From 891759f8609f300203d41cccc7337089b38858bd Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 29 Dec 2020 14:02:51 -0800
Subject: [PATCH 01/89] Clean up type annotations in caffe2/torch/nn/modules
 (#49938)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49938

Test Plan: Sandcastle tests

Reviewed By: xush6528

Differential Revision: D25718705

fbshipit-source-id: 6a9e3e6d17aa458726cd32aa0a71a63c51b601d9
---
 torch/nn/modules/activation.py | 5 ++---
 torch/nn/modules/conv.py       | 3 +--
 torch/nn/modules/utils.py      | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 8a16c8c27808..bc97b8d1a025 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -921,9 +921,8 @@ def __setstate__(self, state):
 
         super(MultiheadAttention, self).__setstate__(state)
 
-    def forward(self, query, key, value, key_padding_mask=None,
-                need_weights=True, attn_mask=None):
-        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
         r"""
     Args:
         query, key, value: map a query and a set of key-value pairs to an output.
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index f22c35fa39ff..6df6eabf0646 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -530,8 +530,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
 
     # dilation being an optional parameter is for backwards
     # compatibility
-    def _output_padding(self, input, output_size, stride, padding, kernel_size, dilation=None):
-        # type: (Tensor, Optional[List[int]], List[int], List[int], List[int], Optional[List[int]]) -> List[int]
+    def _output_padding(self, input: Tensor, output_size: Optional[List[int]], stride: List[int], padding: List[int], kernel_size: List[int], dilation: Optional[List[int]] = None) -> List[int]:
         if output_size is None:
             ret = _single(self.output_padding)  # converting to list if was not already
         else:
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 3e0b93c7afc0..97e4195619cb 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -26,8 +26,7 @@ def _reverse_repeat_tuple(t, n):
     return tuple(x for x in reversed(t) for _ in range(n))
 
 
-def _list_with_default(out_size, defaults):
-    # type: (List[int], List[int]) -> List[int]
+def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]:
     if isinstance(out_size, int):
         return out_size
     if len(defaults) <= len(out_size):

From 4c5a4dbb8cf6acfc714a992f0768295c77dae06f Mon Sep 17 00:00:00 2001
From: Protonu Basu <protonu@fb.com>
Date: Tue, 29 Dec 2020 15:16:44 -0800
Subject: [PATCH 02/89] [Tensorexpr]Copying header files in tensorexpr dir
 (#49933)

Summary:
Previously header files from jit/tensorexpr were not copied, this PR should enable copying.

This will allow other OSS projects like Glow to used TE.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49933

Reviewed By: Krovatkin, mruberry

Differential Revision: D25725927

Pulled By: protonu

fbshipit-source-id: 9d5a0586e9b73111230cacf044cd7e8f5c600ce9
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 01f173d6825b..8289b57e93be 100644
--- a/setup.py
+++ b/setup.py
@@ -892,6 +892,7 @@ def print_box(msg):
                 'include/torch/csrc/jit/serialization/*.h',
                 'include/torch/csrc/jit/python/*.h',
                 'include/torch/csrc/jit/testing/*.h',
+                'include/torch/csrc/jit/tensorexpr/*.h',
                 'include/torch/csrc/onnx/*.h',
                 'include/torch/csrc/utils/*.h',
                 'include/pybind11/*.h',

From 14edc726d93cfb5d1bce7712e16eb20c8d6a04b2 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 29 Dec 2020 15:42:12 -0800
Subject: [PATCH 03/89] Clean up some type annotations in
 caffe2/torch/quantization (#49942)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49942

Upgrades type annotations from Python2 to Python3

Test Plan: Sandcastle tests

Reviewed By: vkuzo

Differential Revision: D25717551

fbshipit-source-id: 1b63dc485ecf6641641b05f7ce095ae1d2d87346
---
 torch/quantization/_numeric_suite_fx.py | 2 +-
 torch/quantization/fake_quantize.py     | 6 ++----
 torch/quantization/observer.py          | 3 +--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/torch/quantization/_numeric_suite_fx.py b/torch/quantization/_numeric_suite_fx.py
index eb1596832c4d..aeba95bb4e8f 100644
--- a/torch/quantization/_numeric_suite_fx.py
+++ b/torch/quantization/_numeric_suite_fx.py
@@ -21,7 +21,7 @@
 def remove_qconfig_observer_fx(model):
     # remove activation post process
     act_post_process_removed_graph = Graph()
-    env = {}  # type: Dict[str, Any]
+    env: Dict[str, Any] = {}
 
     modules = dict(model.named_modules())
 
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index f0ee8453557d..460b1c277a93 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -41,8 +41,7 @@ def calculate_qparams(self, **kwargs):
         pass
 
     @torch.jit.export
-    def enable_fake_quant(self, enabled=True):
-        # type: (bool) -> None
+    def enable_fake_quant(self, enabled: bool = True) -> None:
         self.fake_quant_enabled[0] = 1 if enabled else 0
 
     @torch.jit.export
@@ -50,8 +49,7 @@ def disable_fake_quant(self):
         self.enable_fake_quant(False)
 
     @torch.jit.export
-    def enable_observer(self, enabled=True):
-        # type: (bool) -> None
+    def enable_observer(self, enabled: bool = True) -> None:
         self.observer_enabled[0] = 1 if enabled else 0
 
     @torch.jit.export
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 32d07c939695..7addaa622962 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -877,8 +877,7 @@ def _combine_histograms(self,
         orig_hist = orig_hist + interpolated_histogram.to(torch.float)
         return orig_hist
 
-    def forward(self, x_orig):
-        # type: (torch.Tensor) -> torch.Tensor
+    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
         x = x_orig.detach()
         min_val = self.min_val
         max_val = self.max_val

From 01b57e1810340fa3653c90995bf6d87af1d57a0d Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 29 Dec 2020 16:40:43 -0800
Subject: [PATCH 04/89] Revert D25718705: Clean up type annotations in
 caffe2/torch/nn/modules

Test Plan: revert-hammer

Differential Revision:
D25718705 (https://github.com/pytorch/pytorch/commit/891759f8609f300203d41cccc7337089b38858bd)

Original commit changeset: 6a9e3e6d17aa

fbshipit-source-id: 1a4ef0bfdec8eb8e7ce149bfbdb34a4ad8d964b6
---
 torch/nn/modules/activation.py | 5 +++--
 torch/nn/modules/conv.py       | 3 ++-
 torch/nn/modules/utils.py      | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index bc97b8d1a025..8a16c8c27808 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -921,8 +921,9 @@ def __setstate__(self, state):
 
         super(MultiheadAttention, self).__setstate__(state)
 
-    def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
-                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
         r"""
     Args:
         query, key, value: map a query and a set of key-value pairs to an output.
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 6df6eabf0646..f22c35fa39ff 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -530,7 +530,8 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
 
     # dilation being an optional parameter is for backwards
     # compatibility
-    def _output_padding(self, input: Tensor, output_size: Optional[List[int]], stride: List[int], padding: List[int], kernel_size: List[int], dilation: Optional[List[int]] = None) -> List[int]:
+    def _output_padding(self, input, output_size, stride, padding, kernel_size, dilation=None):
+        # type: (Tensor, Optional[List[int]], List[int], List[int], List[int], Optional[List[int]]) -> List[int]
         if output_size is None:
             ret = _single(self.output_padding)  # converting to list if was not already
         else:
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 97e4195619cb..3e0b93c7afc0 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -26,7 +26,8 @@ def _reverse_repeat_tuple(t, n):
     return tuple(x for x in reversed(t) for _ in range(n))
 
 
-def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]:
+def _list_with_default(out_size, defaults):
+    # type: (List[int], List[int]) -> List[int]
     if isinstance(out_size, int):
         return out_size
     if len(defaults) <= len(out_size):

From e482c70a3dbeca70cc5164ac28b87c2c6906edf3 Mon Sep 17 00:00:00 2001
From: Jony Karki <25265687+jonykarki@users.noreply.github.com>
Date: Tue, 29 Dec 2020 16:42:53 -0800
Subject: [PATCH 05/89] added List as an option to the unflattened_size
 (#49838)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49743

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49838

Reviewed By: mruberry

Differential Revision: D25727971

Pulled By: ngimel

fbshipit-source-id: 60142dae84ef107f0083676a2a78ce6b0472b7e1
---
 test/test_nn.py             | 31 ++++++++++++++++---------------
 torch/nn/modules/flatten.py | 26 ++++++++++++--------------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 1d63be6e3075..386ba369dca6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9283,18 +9283,19 @@ def test_flatten(self):
     def test_unflatten(self):
         tensor_input = torch.randn(2, 50)
 
-        # Unflatten Tensor
+        # Unflatten Tensor (unflattened_size as a tuple of ints and list of ints)
 
-        unflatten = nn.Unflatten(dim=1, unflattened_size=(2, 5, 5))
-        tensor_output = unflatten(tensor_input)
-        self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
+        for us in ((2, 5, 5), [2, 5, 5]):
+            unflatten = nn.Unflatten(dim=1, unflattened_size=us)
+            tensor_output = unflatten(tensor_input)
+            self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
 
         # Unflatten NamedTensor
 
         unflatten = nn.Unflatten(dim='features', unflattened_size=(('C', 2), ('H', 5), ('W', 5)))
         named_tensor_input = tensor_input.refine_names('N', 'features')
         named_tensor_output = unflatten(named_tensor_input)
-        self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
+        self.assertEqual(named_tensor_output.size(), torch.Size([2, 2, 5, 5]))
 
     def test_unflatten_invalid_arg(self):
         # Wrong type for unflattened_size (tuple of floats)
@@ -9304,6 +9305,13 @@ def test_unflatten_invalid_arg(self):
                 r"unflattened_size must be tuple of ints, but found element of type float at pos 2"):
             nn.Unflatten(dim=1, unflattened_size=(2, 5, 5.0))
 
+        # Wrong type for unflattened_size (list of lists and list of tuples)
+        for us in ([['C', 2], ['W', 5], ['H', 5]], [('C', 2), ('W', 5), ('H', 5)]):
+            with self.assertRaisesRegex(
+                    TypeError,
+                    r"unflattened_size must be a tuple of tuples, but found type list"):
+                nn.Unflatten(dim='features', unflattened_size=us)
+
         # Wrong type for unflattened_size (tuple of lists)
 
         with self.assertRaisesRegex(
@@ -9311,19 +9319,12 @@ def test_unflatten_invalid_arg(self):
                 r"unflattened_size must be tuple of tuples, but found element of type list at pos 0"):
             nn.Unflatten(dim='features', unflattened_size=(['C', 2], ['W', 5], ['H', 5]))
 
-        # Wrong type for unflattened_size (list of ints)
-
-        with self.assertRaisesRegex(
-                TypeError,
-                r"unflattened_size must be a tuple of ints, but found type list"):
-            nn.Unflatten(dim=1, unflattened_size=[2, 5, 5])
-
-        # Wrong type for unflattened_size (list of lists)
+        # Wrong type for unflattened_size (tuple of dicts)
 
         with self.assertRaisesRegex(
                 TypeError,
-                r"unflattened_size must be a tuple of tuples, but found type list"):
-            nn.Unflatten(dim='features', unflattened_size=[['C', 2], ['W', 5], ['H', 5]])
+                r"unflattened_size must be tuple of tuples, but found element of type dict at pos 0"):
+            nn.Unflatten(dim='features', unflattened_size=({'C': 2}, {'W': 5}, {'H': 5}))
 
     def test_layer_norm_grads_with_create_graph_flag(self):
         atol = 1e-5
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index c06b7a5534f6..dd491ba99620 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -2,7 +2,7 @@
 
 from typing import Tuple, Union
 from torch import Tensor
-from torch import Size
+from torch.types import _size
 
 
 class Flatten(Module):
@@ -53,8 +53,8 @@ class Unflatten(Module):
       be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
 
     * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
-      a `tuple` of ints or `torch.Size` for `Tensor` input or a `NamedShape` (tuple of `(name, size)` tuples)
-      for `NamedTensor` input.
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape` 
+      (tuple of `(name, size)` tuples) for `NamedTensor` input.
 
     Shape:
         - Input: :math:`(N, *dims)`
@@ -62,7 +62,7 @@ class Unflatten(Module):
 
     Args:
         dim (Union[int, str]): Dimension to be unflattened
-        unflattened_size (Union[torch.Size, NamedShape]): New shape of the unflattened dimension
+        unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
 
     Examples:
         >>> input = torch.randn(2, 50)
@@ -71,7 +71,7 @@ class Unflatten(Module):
         >>>     nn.Linear(50, 50),
         >>>     nn.Unflatten(1, (2, 5, 5))
         >>> )
-        >>> output = m(output)
+        >>> output = m(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
         >>> # With torch.Size
@@ -79,15 +79,13 @@ class Unflatten(Module):
         >>>     nn.Linear(50, 50),
         >>>     nn.Unflatten(1, torch.Size([2, 5, 5]))
         >>> )
-        >>> output = m(output)
+        >>> output = m(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
         >>> # With namedshape (tuple of tuples)
-        >>> m = nn.Sequential(
-        >>>     nn.Linear(50, 50),
-        >>>     nn.Unflatten('features', (('C', 2), ('H', 50), ('W',50)))
-        >>> )
-        >>> output = m(output)
+        >>> input = torch.randn(2, 50, names=('N', 'features'))
+        >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+        >>> output = unflatten(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
     """
@@ -95,9 +93,9 @@ class Unflatten(Module):
 
     __constants__ = ['dim', 'unflattened_size']
     dim: Union[int, str]
-    unflattened_size: Union[Size, NamedShape]
+    unflattened_size: Union[_size, NamedShape]
 
-    def __init__(self, dim: Union[int, str], unflattened_size: Union[Size, NamedShape]) -> None:
+    def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
         super(Unflatten, self).__init__()
 
         if isinstance(dim, int):
@@ -121,7 +119,7 @@ def _require_tuple_tuple(self, input):
                         "but found type {}".format(type(input).__name__))
 
     def _require_tuple_int(self, input):
-        if (isinstance(input, tuple)):
+        if (isinstance(input, (tuple, list))):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, int):
                     raise TypeError("unflattened_size must be tuple of ints, " + 

From 97c17b47720dd1afce88180b843c9182f1a665a9 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 29 Dec 2020 17:01:16 -0800
Subject: [PATCH 06/89] Fix auto exponent issue for torch.pow (#49809)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49809

Fixes https://github.com/pytorch/xla/issues/2688 #46936

Test Plan: Imported from OSS

Reviewed By: nikithamalgifb

Differential Revision: D25724176

Pulled By: anjali411

fbshipit-source-id: 16287a1f481e9475679b99d6fb45de840da225be
---
 aten/src/ATen/native/Pow.cpp            |  6 ++--
 aten/src/ATen/native/cpu/PowKernel.cpp  |  2 +-
 aten/src/ATen/test/scalar_test.cpp      | 20 +++++++++++++
 c10/core/Scalar.cpp                     | 10 +++++++
 c10/core/Scalar.h                       | 39 +++++++++++++++++++++++++
 test/cpp/api/autograd.cpp               |  2 +-
 torch/csrc/autograd/FunctionsManual.cpp | 17 +++++------
 7 files changed, 81 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index bfc5f910e093..4d1601d3e6a0 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -31,11 +31,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
            "result type ", common_dtype, "can't be cast to the desired output type ",
            result.scalar_type());
 
-  auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble();
-
-  if (exponent == 0.0) {
+  if (exp.equal(0.0)) {
     result.resize_as_(base).fill_(1);
-  } else if (exponent == 1.0) {
+  } else if (exp.equal(1.0)) {
     result.resize_as_(base).copy_(base);
   } else {
     auto iter = TensorIterator::unary_op(result, base.to(common_dtype));
diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
index b7ec099a80da..6f0d153e978a 100644
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -63,7 +63,7 @@ void pow_tensor_scalar_kernel(TensorIterator& iter, Scalar exp_scalar) {
         );
       } else if (exp == -0.5) {
         cpu_kernel_vec(iter,
-          [](scalar_t base) -> scalar_t {
+          [](scalar_t base) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
             return 1.0 / std::sqrt(base);
           },
           [](Vec base) -> Vec { return base.rsqrt(); }
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 68c0b4f3f71a..3b7bfb47fe62 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -138,3 +138,23 @@ TEST(TestScalar, TestConj) {
   ASSERT_EQ(float_scalar.conj().toDouble(), 3.0);
   ASSERT_EQ(complex_scalar.conj().toComplexDouble(), c10::complex<double>(2.3, -3.5));
 }
+
+TEST(TestScalar, TestEqual) {
+  ASSERT_FALSE(Scalar(1.0).equal(false));
+  ASSERT_FALSE(Scalar(1.0).equal(true));
+  ASSERT_FALSE(Scalar(true).equal(1.0));
+  ASSERT_TRUE(Scalar(true).equal(true));
+
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 5.0}).equal(c10::complex<double>{2.0, 5.0}));
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 0}).equal(2.0));
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 0}).equal(2));
+
+  ASSERT_TRUE(Scalar(2.0).equal(c10::complex<double>{2.0, 0.0}));
+  ASSERT_FALSE(Scalar(2.0).equal(c10::complex<double>{2.0, 4.0}));
+  ASSERT_FALSE(Scalar(2.0).equal(3.0));
+  ASSERT_TRUE(Scalar(2.0).equal(2));
+
+  ASSERT_TRUE(Scalar(2).equal(c10::complex<double>{2.0, 0}));
+  ASSERT_TRUE(Scalar(2).equal(2));
+  ASSERT_TRUE(Scalar(2).equal(2.0));
+}
diff --git a/c10/core/Scalar.cpp b/c10/core/Scalar.cpp
index 35aa5d60f001..212c41d5b19c 100644
--- a/c10/core/Scalar.cpp
+++ b/c10/core/Scalar.cpp
@@ -21,4 +21,14 @@ Scalar Scalar::conj() const {
   }
 }
 
+Scalar Scalar::log() const {
+  if (isComplex()) {
+    return std::log(v.z);
+  } else if (isFloatingPoint()) {
+    return std::log(v.d);
+  } else {
+    return std::log(v.i);
+  }
+}
+
 }  // namespace c10
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 6151f6d2b150..368228e8202e 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -88,6 +88,45 @@ class C10_API Scalar {
 
   Scalar operator-() const;
   Scalar conj() const;
+  Scalar log() const;
+
+  template<typename T, typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      auto val = v.z;
+      return (val.real() == num) && (val.imag() == T());
+    } else if (isFloatingPoint()) {
+      return v.d == num;
+    } else if (isIntegral(/*includeBool=*/false)) {
+      return v.i == num;
+    } else {
+      // boolean scalar does not equal to a non boolean value
+      return false;
+    }
+  }
+
+  template<typename T, typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      return v.z == num;
+    } else if (isFloatingPoint()) {
+      return (v.d == num.real()) && (num.imag() == T());
+    } else if (isIntegral(/*includeBool=*/false)) {
+      return (v.i == num.real()) && (num.imag() == T());
+    } else {
+      // boolean scalar does not equal to a non boolean value
+      return false;
+    }
+  }
+
+  bool equal(bool num) const {
+    if (isBoolean()) {
+      return static_cast<bool>(v.i) == num;
+    } else {
+      return false;
+    }
+  }
+
   ScalarType type() const {
     if (isComplex()) {
       return ScalarType::ComplexDouble;
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index e4bb96ece6fb..3f79c771c2be 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -175,7 +175,7 @@ TEST(AutogradAPITests, AnomalyMode) {
     auto y = x.pow(1.5);
     auto gr =
         grad({y}, {x}, {}, /*retain_graph=*/true, /*create_backward=*/true);
-    ASSERT_THROWS_WITH(grad({gr[0]}, {x});, "returned nan");
+    ASSERT_THROWS_WITH(grad({gr[0]}, {x}, {torch::tensor({0.0})});, "returned nan");
     auto msgs = warnings.messages();
     ASSERT_EQ(msgs.size(), 2);
     ASSERT_TRUE(
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 0121fef95155..23c0be2e70d6 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -205,12 +205,12 @@ Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> &
   return norm_backward(grad, self, p_, norm);
 }
 
-Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
-  auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble();
-  if (exponent == 0.0) {
+Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent) {
+  if (exponent.equal(0.0)) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
-    auto out = grad * (exponent * self.pow(exponent - 1)).conj();
+    auto grad_lambda = [&](auto exp) { return grad * (exp * self.pow(exp - 1)).conj(); };
+    Tensor out = (exponent.isComplex()) ? grad_lambda(exponent.toComplexDouble()) : grad_lambda(exponent.toDouble());
     return handle_r_to_c(self, out);
   }
 }
@@ -243,9 +243,8 @@ Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& expo
 }
 
 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
-  auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble();
-  auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); };
-  if (base_ == 0.0) {
+  auto grad_lambda = [](Tensor a, Scalar b) { return (a * b.log()).conj(); };
+  if (base.equal(0.0)) {
     auto cond = [](auto exp) {
       if (exp.is_complex()) {
         return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
@@ -255,10 +254,10 @@ Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exp
     };
     auto out = grad * at::where(cond(exponent),
                             at::zeros({}, grad.options()),
-                            grad_lambda(result, base_));
+                            grad_lambda(result, base));
     return handle_r_to_c(exponent, out);
   } else {
-    auto out = grad * grad_lambda(result, base_);
+    auto out = grad * grad_lambda(result, base);
     return handle_r_to_c(exponent, out);
   }
 }

From 12b73fdbbf4a89f3ec46983f90a2bb2d866cb338 Mon Sep 17 00:00:00 2001
From: Nikitha Malgi <nikithamalgi@fb.com>
Date: Tue, 29 Dec 2020 20:22:19 -0800
Subject: [PATCH 07/89] Adding JIT support for cuda streams and events (#48020)

Summary:
=======

This PR addresses the following:

 * Adds JIT support for CUDA Streams
 * Adds JIT support for CUDA Events
 * Adds JIT support for CUDA Stream context manager

Testing:
======

python test/test_jit.py -v TestCUDA

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48020

Reviewed By: navahgar

Differential Revision: D25725749

Pulled By: nikithamalgifb

fbshipit-source-id: b0addeb49630f8f0c430ed7badeca43bb9d2535c
---
 aten/src/ATen/core/interned_strings.h         |   9 +
 test/cpp/jit/test_save_load.cpp               |  28 ++
 test/cpp/jit/tests_setup.py                   |  27 +
 test/jit/test_cuda.py                         | 476 ++++++++++++++++++
 test/test_jit.py                              |   1 +
 tools/build_variables.bzl                     |   1 +
 torch/csrc/jit/cuda/cuda.h                    | 179 +++++++
 .../csrc/jit/frontend/script_type_parser.cpp  |   7 +
 torch/csrc/jit/ir/alias_analysis.cpp          |   3 +-
 torch/csrc/jit/ir/ir.cpp                      |   7 +-
 torch/csrc/jit/ir/ir.h                        |   5 +
 .../csrc/jit/python/python_sugared_value.cpp  |  32 ++
 torch/csrc/jit/python/python_sugared_value.h  |  14 +
 torch/csrc/jit/runtime/register_cuda_ops.cpp  |  87 ++++
 torch/jit/__init__.py                         |   1 +
 torch/jit/cuda.py                             | 182 +++++++
 16 files changed, 1057 insertions(+), 2 deletions(-)
 create mode 100644 test/jit/test_cuda.py
 create mode 100644 torch/csrc/jit/cuda/cuda.h
 create mode 100644 torch/csrc/jit/runtime/register_cuda_ops.cpp
 create mode 100644 torch/jit/cuda.py

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 8065300f0b32..f99dc3c07058 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -17,6 +17,7 @@ namespace c10 {
 #define FORALL_NS_SYMBOLS(_)         \
   _(namespaces, prim)                \
   _(namespaces, aten)                \
+  _(namespaces, cuda)                \
   _(namespaces, onnx)                \
   _(namespaces, attr)                \
   _(namespaces, scope)               \
@@ -284,6 +285,9 @@ namespace c10 {
   _(aten, zero_)                     \
   _(aten, fill_)                     \
   _(aten, masked_fill_)              \
+  _(cuda, _set_device)               \
+  _(cuda, set_stream)                \
+  _(cuda, _current_device)           \
   _(aten, swapaxes)                  \
   _(aten, swapaxes_)                 \
   _(aten, swapdims)                  \
@@ -383,6 +387,7 @@ namespace c10 {
 #define FORALL_NS_SYMBOLS(_) \
   _(namespaces, prim)              \
   _(namespaces, aten)              \
+  _(namespaces, cuda)              \
   _(namespaces, onnx)              \
   _(namespaces, attr)              \
   _(namespaces, scope)             \
@@ -453,6 +458,7 @@ struct TORCH_API Symbol {
   // (and if it's not, you should add it to the built-ins list above.)
   static Symbol attr(const std::string & s);
   static Symbol aten(const std::string & s);
+  static Symbol cuda(const std::string & s);
   static Symbol onnx(const std::string & s);
   static Symbol prim(const std::string & s);
   static Symbol user(const std::string & s);
@@ -463,6 +469,7 @@ struct TORCH_API Symbol {
 
   bool is_attr() const;
   bool is_aten() const;
+  bool is_cuda() const;
   bool is_prim() const;
   bool is_onnx() const;
   bool is_user() const;
@@ -523,6 +530,7 @@ FORALL_NS_SYMBOLS(DEFINE_SYMBOL)
 
 inline Symbol Symbol::attr(const std::string & s) { return Symbol::fromQualString("attr::" + s); }
 inline Symbol Symbol::aten(const std::string & s)  { return Symbol::fromQualString("aten::" + s); }
+inline Symbol Symbol::cuda(const std::string & s)  { return Symbol::fromQualString("cuda::" + s); }
 inline Symbol Symbol::onnx(const std::string & s)  { return Symbol::fromQualString("onnx::" + s); }
 inline Symbol Symbol::prim(const std::string & s)  { return Symbol::fromQualString("prim::" + s); }
 inline Symbol Symbol::scope(const std::string & s) { return Symbol::fromQualString("scope::" + s); }
@@ -531,6 +539,7 @@ inline Symbol Symbol::caffe2(const std::string & s) { return Symbol::fromQualStr
 inline Symbol Symbol::dimname(const std::string & s) { return Symbol::fromQualString("dimname::" + s); }
 inline bool Symbol::is_attr() const { return ns() == namespaces::attr; }
 inline bool Symbol::is_aten() const { return ns() == namespaces::aten; }
+inline bool Symbol::is_cuda() const { return ns() == namespaces::cuda; }
 inline bool Symbol::is_prim() const { return ns() == namespaces::prim; }
 inline bool Symbol::is_onnx() const { return ns() == namespaces::onnx; }
 inline bool Symbol::is_user() const { return ns() == namespaces::user; }
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 2e59358b4e00..e102a6ff767c 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -120,5 +120,33 @@ TEST(SerializationTest, TypeTags) {
   }
 }
 
+TEST(SerializationTest, TestJitStream_CUDA) {
+  torch::jit::Module model;
+  std::vector<torch::jit::IValue> inputs;
+  // Deserialize the ScriptModule from a file using torch::jit::load().
+  // Load the scripted model. This should have been generated by tests_setup.py
+  // Refer: TorchSaveJitStream_CUDA in test/cpp/jit/tests_setup.py
+  model = torch::jit::load("saved_stream_model.pt");
+
+  auto output = model.forward(inputs);
+  auto list_of_elements = output.toTuple()->elements();
+  auto is_stream_s = list_of_elements[0].toBool();
+
+  // a,b: These are the two input tensors
+  // c: This is output tensor generated by the operation torch.cat(a,b)
+  auto a = list_of_elements[1].toTensor();
+  auto b = list_of_elements[2].toTensor();
+  auto c = list_of_elements[3].toTensor();
+  // op: this is used to verify if the cat operation produced the same results
+  // as that on the GPU with torch.cat
+  auto op = at::cat({a, b}, 0);
+
+  // Check if the stream is set
+  ASSERT_TRUE(is_stream_s);
+  // Check if the sizes of the outputs (op and c) is same on the GPU and CPU
+  ASSERT_EQ(op.sizes(), c.sizes());
+  // Check if both the output tensors are equal
+  ASSERT_TRUE(op.equal(c));
+}
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests_setup.py b/test/cpp/jit/tests_setup.py
index 68871d1c21d2..928a06d9b5a0 100644
--- a/test/cpp/jit/tests_setup.py
+++ b/test/cpp/jit/tests_setup.py
@@ -63,11 +63,38 @@ def setup(self):
 
         torch.save(value, self.path, _use_new_zipfile_serialization=False)
 
+class TorchSaveJitStream_CUDA(FileSetup):
+    path = 'saved_stream_model.pt'
+
+    def setup(self):
+        if not torch.cuda.is_available():
+            return
+
+        class Model(torch.nn.Module):
+            def forward(self):
+                device_index = torch.cuda._current_device()
+                s = torch.jit.cuda.Stream(device_index, 0)
+                a = torch.rand(3, 4, device="cuda")
+                b = torch.rand(3, 4, device="cuda")
+
+                with torch.jit.cuda.stream(s):
+                    is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id()
+                    c = torch.cat((a, b), 0).to("cuda")
+                s.synchronize()
+                return is_stream_s, a, b, c
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        torch.jit.save(script_model, self.path)
+
 
 tests = [
     EvalModeForLoadedModule(),
     SerializationInterop(),
     TorchSaveError(),
+    TorchSaveJitStream_CUDA()
 ]
 
 def setup():
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
new file mode 100644
index 000000000000..f7af8e3a2efc
--- /dev/null
+++ b/test/jit/test_cuda.py
@@ -0,0 +1,476 @@
+import os
+import sys
+import gc
+import unittest
+
+import torch
+from typing import NamedTuple
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import skipIfRocm, skipCUDANonDefaultStreamIf
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+# Check if GPU is available
+TEST_CUDA = torch.cuda.is_available()
+# Check if multiple GPU's are available
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+
+# If GPU is not available, then do not run the tests
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests', file=sys.stderr)
+    JitTestCase = object  # noqa: F811
+
+TEST_LARGE_TENSOR = TEST_CUDA
+
+# If GPU is available, then initialize the cuda context and check
+# if there is memory available to allocate for LARGE Tensors.
+if TEST_CUDA:
+    torch.ones(1).cuda()  # initialize cuda context
+    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
+class TestCUDA(JitTestCase):
+    """
+    A suite of tests for the CUDA API in TorchScript.
+    """
+    def setUp(self):
+        super(TestCUDA, self).setUp()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        super(TestCUDA, self).tearDown()
+
+    @skipIfRocm
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_current_stream(self):
+        # Test current stream on the device and check if the stream device index
+        # matches with the device ID
+        @torch.jit.script
+        def fn():
+            device_index = torch.cuda._current_device()
+            s0 = torch.cuda.current_stream(device_index)
+            s1 = torch.cuda.current_stream(1)
+            s2 = torch.cuda.current_stream(0)
+
+            return s0.device_index(), s1.device_index(), s2.device_index()
+
+        d0, d1, d2 = fn()
+
+        # By default, the current device ID is 0.
+        self.assertEqual(0, d0)
+        self.assertEqual(1, d1)
+        self.assertEqual(0, d2)
+        self.assertEqual(d0, d2)
+
+    @skipIfRocm
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @skipCUDANonDefaultStreamIf(True)
+    def test_streams_and_events(self):
+        # This test checks for the default stream ID is set to 0 on the device
+        @torch.jit.script
+        def test_default_streams():
+            s0 = torch.cuda.default_stream(0)
+            s1 = torch.cuda.default_stream(1)
+
+            d = torch.device('cuda:1')
+
+            # Check the current stream id and default id are same
+            # on the current device. The current device id by default is 0
+            s2 = torch.cuda.current_stream(0)
+            check_s2 = s2.id() == s0.id()
+            check_d0 = torch.cuda._current_device() == s2.device_index()
+
+            # Set the current device to d1 and check if the stream
+            # has been set to the default stream on d1
+            with torch.jit.cuda.device(d):
+                s3 = torch.cuda.current_stream(1)
+                check_s3 = s3.id() == s1.id()
+                check_d1 = torch.cuda._current_device() == s3.device_index()
+
+            # Check if the current device was reset to 0
+            is_device_d0 = torch.cuda._current_device() == s2.device_index()
+
+            return s0.device_index(), s1.device_index(), check_s2, check_s3, check_d0, check_d1, is_device_d0
+
+        d0, d1, check_s2, check_s3, check_d0, check_d1, is_device_d0 = test_default_streams()
+
+        self.assertEqual(d0, 0)
+        self.assertEqual(d1, 1)
+        self.assertTrue(check_s2)
+        self.assertTrue(check_s3)
+        self.assertTrue(check_d0)
+        self.assertTrue(check_d1)
+        self.assertTrue(is_device_d0)
+
+        # This test checks if the Stream Context manager is a no op
+        # when the stream is none for `with torch.jit.cuda.stream`
+        @torch.jit.script
+        def test_set_none_stream():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            default_stream = torch.cuda.default_stream(device_index)
+
+            # When stream is none, check if this operation is a no-op
+            with torch.jit.cuda.stream(None):
+                cur_device_index = torch.cuda._current_device()
+                is_device_index_same = cur_device_index == device_index
+                is_current_stream_same = torch.cuda.current_stream(cur_device_index).id() == current_stream.id()
+                is_default_stream_same = torch.cuda.default_stream(device_index).id() == default_stream.id()
+
+            # Check if the device index, current stream and default streams have not changed
+            are_streams_same = is_device_index_same and is_current_stream_same and is_default_stream_same
+            return are_streams_same
+        self.assertTrue(test_set_none_stream())
+
+        # This test checks if the Device Context manager is a no op
+        # when the device is none for `with torch.jit.cuda.device`
+        @torch.jit.script
+        def test_set_device_none():
+            device_index = torch.cuda._current_device()
+            # When device is none, check if this operation is a no-op
+            with torch.jit.cuda.device(None):
+                # Check if the current device is the same
+                is_device_same = torch.cuda._current_device() == device_index
+            return is_device_same
+        self.assertTrue(test_set_device_none())
+
+        # Check if a CUDA JIT stream is created
+        # on the _current_device
+        @torch.jit.script
+        def test_simple_stream():
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            return device_index == s.device_index()
+
+        self.assertTrue(test_simple_stream(), "Could not create Stream!")
+
+        # Class used to store results for the test: test_get_stream.
+        class Result(NamedTuple):
+            t1 : torch.Tensor
+            t2 : torch.Tensor
+            is_current_and_default_stream_same : bool
+            is_default_and_user_stream_not_same : bool
+            is_stream_set : bool
+            is_stream_reset : bool
+            default_stream_query : bool
+            default_stream_id : int
+            user_stream_id : int
+
+        # The test aims at checking different stream proporties.
+        @torch.jit.script
+        def test_get_stream():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            default_stream = torch.cuda.default_stream(device_index)
+            user_stream = torch.jit.cuda.Stream(device_index, 0)
+
+            # Check if the current and default streams are the same on the device
+            is_current_and_default_stream_same = current_stream.id() == default_stream.id()
+            # Check if user stream and default stream are not the same on the device
+            is_default_and_user_stream_not_same = default_stream.id() != user_stream.id()
+
+            with torch.jit.cuda.stream(user_stream):
+                is_stream_set = torch.cuda.current_stream(device_index).id() == user_stream.id()
+
+            # Check if the stream was reset to current_stream
+            is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id()
+
+            tensor1 = torch.rand(10000, 10000, device="cuda")
+            tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            default_stream.synchronize()
+            default_stream_query = default_stream.query()
+
+            # Capture all the results in the class Result
+            res = Result(
+                tensor1, tensor2, is_current_and_default_stream_same,
+                is_default_and_user_stream_not_same, is_stream_set,
+                is_stream_reset, default_stream_query, default_stream.id(), user_stream.id())
+            return res
+
+        result = test_get_stream()
+
+        self.assertEqual(torch.matmul(result.t1, result.t1), result.t2)
+        self.assertTrue(result.is_current_and_default_stream_same)
+        self.assertTrue(result.is_default_and_user_stream_not_same)
+        self.assertTrue(result.is_stream_set)
+        self.assertTrue(result.is_stream_reset)
+        self.assertTrue(result.default_stream_query)
+        self.assertEqual(result.default_stream_id, 0)  # Check if the default stream ID is always 0
+        self.assertNotEqual(result.user_stream_id, 0)  # Check if the user stream is always non zero
+
+        # Test the stream context manager. This test checks if the stream is switched
+        # to the user stream on using the stream context manager.
+        @torch.jit.script
+        def test_stream_context():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            user_stream = torch.jit.cuda.Stream(device_index, 0)
+            A = torch.rand(1000, 1000, device="cuda")
+
+            with torch.jit.cuda.stream(user_stream):
+                check = torch.cuda.current_stream(device_index).id() == user_stream.id()
+                B = torch.mm(A, A).to("cuda")
+            # Wait for B to be computed
+            user_stream.synchronize()
+            # Check if the stream has been reset on the current device
+            is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id()
+
+            return A, B, check, is_stream_reset
+
+        A, B, is_stream_set, is_stream_reset = test_stream_context()
+        self.assertEqual(torch.matmul(A, A), B)
+        self.assertTrue(is_stream_set, "Error: Current stream was not set to user stream!")
+        self.assertTrue(is_stream_reset, "Error: The stream was not restored to previous stream!")
+
+        # Test multiple nested streams. Check if the operations are computed as expected on the streams
+        # This test has been adapted from the eager mode tests available at test/test_cuda.py
+        @torch.jit.script
+        def test_multiple_stream():
+            prev_device_index = torch.cuda._current_device()
+            prev_current_stream = torch.cuda.current_stream(prev_device_index)
+            s1 = torch.jit.cuda.Stream(0, 0)
+            s2 = torch.jit.cuda.Stream(1, 0)
+
+            A = torch.rand(1000, 1000, device="cuda")
+            B = torch.rand(1000, 1000, device="cuda")
+            with torch.jit.cuda.stream(s1):
+                C = torch.mm(A, A).to("cuda")
+                # Check if the stream and device have been set to s1
+                is_stream_s1 = torch.cuda.current_stream(s1.device_index()).id() == s1.id()
+                is_device_s1 = torch.cuda._current_device() == s1.device_index()
+                with torch.jit.cuda.stream(s2):
+                    # Check if the stream and device have been set to s2
+                    is_stream_s2 = torch.cuda.current_stream(s2.device_index()).id() == s2.id()
+                    is_device_s2 = torch.cuda._current_device() == s2.device_index()
+                    D = torch.mm(B, B).to("cuda")
+                # Check if the stream and device have been set to s1
+                is_stream_s1_after = torch.cuda.current_stream(s1.device_index()).id() == s1.id()
+                is_device_s1_after = torch.cuda._current_device() == s1.device_index()
+                # Wait for D to be computed
+                s2.synchronize()
+            # Wait for C to be computed on S1
+            s1.synchronize()
+
+            # Check if the stream and device has been restored to previous stream and device
+            is_device_current = torch.cuda._current_device() == prev_device_index
+            is_stream_current = torch.cuda.current_stream(prev_device_index).id() == prev_current_stream.id()
+
+            check_stream = is_stream_s1 and is_stream_s2 and is_stream_s1_after and is_stream_current
+            check_device = is_device_s1 and is_device_s2 and is_device_s1_after and is_device_current
+            return A, B, C, D, check_stream, check_device
+        A, B, C, D, check_stream, check_device = test_multiple_stream()
+
+        self.assertEqual(torch.matmul(A, A), C)
+        self.assertEqual(torch.matmul(B, B), D)
+        self.assertTrue(check_stream)
+        self.assertTrue(check_device)
+
+        # Test multiple streams waiting on each other for the operations to be completed.
+        @torch.jit.script
+        def test_data_dependency_between_streams():
+            device_index = torch.cuda._current_device()
+            prev_current_stream = torch.cuda.current_stream(device_index)
+            s1 = torch.jit.cuda.Stream(0, 0)
+            s2 = torch.jit.cuda.Stream(0, 0)
+            event = torch.jit.cuda.Event(False, False, False)
+
+            A = torch.rand(1000, 1000, device="cuda")
+            with torch.jit.cuda.stream(s1):
+                is_stream_s1 = torch.cuda.current_stream(device_index).id() == s1.id()
+                B = torch.mm(A, A).to("cuda")
+            s1.record_event(event)
+            # Check if the current_stream is reset
+            is_current_stream_1 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id()
+            # Wait for ops on s1 to be computed
+            s2.wait_event(event)
+            with torch.jit.cuda.stream(s2):
+                is_stream_s2 = torch.cuda.current_stream(device_index).id() == s2.id()
+                C = torch.mm(B, B).to("cuda")
+            # Wait for C to be computed
+            s2.synchronize()
+            # Check if the current_stream is reset
+            is_current_stream_2 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id()
+
+            check_stream = is_current_stream_1 and is_current_stream_2 and is_stream_s1 and is_stream_s2
+            return A, B, C, check_stream
+
+        A, B, C, check_stream = test_data_dependency_between_streams()
+        self.assertEqual(torch.matmul(A, A), B)
+        self.assertEqual(torch.matmul(B, B), C)
+        self.assertTrue(check_stream)
+
+        # Test a simple CUDA event. Test if the CUDA event was created successfully
+        @torch.jit.script
+        def test_simple_event():
+            e = torch.jit.cuda.Event(True, False, False)
+            return e is not None
+        self.assertTrue(test_simple_event(), "Could not create CUDA Event!")
+
+        # Record the CUDA event for operation torch.mm on the current stream
+        # and then test if the elapsed time is greater than 0. This test is also
+        # an adaption from eager mdoe CUDA tests available at test/test_cuda.py
+        @torch.jit.script
+        def test_event():
+            device_index = torch.cuda._current_device()
+            stream = torch.cuda.current_stream(device_index)
+            event = torch.jit.cuda.Event(True, False, False)
+            is_true_event_query = event.query()
+            start_event = torch.jit.cuda.Event(True, False, False)
+            stream.record_event(start_event)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            stream.record_event(event)
+            event.synchronize()
+            is_again_true_event_query = event.query()
+
+            if not (is_true_event_query and is_again_true_event_query):
+                return -1.0
+            return start_event.elapsed_time(event)
+
+        self.assertGreater(test_event(), 0)
+
+        # Check for stream synchronization , when a large tensor multiplication is
+        # computed on the stream. The stream.query should be true once the synchroniztion is done
+        @torch.jit.script
+        def test_stream_synchronize() -> float:
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, False, False)
+            e_tok = torch.jit.cuda.Event(True, False, False)
+
+            e_tik.record(s)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s):
+                tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            s.synchronize()
+            e_tok.record(s)
+            e_tok.synchronize()
+
+            if not s.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+        self.assertGreater(test_stream_synchronize(), 0)
+
+        # Test event synchronization for the event that records a stream doing
+        # a large tensor multiplication. Check if the elapsed time is greater than 0
+        # and the stream.query evaluates to true.
+        @torch.jit.script
+        def test_event_synchronize() -> float:
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, False, False)
+            e_tok = torch.jit.cuda.Event(True, False, False)
+
+            e_tik.record(s)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s):
+                tensor = torch.mm(tensor1, tensor1).to("cuda")
+            s.record_event(e_tok)
+            e_tok.synchronize()
+            s.synchronize()
+
+            if not s.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+
+        self.assertGreater(test_event_synchronize(), 0)
+
+        # Test for event wait. Check if event waits for the all the operations on
+        # the stream to be done. Check for synchronizations and query on the streams
+        # and events. This test is adapted from eager mode tests for CUDA. Please refer
+        # test/test_cuda.py
+        @torch.jit.script
+        def test_event_wait() -> float:
+            device_index = torch.cuda._current_device()
+            s0 = torch.cuda.current_stream(device_index)
+            s1 = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, True, False)
+            e_tok = torch.jit.cuda.Event(True, True, False)
+
+            e_tik.record(s0)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s0):
+                tensor2 = torch.mm(tensor1, tensor1).cuda()
+            e_sync = torch.jit.cuda.Event(True, False, False)
+            e_sync.record(torch.cuda.current_stream(device_index))
+            e_sync.wait(s1)
+            with torch.jit.cuda.stream(s1):
+                tensor3 = torch.rand(1000000000, 1000000000, device="cuda")
+                tensor4 = torch.mm(tensor3, tensor3).cuda()
+            s1.synchronize()
+            e_tok.record(torch.cuda.current_stream(device_index))
+            e_tok.synchronize()
+            s0.synchronize()
+
+            if not s0.query() or not s1.query() or not e_sync.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+        self.assertGreater(test_event_wait(), 0)
+
+        # Test for stream wait_event. Checks if the stream waits on the event
+        @torch.jit.script
+        def test_wait_event():
+            d1 = torch.device('cuda:1')
+
+            with torch.jit.cuda.device(d1):
+                s0 = torch.cuda.current_stream(1)
+                tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+                tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+                e0 = torch.jit.cuda.Event(False, False, False)
+                s0.record_event(e0)
+
+            s1 = torch.cuda.current_stream(0)
+            s1.wait_event(e0)
+            s1.synchronize()
+
+            return e0.query() and s0.query() and s1.query()
+        self.assertTrue(test_wait_event())
+
+        # Test if a scripted module with cuda streams can be saved, loaded and executed
+        def test_save_load(self):
+            class Model(torch.nn.Module):
+                def forward(self):
+                    device_index = torch.cuda._current_device()
+                    s = torch.jit.cuda.Stream(device_index, 0)
+                    a = torch.rand(3, 4, device="cuda")
+                    b = torch.rand(3, 4, device="cuda")
+
+                    with torch.jit.cuda.stream(s):
+                        is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id()
+                        c = torch.cat((a, b), 0).cuda()
+                    s.synchronize()
+                    return is_stream_s, a, b, c
+
+            model = Model()
+
+            # Script the model and save
+            script_model = torch.jit.script(model)
+            is_stream_s, a, b, c = script_model()
+            # Verify if the output is correct
+            self.assertTrue(is_stream_s)
+            self.assertEqual(torch.cat((a, b), 0), c)
+
+            # Save and load scripted model
+            load_model = self.getExportImportCopy(script_model)
+            is_stream_s, a_load, b_load, c_load = load_model()
+            self.assertTrue(is_stream_s)
+            self.assertEqual(torch.cat((a_load, b_load), 0), c_load)
diff --git a/test/test_jit.py b/test/test_jit.py
index ff89429534ac..a683a8eb0b8c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -35,6 +35,7 @@
 from jit.test_slice import TestSlice  # noqa: F401
 from jit.test_warn import TestWarn  # noqa: F401
 from jit.test_isinstance import TestIsinstance  # noqa: F401
+from jit.test_cuda import TestCUDA  # noqa: F401
 from jit.test_hash import TestHash  # noqa: F401
 
 # Torch
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index a214684ab29c..ec53f1d3c772 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -408,6 +408,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp",
     "torch/csrc/jit/codegen/cuda/type.cpp",
     "torch/csrc/jit/tensorexpr/cuda_codegen.cpp",
+    "torch/csrc/jit/runtime/register_cuda_ops.cpp",
 ]
 
 libtorch_cuda_sources = libtorch_cuda_core_sources + [
diff --git a/torch/csrc/jit/cuda/cuda.h b/torch/csrc/jit/cuda/cuda.h
new file mode 100644
index 000000000000..fa92ce22d6e4
--- /dev/null
+++ b/torch/csrc/jit/cuda/cuda.h
@@ -0,0 +1,179 @@
+#include <aten/src/ATen/cuda/CUDAEvent.h>
+#include <c10/core/Device.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/custom_class.h>
+
+namespace torch {
+namespace jit {
+
+class CUDAEvent;
+// This class is a wrapper around c10::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for c10::cuda::CUDAStream. For more details, please refer to
+// c10/cuda/CUDAStream.h.
+class CUDAStream final : public CustomClassHolder {
+ public:
+  CUDAStream(int64_t device = -1, int64_t priority = 0) {
+    constexpr int64_t PRIORITY_INDEX = 0;
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(
+        c10::cuda::getStreamFromPool(priority < PRIORITY_INDEX, device));
+  }
+
+  CUDAStream(c10::cuda::CUDAStream s) {
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(s);
+  }
+
+  bool query() {
+    return stream_->query();
+  }
+
+  c10::intrusive_ptr<CUDAEvent> recordEvent(
+      c10::intrusive_ptr<CUDAEvent> event);
+
+  void synchronize() {
+    stream_->synchronize();
+  }
+
+  void waitEvent(c10::intrusive_ptr<CUDAEvent> event);
+
+  void waitStream(c10::intrusive_ptr<CUDAStream> stream);
+
+  /// Get the CUDA device index that this stream is associated with.
+  int64_t device_index() const {
+    return stream_->device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  c10::Device device() const {
+    return stream_->device();
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  int64_t id() const {
+    return stream_->id();
+  }
+
+  /// Pack a CUDAStream to uint64_t representation.
+  /// The CUDAStream can be unpacked using unpack().  The format of
+  /// the uint64_t is unspecified and may be changed.
+  int64_t pack() const {
+    return stream_->pack();
+  }
+
+ private:
+  std::unique_ptr<c10::cuda::CUDAStream> stream_;
+  friend class CUDAEvent;
+};
+
+// This class is a wrapper around at::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for at::cuda::CUDAEvent. For more details, please refer to
+// aten/src/ATen/cuda/CUDAEvent.h.
+class CUDAEvent final : public CustomClassHolder {
+ public:
+  CUDAEvent(
+      bool enable_timing = false,
+      bool blocking = false,
+      bool interprocess = false) {
+    int flags = cudaEventDisableTiming;
+    if (enable_timing) {
+      flags = cudaEventDefault;
+    }
+    if (blocking) {
+      flags |= cudaEventBlockingSync;
+    }
+    if (interprocess) {
+      TORCH_CHECK(!enable_timing);
+      flags |= cudaEventInterprocess;
+    }
+
+    event_ = std::make_unique<at::cuda::CUDAEvent>(flags);
+  }
+
+  double elapsedTime(c10::intrusive_ptr<CUDAEvent> end) {
+    return event_->elapsed_time(*end->event_);
+  }
+
+  std::string ipcHandle() {
+    cudaIpcEventHandle_t handle;
+    event_->ipc_handle(&handle);
+    std::string str_handle((const char*)&handle, sizeof(handle));
+    return str_handle;
+  }
+
+  bool query() {
+    return event_->query();
+  }
+
+  void record(c10::intrusive_ptr<CUDAStream> stream);
+
+  void synchronize() {
+    event_->synchronize();
+  }
+  void wait(c10::intrusive_ptr<CUDAStream> stream);
+
+ private:
+  void recordInternal(CUDAStream* stream);
+  std::unique_ptr<at::cuda::CUDAEvent> event_;
+
+  friend class CUDAStream;
+};
+
+c10::intrusive_ptr<CUDAEvent> CUDAStream::recordEvent(
+    c10::intrusive_ptr<CUDAEvent> event) {
+  if (!event) {
+    event = c10::make_intrusive<CUDAEvent>();
+  }
+
+  event->recordInternal(this);
+  return event;
+}
+
+void CUDAStream::waitEvent(c10::intrusive_ptr<CUDAEvent> event) {
+  event->event_->block(*stream_);
+}
+
+void CUDAStream::waitStream(c10::intrusive_ptr<CUDAStream> stream) {
+  auto ev = c10::make_intrusive<CUDAEvent>();
+  stream->recordEvent(ev);
+  waitEvent(ev);
+}
+
+void CUDAEvent::record(c10::intrusive_ptr<CUDAStream> stream) {
+  event_->record(*stream->stream_);
+}
+
+void CUDAEvent::recordInternal(CUDAStream* stream) {
+  event_->record(*stream->stream_);
+}
+
+void CUDAEvent::wait(c10::intrusive_ptr<CUDAStream> stream) {
+  event_->block(*stream->stream_);
+}
+
+TORCH_LIBRARY(cuda, m) {
+  auto stream_class = m.class_<torch::jit::CUDAStream>("Stream").def(
+      torch::init<int64_t, int64_t>());
+  auto event_class = m.class_<torch::jit::CUDAEvent>("Event").def(
+      torch::init<bool, bool, bool>());
+
+  stream_class.def("query", &CUDAStream::query)
+      .def("record_event", &CUDAStream::recordEvent)
+      .def("synchronize", &CUDAStream::synchronize)
+      .def("wait_event", &CUDAStream::waitEvent)
+      .def("wait_stream", &CUDAStream::waitStream)
+      .def("device_index", &CUDAStream::device_index)
+      .def("device", &CUDAStream::device)
+      .def("pack", &CUDAStream::pack)
+      .def("id", &CUDAStream::id);
+
+  event_class.def("elapsed_time", &CUDAEvent::elapsedTime)
+      .def("query", &CUDAEvent::query)
+      .def("record", &CUDAEvent::record)
+      .def("synchronize", &CUDAEvent::synchronize)
+      .def("wait", &CUDAEvent::wait);
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 8b1aa58b5aff..f4c1fa2c920d 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -211,6 +211,13 @@ TypePtr ScriptTypeParser::parseTypeFromExprImpl(const Expr& expr) const {
       }
     }
 
+    // Check if the type is a custom class. This is done by checking
+    // if type_name starts with "torch.classes."
+    if (type_name.find("torch.classes.") == 0) {
+      auto custom_class_type = getCustomClass("__torch__." + type_name);
+      return custom_class_type;
+    }
+
     throw ErrorReport(expr) << "Unknown type name '" << type_name << "'";
   } else if (auto name = parseBaseTypeName(expr)) {
     auto itr = string_to_type_lut().find(*name);
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 0b3e4a4a7b41..1ca0f48f9e17 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -572,7 +572,8 @@ void AliasDb::analyzeImpl(Node* node) {
           !aliasAnalysisHasSpecialCaseFor(node->kind()),
       "Special cases should be handled already if we're here.");
 
-  if (node->kind().is_aten() || node->kind().is_prim()) {
+  if (node->kind().is_aten() || node->kind().is_prim() ||
+      node->kind().is_cuda()) {
     // TODO There is nothing in the system that relies on aten:: and prim::
     // ops using AliasAnalysisKind::FROM_SCHEMA or
     // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, but this is the intended
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 65b410d82069..eb75928e5952 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -1079,6 +1079,11 @@ bool Node::hasSideEffects() const {
     case prim::rpc_sync: // It represents RPC message sent.
     case prim::rpc_remote: // It represents RPC message sent.
     case aten::wait: // It can represent RPC message received.
+#ifndef __HIP_PLATFORM_HCC__
+    case cuda::set_stream:
+    case cuda::_set_device:
+    case cuda::_current_device:
+#endif
     case prim::Enter:
     case prim::Exit:
       return true;
@@ -1094,7 +1099,7 @@ bool Node::hasSideEffects() const {
     return false;
   }
 
-  if (kind_.is_prim() || kind_.is_aten()) {
+  if (kind_.is_prim() || kind_.is_aten() || kind_.is_cuda()) {
     // TODO There is nothing in the system that relies on aten:: and prim::
     // ops using AliasAnalysisKind::FROM_SCHEMA,
     // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, or
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 21f172f01465..02867b8639cd 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -72,6 +72,11 @@ using namespace ::c10::attr;
 namespace aten {
 using namespace ::c10::aten;
 }
+namespace cuda {
+#ifndef __HIP_PLATFORM_HCC__
+using namespace ::c10::cuda;
+#endif
+} // namespace cuda
 
 struct Function;
 struct MatchedSchema;
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 933d3bb1a867..056e23d06f02 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -217,6 +217,32 @@ std::shared_ptr<SugaredValue> PythonModuleValue::attr(
   return toSugaredValue(member, m, loc, /*is_constant=*/true);
 }
 
+#ifndef __HIP_PLATFORM_HCC__
+std::shared_ptr<SugaredValue> CUDAPythonModuleValue::attr(
+    const SourceRange& loc,
+    Function& m,
+    const std::string& field) {
+  // List of all the cuda operators which are supported in JIT
+  const std::unordered_set<std::string> cuda_ops = {"current_stream",
+                                                    "default_stream",
+                                                    "_current_device",
+                                                    "_set_device",
+                                                    "device_index",
+                                                    "device_count",
+                                                    "set_stream"};
+
+  if (cuda_ops.find(field) != cuda_ops.end()) {
+    return std::make_shared<BuiltinFunction>(Symbol::cuda(field), c10::nullopt);
+  }
+
+  py::object member = getattr(loc, field);
+  // note: is_constant = true because we consider that global properties
+  // on modules like math.pi or torch.float to be constants
+  // even though it is possible, though rare, for someone to mutate them
+  return toSugaredValue(member, m, loc, /*is_constant=*/true);
+}
+#endif
+
 Value* ModuleValue::asValue(const SourceRange& loc, Function& m) {
   return self_;
 }
@@ -938,6 +964,12 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   if (auto callee = as_function(obj)) {
     return std::make_shared<FunctionValue>(callee->function_);
   } else if (py::isinstance<py::module>(obj)) {
+#ifndef USE_ROCM
+    std::string obj_name = py::cast<py::str>(py::getattr(obj, "__name__"));
+    if (obj_name.compare("torch.cuda") == 0) {
+      return std::make_shared<CUDAPythonModuleValue>(obj);
+    }
+#endif
     return std::make_shared<PythonModuleValue>(obj);
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("_fork").ptr() ||
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index b5d8f4490b3e..1edbc6c15cad 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -91,6 +91,20 @@ struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue {
       const std::string& field) override;
 };
 
+// Used for desugaring uses of the torch.cuda module. All the CUDA APIs with
+// torch.cuda.* are resolved using CUDAPythonModuleValue.
+#ifndef __HIP_PLATFORM_HCC__
+struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue {
+  explicit CUDAPythonModuleValue(py::object mod)
+      : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      Function& m,
+      const std::string& field) override;
+};
+#endif
+
 // Represents all the parameters of a module as a List[Tensor]
 struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
   ConstantParameterList(Value* the_list) : the_list_(the_list) {}
diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp
new file mode 100644
index 000000000000..5cf31d626dd0
--- /dev/null
+++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp
@@ -0,0 +1,87 @@
+// This file registers special JIT operators used to implement the PyTorch CUDA
+// API in TorchScript.
+#ifndef __HIP_PLATFORM_HCC__
+#include <torch/csrc/api/include/torch/utils.h>
+#include <torch/csrc/jit/cuda/cuda.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+RegisterOperators const reg({
+    Operator(
+        "cuda::current_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream",
+        [](Stack* stack) {
+          auto idx = uint16_t(pop(stack).toInt());
+          auto s = c10::cuda::getCurrentCUDAStream(idx);
+          auto st = make_custom_class<torch::jit::CUDAStream>(s);
+          push(stack, IValue(st));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::default_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream",
+        [](Stack* stack) {
+          auto idx = uint16_t(pop(stack).toInt());
+          auto s = c10::cuda::getDefaultCUDAStream(idx);
+          auto st = make_custom_class<torch::jit::CUDAStream>(s);
+          push(stack, IValue(st));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::_current_device() -> int",
+        [](Stack* stack) {
+          auto v = c10::cuda::current_device();
+          push(stack, static_cast<int>(v));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::_set_device(int64_t val) -> ()",
+        [](Stack* stack) {
+          int64_t idx = -1;
+          pop(stack, idx);
+          c10::cuda::set_device(static_cast<c10::DeviceIndex>(idx));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::device_index(Device device) -> int",
+        [](Stack* stack) {
+          auto device = pop(stack);
+          auto idx = device.toDevice().index();
+          push(stack, idx);
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::device_count() -> int",
+        [](Stack* stack) { push(stack, at::cuda::device_count()); },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::set_stream(__torch__.torch.classes.cuda.Stream stream) -> ()",
+        [](Stack* stack) {
+          auto v = pop(stack);
+          auto s = v.toCustomClass<torch::jit::CUDAStream>();
+          // To set the current CUDA stream using
+          // c10::cuda::setCurrentCUDAStream, the jit::CUDAStream object needs
+          // to be converted to c10::cuda::CUDAStream. Since the latter cannot
+          // be returned from a class registered via TorchBind, this can only be
+          // achieved by packing the c10::cuda::CUDAStream instance contained
+          // inside the jit::CUDAStream object to a uint64_t representation, and
+          // unpacking it inside this operator. The unpacked stream is then used
+          // to set the current CUDA stream.
+          auto packed = s->pack();
+          auto unpacked = c10::cuda::CUDAStream::unpack(packed);
+          c10::cuda::setCurrentCUDAStream(unpacked);
+        },
+        aliasAnalysisFromSchema()),
+});
+} // namespace
+} // namespace jit
+} // namespace torch
+#endif
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f2b0c5c53a99..cfd327165899 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -44,6 +44,7 @@
 from torch.jit._serialization import save, load
 from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph
 
+from torch.jit.cuda import stream
 from torch.jit._freeze import freeze
 
 # For backwards compatibility
diff --git a/torch/jit/cuda.py b/torch/jit/cuda.py
new file mode 100644
index 000000000000..16805301600b
--- /dev/null
+++ b/torch/jit/cuda.py
@@ -0,0 +1,182 @@
+# mypy: ignore-errors
+
+r"""
+This package adds support for JIT compilation for CUDA Streams and events,
+This is similar to API's available in the eager mode
+:ref:`cuda-semantics` has more details about working with CUDA.
+"""
+
+import torch
+from typing import Optional, Any
+from torch import device as _device
+
+def get_current_device_index() -> int:
+    r"""Checks if there are CUDA devices available and
+    returns the device index of the current default CUDA device.
+    Returns -1 in case there are no CUDA devices available.
+
+    Arguments: ``None``
+    """
+    if torch.cuda.device_count() > 0:
+        return torch.cuda._current_device()
+    return -1
+
+def get_device_index(device: Optional[_device] = None, optional: bool = False, allow_cpu: bool = False) -> int:
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a CUDA device. Note that for a CUDA device without a specified index,
+    , this will return the current default CUDA device if :attr:`optional` is ``True``.
+    If :attr:`allow_cpu` is ``True``,CPU devices will be accepted and ``-1`` will be
+    returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default CUDA
+    device if :attr:`optional` is ``True``.
+    """
+    if device is None:
+        if optional:
+            return get_current_device_index()
+        else:
+            raise ValueError('Expected a torch.device with a specified index '
+                             f'or an integer, but got: {device}')
+    device_index = -1
+    if isinstance(device, str):
+        device = torch.device(device)
+
+    if isinstance(device, torch.device):
+        if not allow_cpu and device.type == 'cpu':
+            raise ValueError(f'Expected a non cpu device, but got: {device}')
+        device_index = -1 if device.type == 'cpu' else torch.cuda.device_index(device)
+
+    if isinstance(device, int):
+        device_index = device
+
+    return device_index
+
+class device(object):
+    r"""Context-manager that changes the selected device.
+    This is similar to device (torch.device or int), but has been
+    introduced for JIT compatibility.
+    Arguments:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+    def __init__(self, device: Optional[_device]):
+        self.idx = -1
+        self.prev_idx = -1
+        self.device = device
+
+    def __enter__(self):
+        self.idx = get_device_index(self.device, optional=True)
+
+        if self.idx == -1:
+            return
+        self.prev_idx = torch.cuda._current_device()
+
+        if self.prev_idx != self.idx:
+            torch.cuda._set_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        if self.prev_idx != self.idx:
+            torch.cuda._set_device(self.prev_idx)
+
+class StreamContext(object):
+    r"""Context-manager that selects a given stream.
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+    Arguments:
+        StreamContext (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device. If the selected stream is not on the
+        current device, this function will also change the current device to
+        match the stream.
+    """
+    cur_stream : Optional['torch.classes.cuda.Stream']
+
+    def __init__(self, stream: Optional['torch.classes.cuda.Stream']):
+        self.idx = -1
+        self.stream = stream
+        # Initialize the below streams to default stream on the current device
+        self.device_index = get_current_device_index()
+        self.src_prev_stream = torch.cuda.default_stream(self.device_index)
+        self.dst_prev_stream = torch.cuda.default_stream(self.device_index)
+
+    def __enter__(self):
+        self.idx = get_device_index(device=None, optional=True)
+        # If there is no CUDA device available, return
+        if self.idx == -1:
+            return
+
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None
+        if cur_stream is None:
+            return
+        self.src_prev_stream = torch.cuda.current_stream(self.idx)
+        # If the stream is not on the current device, then change the device
+        # and set the current stream on the device
+        if self.src_prev_stream.device_index() != cur_stream.device_index():
+            with device(cur_stream.device()):
+                self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device_index())
+            torch.cuda._set_device(cur_stream.device_index())
+        torch.cuda.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no CUDA device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+        # If the stream was not on the current device, restore the previous stream on
+        # the destination device and also reset the current device to the previous device.
+        # Set the current stream on the device to the src_prev_stream
+        if self.src_prev_stream.device_index() != cur_stream.device_index():
+            torch.cuda.set_stream(self.dst_prev_stream)
+            torch.cuda._set_device(self.idx)
+        torch.cuda.set_stream(self.src_prev_stream)
+
+def stream(stream: Optional['torch.classes.cuda.Stream']) -> StreamContext:
+    r"""Wrapper around the Context-manager that selects a given stream.
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    """
+    return StreamContext(stream)
+
+def Stream(device: int = -1, priority: int = 0) -> 'torch.classes.cuda.Stream':
+    r"""Wrapper around a CUDA stream.
+    A CUDA stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+    Arguments:
+        device(int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream. Can be either
+            -1 (high priority) or 0 (low priority). By default, streams have
+            priority 0.
+    .. note:: Although CUDA versions >= 11 support more than two levels of
+        priorities, in PyTorch, we only support two levels of priorities.
+    """
+    return torch.classes.cuda.Stream(device, priority)
+
+def Event(enable_timing: bool = False, blocking: bool = False, interprocess: bool = False) -> 'torch.classes.cuda.Event':
+    r"""Wrapper around a CUDA event.
+    CUDA events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize CUDA
+    streams.
+    Arguments:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """
+    return torch.classes.cuda.Event(enable_timing, blocking, interprocess)

From cfc3db0ca9e5f3c9a83321b5db876dd7c6557f42 Mon Sep 17 00:00:00 2001
From: Qifan Lu <lqf.1996121@gmail.com>
Date: Wed, 30 Dec 2020 02:59:57 -0800
Subject: [PATCH 08/89] Remove THPWrapper (#49871)

Summary:
Remove `THPWrapper` from PyTorch C code since it is not used anymore and because we have dropped Python 2 compatibility, its usage can be replaced by capsule objects (`PyCapsule_New`, `PyCapsule_CheckExact`, `PyCapsule_GetPointer` and `PyCapsule_GetDestructor`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49871

Reviewed By: mruberry

Differential Revision: D25715038

Pulled By: albanD

fbshipit-source-id: cc3b6f967bbe0dc42c692adf76dff4e4b667fdd5
---
 tools/build_variables.bzl             |   1 -
 torch/csrc/Module.cpp                 |   1 -
 torch/csrc/PtrWrapper.cpp             | 102 --------------------------
 torch/csrc/PtrWrapper.h               |  16 ----
 torch/csrc/THP.h                      |   1 -
 torch/csrc/autograd/python_engine.cpp |   1 -
 6 files changed, 122 deletions(-)
 delete mode 100644 torch/csrc/PtrWrapper.cpp
 delete mode 100644 torch/csrc/PtrWrapper.h

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index ec53f1d3c772..8eeffe724c8e 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -504,7 +504,6 @@ libtorch_python_core_sources = [
     "torch/csrc/MemoryFormat.cpp",
     "torch/csrc/QScheme.cpp",
     "torch/csrc/Module.cpp",
-    "torch/csrc/PtrWrapper.cpp",
     "torch/csrc/python_dimname.cpp",
     "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index aeac5bafd56f..f70bd1a0ad95 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -722,7 +722,6 @@ PyObject* initModule() {
      methods.data()
   };
   ASSERT_TRUE(module = PyModule_Create(&torchmodule));
-  ASSERT_TRUE(THPWrapper_init(module));
   ASSERT_TRUE(THPGenerator_init(module));
   ASSERT_TRUE(THPException_init(module));
   THPSize_init(module);
diff --git a/torch/csrc/PtrWrapper.cpp b/torch/csrc/PtrWrapper.cpp
deleted file mode 100644
index aa48c49949b9..000000000000
--- a/torch/csrc/PtrWrapper.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <torch/csrc/python_headers.h>
-#include <ATen/Utils.h>
-#include <functional>
-
-static PyObject* THPWrapperClass = nullptr;
-
-struct THPWrapper {
-  PyObject_HEAD
-  void *data;
-  void (*destructor)(void*);
-};
-
-PyObject * THPWrapper_New(void *data, void (*destructor)(void*))
-{
-  PyObject *args = PyTuple_New(0);
-  if (!args) {
-    return nullptr;
-  }
-  PyObject *result = PyObject_Call(THPWrapperClass, args, nullptr);
-  if (result) {
-    THPWrapper* wrapper = (THPWrapper*) result;
-    wrapper->data = data;
-    wrapper->destructor = destructor;
-  }
-  Py_DECREF(args);
-  return result;
-}
-
-bool THPWrapper_check(PyObject * obj)
-{
-  return (PyObject*)Py_TYPE(obj) == THPWrapperClass;
-}
-
-void * THPWrapper_get(PyObject * obj)
-{
-  return ((THPWrapper*)obj)->data;
-}
-
-static PyObject * THPWrapper_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  PyObject* self = type->tp_alloc(type, 0);
-  THPWrapper* wrapper = (THPWrapper*) self;
-  wrapper->data = nullptr;
-  wrapper->destructor = nullptr;
-  return self;
-}
-
-static void THPWrapper_dealloc(THPWrapper* self)
-{
-  self->destructor(self->data);
-  Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-PyTypeObject THPWrapperType = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch._C._PtrWrapper",                      /* tp_name */
-  sizeof(THPWrapper),                          /* tp_basicsize */
-  0,                                           /* tp_itemsize */
-  (destructor)THPWrapper_dealloc,              /* tp_dealloc */
-  0,                                           /* tp_vectorcall_offset */
-  nullptr,                                     /* tp_getattr */
-  nullptr,                                     /* tp_setattr */
-  nullptr,                                     /* tp_reserved */
-  nullptr,                                     /* tp_repr */
-  nullptr,                                     /* tp_as_number */
-  nullptr,                                     /* tp_as_sequence */
-  nullptr,                                     /* tp_as_mapping */
-  nullptr,                                     /* tp_hash  */
-  nullptr,                                     /* tp_call */
-  nullptr,                                     /* tp_str */
-  nullptr,                                     /* tp_getattro */
-  nullptr,                                     /* tp_setattro */
-  nullptr,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT,                          /* tp_flags */
-  nullptr,                                     /* tp_doc */
-  nullptr,                                     /* tp_traverse */
-  nullptr,                                     /* tp_clear */
-  nullptr,                                     /* tp_richcompare */
-  0,                                           /* tp_weaklistoffset */
-  nullptr,                                     /* tp_iter */
-  nullptr,                                     /* tp_iternext */
-  nullptr,                                     /* tp_methods */
-  nullptr,                                     /* tp_members */
-  nullptr,                                     /* tp_getset */
-  nullptr,                                     /* tp_base */
-  nullptr,                                     /* tp_dict */
-  nullptr,                                     /* tp_descr_get */
-  nullptr,                                     /* tp_descr_set */
-  0,                                           /* tp_dictoffset */
-  nullptr,                                     /* tp_init */
-  nullptr,                                     /* tp_alloc */
-  THPWrapper_pynew,                            /* tp_new */
-};
-
-bool THPWrapper_init(PyObject *module)
-{
-  THPWrapperClass = (PyObject*)&THPWrapperType;
-  if (PyType_Ready(&THPWrapperType) < 0)
-    return false;
-  Py_INCREF(&THPWrapperType);
-  return true;
-}
diff --git a/torch/csrc/PtrWrapper.h b/torch/csrc/PtrWrapper.h
deleted file mode 100644
index 985193c74c9b..000000000000
--- a/torch/csrc/PtrWrapper.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef THP_PTR_WRAPPER_H
-#define THP_PTR_WRAPPER_H
-
-#include <torch/csrc/python_headers.h>
-
-/**
- * Python wrapper around arbitrary opaque C++ class
- */
-
-bool THPWrapper_init(PyObject *module);
-
-PyObject * THPWrapper_New(void *data, void (*destructor)(void*));
-void * THPWrapper_get(PyObject * obj);
-bool THPWrapper_check(PyObject * obj);
-
-#endif
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index edf4621765f8..26f6c06b3d20 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -31,7 +31,6 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Module.h>
-#include <torch/csrc/PtrWrapper.h>
 #include <torch/csrc/Size.h>
 #include <torch/csrc/Storage.h>
 #include <torch/csrc/Types.h>
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index eee29481bea5..a9c7d709466e 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/autograd/python_engine.h>
 
 #include <torch/csrc/DynamicTypes.h>
-#include <torch/csrc/PtrWrapper.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/engine.h>

From b54ad0897872fb5f4cb7a90046b0bf33836ad559 Mon Sep 17 00:00:00 2001
From: Venkata Chintapalli <venkatach@fb.com>
Date: Wed, 30 Dec 2020 09:58:52 -0800
Subject: [PATCH 09/89] Enable test_fusions TanhQuantize (#49970)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49970

enable test_fusions:test_tanhquantize

Test Plan: https://internalfb.com/intern/testinfra/testrun/6755399469176694

Reviewed By: hyuen

Differential Revision: D25732684

fbshipit-source-id: b8479e43b5248ba5510f0c78c993d534d3ffc2b0
---
 caffe2/contrib/fakelowp/test/test_fusions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
index 335159c8318e..3e22d7c5937b 100644
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@@ -27,7 +27,7 @@ class Fusions(serial.SerializedTestCase):
         rand_seed=st.integers(0, 65534),
     )
     @settings(deadline=datetime.timedelta(seconds=10))
-    def Skip_test_tanhquantize(self, scale, zp, size, rand_seed):
+    def test_tanhquantize(self, scale, zp, size, rand_seed):
         np.random.seed(rand_seed)
 
         workspace.ResetWorkspace()

From 42d2e31cd6d798fe887559465452613378e4b821 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 30 Dec 2020 10:31:50 -0800
Subject: [PATCH 10/89] [numpy] `torch.rsqrt` : promote integer inputs to float
 (#47909)

Summary:
Reference https://github.com/pytorch/pytorch/issues/42515

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47909

Reviewed By: ngimel

Differential Revision: D25730876

Pulled By: mruberry

fbshipit-source-id: c87a8f686e1dd64e511640e0278021c4a584ccf2
---
 aten/src/ATen/native/UnaryOps.cpp                  |  8 ++++++--
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp        |  4 ++--
 test/test_torch.py                                 |  1 -
 test/test_unary_ufuncs.py                          |  1 -
 torch/csrc/jit/tensorexpr/eval.cpp                 |  8 ++++++--
 torch/csrc/jit/tensorexpr/kernel.cpp               |  5 +++--
 .../_internal/common_methods_invocations.py        | 14 ++++++++++++++
 7 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index e6dd1bc4afde..0f6da7e4292a 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -326,8 +326,12 @@ Tensor& reciprocal_out(Tensor& result, const Tensor& self) { return unary_op_imp
 Tensor reciprocal(const Tensor& self) { return unary_op_impl_float(self, reciprocal_stub); }
 Tensor& reciprocal_(Tensor& self) { return unary_op_impl_(self, at::reciprocal_out); }
 
-Tensor& rsqrt_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, rsqrt_stub); }
-Tensor rsqrt(const Tensor& self) { return unary_op_impl(self, at::rsqrt_out); }
+Tensor& rsqrt_out(Tensor& result, const Tensor& self) {
+  return unary_op_impl_float_out(result, self, rsqrt_stub);
+}
+Tensor rsqrt(const Tensor& self) {
+  return unary_op_impl_float(self, rsqrt_stub);
+}
 Tensor& rsqrt_(Tensor& self) { return unary_op_impl_(self, at::rsqrt_out); }
 
 Tensor& sign_out(Tensor& result, const Tensor& self) {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 049b3eff6b5b..32ebaf7752f7 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -587,10 +587,10 @@ static void random_full_64_bits_range_kernel(TensorIterator& iter, c10::optional
 }
 
 static void rsqrt_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "rsqrt_cpu", [&] {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "rsqrt_cpu", [&] {
     cpu_kernel_vec(
         iter,
-        [=](scalar_t a) -> scalar_t {
+        [=](scalar_t a) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
           return (static_cast<scalar_t>(1)) / std::sqrt(a);
         },
         [=](Vec256<scalar_t> a) { return a.rsqrt(); });
diff --git a/test/test_torch.py b/test/test_torch.py
index 6532c2e5e17d..8872516ddd28 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6870,7 +6870,6 @@ def inner(self, device, dtype):
     ('rot90', 'k1_d12', _small_3d, lambda t, d: [1, [1, 2]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
     ('rot90', 'k1_neg_d', _small_3d, lambda t, d: [1, [1, -1]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
     ('rot90', 'default', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
-    ('rsqrt', '', lambda t, d: _small_3d(t, d) + 1, lambda t, d: [], 1e-2, 1e-5, 1e-4, _float_types_no_half),
     ('sinh', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types),
     ('tan', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types),
     ('tan', 'complex', lambda t, d: _small_3d(t, d), lambda t, d: [], 1e-3, 1e-5, 1e-5, _complex_types),
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 776482306f4d..1daecc24f79f 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1715,7 +1715,6 @@ def _medium_2d(dtype, device):
     _TorchMathTestMeta('ceil'),
     _TorchMathTestMeta('rad2deg'),
     _TorchMathTestMeta('deg2rad'),
-    _TorchMathTestMeta('rsqrt', reffn=lambda x: np.reciprocal(np.sqrt(x))),
     _TorchMathTestMeta('frac', reffn='fmod', refargs=lambda x: (x.numpy(), 1)),
     _TorchMathTestMeta('trunc'),
     _TorchMathTestMeta('round'),
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index e60a0bd704bf..186af3ca822f 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -834,8 +834,12 @@ class SimpleIREvaluatorImpl : public IRVisitor {
         return std::erfc(v);
       case kSqrt:
         return std::sqrt(v);
-      case kRsqrt:
-        return 1.0f / std::sqrt(v);
+      case kRsqrt: {
+        auto rsqrt = [](TInput v) __ubsan_ignore_float_divide_by_zero__ {
+          return 1.0f / std::sqrt(v);
+        };
+        return rsqrt(v);
+      }
       case kCeil:
         return std::ceil(v);
       case kFloor:
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 999186d4c4ed..0145014ee8f5 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1282,8 +1282,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::rsqrt: {
-      return computeOneOperand(
-          "aten_rsqrt", v, [](const ExprHandle& a) { return rsqrt(a); });
+      return computeOneOperand("aten_rsqrt", v, [](const ExprHandle& a) {
+        return rsqrt(promoteIntegerToDefaultType(a));
+      });
     } break;
 
     case aten::abs: {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 87d0baa895e8..0c5a5a6353df 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1001,6 +1001,16 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 dtypes=[torch.bfloat16]),
                    )),
+    UnaryUfuncInfo('rsqrt',
+                   ref=lambda x: np.reciprocal(np.sqrt(x)),
+                   domain=(0, float('inf')),
+                   dtypes=all_types_and_complex_and(torch.bool),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
+                   decorators=(precisionOverride({torch.half: 5e-2}),),
+                   promotes_integers_to_float=True,
+                   assert_autodiffed=True,
+                   handles_complex_extremals=False),
     UnaryUfuncInfo('sqrt',
                    ref=np.sqrt,
                    domain=(0, float('inf')),
@@ -1466,6 +1476,10 @@ def method_tests():
         ('ceil', (), NO_ARGS, 'scalar', (True,)),
         ('rad2deg', (S, S, S), NO_ARGS),
         ('deg2rad', (S, S, S), NO_ARGS),
+        # Removing the 'rsqrt' entries leads to failure in
+        # test_index_fill_variable_dim_*
+        # TODO: Remove when fixed.
+        # Reference: https://github.com/pytorch/pytorch/issues/48230
         ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),

From 6b56b71e61e14bf4de5b371f0d8f2f2029065b31 Mon Sep 17 00:00:00 2001
From: Sameer Deshmukh <sameer.deshmukh93@gmail.com>
Date: Wed, 30 Dec 2020 13:27:45 -0800
Subject: [PATCH 11/89] Accept input tensor with 0-dim batch size for
 MultiLabelMarginLoss (#46975)

Summary:
Fix for one of the layers listed in https://github.com/pytorch/pytorch/issues/12013 or https://github.com/pytorch/pytorch/issues/38115

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46975

Reviewed By: mruberry

Differential Revision: D25719980

Pulled By: ngimel

fbshipit-source-id: 83414bad37c0b004bc7cced04df8b9c89bdba3e6
---
 aten/src/ATen/native/LossMulti.h              | 72 ++++++++++++++
 aten/src/ATen/native/LossMultiLabelMargin.cpp | 97 ++++++-------------
 aten/src/ATen/native/LossMultiMargin.cpp      | 45 ++-------
 .../generic/MultiLabelMarginCriterion.cu      | 51 +++++++---
 .../THCUNN/generic/MultiMarginCriterion.cu    | 49 ++++++++--
 test/test_nn.py                               | 29 ++++++
 6 files changed, 222 insertions(+), 121 deletions(-)
 create mode 100644 aten/src/ATen/native/LossMulti.h

diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h
new file mode 100644
index 000000000000..4282c346702c
--- /dev/null
+++ b/aten/src/ATen/native/LossMulti.h
@@ -0,0 +1,72 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
+
+#pragma once
+
+namespace at { namespace native {
+namespace {
+  static void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    TensorArg& target_arg,
+    const Tensor& input,
+    const Tensor& target) {
+    bool valid_inputs = (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0;
+    TORCH_CHECK(
+                valid_inputs,
+                "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+                input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+                  valid_inputs && target.dim() <= 1 && target.numel() == dim,
+                  "inconsistent size ",
+                  target.sizes(),
+                  " for ",
+                  target_arg);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+                  valid_inputs && target.dim() == 2 && target.size(0) == nframe &&
+                  target.size(1) == dim,
+                  "inconsistent size ",
+                  target.sizes(),
+                  " for ",
+                  target_arg);
+    }
+  }
+
+  static void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    TensorArg& target_arg,
+    const Tensor& input,
+    const Tensor& target) {
+    bool valid_inputs = (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0;
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+    
+    TORCH_CHECK(
+                valid_inputs,
+                "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+                input.sizes());
+    TORCH_CHECK(
+                valid_inputs && target.dim() <= 1 && target.numel() == nframe,
+                "inconsistent target size, got: ",
+                target.sizes());
+  }
+
+
+}  // anonymous namespace
+}} // namespace at::native
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index 9582bf661a32..3cd0f46e0a95 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -2,6 +2,7 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/native/LossMulti.h>
 
 namespace at {
 namespace native {
@@ -39,6 +40,7 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
       }
     }
   }
+
   return sum;
 }
 
@@ -100,47 +102,17 @@ static void multilabel_margin_loss_forward_out_cpu_template(
     Tensor& is_target,
     int64_t reduction) {
   auto target_arg = TensorArg(target, "target", 2);
-
-  const auto ndims = input.dim();
-
-  TORCH_CHECK(
-      input.numel() > 0 && ndims <= 2,
-      "non-empty vector or matrix expected, got size: ",
-      input.sizes());
-
   int64_t nframe, dim;
+  const int64_t ndims = input.dim();
   if (ndims <= 1) {
     nframe = 1;
     dim = ndims == 0 ? 1 : input.size(0);
-    TORCH_CHECK(
-        target.numel() > 0 && target.dim() <= 1 && target.numel() == dim,
-        "inconsistent size ",
-        target.sizes(),
-        " for ",
-        target_arg);
-  } else {
+  }
+  else {
     nframe = input.size(0);
     dim = input.size(1);
-    TORCH_CHECK(
-        target.numel() > 0 && target.dim() == 2 && target.size(0) == nframe &&
-            target.size(1) == dim,
-        "inconsistent size ",
-        target.sizes(),
-        " for ",
-        target_arg);
   }
-
-  TORCH_CHECK(
-      target.min().item<int64_t>() >= -1, target_arg, " is out of range");
-  TORCH_CHECK(
-      target.max().item<int64_t>() < dim, target_arg, " is out of range");
-
-  auto input_contiguous = input.contiguous();
-  auto target_contiguous = target.contiguous();
-
-  is_target.resize_as_(target);
-  TORCH_CHECK(is_target.is_contiguous(), "is_target must be contiguous");
-  is_target.zero_();
+  multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
 
   // special case target.dim() <= 1: produce scalar output for scalar inputs
   // even if reduction == Reduction::None
@@ -150,6 +122,22 @@ static void multilabel_margin_loss_forward_out_cpu_template(
     output.resize_({nframe});
   }
 
+  is_target.resize_as_(target);
+  TORCH_CHECK(is_target.is_contiguous(), "is_target must be contiguous");
+  is_target.zero_();
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(
+      target.min().item<int64_t>() >= -1, target_arg, " is out of range");
+  TORCH_CHECK(
+      target.max().item<int64_t>() < dim, target_arg, " is out of range");
+
+  auto input_contiguous = input.contiguous();
+  auto target_contiguous = target.contiguous();
+
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multilabel_margin_loss_forward_out_frame", [&] {
         multilabel_margin_loss_forward_out_frame<scalar_t>(
@@ -232,39 +220,22 @@ static void multilabel_margin_loss_backward_out_cpu_template(
     const Tensor& target,
     int64_t reduction,
     const Tensor& is_target) {
+  int64_t nframe, dim;
   CheckedFrom c = "multilabel_margin_loss_backward_cpu_template";
   auto target_arg = TensorArg(target, "target", 3);
   auto is_target_arg = TensorArg(is_target, "is_target", 5);
+  const int64_t ndims = input.dim();
 
-  const auto ndims = input.dim();
-
-  TORCH_CHECK(
-      input.numel() > 0 && ndims <= 2,
-      "non-empty vector or matrix expected, got size: ",
-      input.sizes());
+  multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
+  checkSameSize(c, target_arg, is_target_arg);
 
-  int64_t nframe, dim;
-  if (ndims <= 1) {
-    nframe = 1;
-    dim = ndims == 0 ? 1 : input.size(0);
-    TORCH_CHECK(
-        target.numel() > 0 && target.dim() <= 1 && target.numel() == dim,
-        "inconsistent size ",
-        target.sizes(),
-        " for ",
-        target_arg);
-  } else {
-    nframe = input.size(0);
-    dim = input.size(1);
-    TORCH_CHECK(
-        target.numel() > 0 && target.dim() == 2 && target.size(0) == nframe &&
-            target.size(1) == dim,
-        "inconsistent size ",
-        target.sizes(),
-        " for ",
-        target_arg);
+  grad_input.resize_as_(input);
+  if (grad_input.numel() == 0) {
+    return;
   }
-  checkSameSize(c, target_arg, is_target_arg);
+
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  grad_input.zero_();
 
   TORCH_CHECK(
       target.min().item<int64_t>() >= -1, target_arg, " is out of range");
@@ -275,10 +246,6 @@ static void multilabel_margin_loss_backward_out_cpu_template(
   auto target_contiguous = target.contiguous();
   auto is_target_contiguous = is_target.contiguous();
 
-  grad_input.resize_as_(input);
-  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  grad_input.zero_();
-
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multilabel_margin_loss_backward_out_frame", [&] {
         multilabel_margin_loss_backward_out_frame<scalar_t>(
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index 48446a98559d..db18d1f655d4 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/native/LossMulti.h>
 
 namespace at {
 namespace native {
@@ -93,27 +94,13 @@ void multi_margin_loss_out_cpu_template(
     Scalar margin,
     const Tensor& weight,
     int64_t reduction) {
+  int64_t nframe, dim;
   const auto ndims = input.dim();
-  TORCH_CHECK(
-      input.numel() > 0 && ndims <= 2,
-      "non-empty vector or matrix expected, got size: ",
-      input.sizes());
+  auto target_arg = TensorArg(target, "target", 2);
 
   TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
 
-  int64_t nframe, dim;
-  if (ndims <= 1) {
-    nframe = 1;
-    dim = ndims == 0 ? 1 : input.size(0);
-  } else {
-    nframe = input.size(0);
-    dim = input.size(1);
-  }
-
-  TORCH_CHECK(
-      target.numel() > 0 && target.dim() <= 1 && target.numel() == nframe,
-      "inconsistent target size, got: ",
-      target.sizes());
+  multi_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
 
   // produce a scalar output for 1d input
   if (reduction == Reduction::None && target.dim() > 0) {
@@ -121,6 +108,9 @@ void multi_margin_loss_out_cpu_template(
   } else {
     output.resize_({});
   }
+  if (input.numel() == 0) {
+    return;
+  }
 
   auto input_contiguous = input.contiguous();
   auto target_contiguous = target.contiguous();
@@ -212,28 +202,13 @@ void multi_margin_loss_backward_out_cpu_template(
     Scalar margin,
     const Tensor& weight,
     int64_t reduction) {
+  int64_t nframe, dim;
+  auto target_arg = TensorArg(target, "target", 2);
   const auto ndims = input.dim();
-  TORCH_CHECK(
-      input.numel() > 0 && ndims <= 2,
-      "non-empty vector or matrix expected, got size: ",
-      input.sizes());
 
   TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
 
-  int64_t nframe, dim;
-  if (ndims <= 1) {
-    nframe = 1;
-    dim = ndims == 0 ? 1 : input.size(0);
-  } else {
-    nframe = input.size(0);
-    dim = input.size(1);
-  }
-
-  TORCH_CHECK(
-      target.numel() > 0 && target.dim() <= 1 && target.numel() == nframe,
-      "inconsistent target size, got: ",
-      target.sizes());
-
+  multi_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
   grad_input.resize_as_(input);
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
 
diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
index ab8d2cb1ad68..6e8d9bc91976 100644
--- a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -3,21 +3,30 @@
 #else
 
 static inline void THNN_(MultiLabelMarginCriterion_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input, THCTensor *target) {
-  if (input->dim() <= 1) {
+  THCState *state,
+  THCTensor *input, THCTensor *target) {
+  int64_t ndims = input->dim();
+  bool valid_inputs = (ndims == 2 && input->size(1) != 0) || (ndims == 1 && input->size(0) != 0) || ndims == 0;
+  TORCH_CHECK(
+    valid_inputs,
+    "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+    input->sizes());
+
+  if (ndims <= 1) {
     int dim = input->dim() == 0 ? 1 : input->size(0);
     int target_size = target->dim() == 0 ? 1 : target->size(0);
-    TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (target_size == dim),
-                "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
-  } else if (input->dim() == 2) {
+
+    TORCH_CHECK(valid_inputs && target->dim() <= 1 && target->numel() == dim,
+      "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
+  } else if (ndims == 2) {
     int nframe = input->size(0);
     int dim = input->size(1);
-    TORCH_CHECK(!target->is_empty() && (target->dim() == 2)
-                && (target->size(0) == nframe) && (target->size(1) == dim),
-                "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
+
+    TORCH_CHECK(
+      valid_inputs && target->dim() == 2 && target->size(0) == nframe && target->size(1) == dim,
+      "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
   } else {
-    TORCH_CHECK(false, "non-empty vector or matrix expected, got size: ", input->sizes());
+    TORCH_CHECK(false, "Expected input of ndims <= 2, but got ndims: ", ndims);
   }
 }
 
@@ -31,6 +40,9 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
            int64_t reduction)
 {
   THNN_(MultiLabelMarginCriterion_shapeCheck)(state, input, target);
+  if (input->numel() == 0) {
+    return;
+  }
   input = THCTensor_(newContiguous)(state, input);
   target = THCIndexTensor_(newContiguous)(state, target);
   istarget = THCTensor_(newContiguous)(state, istarget);
@@ -100,7 +112,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
     }
   }
   else {
-    TORCH_INTERNAL_ASSERT(false, "non-empty vector or matrix expected (shouldn't get here)");
+    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ", 
+      input->sizes());
   }
 
   THCTensor_(free)(state, input);
@@ -117,11 +130,17 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
             THCTensor *istarget,
             int64_t reduction)
 {
+  THNN_(MultiLabelMarginCriterion_shapeCheck)(state, input, target);
   input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  if (input->numel() == 0) {
+    THCTensor_(free)(state, input);
+    return;
+  }
+
   target = THCIndexTensor_(newContiguous)(state, target);
   istarget = THCTensor_(newContiguous)(state, istarget);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resizeAs)(state, gradInput, input);
 
   if(gradInput->dim() <= 1)
   {
@@ -149,10 +168,11 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   {
     int nframe = gradInput->size(0);
     int dim = gradInput->size(1);
-    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
+    THArgCheck((input->size(1) != 0) && (target->dim() == 2) && (target->size(0) == nframe)
                && (target->size(1) == dim), 3, "inconsistent target size");
-    THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size(0) == nframe)
+    THArgCheck((istarget->dim() == 2) && (istarget->size(0) == nframe)
                && (istarget->size(1) == dim), 3, "inconsistent isTarget size");
+
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTILABELMARGIN_THREADS);
 
@@ -168,7 +188,8 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
         reduction != at::Reduction::None);
   }
   else {
-    AT_ERROR("non-empty vector or matrix expected, got size: ", gradInput->sizes());
+    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
+      gradInput->sizes());
   }
 
   THCudaCheck(cudaGetLastError());
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index f2df15054a4c..129413f0b7b2 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -2,6 +2,30 @@
 #define THC_GENERIC_FILE "THCUNN/generic/MultiMarginCriterion.cu"
 #else
 
+static inline void THNN_(MultiMarginCriterion_shapeCheck)(
+                         THCState *state,
+                         THCTensor *input, THCTensor *target) {
+  int64_t nframe, dim;
+  int64_t ndims = input->dim();
+  bool valid_inputs = (ndims == 2 && input->size(1) != 0) || (ndims == 1 && input->size(0) != 0) || ndims == 0;
+  if (ndims <= 1) {
+    nframe = 1;
+    dim = ndims == 0 ? 1 : input->size(0);
+  } else {
+    nframe = input->size(0);
+    dim = input->size(1);
+  }
+
+  TORCH_CHECK(
+    valid_inputs,
+    "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+    input->sizes());
+  TORCH_CHECK(
+      valid_inputs && target->dim() <= 1 && target->numel() == nframe,
+      "inconsistent target size, got: ",
+      target->sizes());
+}
+
 // TODO: improve error messages
 void THNN_(MultiMarginCriterion_updateOutput)(
            THCState *state,
@@ -13,6 +37,10 @@ void THNN_(MultiMarginCriterion_updateOutput)(
            THCTensor *weights,
            accreal margin_)
 {
+  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
+  if (input->numel() == 0) {
+    return;
+  }
   scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
   THCUNN_assertSameGPU(state, 2, input, target);
   input = THCTensor_(newContiguous)(state, input);
@@ -59,7 +87,8 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   else if (input->dim() == 2)
   {
     int nframe = input->size(0);
-    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    // allow zero-dim target for 2D input.
+    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -130,7 +159,8 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   }
   else
   {
-    AT_ERROR("non-empty vector or matrix expected, got sizes: ", input->sizes());
+    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
+    input->sizes());
   }
 
   THCTensor_(free)(state, input);
@@ -149,11 +179,17 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
            THCTensor *weights,
            accreal margin_)
 {
+  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  if (input->numel() == 0) {
+    THCTensor_(free)(state, input);
+    return;
+  }
   scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
   THCUNN_assertSameGPU(state, 3, input, gradInput, target);
-  input = THCTensor_(newContiguous)(state, input);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resizeAs)(state, gradInput, input);
+
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
 
@@ -195,7 +231,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   else if (input->dim() == 2)
   {
     int nframe = gradInput->size(0);
-    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -232,7 +268,8 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   }
   else
   {
-    AT_ERROR("non-empty vector or matrix expected, got ", input->sizes());
+    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ", 
+    input->sizes());
   }
 
   THCTensor_(free)(state, input);
diff --git a/test/test_nn.py b/test/test_nn.py
index 386ba369dca6..ef9ea4c8e6b1 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10864,6 +10864,35 @@ def test_ReflectionPad_empty(self, device):
             inp = torch.randn(3, 0, 10, 10, device=device)
             mod(inp)
 
+
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float, torch.double)
+    def test_MarginLoss_empty(self, device, dtype):
+        for mod, x, y in [
+                (torch.nn.MultiMarginLoss().to(device),
+                 torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype),
+                 torch.ones(0, device=device).type(torch.long)),
+                (torch.nn.MultiLabelMarginLoss().to(device),
+                 torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype),
+                 torch.ones(0, 10, device=device).type(torch.long))]:
+
+            out = mod(x, y)
+            out.sum().backward()
+
+            self.assertEqual(x, torch.zeros_like(x))
+            self.assertEqual(x.grad, torch.zeros_like(x))
+
+            with self.assertRaisesRegex(RuntimeError, 'Expected'):
+                x = torch.randn(0, requires_grad=True, device=device, dtype=dtype)
+                y = torch.ones(10, device=device).type(torch.long)
+                mod(x, y)
+
+            with self.assertRaisesRegex(RuntimeError, 'Expected'):
+                x = torch.randn(10, 0, requires_grad=True, device=device, dtype=dtype)
+                y = torch.ones(10, 0, device=device).type(torch.long)
+                mod(x, y)
+
+
     @onlyOnCPUAndCUDA
     def test_Unfold_empty(self, device):
         inp = torch.randn(0, 3, 3, 4, device=device)

From 6a951a6f4c06dff162e3b81e99a964c8b6ad84f0 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 30 Dec 2020 14:15:49 -0800
Subject: [PATCH 12/89] Fix a KaTeX crash and many docstring issues (#49684)

Summary:
The first commit fixes the `MultiheadAttention` docstrings, which are causing a cryptic KaTeX crash.

The second commit fixes many documentation issues in `torch/_torch_docs.py`, and closes gh-43667 (missing "Keyword arguments" headers). It also fixes a weird duplicate docstring for `torch.argmin`; there's more of these, it looks like they were written based on whether the C++ implementation has an overload. That makes little sense to a Python user though, and the content is simply duplicate.

The `Shape:` heading for https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html looked bad, here's what it looks like with this PR:

<img width="475" alt="image" src="https://user-images.githubusercontent.com/98330/102797488-09a44e00-43b0-11eb-8788-acdf4e936f2f.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49684

Reviewed By: ngimel

Differential Revision: D25730909

Pulled By: mruberry

fbshipit-source-id: d25bcf8caf928e7e8e918017d119de12e10a46e9
---
 torch/_torch_docs.py           | 63 +++++++++++++++-------------------
 torch/nn/modules/activation.py | 25 +++++++-------
 2 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 9c767822b11b..fe7237b5a370 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1026,7 +1026,6 @@ def merge_dicts(*dicts):
     tensor([ 0,  1, -4], dtype=torch.int8)
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.bmm,
            r"""
 bmm(input, mat2, *, deterministic=False, out=None) -> Tensor
@@ -2934,7 +2933,6 @@ def merge_dicts(*dicts):
     tensor([ 0.,  1.])
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.eye,
            r"""
 eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -2944,6 +2942,8 @@ def merge_dicts(*dicts):
 Args:
     n (int): the number of rows
     m (int, optional): the number of columns with default being :attr:`n`
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -4174,7 +4174,6 @@ def merge_dicts(*dicts):
     tensor([ 0.5724,  0.0000, -0.1208])
 """.format(**common_args))
 
-# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667)
 add_docstr(torch.linspace, r"""
 linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
@@ -4201,6 +4200,8 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points
     end (float): the ending value for the set of points
     steps (int): size of the constructed tensor
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -4537,7 +4538,6 @@ def merge_dicts(*dicts):
     tensor([ True,  True, False, False])
 """.format(**common_args))
 
-# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667)
 add_docstr(torch.logspace, """
 logspace(start, end, steps, base=10.0, *, \
          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -4568,7 +4568,9 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points
     end (float): the ending value for the set of points
     steps (int): size of the constructed tensor
-    base (float): base of the logarithm function. Default: ``10.0``.
+    base (float, optional): base of the logarithm function. Default: ``10.0``.
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -5469,36 +5471,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.argmin,
            r"""
-argmin(input) -> LongTensor
+argmin(input, dim=None, keepdim=False) -> LongTensor
 
-Returns the indices of the minimum value of all elements in the :attr:`input` tensor.
+Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
 
 This is the second value returned by :meth:`torch.min`. See its
 documentation for the exact semantics of this method.
 
 .. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
 
-Args:
-    {input}
-
-Example::
-
-    >>> a = torch.randn(4, 4)
-    >>> a
-    tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
-            [ 1.0100, -1.1975, -0.0102, -0.4732],
-            [-0.9240,  0.1207, -0.7506, -1.0213],
-            [ 1.7809, -1.2960,  0.9384,  0.1438]])
-    >>> torch.argmin(a)
-    tensor(13)
-
-.. function:: argmin(input, dim, keepdim=False) -> LongTensor
-
-Returns the indices of the minimum values of a tensor across a dimension.
-
-This is the second value returned by :meth:`torch.min`. See its
-documentation for the exact semantics of this method.
-
 Args:
     {input}
     {dim} If ``None``, the argmin of the flattened input is returned.
@@ -5512,8 +5493,15 @@ def merge_dicts(*dicts):
             [ 1.0100, -1.1975, -0.0102, -0.4732],
             [-0.9240,  0.1207, -0.7506, -1.0213],
             [ 1.7809, -1.2960,  0.9384,  0.1438]])
+    >>> torch.argmin(a)
+    tensor(13)
     >>> torch.argmin(a, dim=1)
     tensor([ 2,  1,  3,  1])
+    >>> torch.argmin(a, dim=1, keepdim=True)
+    tensor([[2],
+            [1],
+            [3],
+            [1]])
 """.format(**single_dim_common))
 
 add_docstr(torch.mm,
@@ -6328,7 +6316,6 @@ def merge_dicts(*dicts):
 
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.ones,
            r"""
 ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -6339,6 +6326,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -6356,7 +6345,6 @@ def merge_dicts(*dicts):
 
 """.format(**factory_common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.ones_like,
            r"""
 ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
@@ -6372,6 +6360,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword arguments:
     {dtype}
     {layout}
     {device}
@@ -8260,7 +8250,7 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
                     batch dimensions consisting of symmetric matrices.
-    eigenvectors(boolean, optional): controls whether eigenvectors have to be computed
+    eigenvectors(bool, optional): controls whether eigenvectors have to be computed
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
 
 Keyword args:
@@ -9270,7 +9260,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.full_like,
            """
-full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+full_like(input, fill_value, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
@@ -9489,9 +9479,10 @@ def merge_dicts(*dicts):
     Batched version for complex inputs is only supported on the CPU.
 
 Arguments:
-    input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is zero or more batch dimensions
-    rcond (float): A floating point value to determine the cutoff for small singular values.
-                   Default: 1e-15
+    input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is
+        zero or more batch dimensions.
+    rcond (float, optional): A floating point value to determine the cutoff for
+        small singular values. Default: ``1e-15``.
 
 Returns:
     The pseudo-inverse of :attr:`input` of dimensions :math:`(*, n, m)`
@@ -9887,6 +9878,8 @@ def merge_dicts(*dicts):
 
 Arguments:
     y (Tensor): The values of the function to integrate
+
+Keyword args:
     dx (float): The distance between points at which `y` is sampled.
     dim (int): The dimension along which to integrate.
         By default, use the last dimension.
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 8a16c8c27808..0c5258615bfd 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -365,11 +365,11 @@ class SiLU(Module):
         \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
 
     .. note::
-        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_ 
-        where the SiLU (Sigmoid Linear Unit) was originally coined, and see 
-        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation 
-        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish: 
-        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_ 
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
         where the SiLU was experimented with later.
 
     Shape:
@@ -937,8 +937,7 @@ def forward(self, query, key, value, key_padding_mask=None,
         attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
             the batches while a 3D mask allows to specify a different mask for the entries of each batch.
 
-    Shape:
-        - Inputs:
+    Shapes for inputs:
         - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
           the embedding dimension.
         - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
@@ -949,15 +948,17 @@ def forward(self, query, key, value, key_padding_mask=None,
           If a ByteTensor is provided, the non-zero positions will be ignored while the position
           with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
           value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
-        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
-          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
-          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
-          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+        - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the
+          source sequence length.
+
+          If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence
+          length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend
+          the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
           while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
           is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
           is provided, it will be added to the attention weight.
 
-        - Outputs:
+    Shapes for outputs:
         - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
           E is the embedding dimension.
         - attn_output_weights: :math:`(N, L, S)` where N is the batch size,

From a7e1f4f37a12806e5fcc08e1cebbb8d73822e71b Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Wed, 30 Dec 2020 14:51:54 -0800
Subject: [PATCH 13/89] Remove incorrect usage of layout(std430) on uniform
 buffers, correctly now treated as error in the latest release of Vulkan SDK.
 (#49572)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49572

Differential Revision: D25729888

Test Plan: Imported from OSS

Reviewed By: SS-JIA

Pulled By: AshkanAliabadi

fbshipit-source-id: 15dd4acef3dfae72f03e7e3085b1ff5936becf3d
---
 aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl        | 1 -
 aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl     | 1 -
 aten/src/ATen/native/vulkan/glsl/add.glsl                     | 1 -
 aten/src/ATen/native/vulkan/glsl/add_.glsl                    | 1 -
 aten/src/ATen/native/vulkan/glsl/add_scalar.glsl              | 1 -
 aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl             | 1 -
 aten/src/ATen/native/vulkan/glsl/addmm.glsl                   | 1 -
 aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl              | 1 -
 aten/src/ATen/native/vulkan/glsl/clamp.glsl                   | 1 -
 aten/src/ATen/native/vulkan/glsl/clamp_.glsl                  | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d.glsl                  | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl               | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl         | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl    | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl | 1 -
 aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl               | 1 -
 aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl           | 1 -
 aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl              | 1 -
 aten/src/ATen/native/vulkan/glsl/mean.glsl                    | 1 -
 aten/src/ATen/native/vulkan/glsl/mean2d.glsl                  | 1 -
 aten/src/ATen/native/vulkan/glsl/mm.glsl                      | 1 -
 aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl              | 1 -
 aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl             | 1 -
 aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl           | 1 -
 aten/src/ATen/native/vulkan/glsl/permute.glsl                 | 1 -
 aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl      | 1 -
 26 files changed, 26 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
index 58394dca19da..2c02e034603e 100644
--- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) readonly buffer kernel {
   vec4 data[];
diff --git a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
index d5b9af843dbe..75243a69bca3 100644
--- a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl
index 8dcff0476edf..361927373a49 100644
--- a/aten/src/ATen/native/vulkan/glsl/add.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl
index ed82d0cbe87b..d6360a376c58 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
index 8882ba0d8ff2..735086a8150a 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
index bffd680669fb..a418a28bb5c3 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/addmm.glsl b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
index 61f76fa8cf5d..a8f09252a167 100644
--- a/aten/src/ATen/native/vulkan/glsl/addmm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
index df2bbcf18014..5de8cf13225f 100644
--- a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp.glsl b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
index c394dfd26627..52c2d2d96c26 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
index b16258685114..3f138bb93ec6 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 9646eb8c9f19..bb2508aefe65 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index fe50262f7d46..0f49515718b2 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
index 37a5898b9f10..5155c07669c1 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
index b73c58e0f54d..89411284fed4 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
index 5cef89c2727f..8baae9b5fcd5 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index 48d9f785008b..1355b2c09b05 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
index d19c370ec9bd..01d653bf06de 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
index 948b797a5207..88373605d010 100644
--- a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform constBlock {
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
index 130d716ca9e6..551fd747f103 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
index 266226aa708b..b8d0add329f2 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mm.glsl b/aten/src/ATen/native/vulkan/glsl/mm.glsl
index 00ab5f31e6db..157acfe9c074 100644
--- a/aten/src/ATen/native/vulkan/glsl/mm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mm.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
index d3a98ba30bea..c0ae48fe3883 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
index b49252e128cc..f959052879ad 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
index fb87b5a36918..adbafcbd0438 100644
--- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl
index af8e33588f78..3d1191ff6eea 100644
--- a/aten/src/ATen/native/vulkan/glsl/permute.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/permute.glsl
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, binding = 0) writeonly buffer outputBuffer {
   float data[];
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
index efb1c5c7fc9a..b4db9b87dacb 100644
--- a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 

From ffbb68af8adb3c3c1921981b41778a9a9f8590fd Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 30 Dec 2020 14:53:48 -0800
Subject: [PATCH 14/89] quant docs: add common errors section (#49902)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49902

Adds a common errors section, and details the two errors
we see often on the discuss forums, with recommended solutions.

Test Plan: build the docs on Mac OS, the new section renders correctly.

Reviewed By: supriyar

Differential Revision: D25718195

Pulled By: vkuzo

fbshipit-source-id: c5ef2b24831d18d57bbafdb82d26d8fbf3a90781
---
 docs/source/quantization.rst | 65 ++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index a389de60416a..1cac90ffab86 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -530,6 +530,71 @@ Best Practices
    ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
    by reducing the range of quantized data type by 1 bit.
 
+Common Errors
+---------------------------------------
+
+Passing a non-quantized Tensor into a quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'quantized::some_operator' with arguments from the 'CPU' backend...
+
+This means that you are trying to pass a non-quantized Tensor to a quantized
+kernel. A common workaround is to use ``torch.quantization.QuantStub`` to
+quantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv(x)
+          return x
+
+Passing a quantized Tensor into a non-quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend.
+
+This means that you are trying to pass a quantized Tensor to a non-quantized
+kernel. A common workaround is to use ``torch.quantization.DeQuantStub`` to
+dequantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.quantization.QuantStub()
+          self.conv1 = torch.nn.Conv2d(1, 1, 1)
+          # this module will not be quantized (see `qconfig = None` logic below)
+          self.conv2 = torch.nn.Conv2d(1, 1, 1)
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv1(x)
+          # during the convert step, this will be replaced with a
+          # `dequantize` call
+          x = self.dequant(x)
+          x = self.conv2(x)
+          return x
+
+  m = M()
+  m.qconfig = some_qconfig
+  # turn off quantization for conv2
+  m.conv2.qconfig = None
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------

From 04a8412b86addab0067e7fce937a0eb5a752b8a9 Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Wed, 30 Dec 2020 15:19:55 -0800
Subject: [PATCH 15/89] [quant] Quantizable LSTM (#49671)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49671

- Introduces the `torch.nn.quantizable` namespace
- Adds the `torch.nn.quantizable.LSTM` module

The point of the `quantizable` namespace is to segregate the purely quantized modules with the modules that could be quantized through a normal quantization flow, but are not using the quantized kernels explicitly.
That means the quantizable modules are functionally and numerically equivalent to the FP ones and can be used instead of the FP ones without any loss.

The main difference between the `torch.nn.LSTM` and the `torch.nn.quantizable.LSTM` is that the former one does not support observation for the linear layers, because all the computation is internal to the `aten` namespace.
The `torch.nn.quantizable.LSTM`, however, uses explicit linear layers that can be observed for further quantization.

Test Plan: Imported from OSS

Differential Revision: D25663870

Reviewed By: vkuzo

Pulled By: z-a-f

fbshipit-source-id: 70ff5463bd759b9a7922571a5712d3409dfdfa06
---
 test/quantization/test_quantized_op.py      |  83 +++-
 test/test_quantization.py                   |   1 +
 torch/__init__.py                           |   1 +
 torch/nn/quantizable/__init__.py            |   1 +
 torch/nn/quantizable/modules/__init__.py    |   7 +
 torch/nn/quantizable/modules/rnn.py         | 403 ++++++++++++++++++++
 torch/quantization/quantize.py              |   6 +-
 torch/testing/_internal/common_quantized.py |  29 ++
 8 files changed, 529 insertions(+), 2 deletions(-)
 create mode 100644 torch/nn/quantizable/__init__.py
 create mode 100644 torch/nn/quantizable/modules/__init__.py
 create mode 100644 torch/nn/quantizable/modules/rnn.py

diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index c676ccc0f793..be044fa5211a 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -23,7 +23,7 @@
 from torch.testing._internal.common_utils import IS_PPC, TEST_WITH_UBSAN, IS_MACOS
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
-    override_quantized_engine, supported_qengines, override_qengines
+    override_quantized_engine, supported_qengines, override_qengines, _snr
 from torch.testing._internal.common_quantized import qengine_is_qnnpack
 from torch.quantization import PerChannelMinMaxObserver
 
@@ -2314,6 +2314,87 @@ def test_advanced_indexing(self):
                 torch.quantize_per_tensor(x_fp32_s4, scale, zp, dtype)
             self.assertEqual(x_q_s4, x_fp32_s4_ref)
 
+    @override_qengines
+    def test_custom_module_lstm(self):
+        qengine = torch.backends.quantized.engine
+
+        batch_size = 4
+        seq_len = 8
+        input_size = 12
+
+        hidden_size = 8
+        num_layers = 2
+
+        dropout = 0  # This is not supported
+
+        Bias = [False, True]
+        Batch_first = [False, True]
+        Bidirectional = [False, True]
+
+        dtype = np.uint8
+        qtype = torch.quint8
+
+        custom_module_config = {
+            'float_to_observed_custom_module_class': {
+                torch.nn.LSTM: torch.nn.quantizable.LSTM
+            }
+        }
+
+        x = np.random.randn(seq_len, batch_size, input_size)
+        scale, zero_point = _calculate_dynamic_qparams(x, dtype=dtype)
+        x = torch.from_numpy(x).to(torch.float)
+        qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point,
+                                       dtype=qtype)
+        x = qx.dequantize()
+
+        with torch.no_grad():
+            for bias, batch_first, bidirectional in itertools.product(
+                    Bias, Batch_first, Bidirectional):
+                # Assume 12dB is sufficient for functional equivalence
+                # Without the bias, linear performs poorly
+                min_power = 10 if bias else 5
+                max_mse = 5e-6 if bias else 5e-1
+
+                if batch_first:
+                    x = x.reshape(batch_size, seq_len, input_size)
+                    qx = qx.reshape(batch_size, seq_len, input_size)
+                else:
+                    x = x.reshape(seq_len, batch_size, input_size)
+                    qx = qx.reshape(seq_len, batch_size, input_size)
+
+                lstm = torch.nn.Sequential(
+                    torch.nn.LSTM(input_size, hidden_size,
+                                  num_layers=num_layers,
+                                  bias=bias, batch_first=batch_first,
+                                  dropout=dropout,
+                                  bidirectional=bidirectional))
+                lstm.eval()
+                y_ref = lstm(x)
+
+                # Prepare
+                lstm.qconfig = torch.quantization.get_default_qconfig(qengine)
+                lstm_prepared = torch.quantization.prepare(
+                    lstm, prepare_custom_config_dict=custom_module_config)
+                self.assertTrue(hasattr(lstm_prepared[0], 'layers'))
+                self.assertEqual(num_layers, len(lstm_prepared[0].layers))
+
+                # Calibrate
+                y = lstm_prepared(x)
+                self.assertEqual(y_ref, y)
+
+                # Quantize
+                lstm_quantized = torch.quantization.convert(lstm_prepared)
+                qy = lstm_quantized(qx)
+
+                snr = _snr(y, qy)
+                snr = [snr[0]] + snr[1]
+
+                for signal, mse, power in snr:
+                    self.assertTrue(
+                        power > min_power or mse < max_mse,
+                        msg=(f"Error is too high: SNR(dB): {power}, "
+                             f"Signal: {signal}, MSE: {mse}"))
+
 
 class TestDynamicQuantizedLinear(TestCase):
     """Tests the correctness of the dynamic quantized linear and linear_relu op."""
diff --git a/test/test_quantization.py b/test/test_quantization.py
index f68bfcd058b6..1c370913c6d0 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -15,6 +15,7 @@
 from quantization.test_quantized_op import TestPadding  # noqa: F401
 from quantization.test_quantized_op import TestQuantizedEmbeddingOps  # noqa: F401
 from quantization.test_quantized_op import TestDynamicQuantizedRNNOp  # noqa: F401
+
 # Quantized Functional
 from quantization.test_quantized_functional import TestQuantizedFunctional  # noqa: F401
 
diff --git a/torch/__init__.py b/torch/__init__.py
index 04955623ab2a..9ae1010a3ba8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -574,6 +574,7 @@ def _assert(condition, message):
 import torch.futures
 import torch.nn
 import torch.nn.intrinsic
+import torch.nn.quantizable
 import torch.nn.quantized
 import torch.optim
 import torch.optim._multi_tensor
diff --git a/torch/nn/quantizable/__init__.py b/torch/nn/quantizable/__init__.py
new file mode 100644
index 000000000000..270dcebaa5f4
--- /dev/null
+++ b/torch/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from .modules import *
diff --git a/torch/nn/quantizable/modules/__init__.py b/torch/nn/quantizable/modules/__init__.py
new file mode 100644
index 000000000000..b3480b717a2d
--- /dev/null
+++ b/torch/nn/quantizable/modules/__init__.py
@@ -0,0 +1,7 @@
+from .rnn import LSTM
+from .rnn import LSTMCell
+
+__all__ = [
+    'LSTM',
+    'LSTMCell',
+]
diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py
new file mode 100644
index 000000000000..cfe076fac16c
--- /dev/null
+++ b/torch/nn/quantizable/modules/rnn.py
@@ -0,0 +1,403 @@
+import numbers
+from typing import Optional, Tuple
+import warnings
+
+import torch
+from torch import Tensor
+
+"""
+We will recreate all the RNN modules as we require the modules to be decomposed
+into its building blocks to be able to observe.
+"""
+
+class LSTMCell(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM) cell.
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell`
+
+    Examples::
+
+        >>> import torch.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTMCell(10, 20)
+        >>> input = torch.randn(3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+                hx, cx = rnn(input[i], (hx, cx))
+                output.append(hx)
+    """
+    _FLOAT_MODULE = torch.nn.LSTMCell
+
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True):
+        super().__init__()
+        self.input_size = input_dim
+        self.hidden_size = hidden_dim
+        self.bias = bias
+
+        self.igates = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=bias)
+        self.hgates = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=bias)
+        self.gates = torch.nn.quantized.FloatFunctional()
+
+        self.fgate_cx = torch.nn.quantized.FloatFunctional()
+        self.igate_cgate = torch.nn.quantized.FloatFunctional()
+        self.fgate_cx_igate_cgate = torch.nn.quantized.FloatFunctional()
+
+        self.ogate_cy = torch.nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        if hidden is None or hidden == (None, None):
+            hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
+        hx, cx = hidden
+
+        igates = self.igates(x)
+        hgates = self.hgates(hx)
+        gates = self.gates.add(igates, hgates)
+
+        input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1)
+
+        input_gate = torch.sigmoid(input_gate)
+        forget_gate = torch.sigmoid(forget_gate)
+        cell_gate = torch.tanh(cell_gate)
+        out_gate = torch.sigmoid(out_gate)
+
+        fgate_cx = self.fgate_cx.mul(forget_gate, cx)
+        igate_cgate = self.igate_cgate.mul(input_gate, cell_gate)
+        fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate)
+        cy = fgate_cx_igate_cgate
+
+        tanh_cy = torch.tanh(cy)
+        hy = self.ogate_cy.mul(out_gate, tanh_cy)
+        return hy, cy
+
+    def initialize_hidden(self, batch_size: int, is_quantized: bool = False) -> Tuple[Tensor, Tensor]:
+        h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros((batch_size, self.hidden_size))
+        if is_quantized:
+            h = torch.quantize_per_tensor(h, scale=1.0, zero_point=0, dtype=torch.quint8)
+            c = torch.quantize_per_tensor(c, scale=1.0, zero_point=0, dtype=torch.quint8)
+        return h, c
+
+    def _get_name(self):
+        return 'QuantizableLSTMCell'
+
+    @classmethod
+    def from_params(cls, wi, wh, bi=None, bh=None):
+        """Uses the weights and biases to create a new LSTM cell.
+
+        Args:
+            wi, wh: Weights for the input and hidden layers
+            bi, bh: Biases for the input and hidden layers
+        """
+        assert (bi is None) == (bh is None)  # Either both None or both have values
+        input_size = wi.shape[1]
+        hidden_size = wh.shape[1]
+        cell = cls(input_dim=input_size, hidden_dim=hidden_size,
+                   bias=(bi is not None))
+        cell.igates.weight = torch.nn.Parameter(wi)
+        if bi is not None:
+            cell.igates.bias = torch.nn.Parameter(bi)
+        cell.hgates.weight = torch.nn.Parameter(wh)
+        if bh is not None:
+            cell.hgates.bias = torch.nn.Parameter(bh)
+        return cell
+
+    @classmethod
+    def from_float(cls, other):
+        assert type(other) == cls._FLOAT_MODULE
+        assert hasattr(other, 'qconfig'), "The float module must have 'qconfig'"
+        observed = cls.from_params(other.weight_ih, other.weight_hh,
+                                   other.bias_ih, other.bias_hh)
+        observed.qconfig = other.qconfig
+        observed.igates.qconfig = other.qconfig
+        observed.hgates.qconfig = other.qconfig
+        return observed
+
+
+class _LSTMSingleLayer(torch.nn.Module):
+    r"""A single one-directional LSTM layer.
+
+    The difference between a layer and a cell is that the layer can process a
+    sequence, while the cell only expects an instantaneous value.
+    """
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True):
+        super().__init__()
+        self.cell = LSTMCell(input_dim, hidden_dim, bias=bias)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        result = []
+        for xx in x:
+            hidden = self.cell(xx, hidden)
+            result.append(hidden[0])  # type: ignore
+        result_tensor = torch.stack(result, 0)
+        return result_tensor, hidden
+
+    @classmethod
+    def from_params(cls, *args, **kwargs):
+        cell = LSTMCell.from_params(*args, **kwargs)
+        layer = cls(cell.input_size, cell.hidden_size, cell.bias)
+        layer.cell = cell
+        return layer
+
+
+class _LSTMLayer(torch.nn.Module):
+    r"""A single bi-directional LSTM layer."""
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
+                 batch_first: bool = False, bidirectional: bool = False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.bidirectional = bidirectional
+        self.layer_fw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias)
+        if self.bidirectional:
+            self.layer_bw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        if hidden is None:
+            hx_fw, cx_fw = (None, None)
+        else:
+            hx_fw, cx_fw = hidden
+        if self.bidirectional:
+            if hx_fw is None:
+                hx_bw = None
+            else:
+                hx_bw = hx_fw[1]
+                hx_fw = hx_fw[0]
+            if cx_fw is None:
+                cx_bw = None
+            else:
+                cx_bw = cx_fw[1]
+                cx_fw = cx_fw[0]
+            hidden_bw = hx_bw, cx_bw
+        hidden_fw = hx_fw, cx_fw
+        result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
+
+        if self.bidirectional:
+            x_reversed = x.flip(0)
+            result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
+            result_bw = result_bw.flip(0)
+
+            result = torch.cat([result_fw, result_bw], result_fw.dim() - 1)
+            h = torch.stack([hidden_fw[0], hidden_bw[0]], 0)  # type: ignore
+            c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore
+        else:
+            result = result_fw
+            h, c = hidden_fw  # type: ignore
+
+        if self.batch_first:
+            result.transpose_(0, 1)
+
+        return result, (h, c)
+
+    @classmethod
+    def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
+        r"""
+        There is no FP equivalent of this class. This function is here just to
+        mimic the behavior of the `prepare` within the `torch.quantization`
+        flow.
+        """
+        assert hasattr(other, 'qconfig') or (qconfig is not None)
+
+        input_size = kwargs.get('input_size', other.input_size)
+        hidden_size = kwargs.get('hidden_size', other.hidden_size)
+        bias = kwargs.get('bias', other.bias)
+        batch_first = kwargs.get('batch_first', other.batch_first)
+        bidirectional = kwargs.get('bidirectional', other.bidirectional)
+
+        layer = cls(input_size, hidden_size, bias, batch_first, bidirectional)
+        layer.qconfig = getattr(other, 'qconfig', qconfig)
+        wi = getattr(other, f'weight_ih_l{layer_idx}')
+        wh = getattr(other, f'weight_hh_l{layer_idx}')
+        bi = getattr(other, f'bias_ih_l{layer_idx}', None)
+        bh = getattr(other, f'bias_hh_l{layer_idx}', None)
+
+        layer.layer_fw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+
+        if other.bidirectional:
+            wi = getattr(other, f'weight_ih_l{layer_idx}_reverse')
+            wh = getattr(other, f'weight_hh_l{layer_idx}_reverse')
+            bi = getattr(other, f'bias_ih_l{layer_idx}_reverse', None)
+            bh = getattr(other, f'bias_hh_l{layer_idx}_reverse', None)
+            layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+        return layer
+
+    # Getters for the weights and biases
+    # Note that jit currently doesn't support the `porperty`, so if you need to
+    # access the weights/biases you would need to navigate manually to the
+    # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883
+    @property
+    def weight_ih(self):
+        return self.layer_fw.cell.igates.weight
+
+    @property
+    def weight_hh(self):
+        return self.layer_fw.cell.hgates.weight
+
+    @property
+    def bias_ih(self):
+        return self.layer_fw.cell.igates.bias
+
+    @property
+    def bias_hh(self):
+        return self.layer_fw.cell.hgates.bias
+
+    @property
+    def weight_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.weight
+
+    @property
+    def weight_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.weight
+
+    @property
+    def bias_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.bias
+
+    @property
+    def bias_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.bias
+
+
+class LSTM(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples below.
+
+    Examples::
+
+        >>> import torch.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+        >>> # To get the weights:
+        >>> print(rnn.layers[0].weight_ih)
+        tensor([[...]])
+        >>> print(rnn.layers[0].weight_hh)
+        AssertionError: There is no reverse path in the non-bidirectional layer
+    """
+    _FLOAT_MODULE = torch.nn.LSTM
+
+    def __init__(self, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True,
+                 batch_first: bool = False, dropout: float = 0.,
+                 bidirectional: bool = False):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.training = False  # We don't want to train using this module
+        num_directions = 2 if bidirectional else 1
+
+        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
+                isinstance(dropout, bool):
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0:
+            warnings.warn("dropout option for quantizable LSTM is ignored. "
+                          "If you are training, please, use nn.LSTM version "
+                          "followed by `prepare` step.")
+            if num_layers == 1:
+                warnings.warn("dropout option adds dropout after all but last "
+                              "recurrent layer, so non-zero dropout expects "
+                              "num_layers greater than 1, but got dropout={} "
+                              "and num_layers={}".format(dropout, num_layers))
+
+        layers = [_LSTMLayer(self.input_size, self.hidden_size,
+                             self.bias, batch_first=False,
+                             bidirectional=self.bidirectional)]
+        for layer in range(1, num_layers):
+            layers.append(_LSTMLayer(self.hidden_size, self.hidden_size,
+                                     self.bias, batch_first=False,
+                                     bidirectional=self.bidirectional))
+        self.layers = torch.nn.ModuleList(layers)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        max_batch_size = x.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if hidden is None:
+            zeros = torch.zeros(num_directions, max_batch_size,
+                                self.hidden_size, dtype=torch.float,
+                                device=x.device)
+            zeros.squeeze_(0)
+            if x.is_quantized:
+                zeros = torch.quantize_per_tensor(zeros, scale=1.0,
+                                                  zero_point=0, dtype=x.dtype)
+            hxcx = [(zeros, zeros) for _ in range(self.num_layers)]
+        else:
+            hidden_non_opt = torch.jit._unwrap_optional(hidden)
+            if isinstance(hidden_non_opt[0], Tensor):
+                hx = hidden_non_opt[0].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size).unbind(0)
+                cx = hidden_non_opt[1].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size).unbind(0)
+                hxcx = []
+                for idx in range(self.num_layers):
+                    hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0)))
+            else:
+                hxcx = hidden_non_opt
+
+        for idx in range(self.num_layers):
+            x, hxcx[idx] = self.layers[idx](x, hxcx[idx])
+
+        hx_list = []
+        cx_list = []
+        for idx in range(self.num_layers):
+            hx_list.append(hxcx[idx][0])
+            cx_list.append(hxcx[idx][1])
+        hx_tensor = torch.stack(hx_list)
+        cx_tensor = torch.stack(cx_list)
+
+        # We are creating another dimension for bidirectional case
+        # need to collapse it
+        hx_tensor = hx_tensor.reshape(-1, *hx_tensor.shape[-2:])
+        cx_tensor = cx_tensor.reshape(-1, *cx_tensor.shape[-2:])
+
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        return x, (hx_tensor, cx_tensor)
+
+    def _get_name(self):
+        return 'QuantizableLSTM'
+
+    @classmethod
+    def from_float(cls, other, qconfig=None):
+        assert isinstance(other, cls._FLOAT_MODULE)
+        assert (hasattr(other, 'qconfig') or qconfig)
+        observed = cls(other.input_size, other.hidden_size, other.num_layers,
+                       other.bias, other.batch_first, other.dropout,
+                       other.bidirectional)
+        observed.qconfig = getattr(other, 'qconfig', qconfig)
+        for idx in range(other.num_layers):
+            observed.layers[idx] = _LSTMLayer.from_float(other, idx, qconfig,
+                                                         batch_first=False)
+        observed.eval()
+        observed = torch.quantization.prepare(observed, inplace=True)
+        return observed
+
+    def from_observed(self, other):
+        return torch.quantization.convert(self, inplace=False,
+                                          remove_qconfig=True)
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index a9417ecb80f3..1be867e0a299 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.quantized as nnq
+import torch.nn.quantizable as nnqa
 from torch.nn.intrinsic import _FusedModule
 
 from .quantization_mappings import (
@@ -152,7 +153,10 @@ def insert_activation_post_process(m, special_act_post_process=None):
         elif needs_observation(child) and type(child) in custom_module_class_mapping:
             observed_child = custom_module_class_mapping[type(child)].from_float(child)
             setattr(module, name, observed_child)
-            insert_activation_post_process(observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if custom_module_class_mapping[type(child)] != nnqa.LSTM:
+                insert_activation_post_process(observed_child)
         else:
             add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
 
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 243cd964b96d..f14556597128 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -102,6 +102,35 @@ def _calculate_dynamic_per_channel_qparams(X, dtype):
 
     return scale, zero_point
 
+def _snr(x, x_hat):
+    """Calculates the signal to noise ratio and returns the signal and noise
+    power, as well as the SNR in dB.
+    If the input is a list/tuple this function is called recursively on each
+    element. The result will have the same nested structure as the inputs.
+
+    Args:
+        x, x_hat: Either a tensor or a nested list/tuple of tensors.
+    Returns:
+        signal, noise, SNR(in dB): Either floats or a nested list of floats
+    """
+    if isinstance(x, (list, tuple)):
+        assert(len(x) == len(x_hat))
+        res = []
+        for idx in range(len(x)):
+            res.append(_snr(x[idx], x_hat[idx]))
+        return res
+    if x_hat.is_quantized:
+        x_hat = x_hat.dequantize()
+    if x.is_quantized:
+        x = x.dequantize()
+    noise = (x - x_hat).norm()
+    if noise == 0:
+        return 0.0, float('inf'), float('inf')
+    signal = x.norm()
+    snr = signal / noise
+    snr_db = 20 * snr.log10()
+    return signal, noise, snr_db
+
 @contextmanager
 def override_quantized_engine(qengine):
     previous = torch.backends.quantized.engine

From 46afd7fc9faa90b3bca3a4966e54bae22fa2b2d8 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Wed, 30 Dec 2020 15:32:16 -0800
Subject: [PATCH 16/89] [PyTorch] Decouple version numbers from c10 and caffe2
 targets (#49905)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49905

There's size regression in model delivery in D25682312. Only the model version numbers are used. However, the dependency of the entire c10 (128 KB) is pulled in.

This diff is to decouple the version numbers to a separate header file, versions.h. Other targets referring to version numbers only can have deps of ```caffe2:version_headers```.
ghstack-source-id: 119161467

Test Plan: CI

Reviewed By: xcheng16, guangyfb

Differential Revision: D25716601

fbshipit-source-id: 07634bcf46eacfefa4aa75f2e4c9b9ee30c6929d
---
 caffe2/serialize/inline_container.h | 63 +-------------------------
 caffe2/serialize/versions.h         | 68 +++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 62 deletions(-)
 create mode 100644 caffe2/serialize/versions.h

diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index a34a6db70115..87c3151bbb76 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -12,6 +12,7 @@
 
 #include "caffe2/serialize/istream_adapter.h"
 #include "caffe2/serialize/read_adapter_interface.h"
+#include "caffe2/serialize/versions.h"
 
 extern "C" {
 typedef struct mz_zip_archive mz_zip_archive;
@@ -90,68 +91,6 @@ typedef struct mz_zip_archive mz_zip_archive;
 namespace caffe2 {
 namespace serialize {
 
-constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
-constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
-
-// Versions (i.e. why was the version number bumped?)
-
-// Note [Dynamic Versions and torch.jit.save vs. torch.save]
-//
-// Our versioning scheme has a "produced file format version" which
-// describes how an archive is to be read. The version written in an archive
-// is at least this current produced file format version, but may be greater
-// if it includes certain symbols. We refer to these conditional versions
-// as "dynamic," since they are identified at runtime.
-//
-// Dynamic versioning is useful when an operator's semantics are updated.
-// When using torch.jit.save we want those semantics to be preserved. If
-// we bumped the produced file format version on every change, however,
-// then older versions of PyTorch couldn't read even simple archives, like
-// a single tensor, from newer versions of PyTorch. Instead, we
-// assign dynamic versions to these changes that override the
-// produced file format version as needed. That is, when the semantics
-// of torch.div changed it was assigned dynamic version 4, and when
-// torch.jit.saving modules that use torch.div those archives also have
-// (at least) version 4. This prevents earlier versions of PyTorch
-// from accidentally performing the wrong kind of division. Modules
-// that don't use torch.div or other operators with dynamic versions
-// can write the produced file format version, and these programs will
-// run as expected on earlier versions of PyTorch.
-//
-// While torch.jit.save attempts to preserve operator semantics,
-// torch.save does not. torch.save is analogous to pickling Python, so
-// a function that uses torch.div will have different behavior if torch.saved
-// and torch.loaded across PyTorch versions. From a technical perspective,
-// torch.save ignores dynamic versioning.
-
-// 1. Initial version
-// 2. Removed op_version_set version numbers
-// 3. Added type tags to pickle serialization of container types
-// 4. (Dynamic) Stopped integer division using torch.div
-//      (a versioned symbol preserves the historic behavior of versions 1--3)
-// 5. (Dynamic) Stops torch.full inferring a floating point dtype
-//      when given bool or integer fill values.
-constexpr uint64_t kProducedFileFormatVersion = 0x3L;
-
-// the version we write when the archive contains bytecode.
-// It must be higher or eq to kProducedFileFormatVersion.
-// Because torchscript changes is likely introduce bytecode change.
-// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
-// should be increased too. The relationship is:
-// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
-//   >= kProducedFileFormatVersion
-constexpr uint64_t kProducedBytecodeVersion = 0x4L;
-
-static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
-    "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion.");
-
-// Introduce kMinSupportedBytecodeVersion for limited backward compatibility
-// support of bytecode. If
-// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader),
-// we should support this model_version. For example, we provide a wrapper to
-// handle an updated operator.
-constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
-
 class TORCH_API PyTorchStreamReader final {
  public:
   explicit PyTorchStreamReader(const std::string& file_name);
diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h
new file mode 100644
index 000000000000..4da4b2c50305
--- /dev/null
+++ b/caffe2/serialize/versions.h
@@ -0,0 +1,68 @@
+#pragma once
+
+namespace caffe2 {
+namespace serialize {
+
+constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
+constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
+
+// Versions (i.e. why was the version number bumped?)
+
+// Note [Dynamic Versions and torch.jit.save vs. torch.save]
+//
+// Our versioning scheme has a "produced file format version" which
+// describes how an archive is to be read. The version written in an archive
+// is at least this current produced file format version, but may be greater
+// if it includes certain symbols. We refer to these conditional versions
+// as "dynamic," since they are identified at runtime.
+//
+// Dynamic versioning is useful when an operator's semantics are updated.
+// When using torch.jit.save we want those semantics to be preserved. If
+// we bumped the produced file format version on every change, however,
+// then older versions of PyTorch couldn't read even simple archives, like
+// a single tensor, from newer versions of PyTorch. Instead, we
+// assign dynamic versions to these changes that override the
+// produced file format version as needed. That is, when the semantics
+// of torch.div changed it was assigned dynamic version 4, and when
+// torch.jit.saving modules that use torch.div those archives also have
+// (at least) version 4. This prevents earlier versions of PyTorch
+// from accidentally performing the wrong kind of division. Modules
+// that don't use torch.div or other operators with dynamic versions
+// can write the produced file format version, and these programs will
+// run as expected on earlier versions of PyTorch.
+//
+// While torch.jit.save attempts to preserve operator semantics,
+// torch.save does not. torch.save is analogous to pickling Python, so
+// a function that uses torch.div will have different behavior if torch.saved
+// and torch.loaded across PyTorch versions. From a technical perspective,
+// torch.save ignores dynamic versioning.
+
+// 1. Initial version
+// 2. Removed op_version_set version numbers
+// 3. Added type tags to pickle serialization of container types
+// 4. (Dynamic) Stopped integer division using torch.div
+//      (a versioned symbol preserves the historic behavior of versions 1--3)
+// 5. (Dynamic) Stops torch.full inferring a floating point dtype
+//      when given bool or integer fill values.
+constexpr uint64_t kProducedFileFormatVersion = 0x3L;
+
+// the version we write when the archive contains bytecode.
+// It must be higher or eq to kProducedFileFormatVersion.
+// Because torchscript changes is likely introduce bytecode change.
+// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
+// should be increased too. The relationship is:
+// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
+//   >= kProducedFileFormatVersion
+constexpr uint64_t kProducedBytecodeVersion = 0x4L;
+
+static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
+    "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion.");
+
+// Introduce kMinSupportedBytecodeVersion for limited backward compatibility
+// support of bytecode. If
+// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader),
+// we should support this model_version. For example, we provide a wrapper to
+// handle an updated operator.
+constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
+} // namespace serialize
+} // namespace caffe2

From cd608fe59b70fa7cafb07110096b2e023a8b6e9c Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 30 Dec 2020 17:04:40 -0800
Subject: [PATCH 17/89] Revert D25719980: [pytorch][PR] Accept input tensor
 with 0-dim batch size for MultiLabelMarginLoss

Test Plan: revert-hammer

Differential Revision:
D25719980 (https://github.com/pytorch/pytorch/commit/6b56b71e61e14bf4de5b371f0d8f2f2029065b31)

Original commit changeset: 83414bad37c0

fbshipit-source-id: 27eddd711a2b9e0adbc08bfab12100562e63ac21
---
 aten/src/ATen/native/LossMulti.h              | 72 --------------
 aten/src/ATen/native/LossMultiLabelMargin.cpp | 97 +++++++++++++------
 aten/src/ATen/native/LossMultiMargin.cpp      | 45 +++++++--
 .../generic/MultiLabelMarginCriterion.cu      | 51 +++-------
 .../THCUNN/generic/MultiMarginCriterion.cu    | 49 ++--------
 test/test_nn.py                               | 29 ------
 6 files changed, 121 insertions(+), 222 deletions(-)
 delete mode 100644 aten/src/ATen/native/LossMulti.h

diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h
deleted file mode 100644
index 4282c346702c..000000000000
--- a/aten/src/ATen/native/LossMulti.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/AccumulateType.h>
-
-#pragma once
-
-namespace at { namespace native {
-namespace {
-  static void multilabel_margin_loss_shape_check(
-    int64_t& nframe,
-    int64_t& dim,
-    const int64_t& ndims,
-    TensorArg& target_arg,
-    const Tensor& input,
-    const Tensor& target) {
-    bool valid_inputs = (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0;
-    TORCH_CHECK(
-                valid_inputs,
-                "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
-                input.sizes());
-
-    if (ndims <= 1) {
-      nframe = 1;
-      dim = ndims == 0 ? 1 : input.size(0);
-      TORCH_CHECK(
-                  valid_inputs && target.dim() <= 1 && target.numel() == dim,
-                  "inconsistent size ",
-                  target.sizes(),
-                  " for ",
-                  target_arg);
-    } else {
-      nframe = input.size(0);
-      dim = input.size(1);
-      TORCH_CHECK(
-                  valid_inputs && target.dim() == 2 && target.size(0) == nframe &&
-                  target.size(1) == dim,
-                  "inconsistent size ",
-                  target.sizes(),
-                  " for ",
-                  target_arg);
-    }
-  }
-
-  static void multi_margin_loss_shape_check(
-    int64_t& nframe,
-    int64_t& dim,
-    const int64_t& ndims,
-    TensorArg& target_arg,
-    const Tensor& input,
-    const Tensor& target) {
-    bool valid_inputs = (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0;
-    if (ndims <= 1) {
-      nframe = 1;
-      dim = ndims == 0 ? 1 : input.size(0);
-    } else {
-      nframe = input.size(0);
-      dim = input.size(1);
-    }
-    
-    TORCH_CHECK(
-                valid_inputs,
-                "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
-                input.sizes());
-    TORCH_CHECK(
-                valid_inputs && target.dim() <= 1 && target.numel() == nframe,
-                "inconsistent target size, got: ",
-                target.sizes());
-  }
-
-
-}  // anonymous namespace
-}} // namespace at::native
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index 3cd0f46e0a95..9582bf661a32 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -2,7 +2,6 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/native/LossMulti.h>
 
 namespace at {
 namespace native {
@@ -40,7 +39,6 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
       }
     }
   }
-
   return sum;
 }
 
@@ -102,32 +100,34 @@ static void multilabel_margin_loss_forward_out_cpu_template(
     Tensor& is_target,
     int64_t reduction) {
   auto target_arg = TensorArg(target, "target", 2);
+
+  const auto ndims = input.dim();
+
+  TORCH_CHECK(
+      input.numel() > 0 && ndims <= 2,
+      "non-empty vector or matrix expected, got size: ",
+      input.sizes());
+
   int64_t nframe, dim;
-  const int64_t ndims = input.dim();
   if (ndims <= 1) {
     nframe = 1;
     dim = ndims == 0 ? 1 : input.size(0);
-  }
-  else {
+    TORCH_CHECK(
+        target.numel() > 0 && target.dim() <= 1 && target.numel() == dim,
+        "inconsistent size ",
+        target.sizes(),
+        " for ",
+        target_arg);
+  } else {
     nframe = input.size(0);
     dim = input.size(1);
-  }
-  multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
-
-  // special case target.dim() <= 1: produce scalar output for scalar inputs
-  // even if reduction == Reduction::None
-  if (reduction != Reduction::None || target.dim() <= 1) {
-    output.resize_({});
-  } else {
-    output.resize_({nframe});
-  }
-
-  is_target.resize_as_(target);
-  TORCH_CHECK(is_target.is_contiguous(), "is_target must be contiguous");
-  is_target.zero_();
-
-  if (input.numel() == 0) {
-    return;
+    TORCH_CHECK(
+        target.numel() > 0 && target.dim() == 2 && target.size(0) == nframe &&
+            target.size(1) == dim,
+        "inconsistent size ",
+        target.sizes(),
+        " for ",
+        target_arg);
   }
 
   TORCH_CHECK(
@@ -138,6 +138,18 @@ static void multilabel_margin_loss_forward_out_cpu_template(
   auto input_contiguous = input.contiguous();
   auto target_contiguous = target.contiguous();
 
+  is_target.resize_as_(target);
+  TORCH_CHECK(is_target.is_contiguous(), "is_target must be contiguous");
+  is_target.zero_();
+
+  // special case target.dim() <= 1: produce scalar output for scalar inputs
+  // even if reduction == Reduction::None
+  if (reduction != Reduction::None || target.dim() <= 1) {
+    output.resize_({});
+  } else {
+    output.resize_({nframe});
+  }
+
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multilabel_margin_loss_forward_out_frame", [&] {
         multilabel_margin_loss_forward_out_frame<scalar_t>(
@@ -220,22 +232,39 @@ static void multilabel_margin_loss_backward_out_cpu_template(
     const Tensor& target,
     int64_t reduction,
     const Tensor& is_target) {
-  int64_t nframe, dim;
   CheckedFrom c = "multilabel_margin_loss_backward_cpu_template";
   auto target_arg = TensorArg(target, "target", 3);
   auto is_target_arg = TensorArg(is_target, "is_target", 5);
-  const int64_t ndims = input.dim();
 
-  multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
-  checkSameSize(c, target_arg, is_target_arg);
+  const auto ndims = input.dim();
 
-  grad_input.resize_as_(input);
-  if (grad_input.numel() == 0) {
-    return;
-  }
+  TORCH_CHECK(
+      input.numel() > 0 && ndims <= 2,
+      "non-empty vector or matrix expected, got size: ",
+      input.sizes());
 
-  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  grad_input.zero_();
+  int64_t nframe, dim;
+  if (ndims <= 1) {
+    nframe = 1;
+    dim = ndims == 0 ? 1 : input.size(0);
+    TORCH_CHECK(
+        target.numel() > 0 && target.dim() <= 1 && target.numel() == dim,
+        "inconsistent size ",
+        target.sizes(),
+        " for ",
+        target_arg);
+  } else {
+    nframe = input.size(0);
+    dim = input.size(1);
+    TORCH_CHECK(
+        target.numel() > 0 && target.dim() == 2 && target.size(0) == nframe &&
+            target.size(1) == dim,
+        "inconsistent size ",
+        target.sizes(),
+        " for ",
+        target_arg);
+  }
+  checkSameSize(c, target_arg, is_target_arg);
 
   TORCH_CHECK(
       target.min().item<int64_t>() >= -1, target_arg, " is out of range");
@@ -246,6 +275,10 @@ static void multilabel_margin_loss_backward_out_cpu_template(
   auto target_contiguous = target.contiguous();
   auto is_target_contiguous = is_target.contiguous();
 
+  grad_input.resize_as_(input);
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  grad_input.zero_();
+
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multilabel_margin_loss_backward_out_frame", [&] {
         multilabel_margin_loss_backward_out_frame<scalar_t>(
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index db18d1f655d4..48446a98559d 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -1,7 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/native/LossMulti.h>
 
 namespace at {
 namespace native {
@@ -94,13 +93,27 @@ void multi_margin_loss_out_cpu_template(
     Scalar margin,
     const Tensor& weight,
     int64_t reduction) {
-  int64_t nframe, dim;
   const auto ndims = input.dim();
-  auto target_arg = TensorArg(target, "target", 2);
+  TORCH_CHECK(
+      input.numel() > 0 && ndims <= 2,
+      "non-empty vector or matrix expected, got size: ",
+      input.sizes());
 
   TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
 
-  multi_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
+  int64_t nframe, dim;
+  if (ndims <= 1) {
+    nframe = 1;
+    dim = ndims == 0 ? 1 : input.size(0);
+  } else {
+    nframe = input.size(0);
+    dim = input.size(1);
+  }
+
+  TORCH_CHECK(
+      target.numel() > 0 && target.dim() <= 1 && target.numel() == nframe,
+      "inconsistent target size, got: ",
+      target.sizes());
 
   // produce a scalar output for 1d input
   if (reduction == Reduction::None && target.dim() > 0) {
@@ -108,9 +121,6 @@ void multi_margin_loss_out_cpu_template(
   } else {
     output.resize_({});
   }
-  if (input.numel() == 0) {
-    return;
-  }
 
   auto input_contiguous = input.contiguous();
   auto target_contiguous = target.contiguous();
@@ -202,13 +212,28 @@ void multi_margin_loss_backward_out_cpu_template(
     Scalar margin,
     const Tensor& weight,
     int64_t reduction) {
-  int64_t nframe, dim;
-  auto target_arg = TensorArg(target, "target", 2);
   const auto ndims = input.dim();
+  TORCH_CHECK(
+      input.numel() > 0 && ndims <= 2,
+      "non-empty vector or matrix expected, got size: ",
+      input.sizes());
 
   TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
 
-  multi_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
+  int64_t nframe, dim;
+  if (ndims <= 1) {
+    nframe = 1;
+    dim = ndims == 0 ? 1 : input.size(0);
+  } else {
+    nframe = input.size(0);
+    dim = input.size(1);
+  }
+
+  TORCH_CHECK(
+      target.numel() > 0 && target.dim() <= 1 && target.numel() == nframe,
+      "inconsistent target size, got: ",
+      target.sizes());
+
   grad_input.resize_as_(input);
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
 
diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
index 6e8d9bc91976..ab8d2cb1ad68 100644
--- a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -3,30 +3,21 @@
 #else
 
 static inline void THNN_(MultiLabelMarginCriterion_shapeCheck)(
-  THCState *state,
-  THCTensor *input, THCTensor *target) {
-  int64_t ndims = input->dim();
-  bool valid_inputs = (ndims == 2 && input->size(1) != 0) || (ndims == 1 && input->size(0) != 0) || ndims == 0;
-  TORCH_CHECK(
-    valid_inputs,
-    "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
-    input->sizes());
-
-  if (ndims <= 1) {
+                         THCState *state,
+                         THCTensor *input, THCTensor *target) {
+  if (input->dim() <= 1) {
     int dim = input->dim() == 0 ? 1 : input->size(0);
     int target_size = target->dim() == 0 ? 1 : target->size(0);
-
-    TORCH_CHECK(valid_inputs && target->dim() <= 1 && target->numel() == dim,
-      "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
-  } else if (ndims == 2) {
+    TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (target_size == dim),
+                "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
+  } else if (input->dim() == 2) {
     int nframe = input->size(0);
     int dim = input->size(1);
-
-    TORCH_CHECK(
-      valid_inputs && target->dim() == 2 && target->size(0) == nframe && target->size(1) == dim,
-      "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
+    TORCH_CHECK(!target->is_empty() && (target->dim() == 2)
+                && (target->size(0) == nframe) && (target->size(1) == dim),
+                "inconsistent target size: ", target->sizes(), " for input of size: ", input->sizes());
   } else {
-    TORCH_CHECK(false, "Expected input of ndims <= 2, but got ndims: ", ndims);
+    TORCH_CHECK(false, "non-empty vector or matrix expected, got size: ", input->sizes());
   }
 }
 
@@ -40,9 +31,6 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
            int64_t reduction)
 {
   THNN_(MultiLabelMarginCriterion_shapeCheck)(state, input, target);
-  if (input->numel() == 0) {
-    return;
-  }
   input = THCTensor_(newContiguous)(state, input);
   target = THCIndexTensor_(newContiguous)(state, target);
   istarget = THCTensor_(newContiguous)(state, istarget);
@@ -112,8 +100,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
     }
   }
   else {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ", 
-      input->sizes());
+    TORCH_INTERNAL_ASSERT(false, "non-empty vector or matrix expected (shouldn't get here)");
   }
 
   THCTensor_(free)(state, input);
@@ -130,17 +117,11 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
             THCTensor *istarget,
             int64_t reduction)
 {
-  THNN_(MultiLabelMarginCriterion_shapeCheck)(state, input, target);
   input = THCTensor_(newContiguous)(state, input);
-  THCTensor_(resizeAs)(state, gradInput, input);
-  if (input->numel() == 0) {
-    THCTensor_(free)(state, input);
-    return;
-  }
-
   target = THCIndexTensor_(newContiguous)(state, target);
   istarget = THCTensor_(newContiguous)(state, istarget);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
 
   if(gradInput->dim() <= 1)
   {
@@ -168,11 +149,10 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   {
     int nframe = gradInput->size(0);
     int dim = gradInput->size(1);
-    THArgCheck((input->size(1) != 0) && (target->dim() == 2) && (target->size(0) == nframe)
+    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
                && (target->size(1) == dim), 3, "inconsistent target size");
-    THArgCheck((istarget->dim() == 2) && (istarget->size(0) == nframe)
+    THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size(0) == nframe)
                && (istarget->size(1) == dim), 3, "inconsistent isTarget size");
-
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTILABELMARGIN_THREADS);
 
@@ -188,8 +168,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
         reduction != at::Reduction::None);
   }
   else {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
-      gradInput->sizes());
+    AT_ERROR("non-empty vector or matrix expected, got size: ", gradInput->sizes());
   }
 
   THCudaCheck(cudaGetLastError());
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index 129413f0b7b2..f2df15054a4c 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -2,30 +2,6 @@
 #define THC_GENERIC_FILE "THCUNN/generic/MultiMarginCriterion.cu"
 #else
 
-static inline void THNN_(MultiMarginCriterion_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input, THCTensor *target) {
-  int64_t nframe, dim;
-  int64_t ndims = input->dim();
-  bool valid_inputs = (ndims == 2 && input->size(1) != 0) || (ndims == 1 && input->size(0) != 0) || ndims == 0;
-  if (ndims <= 1) {
-    nframe = 1;
-    dim = ndims == 0 ? 1 : input->size(0);
-  } else {
-    nframe = input->size(0);
-    dim = input->size(1);
-  }
-
-  TORCH_CHECK(
-    valid_inputs,
-    "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
-    input->sizes());
-  TORCH_CHECK(
-      valid_inputs && target->dim() <= 1 && target->numel() == nframe,
-      "inconsistent target size, got: ",
-      target->sizes());
-}
-
 // TODO: improve error messages
 void THNN_(MultiMarginCriterion_updateOutput)(
            THCState *state,
@@ -37,10 +13,6 @@ void THNN_(MultiMarginCriterion_updateOutput)(
            THCTensor *weights,
            accreal margin_)
 {
-  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
-  if (input->numel() == 0) {
-    return;
-  }
   scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
   THCUNN_assertSameGPU(state, 2, input, target);
   input = THCTensor_(newContiguous)(state, input);
@@ -87,8 +59,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   else if (input->dim() == 2)
   {
     int nframe = input->size(0);
-    // allow zero-dim target for 2D input.
-    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -159,8 +130,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   }
   else
   {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
-    input->sizes());
+    AT_ERROR("non-empty vector or matrix expected, got sizes: ", input->sizes());
   }
 
   THCTensor_(free)(state, input);
@@ -179,17 +149,11 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
            THCTensor *weights,
            accreal margin_)
 {
-  THNN_(MultiMarginCriterion_shapeCheck)(state, input, target);
-  input = THCTensor_(newContiguous)(state, input);
-  THCTensor_(resizeAs)(state, gradInput, input);
-  if (input->numel() == 0) {
-    THCTensor_(free)(state, input);
-    return;
-  }
   scalar_t margin = ScalarConvert<accreal, scalar_t>::to(margin_);
   THCUNN_assertSameGPU(state, 3, input, gradInput, target);
+  input = THCTensor_(newContiguous)(state, input);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
+  THCTensor_(resizeAs)(state, gradInput, input);
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
 
@@ -231,7 +195,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   else if (input->dim() == 2)
   {
     int nframe = gradInput->size(0);
-    THArgCheck((input->size(1) != 0) && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -268,8 +232,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   }
   else
   {
-    TORCH_CHECK(false, "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ", 
-    input->sizes());
+    AT_ERROR("non-empty vector or matrix expected, got ", input->sizes());
   }
 
   THCTensor_(free)(state, input);
diff --git a/test/test_nn.py b/test/test_nn.py
index ef9ea4c8e6b1..386ba369dca6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10864,35 +10864,6 @@ def test_ReflectionPad_empty(self, device):
             inp = torch.randn(3, 0, 10, 10, device=device)
             mod(inp)
 
-
-    @onlyOnCPUAndCUDA
-    @dtypes(torch.float, torch.double)
-    def test_MarginLoss_empty(self, device, dtype):
-        for mod, x, y in [
-                (torch.nn.MultiMarginLoss().to(device),
-                 torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype),
-                 torch.ones(0, device=device).type(torch.long)),
-                (torch.nn.MultiLabelMarginLoss().to(device),
-                 torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype),
-                 torch.ones(0, 10, device=device).type(torch.long))]:
-
-            out = mod(x, y)
-            out.sum().backward()
-
-            self.assertEqual(x, torch.zeros_like(x))
-            self.assertEqual(x.grad, torch.zeros_like(x))
-
-            with self.assertRaisesRegex(RuntimeError, 'Expected'):
-                x = torch.randn(0, requires_grad=True, device=device, dtype=dtype)
-                y = torch.ones(10, device=device).type(torch.long)
-                mod(x, y)
-
-            with self.assertRaisesRegex(RuntimeError, 'Expected'):
-                x = torch.randn(10, 0, requires_grad=True, device=device, dtype=dtype)
-                y = torch.ones(10, 0, device=device).type(torch.long)
-                mod(x, y)
-
-
     @onlyOnCPUAndCUDA
     def test_Unfold_empty(self, device):
         inp = torch.randn(0, 3, 3, 4, device=device)

From 730965c246192c94c804e5ac4a95f175dca2fb18 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 30 Dec 2020 20:32:51 -0800
Subject: [PATCH 18/89] Improve `torch.flatten` docs and add tests to
 test_view_ops (#49501)

Summary:
Addresses https://github.com/pytorch/pytorch/issues/39474

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49501

Reviewed By: mruberry

Differential Revision: D25734450

Pulled By: soulitzer

fbshipit-source-id: 993667dd07acd81a4616465e0a3b94bde449193e
---
 test/test_view_ops.py | 63 +++++++++++++++++++++++++++++++++++++++++++
 torch/_torch_docs.py  | 12 ++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 3a1411d1a167..76d168031a0c 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -100,6 +100,12 @@ def is_view_of(self, base, other):
 
         return True
 
+    # Returns true if v1 and v2 are views of the same base
+    def is_view_of_same_base(self, v1, v2):
+        if (not v1._is_view() or v1 is v2):
+            return False
+        return self.is_view_of(v1._base, v2)
+
     # Performs transpose if contiguous=True, else returns the input tensor as is
     def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         if contiguous:
@@ -457,6 +463,63 @@ def test_reshape_nonview(self, device):
         nv[6] = 0
         self.assertNotEqual(t[1, 1], nv[6])
 
+    def test_flatten_view(self, device):
+        def test_writes_propagate(t, v):
+            idx_t = (0,) * t.ndim
+            idx_v = (0,) * v.ndim
+            v[idx_v] = 0
+            self.assertEqual(t[idx_t], v[idx_v])
+
+        t = torch.ones(1, 2, 3, 4, device=device)
+        v = t.flatten()
+        self.assertTrue(self.is_view_of(t, v))
+        test_writes_propagate(t, v)
+
+        # zero-dimensional tensor
+        t = torch.tensor(1, device=device)
+        v = t.flatten()
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of(t, v))
+
+        t = torch.ones(1, 2, 3, 4, device=device).transpose(2, 3)
+        v = t.flatten(0, 1)
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of_same_base(t, v))
+
+        # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups:
+        t = torch.ones(720, device=device) \
+            .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0))
+        #               [--1--|---2---|-3-] [--1--|----2---|-3-]
+        v1 = t.flatten(0, 1)
+        v2 = v1.flatten(1, 3)
+        v3 = v2.flatten(2, 2)
+        test_writes_propagate(t, v1)
+        self.assertTrue(self.is_view_of_same_base(t, v1))
+        test_writes_propagate(t, v2)
+        self.assertTrue(self.is_view_of_same_base(t, v2))
+        test_writes_propagate(t, v3)
+        self.assertTrue(self.is_view_of_same_base(t, v3))
+
+    def test_flatten_nonview(self, device):
+        def assert_is_nonview(t, nv):
+            idx_t = (0,) * t.ndim
+            idx_nv = (0,) * nv.ndim
+            self.assertTrue(not nv._is_view())
+            nv[idx_nv] = 0
+            self.assertNotEqual(t[idx_t], nv[idx_nv])
+        t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
+        nv = t.flatten(1, 3)
+        assert_is_nonview(t, nv)
+
+        t = torch.ones(2, 2, device=device).T
+        nv = t.flatten()
+        assert_is_nonview(t, nv)
+
+        # flatten returns the original object if start_dim=end_dim
+        t = t = torch.ones(2, 2, device=device)
+        nv = t.flatten(1, 1)
+        self.assertTrue(t is nv)
+
     def test_basic_indexing_slice_view(self, device):
         t = torch.ones(5, 5, device=device)
         v = t[:2, :3]
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index fe7237b5a370..4a1c36df7497 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3095,7 +3095,17 @@ def merge_dicts(*dicts):
            r"""
 flatten(input, start_dim=0, end_dim=-1) -> Tensor
 
-Flattens a contiguous range of dims in a tensor.
+Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+The order of elements in :attr:`input` is unchanged.
+
+Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+
+.. note::
+    Flattening a zero-dimensional tensor will return a one-dimensional view.
 
 Args:
     {input}

From 4677fc69a27974a655115a31318c34e81dd48f3f Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 30 Dec 2020 21:11:55 -0800
Subject: [PATCH 19/89] Fix inf norm grad (reland) (#48611)

Summary:
Reland of https://github.com/pytorch/pytorch/issues/48122

Does this result in a regression? No significant regression observed.

Timer script:
```
import torch
from torch.utils.benchmark import Timer

setup="""
a = torch.rand((2, 2), requires_grad=True)
gradient = torch.ones(2)
"""

stmt="""
torch.autograd.grad(torch.norm(a, dim=(0,), keepdim=False), a, gradient)
"""

timer = Timer(stmt, setup)

print(timer.timeit(10000))
print(timer.collect_callgrind(100))
```
Note: small matrix, keepdim is False, and dims is non-empty

Before change
```
Runtime   37.37 us
1 measurement, 10000 runs , 1 thread

                           All          Noisy symbols removed
    Instructions:     15279045                   15141710
    Baseline:             4257                       3851
100 runs per measurement, 1 thread
```

After change
```
Runtime 36.08 us
1 measurement, 10000 runs , 1 thread

                           All          Noisy symbols removed
    Instructions:     15296974                   15153534
    Baseline:             4257                       3851
100 runs per measurement, 1 thread
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48611

Reviewed By: albanD, mruberry

Differential Revision: D25309997

Pulled By: soulitzer

fbshipit-source-id: 5fb950dc9259234342985c0e84ada25a7e3814d6
---
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp  |  2 +-
 aten/src/ATen/native/cuda/ReduceNormKernel.cu |  2 +-
 test/test_autograd.py                         | 14 +++++++
 torch/csrc/autograd/FunctionsManual.cpp       | 41 +++++++++----------
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 5f96e01ab319..32033abcd4e2 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -225,7 +225,7 @@ static void norm_kernel_tensor_iterator_impl(
       binary_kernel_reduce(
         iter,
         AbsMaxOps<scalar_t, acc_t>(),
-        std::numeric_limits<acc_t>::min()
+        acc_t(0)
       );
     });
   } else if (val == -INFINITY) {
diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
index 3953f16b69c9..3a24f00f6ebf 100644
--- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
@@ -28,7 +28,7 @@ void norm_kernel_cuda_impl(TensorIterator& iter, Scalar val) {
   } else if (p == static_cast<double>(2)) {
     gpu_reduce_kernel<scalar_t, out_t>(iter, NormTwoOps<scalar_t, acc_t>(), 0);
   } else if (p == static_cast<double>(INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t>(), std::numeric_limits<acc_t>::min());
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t>(), 0);
   } else if (p == static_cast<double>(-INFINITY)) {
     gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMinOps<scalar_t, acc_t>(), std::numeric_limits<acc_t>::max());
   } else {
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 2107bfb3eb15..34c38eefa342 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2971,6 +2971,20 @@ def run_test(input_size, norm_deg):
         run_test((10,), 3)
         run_test((10,), 1)
         run_test((10,), 1.5)
+        run_test((10,), inf)
+
+    def test_norm_inf_subgradient(self):
+        def run_test(input, expected, dim=None):
+            x = torch.tensor(input, requires_grad=True)
+            out = x.norm(inf, dim=dim, keepdim=True)
+            out.backward(torch.ones(out.size()))
+            self.assertEqual(x.grad, expected)
+
+        run_test([0., 0., 0.], [0., 0., 0.])
+        run_test([1., 0., 1.], [0.5, 0., 0.5])
+        run_test([[1., 0., 1.], [0., 1., 1.]], [[0.25, 0., 0.25], [0., 0.25, 0.25]])
+        run_test([[1., 0., 1.], [0., 1., 0.]], [[0.5, 0., 0.5], [0., 1., 0.]], (1,))
+        run_test(torch.ones((2, 2, 2)), torch.full((2, 2, 2), 0.25), (0, 2))
 
     def test_pow_zero_tensor_gradient(self):
         def run_test(input_size, exponent):
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 23c0be2e70d6..3c84a0da4a99 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -160,10 +160,21 @@ std::tuple<Tensor, Tensor> _euclidean_dist_backward(const Tensor & grad, const T
             x2 * ratio.sum(-2, false).unsqueeze(-1) - ratio.transpose(-2, -1).matmul(x1)};
 }
 
-Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Scalar> & p_, const Tensor & norm) {
+Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional<Scalar> & p_, const Tensor& norm) {
+  return norm_backward(grad, self, p_, norm, {}, true);
+}
+
+Tensor norm_backward(Tensor grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
+  size_t ndim = self.sizes().size();
   double p = p_.value_or(2.0).toDouble();
   Tensor self_scaled;
   Tensor scale_v;
+
+  if (!keepdim && self.dim() != 0) {
+    grad = unsqueeze_multiple(grad, dim, ndim);
+    norm = unsqueeze_multiple(norm, dim, ndim);
+  }
+
   if (p == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else if (p == 1.0) {
@@ -172,8 +183,13 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Sc
     self_scaled = self;
     scale_v = grad / norm;
   } else if (std::isinf(p)) {
-    self_scaled = self.sgn() * (self.abs() == norm).type_as(self);
-    scale_v = grad.clone(at::MemoryFormat::Preserve);
+    Tensor is_eq_max = (self.abs() == norm).logical_or_(self.isnan().logical_and_(norm.isnan())).type_as(self);
+    self_scaled = self.sign() * is_eq_max;
+    Tensor nb_max = is_eq_max.count_nonzero(dim);
+    if (self.dim() != 0) {
+      nb_max = unsqueeze_multiple(nb_max, dim, ndim);
+    }
+    scale_v = grad / nb_max;
   } else if (p < 2.0) {
     self_scaled = self.sgn() * self.abs().pow(p - 1);
     scale_v = grad / norm.pow(p - 1);
@@ -186,25 +202,6 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Sc
   return self_scaled * scale_v;
 }
 
-Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
-  IntArrayRef sizes = self.sizes();
-  if (!keepdim && self.dim() != 0) {
-    if (dim.size()==1) {
-      grad = grad.unsqueeze(dim[0]);
-      norm = norm.unsqueeze(dim[0]);
-    } else {
-      auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, sizes.size());
-      for (size_t i = 0; i < sizes.size(); i++){
-        if (dims_to_unsqueeze[i]) {
-          grad = grad.unsqueeze(i);
-          norm = norm.unsqueeze(i);
-        }
-      }
-    }
-  }
-  return norm_backward(grad, self, p_, norm);
-}
-
 Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent) {
   if (exponent.equal(0.0)) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);

From de3d8f8c35c42d965ebfe8782c9c0ab4fc035bf3 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 30 Dec 2020 22:03:27 -0800
Subject: [PATCH 20/89] Revert D25734450: [pytorch][PR] Improve `torch.flatten`
 docs and add tests to test_view_ops

Test Plan: revert-hammer

Differential Revision:
D25734450 (https://github.com/pytorch/pytorch/commit/730965c246192c94c804e5ac4a95f175dca2fb18)

Original commit changeset: 993667dd07ac

fbshipit-source-id: 603af25311fc8b29bb033167f3b2704da79c3147
---
 test/test_view_ops.py | 63 -------------------------------------------
 torch/_torch_docs.py  | 12 +--------
 2 files changed, 1 insertion(+), 74 deletions(-)

diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 76d168031a0c..3a1411d1a167 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -100,12 +100,6 @@ def is_view_of(self, base, other):
 
         return True
 
-    # Returns true if v1 and v2 are views of the same base
-    def is_view_of_same_base(self, v1, v2):
-        if (not v1._is_view() or v1 is v2):
-            return False
-        return self.is_view_of(v1._base, v2)
-
     # Performs transpose if contiguous=True, else returns the input tensor as is
     def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         if contiguous:
@@ -463,63 +457,6 @@ def test_reshape_nonview(self, device):
         nv[6] = 0
         self.assertNotEqual(t[1, 1], nv[6])
 
-    def test_flatten_view(self, device):
-        def test_writes_propagate(t, v):
-            idx_t = (0,) * t.ndim
-            idx_v = (0,) * v.ndim
-            v[idx_v] = 0
-            self.assertEqual(t[idx_t], v[idx_v])
-
-        t = torch.ones(1, 2, 3, 4, device=device)
-        v = t.flatten()
-        self.assertTrue(self.is_view_of(t, v))
-        test_writes_propagate(t, v)
-
-        # zero-dimensional tensor
-        t = torch.tensor(1, device=device)
-        v = t.flatten()
-        test_writes_propagate(t, v)
-        self.assertTrue(self.is_view_of(t, v))
-
-        t = torch.ones(1, 2, 3, 4, device=device).transpose(2, 3)
-        v = t.flatten(0, 1)
-        test_writes_propagate(t, v)
-        self.assertTrue(self.is_view_of_same_base(t, v))
-
-        # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups:
-        t = torch.ones(720, device=device) \
-            .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0))
-        #               [--1--|---2---|-3-] [--1--|----2---|-3-]
-        v1 = t.flatten(0, 1)
-        v2 = v1.flatten(1, 3)
-        v3 = v2.flatten(2, 2)
-        test_writes_propagate(t, v1)
-        self.assertTrue(self.is_view_of_same_base(t, v1))
-        test_writes_propagate(t, v2)
-        self.assertTrue(self.is_view_of_same_base(t, v2))
-        test_writes_propagate(t, v3)
-        self.assertTrue(self.is_view_of_same_base(t, v3))
-
-    def test_flatten_nonview(self, device):
-        def assert_is_nonview(t, nv):
-            idx_t = (0,) * t.ndim
-            idx_nv = (0,) * nv.ndim
-            self.assertTrue(not nv._is_view())
-            nv[idx_nv] = 0
-            self.assertNotEqual(t[idx_t], nv[idx_nv])
-        t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
-        nv = t.flatten(1, 3)
-        assert_is_nonview(t, nv)
-
-        t = torch.ones(2, 2, device=device).T
-        nv = t.flatten()
-        assert_is_nonview(t, nv)
-
-        # flatten returns the original object if start_dim=end_dim
-        t = t = torch.ones(2, 2, device=device)
-        nv = t.flatten(1, 1)
-        self.assertTrue(t is nv)
-
     def test_basic_indexing_slice_view(self, device):
         t = torch.ones(5, 5, device=device)
         v = t[:2, :3]
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4a1c36df7497..fe7237b5a370 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3095,17 +3095,7 @@ def merge_dicts(*dicts):
            r"""
 flatten(input, start_dim=0, end_dim=-1) -> Tensor
 
-Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
-are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
-The order of elements in :attr:`input` is unchanged.
-
-Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
-or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
-be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
-flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
-
-.. note::
-    Flattening a zero-dimensional tensor will return a one-dimensional view.
+Flattens a contiguous range of dims in a tensor.
 
 Args:
     {input}

From 749f8b78508c43f9e6331f2395a4202785068442 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Wed, 30 Dec 2020 23:46:53 -0800
Subject: [PATCH 21/89] Remove flops warnings from the default profiler use
 case (#49896)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49896

Add missing check for with_flops option set

Test Plan:
python test/test_profiler.py
CI

Reviewed By: xuzhao9, ngimel

Differential Revision: D25716930

Pulled By: ilia-cher

fbshipit-source-id: 0da0bbb6c1a52328f665237e503406f877b41449
---
 torch/_C/_autograd.pyi                                  | 3 ++-
 torch/autograd/profiler.py                              | 4 +++-
 torch/csrc/autograd/init.cpp                            | 2 +-
 torch/csrc/autograd/profiler_legacy.cpp                 | 6 ++++--
 torch/csrc/autograd/profiler_legacy.h                   | 7 +++++--
 torch/distributed/rpc/server_process_global_profiler.py | 1 +
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index cfcb66896ad7..15a286f2370c 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -25,7 +25,8 @@ class ProfilerConfig:
         state: ProfilerState,
         report_input_shapes: bool,
         profile_memory: bool,
-        with_stack: bool
+        with_stack: bool,
+        with_flops: bool
     ) -> None: ...
     ...
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index a5c078e84f4c..a3d0da1aef9d 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -468,7 +468,8 @@ def config(self):
             self.profiler_kind,
             self.record_shapes,
             self.profile_memory,
-            self.with_stack)
+            self.with_stack,
+            self.with_flops)
 
     def __enter__(self):
         if not self.enabled:
@@ -746,6 +747,7 @@ def __enter__(self):
                 torch.autograd.ProfilerState.NVTX,
                 self.record_shapes,
                 False,
+                False,
                 False)
         )
         return self
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ca419522dff8..d86073a7af79 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -52,7 +52,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("CUDA", ActivityType::CUDA);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
-      .def(py::init<ProfilerState, bool, bool, bool>());
+      .def(py::init<ProfilerState, bool, bool, bool, bool>());
 
   py::class_<LegacyEvent>(m, "ProfilerEvent")
       .def("kind", &LegacyEvent::kindStr)
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 3b1d254e985b..85272677a06b 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -226,8 +226,10 @@ void ProfilerThreadLocalState::pushRange(
     evt.setSequenceNr(fn.seqNr());
     evt.setFwdThreadId(fn.forwardThreadId());
     evt.setScope((uint8_t)fn.scope());
-    evt.setExtraArgs(saveExtraArgs(fn));
-    evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs()));
+    if (config_.with_flops) {
+      evt.setExtraArgs(saveExtraArgs(fn));
+      evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs()));
+    }
 #ifndef C10_MOBILE
     // backward nodes source range corresponds to the forward node
     // TODO: consider using C++ stack trace
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 3e07c8cb541b..23169cd33450 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -387,16 +387,19 @@ struct TORCH_API ProfilerConfig {
       ProfilerState state,
       bool report_input_shapes = false,
       bool profile_memory = false,
-      bool with_stack = false)
+      bool with_stack = false,
+      bool with_flops = false)
       : state(state),
         report_input_shapes(report_input_shapes),
         profile_memory(profile_memory),
-        with_stack(with_stack) {}
+        with_stack(with_stack),
+        with_flops(with_flops) {}
   ~ProfilerConfig() = default;
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;
   bool with_stack;
+  bool with_flops;
 
   // Returns IValues corresponding to ProfilerConfig struct, to be used for
   // serialization.
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 6cd7b168ec6a..d8de89bfc937 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -116,6 +116,7 @@ def __enter__(self):
             profiler_kind,
             self.record_shapes,
             self.profile_memory,
+            False,
             False)
         _enable_server_process_global_profiler(profiler_config)
         return self

From 8aad66a7bd35782a0e662d133276830204499656 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Fri, 1 Jan 2021 02:09:35 -0800
Subject: [PATCH 22/89] [c10/**] Fix typos (#49815)

Summary:
All pretty minor. I avoided renaming `class DestructableMock` to `class DestructibleMock` and similar such symbol renames (in this PR).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49815

Reviewed By: VitalyFedyunin

Differential Revision: D25734507

Pulled By: mruberry

fbshipit-source-id: bbe8874a99d047e9d9814bf92ea8c036a5c6a3fd
---
 c10/CMakeLists.txt                       |  2 +-
 c10/core/DispatchKeySet.h                |  4 ++--
 c10/core/MemoryFormat.h                  |  4 ++--
 c10/core/Scalar.cpp                      |  2 +-
 c10/core/Stream.cpp                      |  2 +-
 c10/core/TensorImpl.h                    |  6 +++---
 c10/core/impl/DeviceGuardImplInterface.h |  2 +-
 c10/cuda/CMakeLists.txt                  |  2 +-
 c10/cuda/CUDACachingAllocator.cpp        |  2 +-
 c10/cuda/CUDAStream.cpp                  |  2 +-
 c10/cuda/CUDAStream.h                    |  2 +-
 c10/macros/Macros.h                      |  2 +-
 c10/mobile/CPUCachingAllocator.cpp       |  2 +-
 c10/mobile/CPUCachingAllocator.h         |  4 ++--
 c10/mobile/CPUProfilingAllocator.cpp     | 12 ++++++------
 c10/test/util/bfloat16_test.cpp          |  6 +++---
 c10/test/util/intrusive_ptr_test.cpp     | 14 +++++++-------
 c10/util/Bitset.h                        |  4 ++--
 c10/util/Flags.h                         |  2 +-
 c10/util/Logging.h                       |  2 +-
 c10/util/SmallVector.h                   |  2 +-
 c10/util/TypeCast.h                      |  2 +-
 c10/util/complex.h                       |  2 +-
 c10/util/intrusive_ptr.h                 |  2 +-
 c10/util/typeid.cpp                      |  2 +-
 25 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 48bceb440954..b175e5bdd6ce 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -23,7 +23,7 @@ configure_file(
     ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
 
 # Note: if you want to add ANY dependency to the c10 library, make sure you
-# check with the core PyTorch developers as the dependendency will be
+# check with the core PyTorch developers as the dependency will be
 # transitively passed on to all libraries dependent on PyTorch.
 file(GLOB C10_SRCS
         *.cpp
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 486272ece92e..58d456b950ed 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -124,7 +124,7 @@ class DispatchKeySet final {
 public:
   // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the
   // set. The iterator is only invalidated by the destruction of the underlying
-  // DispatchKeySet as the iterator stores a pointer to the raw represenation of
+  // DispatchKeySet as the iterator stores a pointer to the raw representation of
   // the DispatchKeySet.
   class iterator {
    public:
@@ -235,7 +235,7 @@ C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t);
 C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
 
 // This API exists because we have a use case for checking
-// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefind)
+// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
 // in OperatorEntry.cpp but we disallow it in has() API.
 C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias);
 
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
index e25814cd0717..6528f6c8f110 100644
--- a/c10/core/MemoryFormat.h
+++ b/c10/core/MemoryFormat.h
@@ -98,7 +98,7 @@ inline std::vector<int64_t> get_channels_last_strides_3d(IntArrayRef sizes) {
 // 1. Please do not combine these helper functions, each helper function handles
 // exactly one case of sizes + memory_format, by doing this, the strides indices
 // will be a constant array and we can access it using constant index number,
-// the complier will fully unroll the loop on strides indices to gain a better
+// the compiler will fully unroll the loop on strides indices to gain a better
 // performance.
 // 2. No error check in helper function, caller ensures the correctness of the input
 // 3. All helper functions have similar comments, only 1st helper function is commented here.
@@ -205,7 +205,7 @@ inline bool is_channels_last_strides_3d_s5(const IntArrayRef sizes, const IntArr
 //   a. we identify corner cases where the implementation compromises on.
 //
 // By the time accumulated permutation is enabled to replace implicit
-// memory_foramt through strides, we should be updating our tests and fix the
+// memory_format through strides, we should be updating our tests and fix the
 // issues in our tests.
 //
 // We use Channels Last 2d as an example above.
diff --git a/c10/core/Scalar.cpp b/c10/core/Scalar.cpp
index 212c41d5b19c..203b544924ec 100644
--- a/c10/core/Scalar.cpp
+++ b/c10/core/Scalar.cpp
@@ -3,7 +3,7 @@
 namespace c10 {
 
 Scalar Scalar::operator-() const {
-  TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not suppported.");
+  TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not supported.");
   if (isFloatingPoint()) {
     return Scalar(-v.d);
   } else if (isComplex()) {
diff --git a/c10/core/Stream.cpp b/c10/core/Stream.cpp
index 9a5c838c73fe..1a56c9d68567 100644
--- a/c10/core/Stream.cpp
+++ b/c10/core/Stream.cpp
@@ -2,7 +2,7 @@
 
 namespace c10 {
 
-// Not very parseable, but I don't know a good compact syntax for streams.
+// Not very parsable, but I don't know a good compact syntax for streams.
 // Feel free to change this into something more compact if needed.
 std::ostream& operator<<(std::ostream& stream, const Stream& s) {
   stream << "stream " << s.id() << " on device " << s.device();
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 3326404e1d07..e7f9c1260263 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -19,7 +19,7 @@
 #include <c10/util/python_stub.h>
 
 // A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
+// is shrunk to a smaller size. As a result, a Tensor is always going to
 // keep the memory allocated for its maximum capacity reshaped to so far.
 //
 // This parameter is respected "upper-case" methods which call Resize()
@@ -625,7 +625,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * The API is as follows:
    *   - "new_grad" is a Tensor containing the new value of the gradient that should
    *     be set
-   *   - "self" should reprensent the Tensor whose forward grad is accessed. It is
+   *   - "self" should represent the Tensor whose forward grad is accessed. It is
    *     required when dealing with view.
    *   - "level" allows to specify the level of forward AD nesting for which the
    *     gradient should be set. Note that since levels are not fully supported
@@ -1381,7 +1381,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // error in attempt to invoke TypeMeta::ctor()
     static_assert(
         std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
+        "Tensor can't hold non-default-constructable types");
     return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
   }
 
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 2ef02b57d3be..258f8953f4de 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -126,7 +126,7 @@ struct C10_API DeviceGuardImplInterface {
 /**
  * Increments the event's version and enqueues a job with this version
  * in the stream's work queue. When the stream process that job
- * it nofifies all streams waiting on / blocked by that version of the
+ * it notifies all streams waiting on / blocked by that version of the
  * event to continue and marks that version as recorded.
  * */
   virtual void record(
diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index c8fa53df6f02..256fc54b08a1 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -13,7 +13,7 @@ configure_file(
     ${CMAKE_BINARY_DIR}/c10/cuda/impl/cuda_cmake_macros.h)
 
 # Note: if you want to add ANY dependency to the c10 library, make sure you
-# check with the core PyTorch developers as the dependendency will be
+# check with the core PyTorch developers as the dependency will be
 # transitively passed on to all libraries dependent on PyTorch.
 
 # Note: if you add a new source file/header, you will need to update
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 0b5d2992538c..493296248e5b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -62,7 +62,7 @@ constexpr size_t kSmallSize = 1048576;      // largest "small" allocation is 1 M
 constexpr size_t kSmallBuffer = 2097152;    // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kLargeBuffer = 20971520;   // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152;     // round up large allocs to 2 MiB
+constexpr size_t kRoundLarge = 2097152;     // round up large allocations to 2 MiB
 
 typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
 
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 457331f4a00d..d1e290c3f02c 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -60,7 +60,7 @@ static LeakyStreamInternals default_streams[C10_COMPILE_TIME_MAX_GPUS];
 // in the pool to be returned when a stream is requested (round-robin fashion
 // , see the note in CUDAStream.h).
 //
-// unique_ptr<T[]> is used instead of vector<T> because T might be non-moveable
+// unique_ptr<T[]> is used instead of vector<T> because T might be non-movable
 // and non-copyable.
 static std::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS];
 static std::atomic<uint32_t> low_priority_counters[C10_COMPILE_TIME_MAX_GPUS];
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 41802b3bc9ef..05eddf5ce122 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -152,7 +152,7 @@ class C10_CUDA_API CUDAStream {
   static std::tuple<int, int> priority_range() {
       // Note: this returns the range of priority **supported by PyTorch**, not
       // the range of priority **supported by CUDA**. The former is a subset of
-      // the latter. Curently PyTorch only supports 0 and -1, which are "low" and
+      // the latter. Currently PyTorch only supports 0 and -1, which are "low" and
       // "high" priority.
       int least_priority, greatest_priority;
       C10_CUDA_CHECK(
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 46ff50621417..5499a7d8b81c 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -316,7 +316,7 @@ __host__ __device__
 #define C10_MOBILE 1
 #endif // ANDROID / IOS
 
-// Portably determine if a type T is trivially copyable or not.
+// Portable determination of whether type T is trivially copyable.
 // Warning: __has_trivial_copy for GCC may not always detect the non-POD
 // correctly. For example, T = std::unique_ptr may evaluate to true and be
 // treated as POD. This can cause unexpected behavior.
diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
index bde4067d45dc..0114856ca89b 100644
--- a/c10/mobile/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -61,7 +61,7 @@ void CPUCachingAllocator::record_free(void* ptr) {
   // is being freed outside the scope of this allocator.
   // At the moment only way to capture this is to have the allocator,
   // that uses this CachingAllocator as the backing allocator,
-  // call this function explicity upon freeing memory while
+  // call this function explicitly upon freeing memory while
   // outside the scope of caching allocator.
   // If the memory is freed in some other way, then we will likely
   // have undefined behavior or page fault. But this can be
diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
index 2f11e6ea8669..c80fee0682eb 100644
--- a/c10/mobile/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -26,7 +26,7 @@
  * What are the cons?
  *    There are some cons that were observed where use of caching allocator led to
  *    worse performance on some platforms. Reason being that the caching mechanism
- *    used by this allocator left us worse off compared to the corresonding platform's
+ *    used by this allocator left us worse off compared to the corresponding platform's
  *    tuned memory allocator. In that case it seemed better to not use this allocator.
  *    Note there are some ideas to fix this in the works.
  *
@@ -63,7 +63,7 @@ class C10_API CPUCachingAllocator {
     //    returned the memory to OS via free_cached.
     //  1.1. Therefore even when the said memory is "freed" via this
     //       allocator (and thus cached), it will continue to stay
-    //       in allocaiton_map_. Furthermore it will also exist in
+    //       in allocation_map_. Furthermore it will also exist in
     //       available_map_. Thus an allocated memory pointer can be in both
     //       allocation_map_ and available_map_ simultaneously.
     // 2. Memory pointer maybe removed from allocation_map_, when it
diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp
index 5f2b28b4b2d0..0118d0a29587 100644
--- a/c10/mobile/CPUProfilingAllocator.cpp
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@@ -133,7 +133,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
   // Upon free end_ptr = offset + size
   // If end_ptr exists merge freed allocation
-  // Also find coresponding offset in size_to_offet
+  // Also find corresponding offset in size_to_offset
   // Remove that entry and update with new size and offset
   // If end_ptr does not exist then just insert offset,size
   // in map and correspondingly size, offset in the other map.
@@ -176,7 +176,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
       }
       allocation_offsets[mem_event.allocation_id] = alloc_offset;
     } else {
-      // 1. Check if freed block is adjancent to an existing free block
+      // 1. Check if freed block is adjacent to an existing free block
       //    at its end boundary. This is done by checking
       //    free_end_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
@@ -186,7 +186,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
       //    free_start_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
       //    the block being freed.
-      // 3. Inser the freed block in map.
+      // 3. Insert the freed block in map.
       auto freed_offset = allocation_offsets[mem_event.allocation_id];
       auto freed_size = mem_event.size;
       auto end_offset = freed_offset + freed_size;
@@ -223,7 +223,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
     }
   }
   TORCH_CHECK(validate_allocation_plan(mem_events, allocation_offsets),
-      "ProfilingAllocator: Allocation plan invaild.");
+      "ProfilingAllocator: Allocation plan invalid.");
   return allocation_offsets;
 }
 
@@ -394,7 +394,7 @@ CPUProfilingAllocator::~CPUProfilingAllocator() {
 
 WithProfileAllocationsGuard::WithProfileAllocationsGuard(
     AllocationPlan* plan) {
-  // Nesting of allocation profiling does not seem meanigful.
+  // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan);
@@ -409,7 +409,7 @@ WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
 
 WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
     AllocationPlan* plan, bool* success) {
-  // Nesting of allocation profiling does not seem meanigful.
+  // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan, true);
diff --git a/c10/test/util/bfloat16_test.cpp b/c10/test/util/bfloat16_test.cpp
index d08f512053ab..af00bab99c5b 100644
--- a/c10/test/util/bfloat16_test.cpp
+++ b/c10/test/util/bfloat16_test.cpp
@@ -87,7 +87,7 @@ namespace {
   }
 
   TEST(BFloat16Math, Addition) {
-    // This test verifies that if only first 7 bits of float's mantisa are
+    // This test verifies that if only first 7 bits of float's mantissa are
     // changed after addition, we should have no loss in precision.
 
     // input bits
@@ -108,8 +108,8 @@ namespace {
     EXPECT_EQ(res, expected);
   }
 
-  TEST(BFloat16Math, Substraction) {
-    // This test verifies that if only first 7 bits of float's mantisa are
+  TEST(BFloat16Math, Subtraction) {
+    // This test verifies that if only first 7 bits of float's mantissa are
     // changed after subtraction, we should have no loss in precision.
 
     // input bits
diff --git a/c10/test/util/intrusive_ptr_test.cpp b/c10/test/util/intrusive_ptr_test.cpp
index 2ea283d1a4f0..9df5b004a094 100644
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@@ -694,21 +694,21 @@ TEST(IntrusivePtrTest, Equality_Nullptr) {
   EXPECT_FALSE(var1 != var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality) {
+TEST(IntrusivePtrTest, Inequality) {
   intrusive_ptr<SomeClass> var1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2);
   EXPECT_FALSE(var1 == var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality_NullptrLeft) {
+TEST(IntrusivePtrTest, Inequality_NullptrLeft) {
   intrusive_ptr<SomeClass> var1;
   intrusive_ptr<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2);
   EXPECT_FALSE(var1 == var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality_NullptrRight) {
+TEST(IntrusivePtrTest, Inequality_NullptrRight) {
   intrusive_ptr<SomeClass> var1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> var2;
   EXPECT_TRUE(var1 != var2);
@@ -2487,28 +2487,28 @@ TEST(WeakIntrusivePtrTest, Equality_Invalid) {
   EXPECT_FALSE(var1 != var2);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality) {
+TEST(WeakIntrusivePtrTest, Inequality) {
   IntrusiveAndWeak<SomeClass> var1 = make_intrusive<SomeClass>();
   IntrusiveAndWeak<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1.weak != var2.weak);
   EXPECT_FALSE(var1.weak == var2.weak);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_InvalidLeft) {
+TEST(WeakIntrusivePtrTest, Inequality_InvalidLeft) {
   weak_intrusive_ptr<SomeClass> var1 = make_invalid_weak<SomeClass>();
   IntrusiveAndWeak<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2.weak);
   EXPECT_FALSE(var1 == var2.weak);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_InvalidRight) {
+TEST(WeakIntrusivePtrTest, Inequality_InvalidRight) {
   IntrusiveAndWeak<SomeClass> var1 = make_intrusive<SomeClass>();
   weak_intrusive_ptr<SomeClass> var2 = make_invalid_weak<SomeClass>();
   EXPECT_TRUE(var1.weak != var2);
   EXPECT_FALSE(var1.weak == var2);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_WeakOnly) {
+TEST(WeakIntrusivePtrTest, Inequality_WeakOnly) {
   weak_intrusive_ptr<SomeClass> var1 = make_weak_only<SomeClass>();
   weak_intrusive_ptr<SomeClass> var2 = make_weak_only<SomeClass>();
   EXPECT_TRUE(var1 != var2);
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index e849563e60fe..964146be05e7 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -64,7 +64,7 @@ struct bitset final {
     bitset cur = *this;
     size_t index = cur.find_first_set();
     while (0 != index) {
-      // -1 because find_first_set() is not one-indiced.
+      // -1 because find_first_set() is not one-indexed.
       index -= 1;
       func(index);
       cur.unset(index);
@@ -73,7 +73,7 @@ struct bitset final {
   }
 
 private:
-  // Return the index of the first set bit. The returned index is one-indiced
+  // Return the index of the first set bit. The returned index is one-indexed
   // (i.e. if the very first bit is set, this function returns '1'), and a return
   // of '0' means that there was no bit set.
   size_t find_first_set() const {
diff --git a/c10/util/Flags.h b/c10/util/Flags.h
index 6bfe62507fcd..b4352510c997 100644
--- a/c10/util/Flags.h
+++ b/c10/util/Flags.h
@@ -4,7 +4,7 @@
 /* Commandline flags support for C10.
  *
  * This is a portable commandline flags tool for c10, so we can optionally
- * choose to use gflags or a lightweighted custom implementation if gflags is
+ * choose to use gflags or a lightweight custom implementation if gflags is
  * not possible on a certain platform. If you have gflags installed, set the
  * macro C10_USE_GFLAGS will seamlessly route everything to gflags.
  *
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index acab3cfecd23..6fa7e93f26d8 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -284,7 +284,7 @@ BINARY_COMP_HELPER(LessEquals, <=)
  * Very lightweight logging for the first time API usage. It's beneficial for
  * tracking of individual functionality usage in larger applications.
  *
- * In order to ensure light-weightness of logging, we utilize static variable
+ * In order to ensure light-weightedness of logging, we utilize static variable
  * trick - LogAPIUsage will be invoked only once and further invocations will
  * just do an atomic check.
  *
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 076a1d401065..9b32d8edfe7f 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -832,7 +832,7 @@ SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
 
   // If we have to grow to have enough elements, destroy the current elements.
   // This allows us to avoid copying them during the grow.
-  // FIXME: don't do this if they're efficiently moveable.
+  // FIXME: don't do this if they're efficiently movable.
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
     this->destroy_range(this->begin(), this->end());
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index df15509d7e0f..85513ecc5e2f 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -44,7 +44,7 @@ struct static_cast_with_inter_type {
 // Note: Converting from negative float values to unsigned integer types is
 // undefined behavior in C++, and current CPU and GPU compilers exhibit
 // divergent behavior. Casting from negative float values to signed
-// integer types and then to unsigned integer types is not undefiend,
+// integer types and then to unsigned integer types is not undefined,
 // however, so this cast improves the consistency of type conversions
 // to uint8 across compilers.
 // Further note: Type conversions across compilers still have other undefined
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 2578da2957ab..d4d5525170af 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -61,7 +61,7 @@ namespace c10 {
 //     Since we only support float and double, on will use `complex& operator=(T x)`
 // - Copy assignment operator and converting assignment operator
 //   - There is no specialization of converting assignment operators, which type is
-//     convertible is soly depend on whether the scalar type is convertable
+//     convertible is solely dependent on whether the scalar type is convertible
 //
 // In addition to the standard assignment, we also provide assignment operators with std and thrust
 //
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 761dd27d6d46..637db95991f2 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -700,7 +700,7 @@ class weak_intrusive_ptr final {
   /**
    * Takes an owning (but must be weakly referenced) pointer to TTarget* and
    * creates a weak_intrusive_ptr that takes over ownership.
-   * Thas means the weakcount is not increased.
+   * This means that the weakcount is not increased.
    * This is the counter-part to weak_intrusive_ptr::release() and the pointer
    * passed in *must* have been created using weak_intrusive_ptr::release().
    */
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index f3fe048b4cca..79c093cbeb31 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -60,7 +60,7 @@ CAFFE_KNOWN_TYPE(bool*)
 CAFFE_KNOWN_TYPE(char*)
 CAFFE_KNOWN_TYPE(int*)
 
-// For some of the compilers, long is definied separately from int32_t and
+// For some of the compilers, long is defined separately from int32_t and
 // int64_t. As a result we will need to actually define them separately.
 // It is recommended that one does NOT use long - use int32_t and int64_t
 // explicitly. Explicit long type annotation may go away in the future.

From e44b2b72bd4ccecf9c2f6c18d09c11eff446b5a3 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Sun, 3 Jan 2021 00:10:04 -0800
Subject: [PATCH 23/89] Back out "[pytorch][PR] Preserve memory format in qconv
 op" (#49994)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49994

Revert preserving memory format in qconv op because it is negatively affecting performance, will revert revert after fixing all issues

Test Plan: pytest fbcode/caffe2/test/quantization/test_quantized_op.py

Reviewed By: kimishpatel

Differential Revision: D25731279

fbshipit-source-id: 908dbb127210a93b27ada7ccdfa531177edf679a
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp |  2 +-
 test/quantization/test_quantized_op.py       | 32 ++++++--------------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index b7d893ad55fc..05762bfb036f 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -746,7 +746,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       run_status == pytorch_qnnp_status_success,
       "failed to run quantized::conv2d (qnnpack) operator");
 
-  return output.contiguous(act.suggest_memory_format());
+  return output;
 }
 
 template at::Tensor PackedConvWeightsQnnp<2>::apply(
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index be044fa5211a..a192eddca234 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -3427,7 +3427,7 @@ def _make_qconv_tensors(
         self, batch_size, input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, dilations,
         X_scale, X_zero_point, W_scale, W_zero_point,
-        use_bias, use_channelwise, use_transpose, memory_format=torch.contiguous_format
+        use_bias, use_channelwise, use_transpose
     ):
         assert not (use_channelwise and use_transpose), \
                "Cannot generate channelwise qconv_transpose_tensors "
@@ -3475,7 +3475,6 @@ def _make_qconv_tensors(
             (batch_size, input_channels,) + input_feature_map_shape,
         )
         X = X_scale * (X_init - X_zero_point).float()
-        X = X.to(memory_format=memory_format)
 
         if use_channelwise:
             W_shape = (-1, 1) + (1,) * len(kernels)
@@ -3508,15 +3507,13 @@ def _test_qconv_impl(
         input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, o_pads,
         dilations, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose,
-        memory_format=torch.contiguous_format
+        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose
     ):
         (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors(
             batch_size, input_channels_per_group, input_feature_map_shape,
             output_channels_per_group, groups, kernels,
             strides, pads, dilations, X_scale, X_zero_point, W_scale,
-            W_zero_point, use_bias, use_channelwise, use_transpose,
-            memory_format)
+            W_zero_point, use_bias, use_channelwise, use_transpose)
         # Assign weights
         W = W_q.dequantize()
         X = X_q.dequantize()
@@ -3564,14 +3561,6 @@ def _test_qconv_impl(
             pads: {pads}, o_pads: {o_pads}, dilations: {dilations},
             groups: {groups}, y_s: {Y_scale}, y_zp: {Y_zero_point}''')
 
-        # fbgemm for now forces output to be NHWC (channels last) to opportunistically
-        # improve performance
-        if torch.backends.quantized.engine == 'qnnpack':
-            # Make sure memory format is preserved
-            self.assertEqual(
-                X_q.is_contiguous(memory_format=memory_format),
-                Y_q.is_contiguous(memory_format=memory_format))
-
         # Return the quantized data for later reuse
         return X_q, W_q, bias_float
 
@@ -3644,14 +3633,12 @@ def test_qconv2d(
             dilations,
             groups,
         )
-        for memory_format in (torch.contiguous_format, torch.channels_last):
-            self._test_qconv_impl(
-                qconv, qconv_prepack, conv_op, batch_size,
-                input_channels_per_group, (height, width),
-                output_channels_per_group, groups, kernels, strides, pads, None,
-                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
-                memory_format)
+        self._test_qconv_impl(
+            qconv, qconv_prepack, conv_op, batch_size,
+            input_channels_per_group, (height, width),
+            output_channels_per_group, groups, kernels, strides, pads, None,
+            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False)
 
     """Tests the correctness of quantized convolution op."""
     @given(batch_size=st.integers(1, 3),
@@ -4244,7 +4231,6 @@ def test_qconv3d_unpack(
             (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad),
             channelwise)
 
-
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
            channels=st.integers(1, 64),

From c7e9abb66abef127f8cebccbe0aa27c6ded9ead6 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 4 Jan 2021 05:01:02 -0800
Subject: [PATCH 24/89] Making ops c10-full: list of optional tensors (#49138)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49138

See for details: https://fb.quip.com/QRtJAin66lPN

We need to model optional types explicitly, mostly for schema inference. So we cannot pass a `Tensor?[]` as `ArrayRef<Tensor>`, instead we need to pass it as an optional type. This PR changes it to `torch::List<c10::optional<Tensor>>`. It also makes the ops c10-full that were blocked by this.

## Backwards Compatibility

- This should not break the Python API because the representation in Python is the same and python_arg_parser just transforms the python list into a `List<optional<Tensor>>` instead of into a `List<Tensor>`.
- This should not break serialized models because there's some logic that allows loading a serialized `List<Tensor>` as `List<optional<Tensor>>`, see https://github.com/pytorch/pytorch/pull/49138/files#diff-9315f5dd045f47114c677174dcaa2f982721233eee1aa19068a42ff3ef775315R57
- This will break backwards compatibility for the C++ API. There is no implicit conversion from `ArrayRef<Tensor>` (which was the old argument type) to `List<optional<Tensor>>`. One common call pattern is `tensor.index({indices_tensor})`, where indices_tensor is another `Tensor`, and that will continue working because the `{}` initializer_list constructor for `List<optional<Tensor>>` can take `Tensor` elements that are implicitly converted to `optional<Tensor>`, but another common call pattern was `tensor.index(indices_tensor)`, where previously, the `Tensor` got implicitly converted to an `ArrayRef<Tensor>`, and to implicitly convert `Tensor -> optional<Tensor> -> List<optional<Tensor>>` would be two implicit conversions. C++ doesn't allow chaining. two implicit conversions. So those call sites have to be rewritten to `tensor.index({indices_tensor})`.

ghstack-source-id: 119269131

Test Plan:
## Benchmarks (C++ instruction counts):
### Forward
#### Script
```py
from torch.utils.benchmark import Timer

counts = Timer(
    stmt="""
        auto t = {{op call to measure}};
    """,
    setup="""
        using namespace torch::indexing;
        auto x = torch::ones({4, 4, 4});
    """,
    language="cpp",
).collect_callgrind(number=1_000)
print(counts)
```
#### Results
|  Op call                                                              |before   |after   |delta  |      |
|------------------------------------------------------------------------|---------|--------|-------|------|
|x[0] = 1                                                                |11566015 |11566015|0      |0.00% |
|x.index({0})                                                            |6807019  |6801019 |-6000  |-0.09%|
|x.index({0, 0})                                                         |13529019 |13557019|28000  |0.21% |
|x.index({0, 0, 0})                                                      |10677004 |10692004|15000  |0.14% |
|x.index({"..."})                                                        |5512015  |5506015 |-6000  |-0.11%|
|x.index({Slice(None, None, None)})                                      |6866016  |6936016 |70000  |1.02% |
|x.index({None})                                                         |8554015  |8548015 |-6000  |-0.07%|
|x.index({false})                                                        |22400000 |22744000|344000 |1.54% |
|x.index({true})                                                         |27624088 |27264393|-359695|-1.30%|
|x.index({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})})|123472000|123463306|-8694|-0.01%|

### Autograd
#### Script
```py
from torch.utils.benchmark import Timer

counts = Timer(
    stmt="""
        auto t = {{op call to measure}};
    """,
    setup="""
        using namespace torch::indexing;
        auto x = torch::ones({4, 4, 4}, torch::requires_grad());
    """,
    language="cpp",
).collect_callgrind(number=1_000)
print(counts)
```
Note: the script measures the **forward** path of an op call with autograd enabled (i.e. calls into VariableType). It does not measure the backward path.

#### Results
|  Op call                                                              |before   |after   |delta  |      |
|------------------------------------------------------------------------|---------|--------|-------|------|
|x.index({0})                                                            |14839019|14833019|-6000| 0.00% |
|x.index({0, 0})                                                         |28342019|28370019|28000| 0.00% |
|x.index({0, 0, 0})                                                      |24434004|24449004|15000| 0.00% |
|x.index({"..."})                                                       |12773015|12767015|-6000| 0.00% |
|x.index({Slice(None, None, None)})                                      |14837016|14907016|70000| 0.47% |
|x.index({None})                                                        |15926015|15920015|-6000| 0.00% |
|x.index({false})                                                        |36958000|37477000|519000| 1.40% |
|x.index({true})                                                         |41971408|42426094|454686| 1.08% |
|x.index({"...", 0, true, Slice(1, None, 2), torch::tensor({1, 2})}) |168184392|164545682|-3638710| -2.16% |

Reviewed By: bhosmer

Differential Revision: D25454632

fbshipit-source-id: 28ab0cffbbdbdff1c40b4130ca62ee72f981b76d
---
 aten/src/ATen/ATen.h                          |   1 +
 aten/src/ATen/ParallelOpenMP.cpp              |   1 +
 aten/src/ATen/TensorIndexing.h                |  11 +-
 aten/src/ATen/autocast_mode.cpp               |   2 +-
 aten/src/ATen/core/List.h                     |   2 +-
 aten/src/ATen/core/List_inl.h                 |  16 +-
 aten/src/ATen/core/Variadic.h                 |  10 +
 aten/src/ATen/core/jit_type.h                 | 187 +----------------
 aten/src/ATen/core/jit_type_base.h            | 195 ++++++++++++++++++
 aten/src/ATen/native/Embedding.cpp            |   2 +-
 aten/src/ATen/native/IndexingUtils.h          |  63 +++---
 aten/src/ATen/native/LinearAlgebra.cpp        |   4 +-
 .../ATen/native/TensorAdvancedIndexing.cpp    |  26 ++-
 aten/src/ATen/native/TensorAdvancedIndexing.h |   4 +-
 aten/src/ATen/native/cuda/IndexKernel.cu      |   2 +-
 aten/src/ATen/native/cuda/Indexing.cu         |   4 +-
 aten/src/ATen/native/native_functions.yaml    |   4 +
 aten/src/ATen/native/sparse/SparseTensor.cpp  |   4 +-
 aten/src/ATen/templates/TensorBody.h          |   1 +
 caffe2/contrib/aten/aten_op.cc                |  16 +-
 caffe2/contrib/aten/aten_op_template.h        |  12 +-
 caffe2/contrib/aten/gen_op.py                 |  18 +-
 test/cpp/api/tensor_indexing.cpp              |  16 +-
 test/test_overrides.py                        |   2 +
 tools/autograd/gen_autograd_functions.py      |  11 +-
 tools/autograd/gen_trace_type.py              |   3 +-
 tools/autograd/gen_variable_type.py           |  51 ++++-
 tools/autograd/templates/Functions.h          |   9 +
 tools/autograd/templates/VariableType.h       |   1 -
 tools/codegen/api/cpp.py                      |   8 +-
 tools/codegen/api/native.py                   |   2 +-
 tools/codegen/api/python.py                   |  11 +-
 torch/csrc/autograd/FunctionsManual.cpp       |  35 ++--
 torch/csrc/autograd/FunctionsManual.h         |   2 +-
 torch/csrc/autograd/VariableTypeManual.cpp    |  11 +-
 torch/csrc/autograd/VariableTypeUtils.h       |  19 ++
 torch/csrc/jit/backends/backend_detail.h      |   1 +
 torch/csrc/jit/frontend/tracer.cpp            |  13 ++
 torch/csrc/jit/frontend/tracer.h              |   4 +
 torch/csrc/jit/mobile/module.h                |   1 +
 torch/csrc/jit/runtime/interpreter.h          |   1 +
 torch/csrc/jit/runtime/register_prim_ops.cpp  |   8 +-
 torch/csrc/jit/runtime/vararg_functions.h     |   1 +
 torch/csrc/utils/python_arg_parser.cpp        |   3 +-
 torch/csrc/utils/python_arg_parser.h          |  17 ++
 45 files changed, 510 insertions(+), 305 deletions(-)
 create mode 100644 aten/src/ATen/core/jit_type_base.h

diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index ae95ef43f21c..8d29a9204420 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -31,3 +31,4 @@
 #include <c10/util/Exception.h>
 #include <ATen/core/UnsafeFromTH.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp
index 07fc4e279557..261f6cdd46b5 100644
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@@ -1,4 +1,5 @@
 #include <ATen/Config.h>
+#include <ATen/core/jit_type.h>
 #if AT_PARALLEL_OPENMP
 #include <ATen/Parallel.h>
 
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 3890662123a2..f6c3bbbe09cc 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -10,6 +10,8 @@
 // There is some back story, see https://github.com/pytorch/pytorch/issues/48684
 #include <ATen/NativeFunctions.h>
 
+#include <ATen/core/List.h>
+
 namespace at {
 namespace indexing {
 
@@ -261,14 +263,15 @@ static inline void recordTensorIndex(const Tensor& tensor, std::vector<Tensor>&
   (*dim_ptr)++;
 };
 
-static inline std::vector<Tensor> typeConvertIndices(const Tensor& self, std::vector<Tensor>&& indices) {
-  std::vector<Tensor> converted_inds(indices.size());
+static inline c10::List<c10::optional<Tensor>> typeConvertIndices(const Tensor& self, std::vector<Tensor>&& indices) {
+  c10::List<c10::optional<Tensor>> converted_inds;
+  converted_inds.reserve(indices.size());
   for (size_t i = 0; i < indices.size(); ++i) {
     const auto &ind = indices[i];
     if (ind.defined()) {
-      converted_inds[i] = ind.to(ind.options().device(self.device()));
+      converted_inds.push_back(ind.to(ind.options().device(self.device())));
     } else {
-      converted_inds[i] = std::move(indices[i]);
+      converted_inds.push_back(std::move(indices[i]));
     }
   }
   return converted_inds;
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 8c82f965ef0f..dfb8e3ac0f32 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -406,7 +406,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
   KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
   KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote)
+  KERNEL(ADD_NS(index_put), "index_put", Tensor (const Tensor &, const torch::List<c10::optional<Tensor>>&, const Tensor &, bool), promote)
   KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
   KERNEL(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)
 
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index 40f733784fe5..f911722c51e1 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -243,7 +243,7 @@ class List final {
    * Example:
    *   List<int> a({2, 3, 4});
    */
-  explicit List(std::initializer_list<T> initial_values);
+  List(std::initializer_list<T> initial_values);
   explicit List(ArrayRef<T> initial_values);
 
   /**
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index 3cbd7a310275..ab3ddae55770 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <ATen/core/jit_type_base.h>
 #include <ATen/core/ivalue.h>
-#include <ATen/core/jit_type.h>
 
 namespace c10 {
 
@@ -50,7 +50,17 @@ List<T>::List(TypePtr elementType)
 namespace impl {
 template<class T>
 List<T> toTypedList(impl::GenericList list) {
-  TORCH_INTERNAL_ASSERT(*getTypePtr<T>() == *list.impl_->elementType, "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr<T>()), ">. Types mismatch.");
+  // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant
+  // because upcasting would allow people to add types into the new list that would break the old list.
+  // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can
+  // allow upcasting. This can be a perf improvement since we can cast List<T> to List<optional<T>>
+  // without having to copy it. This is also used to provide backwards compatibility with some old models
+  // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_
+  // as List<Tensor> before we changed that argument to be List<optional<Tensor>>. When deserializing, we
+  // have list.use_count() == 1 and can deserialize the List<Tensor> directly as List<optional<Tensor>>.
+  TORCH_CHECK(*list.impl_->elementType == *getTypePtr<T>()
+    || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(getTypePtr<T>()))
+    , "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr<T>()), ">. Types mismatch.");
   return List<T>(std::move(list.impl_));
 }
 
@@ -312,3 +322,5 @@ void List<T>::unsafeSetElementType(TypePtr t) {
   impl_->elementType = std::move(t);
 }
 }
+
+#include <ATen/core/jit_type.h>
diff --git a/aten/src/ATen/core/Variadic.h b/aten/src/ATen/core/Variadic.h
index b49d94bba1c8..d33f3d575177 100644
--- a/aten/src/ATen/core/Variadic.h
+++ b/aten/src/ATen/core/Variadic.h
@@ -6,6 +6,7 @@
 #include <utility>
 
 #include <c10/util/ArrayRef.h>
+#include <ATen/core/List.h>
 
 namespace at {
 
@@ -56,6 +57,15 @@ struct IterArgs {
     }
   }
 
+  template <typename T>
+  void operator()(const torch::List<T>& args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
   // NB: we need to specify std::vector manually as C++ won't
   // do an implicit conversion to make a template deduction go through.
   template <typename T>
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index f6902cd4beb6..a3ae813616e0 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include <ATen/core/jit_type_base.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/interned_strings.h>
-#include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
+#include <ATen/core/ivalue.h>
 #include <c10/util/TypeList.h>
 #include <c10/util/Optional.h>
 
@@ -17,197 +18,17 @@ struct ClassType;
 namespace torch {
 namespace jit {
 struct CompilationUnit;
+struct Function;
 } // namespace jit
 } // namespace torch
 
 namespace c10 {
 
+struct IValue;
 struct FunctionSchema;
 struct NamedType;
 using OptNameList = c10::optional<std::vector<std::string>>;
 
-#define C10_FORALL_TYPES(_) \
-  _(AnyType)                \
-  _(EnumType)               \
-  _(AnyEnumType)            \
-  _(TensorType)             \
-  _(StorageType)            \
-  _(TupleType)              \
-  _(ListType)               \
-  _(DictType)               \
-  _(NumberType)             \
-  _(FloatType)              \
-  _(FutureType)             \
-  _(RRefType)               \
-  _(IntType)                \
-  _(NoneType)               \
-  _(StringType)             \
-  _(GeneratorType)          \
-  _(QuantizerType)          \
-  _(BoolType)               \
-  _(OptionalType)           \
-  _(VarType)                \
-  _(DeviceObjType)          \
-  _(StreamObjType)          \
-  _(FunctionType)           \
-  _(ClassType)              \
-  _(PyObjectType)           \
-  _(CapsuleType)            \
-  _(InterfaceType)          \
-  _(QSchemeType)            \
-  _(LayoutType)             \
-  _(ScalarTypeType)         \
-  _(AnyListType)            \
-  _(AnyTupleType)           \
-  _(AnyClassType)
-
-enum class TypeKind {
-#define DEFINE_TYPE(T) T,
-  C10_FORALL_TYPES(DEFINE_TYPE)
-#undef DEFINE_TYPE
-};
-
-TORCH_API const char* typeKindToString(TypeKind kind);
-
-struct Type;
-using TypePtr = std::shared_ptr<Type>;
-using ConstTypePtr = std::shared_ptr<const Type>;
-
-// Use this to customize how a Type is printed using `annotation_str()`. If
-// c10::nullopt is returned, `annotation_str()` falls through to its default
-// implementation.
-using TypePrinter =
-    std::function<c10::optional<std::string>(const ConstTypePtr&)>;
-
-struct TORCH_API Type : std::enable_shared_from_this<Type> {
- private:
-  TypeKind kind_;
-
- protected:
-  Type(TypeKind kind) : kind_(kind) {}
-
-  virtual std::string annotation_str_impl(TypePrinter printer) const {
-    return str();
-  }
-
- public:
-  virtual bool operator==(const Type& rhs) const = 0;
-
-  // subtyping relation. By default, we return true for the case
-  // when the type is exactly equal or if this <: T where rhs = Optional[T]
-
-  // if this returns false and the why_not stream is non-null, it contains
-  // additional details that describe why this is not a subtype of 'rhs'.
-  // This additional information should only contain details that are not obvious
-  // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false
-  // but not clear why `Foo <: InterfaceBar` might be false.
-  virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const;
-  virtual bool is_module() const;
-  bool isSubtypeOf(const TypePtr& rhs) const {
-    return isSubtypeOfExt(rhs, nullptr);
-  }
-
-  // How this type will appear in FunctionSchema declarations
-  virtual std::string str() const = 0;
-
-  // How this type will appear as if it were a type annotation in Python
-  // which is sometimes different than how it appears in declarations (e.g.
-  // int[] vs List[int])
-  //
-  // Takes a custom printer that users can pass in to customize the output of
-  // this method.
-  std::string annotation_str(TypePrinter printer) const {
-    if (printer) {
-      // the printer can return nullopt to fall through to the default impl
-      if (auto renamed = printer(shared_from_this())) {
-        return *renamed;
-      }
-    }
-    return annotation_str_impl(printer);
-  }
-  std::string annotation_str() const {
-    // Overload instead of define a default value for `printer` to help
-    // debuggers out.
-    return annotation_str(nullptr);
-  }
-
-  // Returns a human readable string that includes additional information like
-  // "type is inferred rather than explictly defined" to help construct more
-  // user-friendly messages.
-  virtual std::string repr_str() const {
-    return annotation_str();
-  }
-
-  TypeKind kind() const {
-    return kind_;
-  }
-
-  virtual bool requires_grad() const {
-    for (const auto& ct : containedTypes()) {
-      if (ct->requires_grad()) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Dynamically cast this object to the subclass indicated by the
-  // template variable, returning nullptr if the cast is invalid.
-  template <typename T>
-  std::shared_ptr<T> cast() {
-    if (T::Kind == kind()) {
-      return std::static_pointer_cast<T>(shared_from_this());
-    }
-    return nullptr;
-  }
-  template <typename T>
-  std::shared_ptr<const T> cast() const {
-    if (T::Kind == kind()) {
-      return std::static_pointer_cast<const T>(shared_from_this());
-    }
-    return nullptr;
-  }
-  template <typename T>
-  std::shared_ptr<T> expect() {
-    auto r = cast<T>();
-    AT_ASSERT(r);
-    return r;
-  }
-  template <typename T>
-  std::shared_ptr<const T> expect() const {
-    auto r = cast<const T>();
-    AT_ASSERT(r);
-    return r;
-  }
-  virtual ~Type() = default;
-  virtual bool hasFreeVariables() const {
-    return false;
-  }
-  // list of types this type contains, e.g. for a List then element type of a
-  // list for a tuple, the types of the tuple elements
-  virtual at::ArrayRef<TypePtr> containedTypes() const {
-    return {};
-  }
-  // create a new version of this type, replacing its contained types with
-  // contained_types
-  TypePtr withContained(std::vector<TypePtr> contained_types) {
-    auto current_contained = containedTypes();
-    AT_ASSERT(current_contained.size() == contained_types.size());
-    if (current_contained.equals(contained_types)) {
-      return shared_from_this();
-    }
-    return createWithContained(std::move(contained_types));
-  }
-  // per-type constructor, you only need to override this if the
-  // containedTypes() is not empty
-  virtual TypePtr createWithContained(
-      std::vector<TypePtr> contained_types) const {
-    AT_ERROR(
-        "type with contained types did not overload createWithContained: ",
-        str());
-  }
-};
-
 struct AnyType;
 using AnyTypePtr = std::shared_ptr<AnyType>;
 // Any is the top of the type hierarchy, all other types are subtypes
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
new file mode 100644
index 000000000000..37da9ad7ef8d
--- /dev/null
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -0,0 +1,195 @@
+#pragma once
+
+#include <functional>
+#include <string>
+#include <memory>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ArrayRef.h>
+
+namespace c10 {
+
+#define C10_FORALL_TYPES(_) \
+  _(AnyType)                \
+  _(EnumType)               \
+  _(AnyEnumType)            \
+  _(TensorType)             \
+  _(StorageType)            \
+  _(TupleType)              \
+  _(ListType)               \
+  _(DictType)               \
+  _(NumberType)             \
+  _(FloatType)              \
+  _(FutureType)             \
+  _(RRefType)               \
+  _(IntType)                \
+  _(NoneType)               \
+  _(StringType)             \
+  _(GeneratorType)          \
+  _(QuantizerType)          \
+  _(BoolType)               \
+  _(OptionalType)           \
+  _(VarType)                \
+  _(DeviceObjType)          \
+  _(StreamObjType)          \
+  _(FunctionType)           \
+  _(ClassType)              \
+  _(PyObjectType)           \
+  _(CapsuleType)            \
+  _(InterfaceType)          \
+  _(QSchemeType)            \
+  _(LayoutType)             \
+  _(ScalarTypeType)         \
+  _(AnyListType)            \
+  _(AnyTupleType)           \
+  _(AnyClassType)
+
+enum class TypeKind {
+#define DEFINE_TYPE(T) T,
+  C10_FORALL_TYPES(DEFINE_TYPE)
+#undef DEFINE_TYPE
+};
+
+TORCH_API const char* typeKindToString(TypeKind kind);
+
+struct Type;
+using TypePtr = std::shared_ptr<Type>;
+using ConstTypePtr = std::shared_ptr<const Type>;
+
+// Use this to customize how a Type is printed using `annotation_str()`. If
+// c10::nullopt is returned, `annotation_str()` falls through to its default
+// implementation.
+using TypePrinter =
+    std::function<c10::optional<std::string>(const ConstTypePtr&)>;
+
+struct TORCH_API Type : std::enable_shared_from_this<Type> {
+ private:
+  TypeKind kind_;
+
+ protected:
+  Type(TypeKind kind) : kind_(kind) {}
+
+  virtual std::string annotation_str_impl(TypePrinter printer) const {
+    return str();
+  }
+
+ public:
+  virtual bool operator==(const Type& rhs) const = 0;
+
+  // subtyping relation. By default, we return true for the case
+  // when the type is exactly equal or if this <: T where rhs = Optional[T]
+
+  // if this returns false and the why_not stream is non-null, it contains
+  // additional details that describe why this is not a subtype of 'rhs'.
+  // This additional information should only contain details that are not obvious
+  // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false
+  // but not clear why `Foo <: InterfaceBar` might be false.
+  virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const;
+  virtual bool is_module() const;
+  bool isSubtypeOf(const TypePtr& rhs) const {
+    return isSubtypeOfExt(rhs, nullptr);
+  }
+
+  // How this type will appear in FunctionSchema declarations
+  virtual std::string str() const = 0;
+
+  // How this type will appear as if it were a type annotation in Python
+  // which is sometimes different than how it appears in declarations (e.g.
+  // int[] vs List[int])
+  //
+  // Takes a custom printer that users can pass in to customize the output of
+  // this method.
+  std::string annotation_str(TypePrinter printer) const {
+    if (printer) {
+      // the printer can return nullopt to fall through to the default impl
+      if (auto renamed = printer(shared_from_this())) {
+        return *renamed;
+      }
+    }
+    return annotation_str_impl(printer);
+  }
+  std::string annotation_str() const {
+    // Overload instead of define a default value for `printer` to help
+    // debuggers out.
+    return annotation_str(nullptr);
+  }
+
+  // Returns a human readable string that includes additional information like
+  // "type is inferred rather than explictly defined" to help construct more
+  // user-friendly messages.
+  virtual std::string repr_str() const {
+    return annotation_str();
+  }
+
+  TypeKind kind() const {
+    return kind_;
+  }
+
+  virtual bool requires_grad() const {
+    for (const auto& ct : containedTypes()) {
+      if (ct->requires_grad()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Dynamically cast this object to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid.
+  template <typename T>
+  std::shared_ptr<T> cast() {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<T>(shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T>
+  std::shared_ptr<const T> cast() const {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<const T>(shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T>
+  std::shared_ptr<T> expect() {
+    auto r = cast<T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  template <typename T>
+  std::shared_ptr<const T> expect() const {
+    auto r = cast<const T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  virtual ~Type() = default;
+  virtual bool hasFreeVariables() const {
+    return false;
+  }
+  // list of types this type contains, e.g. for a List then element type of a
+  // list for a tuple, the types of the tuple elements
+  virtual at::ArrayRef<TypePtr> containedTypes() const {
+    return {};
+  }
+  // create a new version of this type, replacing its contained types with
+  // contained_types
+  TypePtr withContained(std::vector<TypePtr> contained_types) {
+    auto current_contained = containedTypes();
+    AT_ASSERT(current_contained.size() == contained_types.size());
+    if (current_contained.equals(contained_types)) {
+      return shared_from_this();
+    }
+    return createWithContained(std::move(contained_types));
+  }
+  // per-type constructor, you only need to override this if the
+  // containedTypes() is not empty
+  virtual TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const {
+    AT_ERROR(
+        "type with contained types did not overload createWithContained: ",
+        str());
+  }
+};
+
+}
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index bf74e8b356c7..a4854e1ced4d 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -68,7 +68,7 @@ Tensor embedding_sparse_backward(
   Tensor indices = indices_;
   Tensor grad = grad_;
   if (padding_idx != -1) {
-    auto c = indices != padding_idx;
+    torch::List<c10::optional<Tensor>> c({indices != padding_idx});
     indices = indices.index(c);
     grad = grad.index(c);
   }
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 94d61b02dd0b..92f6957f25ad 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/core/List.h>
 
 #include <limits>
 
@@ -15,40 +16,45 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
 }
 
 
-static std::vector<Tensor> expandTensors(const Tensor & self, TensorList indices) {
+static std::vector<Tensor> expandTensors(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
   std::vector<Tensor> result;
-  for (const auto & index : indices) {
-    if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
-      if (index.scalar_type() == kByte) {
-        TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
-        " please use a dtype torch.bool instead.");
-      }
-      // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
-      // corresponding dimensions in self
-      for (int64_t j = 0; j < index.dim(); j++) {
-        int64_t srcIdx = result.size() + j;
-        if (index.size(j) != self.size(srcIdx)) {
-          invalid_mask(self, srcIdx, index, j);
+  for (c10::optional<Tensor> index_opt : indices) {
+    if (!index_opt.has_value()) {
+      result.emplace_back();
+    } else {
+      Tensor index = std::move(*index_opt);
+      if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
+        if (index.scalar_type() == kByte) {
+          TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
+          " please use a dtype torch.bool instead.");
         }
+        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
+        // corresponding dimensions in self
+        for (int64_t j = 0; j < index.dim(); j++) {
+          int64_t srcIdx = result.size() + j;
+          if (index.size(j) != self.size(srcIdx)) {
+            invalid_mask(self, srcIdx, index, j);
+          }
+        }
+        // Replace with nonzeros
+        auto nonzero = index.nonzero();
+        for (int64_t j = 0; j < index.dim(); j++) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      } else {
+        result.emplace_back(std::move(index));
       }
-      // Replace with nonzeros
-      auto nonzero = index.nonzero();
-      for (int64_t j = 0; j < index.dim(); j++) {
-        result.emplace_back(nonzero.select(1, j));
-      }
-    } else {
-      result.emplace_back(index);
     }
   }
   return result;
 }
 
 
-static void checkIndexTensorTypes(TensorList indices) {
-  for (auto& tensor : indices) {
-    if (tensor.defined()) {
-      auto scalarType = tensor.scalar_type();
+static void checkIndexTensorTypes(const torch::List<c10::optional<Tensor>>& indices) {
+  for (c10::optional<Tensor> tensor : indices) {
+    if (tensor.has_value() && tensor->defined()) {
+      auto scalarType = tensor->scalar_type();
       if (scalarType != kLong && scalarType != kByte && scalarType != kBool) {
           TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors");
       }
@@ -56,6 +62,15 @@ static void checkIndexTensorTypes(TensorList indices) {
   }
 }
 
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const Tensor& a : list) {
+    result.push_back(a);
+  }
+  return result;
+}
+
 static bool hasContiguousSubspace(TensorList tl) {
   // true if all the non-null tensors are adjacent
   auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index da8d2bd6db47..a37d1046bac2 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -8,6 +8,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/LinearAlgebra.h>
+#include <ATen/native/IndexingUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/LegacyTHFunctionsCPU.h>
@@ -73,7 +74,8 @@ Tensor logdet(const Tensor& self) {
   // U is singular when U(i, i) = 0 for some i in [1, self.size(-1)].
   Tensor logdet_vals = diag_U.abs_().log_().sum(-1);
   if (self.dim() > 2) {
-    logdet_vals.index_put_((det_sign < 0).nonzero_numpy(), at::full({}, NAN, self.options()));
+    auto indices = toListOfOptionalTensors((det_sign < 0).nonzero_numpy());
+    logdet_vals.index_put_(std::move(indices), at::full({}, NAN, self.options()));
   } else if (det_sign.item<double>() < 0) {
     logdet_vals.fill_(NAN);
   }
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 1d9f9d9d2a12..2d79a4e3713f 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -206,7 +206,7 @@ AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
   }
 }
 
-static AdvancedIndex make_info(Tensor self, TensorList orig) {
+static AdvancedIndex make_info(Tensor self, const torch::List<c10::optional<at::Tensor>>& orig) {
   checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = expandTensors(self, orig);
@@ -281,7 +281,7 @@ static TensorIterator make_index_out_iterator(const AdvancedIndex& info, Tensor&
   return config.build();
 }
 
-Tensor index(const Tensor & self, TensorList indices) {
+Tensor index(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   auto info = make_info(self, indices);
@@ -290,7 +290,7 @@ Tensor index(const Tensor & self, TensorList indices) {
   return iter.output();
 }
 
-Tensor quantized_index(const Tensor & self, TensorList indices) {
+Tensor quantized_index(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_INTERNAL_ASSERT(
       self.qscheme() == c10::kPerTensorAffine ||
       self.qscheme() == c10::kPerTensorSymmetric,
@@ -311,12 +311,14 @@ Tensor quantized_index(const Tensor & self, TensorList indices) {
       res, self.q_scale(), self.q_zero_point(), self.scalar_type());
 }
 
-Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) {
+Tensor& index_out(Tensor& result, const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   at::assert_no_internal_overlap(result);
   at::assert_no_overlap(result, self);
-  for (auto& index: indices) {
-    at::assert_no_overlap(result, index);
+  for (const c10::optional<Tensor>& index: indices) {
+    if (index.has_value()) {
+      at::assert_no_overlap(result, *index);
+    }
   }
 
   auto info = make_info(self, indices);
@@ -325,11 +327,11 @@ Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) {
   return result;
 }
 
-Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value, bool accumulate) {
+Tensor index_put(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, bool accumulate) {
   return self.clone(at::MemoryFormat::Preserve).index_put_(indices, value, accumulate);
 }
 
-Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate, const bool unsafe) {
+Tensor & _index_put_impl_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   if (at::has_internal_overlap(self) == MemOverlap::YES) {
     TORCH_WARN(
@@ -338,8 +340,10 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu
       "This also applies to advanced indexing e.g. tensor[indices] = tensor");
   }
   at::assert_no_overlap(self, value);
-  for (auto& index: indices) {
-    at::assert_no_overlap(self, index);
+  for (const c10::optional<Tensor>& index: indices) {
+    if (index.has_value()) {
+      at::assert_no_overlap(self, *index);
+    }
   }
 
   if (accumulate && self.device().type() == kCUDA) {
@@ -356,7 +360,7 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu
 }
 
 
-Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate) {
+Tensor & index_put_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate) {
   return at::_index_put_impl_(self, indices, value, accumulate, /*unsafe=*/false);
 }
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h
index 560b46162546..0e0958606de1 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@@ -15,7 +15,7 @@ enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY};
 
 using index_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides);
 using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
-using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe);
+using index_put_accum_fn = void(*)(Tensor &, const c10::List<c10::optional<Tensor>> &, const Tensor &, bool unsafe);
 using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar);
 using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
 
@@ -42,6 +42,6 @@ DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub);
 DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
 DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
 
-TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices);
+TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<c10::optional<at::Tensor>>& indices);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index cb4aa644fee2..d88f202487af 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -190,7 +190,7 @@ static Tensor & masked_select_out_cuda_impl(Tensor & result, const Tensor & self
   Tensor _mask = (mask.dim() == 0) ? mask.unsqueeze(0) : mask;
   Tensor _self = (self.dim() == 0) ? self.unsqueeze(0) : self;
   std::tie(_mask, _self) = expand_outplace(_mask, _self);
-  at::native::index_out(result, _self, _mask);
+  at::native::index_out(result, _self, c10::List<c10::optional<at::Tensor>>({_mask}));
 
   return result;
 }
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index d630d727019f..e372f8bdb697 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -160,7 +160,7 @@ computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
 }
 
 
-static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, TensorList orig, bool check_range) {
+static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, const c10::List<c10::optional<at::Tensor>>& orig, bool check_range) {
   checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = expandTensors(self, orig);
@@ -184,7 +184,7 @@ static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t
 
 
 namespace {
-void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & value, bool unsafe) {
+void index_put_accum_kernel(Tensor & self, const c10::List<c10::optional<Tensor>>& indices, const Tensor & value, bool unsafe) {
   if (indices.size() > (size_t)self.dim()) {
     TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a5b945399da8..6b0aaa8f4d9b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2226,6 +2226,7 @@
   use_c10_dispatcher: full
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: index
@@ -2254,6 +2255,7 @@
   variants: function, method
 
 - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: index_put_
@@ -2264,9 +2266,11 @@
   # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)
 
 - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _index_put_impl_
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index d621efafee41..fb7e16539c15 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -7,6 +7,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/native/IndexingUtils.h>
 
 #include <TH/THBlasUtils.h>
 
@@ -14,7 +15,6 @@ namespace at { namespace native {
 
 using namespace at::sparse;
 
-
 /******************************************************************************
  * access methods
  ******************************************************************************/
@@ -328,7 +328,7 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){
 
   Tensor values;
   if (self.dim() > 0) {
-    std::vector<Tensor> ix = indices.chunk(indices.size(0), 0);
+    auto ix = toListOfOptionalTensors(indices.chunk(indices.size(0), 0));
     values = self.index(ix).squeeze(0).clone(at::MemoryFormat::Preserve);
   } else {
     AT_ASSERT(nz.sizes().equals({0, 1}));
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index d42c8c23fe9c..1c0a04a318d0 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -28,6 +28,7 @@ class Tensor;
 }
 namespace c10{
 struct TensorOptions;
+template<class T> class List;
 }
 namespace at {
 struct Generator;
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
index 9e7479141ad4..dba68d21c2dd 100644
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -6,13 +6,17 @@ namespace caffe2 {
 namespace internal {
 at::Tensor index_with_uint8_handling(
     const at::Tensor& self,
-    at::TensorList indices) {
+    const torch::List<c10::optional<at::Tensor>>& indices) {
   // Support BC only for the simplest case of mask indexing
-  if (indices.size() == 1 && indices[0].scalar_type() == at::kByte) {
-    TORCH_WARN(
-        "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
-        " please use a bool mask instead.");
-    return at::index(self, {indices[0].to(at::kBool)});
+  if (indices.size() == 1) {
+    c10::optional<at::Tensor> first = indices[0];
+    if (first.has_value()
+        && first->scalar_type() == at::kByte) {
+      TORCH_WARN(
+          "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
+          " please use a bool mask instead.");
+      return at::index(self, {first->to(at::kBool)});
+    }
   }
   return at::index(self, indices);
 }
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index f3a42dbd8f59..cd1ce7651b48 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -21,7 +21,7 @@ using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
 namespace internal {
 TORCH_API at::Tensor index_with_uint8_handling(
     const at::Tensor& self,
-    at::TensorList indices);
+    const torch::List<c10::optional<at::Tensor>>& indices);
 }
 
 template <class Context>
@@ -86,6 +86,16 @@ class ATenOp : public Operator<Context> {
 
   std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
     std::vector<at::Tensor> results;
+    results.reserve(len);
+    for (size_t ii = i; ii < i + len; ++ii) {
+      results.push_back(peek(ii, N));
+    }
+    return results;
+  }
+
+  torch::List<c10::optional<at::Tensor>> peekSliceOptionals(size_t i, size_t len, size_t N) {
+    torch::List<c10::optional<at::Tensor>> results;
+    results.reserve(len);
     for (size_t ii = i; ii < i + len; ++ii) {
       results.push_back(peek(ii, N));
     }
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 2a822058bfdf..ba29ab933da9 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -68,7 +68,7 @@ def value_has_tensors(v):
 
 
 def value_is_tensor_type(v):
-    return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
+    return value_has_tensors(v) and v['dynamic_type'] not in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']
 
 
 # for each aten type, how do we handle a return value of that type?
@@ -208,7 +208,7 @@ def self_as_first_argument(arguments):
 def get_num_inputs(o):
     args = 0
     for a in o['arguments']:
-        if a['type'] == 'TensorList':
+        if a['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
             return '*'
         elif value_has_tensors(a):
             args += 1
@@ -277,10 +277,10 @@ def emit_assignments(o, env):
             # e.g. "Float" is at::kFloat
             assert('Type' in o['method_of'])
 
-        static_tensor_inputs = sum(arg['type'] != 'TensorList' and value_is_tensor_type(arg) for arg in o['arguments'])
-        has_tensorlist = any(arg['type'] == 'TensorList' for arg in o['arguments'])
+        static_tensor_inputs = sum(arg['type'] not in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] and value_is_tensor_type(arg) for arg in o['arguments'])
+        has_tensorlist = any(arg['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] for arg in o['arguments'])
         if has_tensorlist:
-            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] == 'TensorList'][0]
+            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']][0]
 
         real_inputs = 0
         for i, arg in enumerate(o['arguments']):
@@ -290,10 +290,16 @@ def emit_assignments(o, env):
             view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
             if arg['type'] == 'TensorList':
                 # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end as if we only had
+                # switch to indexing the "stack" from the end
                 env['statements'].append(
                     'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
                     .format(arg['name'], real_inputs, static_tensor_inputs))
+            elif arg['type'] == 'const c10::List<c10::optional<Tensor>> &':
+                # NOTE: do not advance real_inputs here. After this we will
+                # switch to indexing the "stack" from the end
+                env['statements'].append(
+                    'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());'
+                    .format(arg['name'], real_inputs, static_tensor_inputs))
             elif value_is_tensor_type(arg):
                 # load tensor inputs from Caffe2
                 env['statements'].append(
diff --git a/test/cpp/api/tensor_indexing.cpp b/test/cpp/api/tensor_indexing.cpp
index efb153fbf481..03600c5c882e 100644
--- a/test/cpp/api/tensor_indexing.cpp
+++ b/test/cpp/api/tensor_indexing.cpp
@@ -83,27 +83,27 @@ TEST(TensorIndexingTest, TestNoIndices) {
   ASSERT_THROWS_WITH(tensor.index_put_(indices, value), "Passing an empty index list to Tensor::index_put_() is not valid syntax");
 }
 
-TEST(TensorIndexingTest, TestAdvancedIndexingWithArrayRefOfTensor) {
+TEST(TensorIndexingTest, TestAdvancedIndexingWithListOfTensor) {
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index(at::ArrayRef<torch::Tensor>({index}));
+    torch::Tensor result = at::index(tensor, {index});
     torch::Tensor result_with_init_list = tensor.index({index});
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef<torch::Tensor>({index}), torch::ones({20}));
+    torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({20}));
     torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({20}));
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef<torch::Tensor>({index}), torch::ones({1, 20}));
+    torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({1, 20}));
     torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({1, 20}));
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
 }
 
@@ -173,7 +173,7 @@ TEST(TensorIndexingTest, TestBoolIndices) {
 TEST(TensorIndexingTest, TestBoolIndicesAccumulate) {
   auto mask = torch::zeros({10}, torch::kBool);
   auto y = torch::ones({10, 10});
-  y.index_put_({mask}, y.index({mask}), /*accumulate=*/true);
+  y.index_put_({mask}, {y.index({mask})}, /*accumulate=*/true);
   assert_tensor_equal(y, torch::ones({10, 10}));
 }
 
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 95f94504d84e..f32b04cb2e53 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -563,6 +563,8 @@ def instance_gen():
                     func_args.append(instance_gen())
                 elif t == 'TensorList':
                     func_args.append([instance_gen(), instance_gen()])
+                elif t == 'c10::List<c10::optional<Tensor>>':
+                    func_args.append([instance_gen(), instance_gen()])
                 elif t == 'IntArrayRef':
                     size = arg.get('size', 2)
                     if size == 1:
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index a22154b5c01d..4724b99a8742 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -141,7 +141,7 @@ def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str
     compute_index_ranges: List[str] = []
 
     for arg in info.args_with_derivatives:
-        if arg.type == 'TensorList':
+        if arg.type == 'TensorList' or arg.type == 'const c10::List<c10::optional<Tensor>> &':
             size = f'{arg.name}_size_'
             saved_list_sizes.append(f'size_t {arg.name}_size_;')
         else:
@@ -166,6 +166,15 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
             release_variables.append(f'{name}_released_ = true;')
             unpack.append(f'auto {name} = unpack_list({name}_);')
             asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
+        elif var.type == 'c10::List<c10::optional<Tensor>>':
+            saved_variables.append(f'std::vector<SavedVariable> {name}_;')
+            saved_variables.append(f'bool {name}_released_ = false;')
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f'{name}_.clear();')
+            release_variables.append(f'{name}_released_ = true;')
+            unpack.append(f'auto {name} = unpack_opt_list({name}_);')
+            asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
         elif var.type == 'IntArrayRef':
             saved_variables.append(f'std::vector<int64_t> {name};')
         elif var.type == 'c10::optional<IntArrayRef>':
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index b2dfe2667128..78c460843d94 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -112,9 +112,8 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
             ]
         else:
             name = arg.name
-            # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs
             if str(arg.type) == 'Tensor?[]':
-                return [f'jit::tracer::addInputs(node, "{name}", {name}, true);']
+                return [f'jit::tracer::addInputs(node, "{name}", {name});']
             else:
                 return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 1d75ae46e9c9..f97fb55ab012 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -118,6 +118,21 @@
 }
 """)
 
+SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const c10::optional<Tensor>& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_value() && tensor->has_storage() ? c10::optional<Storage>(tensor->storage()) : c10::nullopt);
+""")
+
+ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value())
+    AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(
+        static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->storage()));
+}
+""")
+
 SAVE_TENSOR_IMPL = CodeTemplate("""\
 c10::intrusive_ptr<TensorImpl> ${tensor_name}_impl_saved;
 if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr();
@@ -140,6 +155,21 @@
 }
 """)
 
+SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  c10::optional<Tensor> t = ${tensorlist_name}[i];
+  if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr();
+}
+""")
+
+ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  if (${tensorlist_name}_impl_saved[i])
+    AT_ASSERT(${tensorlist_name}_impl_saved[i] == static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->getIntrusivePtr());
+}
+""")
+
 # The following list contains functions that we don't enforce the invariant on.
 DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = {
     # These functions are expected to change impl or storage of input tensors
@@ -466,7 +496,8 @@ def emit_save_inputs():
         if func is None:
             return setup
 
-        has_tensorlist_arg = any(arg.type == 'TensorList' for arg in func.args_with_derivatives)
+        has_tensorlist_arg = \
+            any(arg.type in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] for arg in func.args_with_derivatives)
 
         # We don't want to save tensors if we know that they will never be used
         # when computing the derivative, so we add guards to those statements
@@ -515,7 +546,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
 
         setup.extend(save_variables(func.all_saved_inputs, False, guard_for))
         for arg in func.args_with_derivatives:
-            if arg.type == 'TensorList':
+            if arg.type in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
                 setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();')
 
         return setup
@@ -554,7 +585,7 @@ def emit_check_if_in_complex_autograd_allowlist():
             return body
         for arg in differentiable_outputs:
             name = arg['name']
-            if arg['type'] == 'Tensor' or arg['type'] == 'TensorList':
+            if arg['type'] in ['Tensor', 'TensorList', 'const c10::List<c10::optional<Tensor>> &']:
                 body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name))
         return body
 
@@ -599,7 +630,7 @@ def save_variables(
                     expr = f'SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})'
                 else:
                     expr = f'SavedVariable({var}, {str(is_output).lower()})'
-            elif arg.type == 'TensorList':
+            elif arg.type in ['TensorList', 'c10::List<c10::optional<Tensor>>']:
                 name += '_'
                 expr = f'make_saved_variable_list({arg.name})'
             elif arg.type == 'IntArrayRef':
@@ -699,7 +730,7 @@ def wrap_output(return_values, var):
                 # Only allow rebasing of the history if we return a single Tensor
                 # If we are in a no grad block, raise a warning
                 # See NOTE [ View + Inplace detection ] for more details about this logic
-                if return_info['dynamic_type'] == 'TensorList':
+                if return_info['dynamic_type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
                     if base_name in MULTI_OUTPUT_SAFE_FUNCTIONS:
                         creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE"
                     else:
@@ -736,6 +767,11 @@ def enforce_same_tensorimpl_and_storage(env, call):
                                         SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                                 ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                elif simple_type == 'c10::List<c10::optional<Tensor>>':
+                    save_ptrs_stmts += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                                        SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                    enforce_same_ptrs_stmts += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                                                ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
                 elif simple_type == 'Tensor':
                     save_ptrs_stmts += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
                                         SAVE_TENSOR_IMPL.substitute(tensor_name=arg)]
@@ -836,7 +872,7 @@ def emit_increment_version():
 
 def unpack_args(env, declaration):
     def requires_unpack(arg):
-        return 'Tensor' in arg['dynamic_type']
+        return 'Tensor' in arg['dynamic_type'] and 'c10::optional' not in arg['type']
 
     body = []
     unpacked_args = []
@@ -855,9 +891,8 @@ def requires_unpack(arg):
         dynamic_type = arg['dynamic_type']
         if 'TensorOptions' not in dynamic_type:
             is_nullable = arg.get('is_nullable', False)
-            ref = (not is_nullable) and dynamic_type not in ['TensorList']
+            ref = (not is_nullable) and dynamic_type != 'TensorList'
             suffix = '_opt' if is_nullable and dynamic_type != 'TensorList' else ''
-
             body.append(UNPACK_TENSOR.substitute(
                 arg_name=arg['name'],
                 arg_pos=i,
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index 03240e2a5a2b..0540bb65b33b 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -32,6 +32,15 @@ inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs) {
   });
 }
 
+inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    result.push_back(v.unpack());
+  }
+  return result;
+}
+
 struct TypeAndSize {
   TypeAndSize() : options(at::TensorOptions()) {}
   /* implicit */
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 9062a4d08e34..fc8ffa5799c1 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -49,7 +49,6 @@ namespace VariableType {
   at::Tensor & unpack(Tensor & t, const char * name, int pos);
   const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
   at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
-  c10::optional<at::Tensor> unpack_opt(const c10::optional<Tensor> & t, const char * name, int pos);
   std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
 };
 
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index ffd9626601a0..c27a6768300a 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -104,9 +104,11 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
             return BaseCType("TensorList", binds)
         elif str(t.elem) == 'Dimname':
             return BaseCType("DimnameList", binds)
-        # TODO: do something reasonable about lists of optional tensors
-        elif (not local.use_c10_dispatcher().dispatcher_uses_new_style()) and str(t.elem) == 'Tensor?':
-            return BaseCType("TensorList", binds)
+        elif str(t.elem) == 'Tensor?':
+            if local.use_c10_dispatcher().dispatcher_uses_new_style():
+                return BaseCType("const c10::List<c10::optional<Tensor>> &", binds)
+            else:
+                return BaseCType("TensorList", binds)
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         # TODO: explicitly qualify namespace here
         return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds)
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index 3b793527edd9..9781c46884e7 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -34,7 +34,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
         else:
             return ConstRefCType(BaseCType('Tensor', binds))
     elif str(t) == 'Tensor?[]':
-        return BaseCType('TensorList', binds)
+        return BaseCType('const c10::List<c10::optional<Tensor>> &', binds)
     return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
 
 def returns_type(rs: Sequence[Return]) -> str:
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index 059032869675..bdb31d4d8616 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -228,7 +228,7 @@ class PythonArgument:
     # Compute argument formal for python argument parsing.
     # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
     def argument_str(self, *, method: bool = False) -> str:
-        type_str = argument_type_str(self.type)
+        type_str = argument_type_str(self.type).replace('const ', '').replace(' &', '')
 
         name = self.name
         # s/self/input/ outside method bindings
@@ -624,10 +624,9 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
             return f'ScalarList[{size}]' if size is not None else 'ScalarList'
         elif str(t.elem) == 'Tensor?':
             if simple_type:
-                return 'TensorList'
+                return 'c10::List<c10::optional<Tensor>>'
             else:
-                # TODO: clone the old codegen behavior but does it make sense?
-                return 'TensorList?'
+                return 'const c10::List<c10::optional<Tensor>> &'
         elif str(t.elem) == 'Dimname':
             return f'DimnameList[{size}]' if size is not None else 'DimnameList'
         elem = argument_type_str(t.elem, simple_type=simple_type)
@@ -1051,12 +1050,14 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
                 return 'toDimnameListOptional'
 
     elif isinstance(t, ListType):
-        if str(t.elem) == 'Tensor' or str(t.elem) == 'Tensor?':
+        if str(t.elem) == 'Tensor':
             # accept and use definite size
             if t.size is not None:
                 return f'tensorlist_n<{t.size}>'
             else:
                 return 'tensorlist'
+        elif str(t.elem) == 'Tensor?':
+            return 'list_of_optional_tensors'
         elif str(t.elem) == 'Dimname':
             # accept definite size
             return 'dimnamelist'
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 3c84a0da4a99..6558295d58cb 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -14,6 +14,7 @@
 #include <ATen/ScalarOps.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/native/IndexingUtils.h>
 
 #include <ciso646>
 #include <algorithm>
@@ -2211,15 +2212,17 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det)
       return nonsingular_case_backward(grad, self, det);
     }
   } else {
-    auto nonzero_det_indices = at::where(det);
+    auto nonzero_det_indices = at::native::toListOfOptionalTensors(at::where(det));
+    c10::optional<Tensor> first_nonzero_det_index = nonzero_det_indices[0];
 
-    if (nonzero_det_indices[0].size(0) == det.numel()) {  // all determinants are nonzero (non-singular)
+    if (first_nonzero_det_index->size(0) == det.numel()) {  // all determinants are nonzero (non-singular)
       return nonsingular_case_backward(grad, self, det);
     }
 
-    auto zero_det_indices = at::where(det == 0);
+    auto zero_det_indices = at::native::toListOfOptionalTensors(at::where(det == 0));
+    c10::optional<Tensor> first_zero_det_index = zero_det_indices[0];
 
-    if (zero_det_indices[0].size(0) == det.numel()) {  // all determinants are zero (singular)
+    if (first_zero_det_index->size(0) == det.numel()) {  // all determinants are zero (singular)
       return singular_case_backward(grad, self, det);
     }
 
@@ -2261,15 +2264,17 @@ Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& lo
       return singular_case_backward(grad, self);
     }
   } else {
-    auto finite_logdet_indices = at::where(logdet != -INFINITY);
+    auto finite_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet != -INFINITY));
+    c10::optional<Tensor> first_finite_logdet_index = finite_logdet_indices[0];
 
-    if (finite_logdet_indices[0].size(0) == logdet.numel()) {  // all log determinants are finite (non-singular)
+    if (first_finite_logdet_index->size(0) == logdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad, self);
     }
 
-    auto neginf_logdet_indices = at::where(logdet == -INFINITY);
+    auto neginf_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet == -INFINITY));
+    c10::optional<Tensor> first_neginf_logdet_index = neginf_logdet_indices[0];
 
-    if (neginf_logdet_indices[0].size(0) == logdet.numel()) {  // all log determinants are -inf (singular)
+    if (first_neginf_logdet_index->size(0) == logdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad, self);
     }
 
@@ -2313,15 +2318,17 @@ Tensor slogdet_backward(const Tensor& grad_logabsdet,
       return nonsingular_case_backward(grad_logabsdet, self);
     }
   } else {
-    auto nonzero_signdet_indices = at::where(signdet);
+    auto nonzero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet));
+    c10::optional<Tensor> first_nonzero_signdet_index = nonzero_signdet_indices[0];
 
-    if (nonzero_signdet_indices[0].size(0) == logabsdet.numel()) {  // all log determinants are finite (non-singular)
+    if (first_nonzero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad_logabsdet, self);
     }
 
-    auto zero_signdet_indices = at::where(signdet == 0);
+    auto zero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet == 0));
+    c10::optional<Tensor> first_zero_signdet_index = zero_signdet_indices[0];
 
-    if (zero_signdet_indices[0].size(0) == logabsdet.numel()) {  // all log determinants are -inf (singular)
+    if (first_zero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad_logabsdet, self);
     }
 
@@ -2873,8 +2880,8 @@ Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indic
   return gg_weight.view(size);
 }
 
-Tensor index_backward(Tensor zeros_like_self, TensorList indices, const Tensor& grad) {
-   return at::_index_put_impl_(zeros_like_self, indices, grad, true, true);
+Tensor index_backward(Tensor zeros_like_self, const torch::List<c10::optional<Tensor>>& indices, const Tensor& grad) {
+  return at::_index_put_impl_(zeros_like_self, indices, grad, true, true);
 }
 
 Tensor _cudnn_ctc_loss_backward(const Tensor& grad_out, const Tensor& loss, const Tensor& raw_grad, bool zero_infinity) {
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 3814e8078b23..30736e13f58a 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -124,7 +124,7 @@ at::Tensor slogdet_backward(const at::Tensor& grad_logabsdet, const at::Tensor&
 at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self);
 at::Tensor sparse_constructor_values_backward(const at::Tensor& sparse_grad_out, const at::Tensor& indices, at::IntArrayRef values_shape);
 at::Tensor embedding_dense_double_backward(const at::Tensor & grad, const at::Tensor & indices, int64_t padding_idx);
-at::Tensor index_backward(at::Tensor zeros_like_self, at::TensorList indices, const at::Tensor& grad);
+at::Tensor index_backward(at::Tensor zeros_like_self, const torch::List<c10::optional<Tensor>>& indices, const at::Tensor& grad);
 at::Tensor _cudnn_ctc_loss_backward(const at::Tensor& grad_out, const at::Tensor& loss, const at::Tensor& raw_grad, bool zero_infinity);
 
 Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 0663d7f46fa8..d1f15fff3669 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -66,10 +66,6 @@ Tensor unpack_opt(const Tensor & t, const char * name, int pos) {
   return unpack(t, name, pos);
 }
 
-c10::optional<Tensor> unpack_opt(const c10::optional<Tensor> & t, const char * name, int pos) {
-  return t;
-}
-
 std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos) {
   std::vector<at::Tensor> ret(tl.size());
   for (size_t i = 0; i < tl.size(); ++i) {
@@ -94,7 +90,7 @@ void _backward(
   // instead of us having to unwrap it to Tensor _gradient here.
   Tensor _gradient = gradient.has_value() ? *gradient : Tensor();
   std::vector<torch::autograd::Variable> input_vars(inputs.begin(), inputs.end());
-  torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph, input_vars);
+  torch::autograd::backward({self}, {_gradient}, keep_graph, create_graph, input_vars);
 }
 
 void set_data(Tensor & self, const Tensor & new_data) {
@@ -230,7 +226,6 @@ Tensor _fw_primal(const Tensor & self, int64_t level) {
 
 // We don't have an outplace copy, so this can't be generated automatically
 Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) {
-  jit::Value* output = nullptr;
   // TODO: once copy is exposed in Declarations.yaml we may be able to bind
   // it automatically
   auto& self_ = unpack(self, "self", 0);
@@ -282,7 +277,7 @@ Tensor& resize_(
   }
   {
     at::AutoNonVariableTypeMode non_var_type_mode(true);
-    self_.resize_(size, std::move(optional_memory_format));
+    self_.resize_(size, optional_memory_format);
   }
 
   if (self.fw_grad(/* level */ 0).defined()) {
@@ -303,7 +298,7 @@ Tensor& resize_as_(
   }
   {
     at::AutoNonVariableTypeMode non_var_type_mode(true);
-    at::resize_as_(self_, the_template_, std::move(optional_memory_format));
+    at::resize_as_(self_, the_template_, optional_memory_format);
   }
 
   // Handle fw grad
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index af02de68fc27..509a12e01140 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -266,12 +266,31 @@ inline void check_no_requires_grad(TensorList tensors, const char* name) {
   }
 }
 
+inline void check_no_requires_grad(const c10::List<c10::optional<Tensor>>& tensors, const char* name) {
+  for (c10::optional<Tensor> tensor : tensors) {
+    if (tensor.has_value()) {
+      check_no_requires_grad(*tensor, name);
+    }
+  }
+}
+
 // Assumed that saved tensor lists are never inplace outputs
 inline std::vector<SavedVariable> make_saved_variable_list(TensorList tensors) {
   return fmap(tensors, [](const Tensor& tensor) -> SavedVariable {
       return SavedVariable{tensor, false /* is output */}; });
 }
 
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(const c10::List<c10::optional<at::Tensor>>& tensors) {
+  return fmap(tensors, [](const c10::optional<Tensor>& tensor) -> SavedVariable {
+    if (tensor.has_value()) {
+      return SavedVariable{*tensor, false /* is output */};
+    } else {
+      return SavedVariable{Tensor(), false /* is output */};
+    }
+  });
+}
+
 inline std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
diff --git a/torch/csrc/jit/backends/backend_detail.h b/torch/csrc/jit/backends/backend_detail.h
index 2d19f2ed8950..00f0f2f9eb44 100644
--- a/torch/csrc/jit/backends/backend_detail.h
+++ b/torch/csrc/jit/backends/backend_detail.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 72ccd77f2220..1bab391bd393 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -103,6 +103,9 @@ void TracingState::delValue(const IValue& var) {
 Value* getValueTrace(const IValue& var) {
   return getTracingState()->getValue(var);
 }
+Value* getOptTensorValueTrace(const c10::optional<at::Tensor>& var) {
+  return getValueTrace(IValue(var));
+}
 Value* TracingState::getValue(const IValue& var) {
   // allow tracing of tuples passed to List[Tensor] or Tuple[Tensor...]
   // arguments
@@ -686,6 +689,16 @@ void addInputs(
   }
   n->addInput(list_node->output());
 }
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<c10::optional<at::Tensor>>& value) {
+  Graph* g = n->owningGraph();
+  Node* list_node = nullptr;
+  list_node = g->insertNode(g->createList(
+      OptionalType::ofTensor(), fmap(value, getOptTensorValueTrace)));
+  n->addInput(list_node->output());
+}
 
 void addInputs(
     Node* n,
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 61d79cb3efd2..f5cbd821bda4 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -255,6 +255,10 @@ TORCH_API void addInputs(
     const char* name,
     ArrayRef<at::Tensor> value,
     bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<c10::optional<at::Tensor>>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 8b7da739df9a..2be75c61b6b5 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -1,5 +1,6 @@
 #pragma once
 //#include <ATen/core/function_schema.h>
+#include <ATen/core/jit_type.h>
 #include <torch/csrc/jit/mobile/function.h>
 #include <torch/csrc/jit/mobile/method.h>
 
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 120a3ffb7507..a4bb209cd17e 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -5,6 +5,7 @@
 
 #include <ATen/ThreadLocalState.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/frontend/source_range.h>
 
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index f23b09dc0e74..fe75ec52046e 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -908,7 +908,7 @@ RegisterOperators reg(
          TORCH_SELECTIVE_SCHEMA(
              "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
          [](Stack* stack) {
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index(self, indices);
            push(stack, std::move(result));
@@ -921,7 +921,7 @@ RegisterOperators reg(
            auto unsafe = pop(stack).toBool();
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result =
                at::_index_put_impl_(self, indices, values, accumulate, unsafe);
@@ -934,7 +934,7 @@ RegisterOperators reg(
          [](Stack* stack) {
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index_put_(self, indices, values, accumulate);
            push(stack, std::move(result));
@@ -946,7 +946,7 @@ RegisterOperators reg(
          [](Stack* stack) {
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index_put_(self, indices, values, accumulate);
            push(stack, std::move(result));
diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h
index 36bef721d626..d6eba7f5d191 100644
--- a/torch/csrc/jit/runtime/vararg_functions.h
+++ b/torch/csrc/jit/runtime/vararg_functions.h
@@ -2,6 +2,7 @@
 #include <ATen/core/List.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 
 namespace torch {
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index c7fdf844945e..ee3a0bc71f2f 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -24,6 +24,7 @@ static std::unordered_map<std::string, ParameterType> type_map = {
   {"double", ParameterType::DOUBLE},
   {"complex", ParameterType::COMPLEX},
   {"TensorList", ParameterType::TENSOR_LIST},
+  {"c10::List<c10::optional<Tensor>>", ParameterType::TENSOR_LIST},
   {"IntArrayRef", ParameterType::INT_LIST},
   {"ArrayRef<double>", ParameterType::FLOAT_LIST},
   {"Generator", ParameterType::GENERATOR},
@@ -390,7 +391,7 @@ bool is_float_or_complex_list(PyObject* obj) {
   }
 
   auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) { 
+  if (size > 0) {
     PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
     if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) {
       return false;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index ccf3ba6b42c4..9fa490139cbd 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -160,6 +160,7 @@ struct PythonArgs {
   inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar);
   inline std::vector<at::Scalar> scalarlist(int i);
   inline std::vector<at::Tensor> tensorlist(int i);
+  inline torch::List<c10::optional<at::Tensor>> list_of_optional_tensors(int i);
   template<int N>
   inline std::array<at::Tensor, N> tensorlist_n(int i);
   inline std::vector<int64_t> intlist(int i);
@@ -327,6 +328,22 @@ inline std::vector<at::Tensor> PythonArgs::tensorlist(int i) {
   return res;
 }
 
+inline torch::List<c10::optional<at::Tensor>> PythonArgs::list_of_optional_tensors(int i) {
+  if (!args[i]) return torch::List<c10::optional<at::Tensor>>();
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  torch::List<c10::optional<at::Tensor>> res;
+  res.reserve(size);
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx) : PyList_GET_ITEM(arg.get(), idx);
+    // This is checked by the argument parser so it's safe to cast without checking
+    // if this is a tensor first
+    res.push_back(reinterpret_cast<THPVariable*>(obj)->cdata);
+  }
+  return res;
+}
+
 template<int N>
 inline std::array<at::Tensor, N> PythonArgs::tensorlist_n(int i) {
   auto res = std::array<at::Tensor, N>();

From 211f35631f2babd914ca590a0b9e2b2dee64d286 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <gleobas@quansight.com>
Date: Mon, 4 Jan 2021 09:26:00 -0800
Subject: [PATCH 25/89] Add type annotations to _tensorboard_vis.py and
 hipify_python.py (#49834)

Summary:
closes gh-49833

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49834

Reviewed By: mruberry

Differential Revision: D25725341

Pulled By: malfet

fbshipit-source-id: 7454c7afe07a3ff829826afe02aba05b7f649d9b
---
 mypy.ini                            | 18 ------------------
 torch/_C/__init__.pyi.in            |  4 ++++
 torch/contrib/_tensorboard_vis.py   |  3 ++-
 torch/utils/hipify/hipify_python.py |  4 +++-
 4 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 8c900bcced76..7d6161bddd17 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -104,24 +104,6 @@ ignore_errors = True
 [mypy-torch._utils]
 ignore_errors = True
 
-[mypy-torch._overrides]
-ignore_errors = True
-
-[mypy-torch.utils.tensorboard._caffe2_graph]
-ignore_errors = True
-
-[mypy-torch.contrib._tensorboard_vis]
-ignore_errors = True
-
-[mypy-torch.nn.utils.prune]
-ignore_errors = True
-
-[mypy-torch.utils.show_pickle]
-ignore_errors = True
-
-[mypy-torch.utils.hipify.hipify_python]
-ignore_errors = True
-
 [mypy-torch.utils.benchmark.examples.*]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 79c93cb191f1..6427a4a4ed80 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -344,6 +344,10 @@ def _propagate_and_assign_input_shapes(
     propagate: _bool
 ) -> Graph: ...
 
+# Defined in torch/csrc/jit/runtime/graph_executor.h
+class GraphExecutorState:
+    ...
+
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
     def eraseInput(self, i: _int) -> None: ...
diff --git a/torch/contrib/_tensorboard_vis.py b/torch/contrib/_tensorboard_vis.py
index b3039f4cdd4f..b1b8d35a511d 100644
--- a/torch/contrib/_tensorboard_vis.py
+++ b/torch/contrib/_tensorboard_vis.py
@@ -1,6 +1,7 @@
 import time
 from collections import defaultdict
 from functools import partial
+from typing import DefaultDict
 
 import torch
 
@@ -104,7 +105,7 @@ def inline_graph(subgraph, name, node):
         for out, val in zip(subgraph.outputs(), node.outputs()):
             value_map[val.unique()] = rec_value_map[out.unique()]
 
-    op_id_counter = defaultdict(int)
+    op_id_counter: DefaultDict[str, int] = defaultdict(int)
 
     def name_for(node):
         kind = node.kind()[node.kind().index('::') + 2:]
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index d1639d20adba..adc480793d82 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -782,7 +782,9 @@ def repl(m):
                                                     os.path.relpath(header_filepath, output_directory),
                                                     all_files, includes, stats, hip_clang_launch, is_pytorch_extension,
                                                     clean_ctx, show_progress)
-                return templ.format(os.path.relpath(HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"], header_dir))
+                value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
+                assert value is not None
+                return templ.format(os.path.relpath(value, header_dir))
 
             return m.group(0)
         return repl

From 22bd277891612e9003fde7ad6d855df77b56cefa Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 4 Jan 2021 09:26:01 -0800
Subject: [PATCH 26/89] Run test_type_hints first (#49748)

Summary:
Since it sort of a liner check and fails frequently

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49748

Reviewed By: vkuzo

Differential Revision: D25682980

Pulled By: malfet

fbshipit-source-id: 7dba28242dced0277bad56dc887d3273c1e9e575
---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index e13753e93348..93484f7a583e 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -19,6 +19,7 @@
 from typing import Dict, Optional
 
 TESTS = [
+    'test_type_hints',
     'test_autograd',
     'benchmark_utils/test_benchmark_utils',
     'test_binary_ufuncs',
@@ -72,7 +73,6 @@
     'test_testing',
     'test_torch',
     'test_type_info',
-    'test_type_hints',
     'test_unary_ufuncs',
     'test_utils',
     'test_view_ops',

From b76822eb49588d0c8aa3eab94a88c77c79cb90a0 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Mon, 4 Jan 2021 10:12:19 -0800
Subject: [PATCH 27/89] Update update_s3_htmls.yml (#49934)

Summary:
It is now running for forks, and generates a lot of failure message to owner of forks.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49934

Reviewed By: mruberry

Differential Revision: D25739552

Pulled By: seemethere

fbshipit-source-id: 0f9cc430316c0a5e9972de3cdd06d225528c81c2
---
 .github/workflows/update_s3_htmls.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml
index 92f9a66a0fd8..f2320ce2fcbf 100644
--- a/.github/workflows/update_s3_htmls.yml
+++ b/.github/workflows/update_s3_htmls.yml
@@ -9,6 +9,7 @@ on:
 jobs:
   update-html:
     runs-on: ubuntu-latest
+    if: ${{ github.repository_owner == 'pytorch' }}
     strategy:
       matrix:
         prefix: ["whl", "whl/test", "whl/nightly"]

From fdb81c538a23a50e47daa34e8bcc14f1500d8596 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Mon, 4 Jan 2021 11:10:08 -0800
Subject: [PATCH 28/89] Improve `torch.flatten` docs and add tests to
 test_view_ops (#49501)

Summary:
Addresses https://github.com/pytorch/pytorch/issues/39474

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49501

Reviewed By: mrshenli

Differential Revision: D25740586

Pulled By: soulitzer

fbshipit-source-id: 3d7bdbab91eb208ac9e6832bb766d9d95a00c103
---
 test/test_view_ops.py | 64 +++++++++++++++++++++++++++++++++++++++++++
 torch/_torch_docs.py  | 12 +++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 3a1411d1a167..be33aa1ab44a 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -100,6 +100,12 @@ def is_view_of(self, base, other):
 
         return True
 
+    # Returns true if v1 and v2 are views of the same base
+    def is_view_of_same_base(self, v1, v2):
+        if (not v1._is_view() or v1 is v2):
+            return False
+        return self.is_view_of(v1._base, v2)
+
     # Performs transpose if contiguous=True, else returns the input tensor as is
     def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         if contiguous:
@@ -457,6 +463,64 @@ def test_reshape_nonview(self, device):
         nv[6] = 0
         self.assertNotEqual(t[1, 1], nv[6])
 
+    def test_flatten_view(self, device):
+        def test_writes_propagate(t, v):
+            idx_t = (0,) * t.ndim
+            idx_v = (0,) * v.ndim
+            v[idx_v] = 0
+            self.assertEqual(t[idx_t], v[idx_v])
+
+        t = torch.ones(1, 2, 3, 4, device=device)
+        v = t.flatten()
+        self.assertTrue(self.is_view_of(t, v))
+        test_writes_propagate(t, v)
+
+        # zero-dimensional tensor
+        t = torch.tensor(1, device=device)
+        v = t.flatten()
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of(t, v))
+
+        t = torch.ones(1, 2, 3, 4, device=device).transpose(2, 3)
+        v = t.flatten(0, 1)
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of_same_base(t, v))
+
+        # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups:
+        t = torch.ones(720, device=device) \
+            .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0))
+        #               [--1--|---2---|-3-] [--1--|----2---|-3-]
+        v1 = t.flatten(0, 1)
+        v2 = v1.flatten(1, 3)
+        v3 = v2.flatten(2, 2)
+        test_writes_propagate(t, v1)
+        self.assertTrue(self.is_view_of_same_base(t, v1))
+        test_writes_propagate(t, v2)
+        self.assertTrue(self.is_view_of_same_base(t, v2))
+        test_writes_propagate(t, v3)
+        self.assertTrue(self.is_view_of_same_base(t, v3))
+
+    @onlyOnCPUAndCUDA
+    def test_flatten_nonview(self, device):
+        def assert_is_nonview(t, nv):
+            idx_t = (0,) * t.ndim
+            idx_nv = (0,) * nv.ndim
+            self.assertTrue(not nv._is_view())
+            nv[idx_nv] = 0
+            self.assertNotEqual(t[idx_t], nv[idx_nv])
+        t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
+        nv = t.flatten(1, 3)
+        assert_is_nonview(t, nv)
+
+        t = torch.ones(2, 2, device=device).T
+        nv = t.flatten()
+        assert_is_nonview(t, nv)
+
+        # flatten returns the original object if start_dim=end_dim
+        t = t = torch.ones(2, 2, device=device)
+        nv = t.flatten(1, 1)
+        self.assertTrue(t is nv)
+
     def test_basic_indexing_slice_view(self, device):
         t = torch.ones(5, 5, device=device)
         v = t[:2, :3]
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index fe7237b5a370..4a1c36df7497 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3095,7 +3095,17 @@ def merge_dicts(*dicts):
            r"""
 flatten(input, start_dim=0, end_dim=-1) -> Tensor
 
-Flattens a contiguous range of dims in a tensor.
+Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+The order of elements in :attr:`input` is unchanged.
+
+Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+
+.. note::
+    Flattening a zero-dimensional tensor will return a one-dimensional view.
 
 Args:
     {input}

From 6e84a018be3819f9933911fe884cdf7c069c9cee Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Mon, 4 Jan 2021 11:16:59 -0800
Subject: [PATCH 29/89] move to non-legacy magma v2 headers (#49978)

Summary:
We recently (https://github.com/pytorch/pytorch/issues/7582) dropped magma v1 support, but we were still including the legacy compatibility headers and using functions only provided by them.
This changes the includes to the new magma_v2 header and fixes the triangular solve functions to use the v2-style magma_queue-using API.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49978

Reviewed By: mrshenli

Differential Revision: D25752499

Pulled By: ngimel

fbshipit-source-id: 26d916bc5ce63978b341aefb072af228f140637d
---
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       |   2 +-
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 141 ++++++++++++++----
 aten/src/ATen/native/cuda/MiscUtils.h         |   2 +-
 aten/src/THC/THCTensorMathMagma.cu            |   2 +-
 aten/src/THC/THCTensorMathMagma.cuh           |   2 +-
 5 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 00424ab83ba0..f38860e8ef13 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -21,7 +21,7 @@
 #endif
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifdef __HIP_PLATFORM_HCC__
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 7e6384c44b24..3fbd693d17b1 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -16,8 +16,8 @@
 #include <THC/THC.h> // for USE_MAGMA
 
 #ifdef USE_MAGMA
-#include <magma.h>
 #include <magma_types.h>
+#include <magma_v2.h>
 
 const bool use_magma_ = true;
 #else
@@ -95,10 +95,18 @@ void magmaCholeskyBatched(
     magma_uplo_t uplo, magma_int_t n, scalar_t** dA_array, magma_int_t ldda,
     magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue);
 
-template<class scalar_t>
+template <class scalar_t>
 void magmaTriangularSolve(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    scalar_t* dA,
+    magma_int_t ldda,
+    scalar_t* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue);
 
 template<class scalar_t>
 void magmaTriangularSolveBatched(
@@ -662,45 +670,117 @@ void magmaCholeskyBatched<c10::complex<float>>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<double>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    double* dA, magma_int_t ldda, double* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
-  magma_dtrsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    double* dA,
+    magma_int_t ldda,
+    double* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
+  magma_dtrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      1,
+      dA,
+      ldda,
+      dB,
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<float>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    float* dA, magma_int_t ldda, float* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
-  magma_strsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    float* dA,
+    magma_int_t ldda,
+    float* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
+  magma_strsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      1,
+      dA,
+      ldda,
+      dB,
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<c10::complex<double>>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    c10::complex<double>* dA, magma_int_t ldda, c10::complex<double>* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    c10::complex<double>* dA,
+    magma_int_t ldda,
+    c10::complex<double>* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
   magmaDoubleComplex alpha({1, 0});
-  magma_ztrsm(MagmaLeft, uplo, trans, diag, m, n, alpha,
-    reinterpret_cast<magmaDoubleComplex*>(dA), ldda,
-    reinterpret_cast<magmaDoubleComplex*>(dB), lddb);
+  magma_ztrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      reinterpret_cast<magmaDoubleComplex*>(dA),
+      ldda,
+      reinterpret_cast<magmaDoubleComplex*>(dB),
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<c10::complex<float>>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    c10::complex<float>* dA, magma_int_t ldda, c10::complex<float>* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    c10::complex<float>* dA,
+    magma_int_t ldda,
+    c10::complex<float>* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
   magmaFloatComplex alpha({1, 0});
-  magma_ctrsm(MagmaLeft, uplo, trans, diag, m, n, alpha,
-    reinterpret_cast<magmaFloatComplex*>(dA), ldda,
-    reinterpret_cast<magmaFloatComplex*>(dB), lddb);
+  magma_ctrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      reinterpret_cast<magmaFloatComplex*>(dA),
+      ldda,
+      reinterpret_cast<magmaFloatComplex*>(dB),
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
@@ -1636,11 +1716,14 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
   magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
   magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount");
 
+  MAGMAQueue magma_queue(b.get_device());
+
   // batch_size == 1 implies that:
   // 1. the RHS and LHS tensors have 2 dimensions, or
   // 2. the RHS and LHS tensors have more than 2 dimensions but all batch dimensions are 1
   if (batch_size == 1) {
-    magmaTriangularSolve<scalar_t>(uplo, trans, diag, n, nrhs, A_data, n, b_data, n);
+    magmaTriangularSolve<scalar_t>(
+        uplo, trans, diag, n, nrhs, A_data, n, b_data, n, magma_queue);
   } else {
     auto A_mat_stride = matrixStride(A);
     auto b_mat_stride = matrixStride(b);
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index 31e6d69aa0a1..8f78e8d78003 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -6,8 +6,8 @@
 #include <THC/THC.h>  // for USE_MAGMA
 
 #ifdef USE_MAGMA
-#include <magma.h>
 #include <magma_types.h>
+#include <magma_v2.h>
 #endif
 
 namespace at {
diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu
index ce6ca38afd2b..36316a6bf2eb 100644
--- a/aten/src/THC/THCTensorMathMagma.cu
+++ b/aten/src/THC/THCTensorMathMagma.cu
@@ -8,7 +8,7 @@
 #include <ATen/native/cuda/MiscUtils.h>
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifndef DIVUP
diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh
index 5ceac465c317..1fb5821afce5 100644
--- a/aten/src/THC/THCTensorMathMagma.cuh
+++ b/aten/src/THC/THCTensorMathMagma.cuh
@@ -2,7 +2,7 @@
 #define THC_TENSOR_MATH_MAGMA_CUH
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifdef USE_MAGMA

From 69ca5e1397f2412b0aeca6617e7613ef8dd81c2c Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Mon, 4 Jan 2021 11:19:08 -0800
Subject: [PATCH 30/89] Enforce c10-fullness for all ops (#49619)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49619

This is a minimal-change PR that enforces that all operators are c10-full by making it the default.

This does not clean up any code yet, that will happen in PRs stacked on top. But this PR already ensures
that there are no non-c10-full ops left and there will be no non-c10-full ops introduced anymore.
ghstack-source-id: 119269182

Test Plan: waitforsandcastle

Reviewed By: bhosmer

Differential Revision: D25650198

fbshipit-source-id: efc53e884cb53193bf58a4834bf148453e689ea1
---
 aten/src/ATen/native/README.md | 33 +++++++++++++++++++++------------
 tools/codegen/model.py         |  2 +-
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 6f17ac860cf8..6e7664c1e1a5 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -335,22 +335,31 @@ set of reviewers.
 ### `use_c10_dispatcher`
 
 ```
-use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper'
-use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures'
 use_c10_dispatcher: 'full'
+use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures'
 ```
 
 This will indicate the level of integration with the c10 dispatcher.
-If setting this to 'full' works for your operator, please do.
-This will enabled the full templated boxing and unboxing for your operator.
-Some ops use features that aren't supported by those templates yet,
-and enabling `use_c10_dispatcher: full` for those will result in a compiler error.
-For those, use `use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper'` instead,
-or just omit the argument because 'with_codegenerated_unboxing_wrapper' is the default.
-`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` is similar to `full`
-but adds a wrapper around the kernel before registering it with the dispatcher
-to support some legacy function signatures for kernels that we didn't migrate to
-the new signatures yet.
+For any new ops, please set this to 'full'. This is also the default,
+so you can just omit it.
+This requires the operator function signature to be aligned with the
+function schema in native_functions.yaml, i.e.
+- out arguments have to be in the end of the argument list instead of in the beginning
+- TensorOptions are taken as separate arguments
+```
+  const c10::optional<ScalarType>& dtype,
+  const c10::optional<Layout>& layout,
+  const c10::optional<Device>& device,
+  const c10::optional<bool>& pin_memory
+```
+  instead of one `TensorOptions` argument
+- optional tensors are taken as `const c10::optional<Tensor>&` instead of `Tensor`
+Some of our kernels are still written in a legacy way, not doing those things,
+and need an adapter to work with the dispatcher calling convention. For those, we use
+`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` to codegenerate a corresponding
+adapter around them in the operator registration call. Over time, we will migrate all
+those kernels to the new calling convention and hacky_wrapper will die.
+Please don't use it for new operators.
 
 ### `manual_kernel_registration`
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index a007e1a76f7c..6156dd5f89fc 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -171,7 +171,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
 
         use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None)
         if use_c10_dispatcher_s is None:
-            use_c10_dispatcher = UseC10Dispatcher.with_codegenerated_unboxing_wrapper
+            use_c10_dispatcher = UseC10Dispatcher.full
         elif use_c10_dispatcher_s == 'full':
             use_c10_dispatcher = UseC10Dispatcher.full
         elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures':

From f0945537af78d9f79b93f95757bc2b23aee2f127 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 4 Jan 2021 11:27:05 -0800
Subject: [PATCH 31/89] .circleci: Ignore unbound variables for conda (#50053)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50053

For some reason conda likes to re-activate the conda environment when attempting this install
which means that a deactivate is run and some variables might not exist when that happens,
namely CONDA_MKL_INTERFACE_LAYER_BACKUP from libblas so let's just ignore unbound variables when
it comes to the conda installation commands

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: samestep

Differential Revision: D25760737

Pulled By: seemethere

fbshipit-source-id: 9e7720eb8a4f8028dbaa7bcfc304e5c1ca73ad08
---
 .circleci/scripts/binary_linux_test.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 0b2e60b48f8e..26cc77c8ff9c 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -51,7 +51,14 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
     else
       cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
     fi
-    retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
+    (
+      # For some reason conda likes to re-activate the conda environment when attempting this install
+      # which means that a deactivate is run and some variables might not exist when that happens,
+      # namely CONDA_MKL_INTERFACE_LAYER_BACKUP from libblas so let's just ignore unbound variables when
+      # it comes to the conda installation commands
+      set +u
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
+    )
   fi
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
   pip install "\$pkg"

From 8e20594b38837d312070001ad2e62218a9f8163f Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 4 Jan 2021 11:51:28 -0800
Subject: [PATCH 32/89] Construct CppSignatureGroup from NativeFunction
 (#49245)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49245

This will make it easier to implement the POC in
https://github.com/peterbell10/pytorch/commit/d534f7d4c555a37fd178c143098b8537a5a05d61
see also https://github.com/pytorch/pytorch/pull/45666

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: smessmer

Differential Revision: D25594005

Pulled By: ezyang

fbshipit-source-id: e458d3dc3a765ec77425761b9b17f23769cecf9e
---
 tools/autograd/gen_python_functions.py   | 2 +-
 tools/autograd/gen_trace_type.py         | 4 ++--
 tools/autograd/gen_variable_factories.py | 2 +-
 tools/autograd/gen_variable_type.py      | 2 +-
 tools/autograd/load_derivatives.py       | 2 +-
 tools/codegen/api/python.py              | 2 +-
 tools/codegen/api/types.py               | 3 ++-
 tools/codegen/gen.py                     | 6 +++---
 8 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 1f61ce3dfa20..0450983a8e41 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -230,7 +230,7 @@ def signature_original(f: NativeFunction) -> str:
             opname += '_out'
         if f.func.name.name.inplace and pyi:
             opname += '_'
-        args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
+        args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
         # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml.
         types = ', '.join(argument_type_str(a.argument.type)
                           for a in args if isinstance(a.argument, Argument))
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 78c460843d94..31eb8aacf296 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -121,7 +121,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
     if f.use_c10_dispatcher.dispatcher_uses_new_style():
         args = list(f.func.schema_order_arguments())
     else:
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False)
+        sig_group = CppSignatureGroup.from_native_function(f, method=False)
         args = [cpp_args.argument for cpp_args in sig_group.signature.arguments()
                 if not isinstance(cpp_args.argument, SelfArgument)]
 
@@ -380,7 +380,7 @@ def method_definition(f: NativeFunction) -> Optional[str]:
             for a in f.func.schema_order_arguments()
         )
     else:
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False)
+        sig_group = CppSignatureGroup.from_native_function(f, method=False)
         formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments())
 
     return METHOD_DEFINITION.substitute(
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index a8c07aef4181..f8ab30dc4580 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -48,7 +48,7 @@ def process_function(f: NativeFunction) -> Optional[str]:
     if Variant.function not in f.variants or not is_factory:
         return None
 
-    sig = CppSignatureGroup.from_schema(f.func, method=False).signature
+    sig = CppSignatureGroup.from_native_function(f, method=False).signature
     formals: List[str] = []
     exprs: List[str] = []
     requires_grad = 'false'
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index f97fb55ab012..7346bfeb34c7 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -974,7 +974,7 @@ def get_decl_signature(declaration: Dict[Any, Any], use_base_variant: bool = Fal
 
 @with_native_function
 def get_func_signature(f: NativeFunction) -> str:
-    args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
+    args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
     types = ', '.join(python.argument_type_str(a.argument.type, simple_type=True)
                       if isinstance(a.argument, Argument) else 'TensorOptions'
                       for a in args)
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index bc2de6bb14d7..d5c742bb6fa5 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -62,7 +62,7 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque
 
 @with_native_function
 def cpp_arguments(f: NativeFunction) -> Sequence[Binding]:
-    return CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
+    return CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
 
 def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...]) -> Derivative:
     arguments = cpp_arguments(f)
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index bdb31d4d8616..bc5cbb440b98 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -566,7 +566,7 @@ class DispatchLambdaArgumentExprs:
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
-    return CppSignatureGroup.from_schema(f.func, method=method).signature
+    return CppSignatureGroup.from_native_function(f, method=method).signature
 
 def has_tensor_options(f: NativeFunction) -> bool:
     return f.func.arguments.tensor_options is not None
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index 5532c35b4ed2..6036454b850b 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -168,7 +168,8 @@ class CppSignatureGroup:
     faithful_signature: Optional[CppSignature]
 
     @staticmethod
-    def from_schema(func: FunctionSchema, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup':
+    def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup':
+        func = f.func
         faithful_signature: Optional[CppSignature]
         if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
             faithful_signature = CppSignature(func=func, faithful=True, method=method, fallback_binding=fallback_binding)
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 37f4ea7cc174..bb261c314f24 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -596,7 +596,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 
         name = cpp.name(f.func)
 
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=f.manual_cpp_binding)
+        sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding)
 
         if self.target is Target.DECLARATION:
             result = f"TORCH_API {sig_group.signature.decl()};\n"
@@ -650,7 +650,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 
         name = cpp.name(f.func)
 
-        sig_group = CppSignatureGroup.from_schema(f.func, method=True, fallback_binding=f.manual_cpp_binding)
+        sig_group = CppSignatureGroup.from_native_function(f, method=True, fallback_binding=f.manual_cpp_binding)
 
         if self.target is Target.DECLARATION:
             result = f"{sig_group.signature.decl()} const;\n"
@@ -1032,7 +1032,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
     kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
     out_arg_set = set(a.name for a in f.func.arguments.out)
 
-    sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=False)
+    sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False)
     cpp_args = sig_group.signature.arguments()
     arguments = [
         compute_cpp_argument_yaml(

From 7202c0ec50caea003fcb5749066f23a0a8561402 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 4 Jan 2021 11:51:28 -0800
Subject: [PATCH 33/89] Tighten up error checking on manual_kernel_registration
 (#49341)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49341

I noticed that #49097 was using manual_kernel_registration incorrectly,
so this diff tightens up the testing so that:

1. We don't generate useless wrapper functions when manual_kernel_registration
is on (it's not going to be registered, so it does nothing).

2. manual_kernel_registration shouldn't affect generation of functions in
Functions.h; if you need to stop bindings, use manual_cpp_binding

3. Structured and manual_kernel_registration are a hard error

4. We raise an error if you set dispatch and manual_kernel_registration at the
same time.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: smessmer

Differential Revision: D25594003

Pulled By: ezyang

fbshipit-source-id: 655b10e9befdfd8bc95f1631b2f48f995a31a59a
---
 tools/codegen/gen.py   | 5 +++--
 tools/codegen/model.py | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index bb261c314f24..a04f3001c3da 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -388,6 +388,7 @@ def gen_structured(self, g: StructuredNativeFunctions) -> List[str]:
         @with_native_function
         def gen_one(f: NativeFunction) -> Optional[str]:
             assert self.target is not Target.DECLARATION
+            assert not f.manual_kernel_registration
 
             # TODO: put this into StructuredNativeFunctions itself
             functional_func = g.out.func.signature()
@@ -487,6 +488,8 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
 
         if self.dispatch_key not in f.dispatch:
             return None
+        if f.manual_kernel_registration:
+            return None
 
         op_name = f"aten::{f.func.name}"
         if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
@@ -589,8 +592,6 @@ class ComputeFunction:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
-        if f.manual_kernel_registration:
-            return None
         if Variant.function not in f.variants:
             return None
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 6156dd5f89fc..4d0fca8eabe6 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -222,6 +222,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
         assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
         dispatch: Dict[str, str] = {}
         if raw_dispatch is not None:
+            assert not manual_kernel_registration, \
+                "cannot specify both manual_kernel_registration and dispatch; with " \
+                "manual registration, dispatch has no effect!"
             for ks, v in raw_dispatch.items():
                 if ks == '__line__':
                     continue  # not worth tracking line numbers for dispatch entries

From 8eee8460f821b7acb7a060df7b3b557218128fd9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 4 Jan 2021 11:51:28 -0800
Subject: [PATCH 34/89] codegen: Resolve overload ambiguities created by
 defaulted arguments (#49348)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49348

This is a redux of #45666 post refactor, based off of
https://github.com/peterbell10/pytorch/commit/d534f7d4c555a37fd178c143098b8537a5a05d61
Credit goes to peterbell10 for the implementation.

Fixes #43945.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: smessmer

Differential Revision: D25594004

Pulled By: ezyang

fbshipit-source-id: c8eb876bb3348308d6dc8ba7bf091a2a3389450f
---
 aten/src/ATen/native/TestOps.cpp           | 18 +++++++++++++
 aten/src/ATen/native/native_functions.yaml | 11 ++++++++
 test/cpp/api/misc.cpp                      |  8 ++++++
 tools/codegen/api/cpp.py                   | 30 ++++++++++++++++------
 tools/codegen/api/types.py                 | 29 ++++++++++++++++++---
 tools/codegen/gen.py                       |  4 ++-
 tools/codegen/model.py                     | 13 ++++++++++
 7 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index 0ebdce6795aa..7a6f7c6e8e05 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/ScalarOps.h>
 
 namespace at {
 namespace native {
@@ -50,5 +51,22 @@ Tensor _test_string_default(const Tensor& dummy, std::string a, std::string b) {
   return dummy;
 }
 
+// Test that overloads with ambiguity created by defaulted parameters work.
+// The operator declared first should have priority always
+
+// Overload a
+Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, int64_t b) {
+  TORCH_CHECK(a == 1);
+  TORCH_CHECK(b == 1);
+  return c10::scalar_to_tensor(1);
+}
+
+// Overload b
+Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, std::string b) {
+  TORCH_CHECK(a == 2);
+  TORCH_CHECK(b == "2");
+  return c10::scalar_to_tensor(2);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6b0aaa8f4d9b..cf1658af03f5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10229,3 +10229,14 @@
 - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor
+  cpp_no_default_args: ['a', 'b']
+  use_c10_dispatcher: full
+  python_module: nn
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index 160075d0d268..a8d6320e9533 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -82,3 +82,11 @@ TEST_F(AutogradTest, CanPassCustomGradientInputs) {
   z.sum().backward(torch::ones({}) * 2);
   ASSERT_TRUE(x.grad().allclose(y * 2));
 }
+
+TEST(UtilsTest, AmbiguousOperatorDefaults) {
+  auto tmp = at::empty({}, at::kCPU);
+  at::_test_ambiguous_defaults(tmp);
+  at::_test_ambiguous_defaults(tmp, 1);
+  at::_test_ambiguous_defaults(tmp, 1, 1);
+  at::_test_ambiguous_defaults(tmp, 2, "2");
+}
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index c27a6768300a..29a29e215f4f 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,7 +1,7 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
 import tools.codegen.local as local
-from typing import Optional, Sequence, Union, List
+from typing import Optional, Sequence, Union, List, Set
 
 # This file describes the translation of JIT schema to the public C++
 # API, which is what people use when they call functions like at::add.
@@ -239,26 +239,37 @@ def default_expr(d: str, t: Type) -> str:
 
 def argument(
     a: Union[Argument, TensorOptionsArguments, SelfArgument],
-    *, method: bool = False, faithful: bool = False,
-    has_tensor_options: bool = False
+    *, cpp_no_default_args: Set[str], method: bool, faithful: bool,
+    has_tensor_options: bool
 ) -> List[Binding]:
+    def sub_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Binding]:
+        return argument(
+            a, cpp_no_default_args=cpp_no_default_args, method=method, faithful=faithful,
+            has_tensor_options=has_tensor_options)
+
     if isinstance(a, Argument):
         binds: ArgName
         if a.name == "memory_format" and has_tensor_options:
             binds = SpecialArgName.possibly_redundant_memory_format
         else:
             binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
         return [Binding(
             ctype=argument_type(a, binds=binds),
             name=a.name,
-            default=default_expr(a.default, a.type) if a.default is not None else None,
+            default=default,
             argument=a,
         )]
     elif isinstance(a, TensorOptionsArguments):
         if faithful:
-            return argument(a.dtype) + argument(a.layout) + argument(a.device) + argument(a.pin_memory)
+            return sub_argument(a.dtype) + sub_argument(a.layout) + \
+                sub_argument(a.device) + sub_argument(a.pin_memory)
         else:
             default = None
+            # Enforced by NativeFunction.__post_init__
+            assert 'options' not in cpp_no_default_args
             if all(x.default == "None" for x in a.all()):
                 default = '{}'
             elif a.dtype.default == "long":
@@ -274,13 +285,13 @@ def argument(
             # Caller is responsible for installing implicit this in context!
             return []
         else:
-            return argument(a.argument)
+            return sub_argument(a.argument)
     else:
         assert_never(a)
 
 def arguments(
     arguments: Arguments,
-    *, faithful: bool, method: bool
+    *, faithful: bool, method: bool, cpp_no_default_args: Set[str]
 ) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
     if faithful:
@@ -291,5 +302,8 @@ def arguments(
         args.extend(arguments.non_out)
     return [
         r.no_default() if faithful else r for a in args
-        for r in argument(a, faithful=faithful, method=method, has_tensor_options=arguments.tensor_options is not None)
+        for r in argument(
+            a, faithful=faithful, method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args)
     ]
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index 6036454b850b..ea03a1799cfb 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -1,6 +1,6 @@
 from tools.codegen.model import *
 from dataclasses import dataclass
-from typing import Optional, Union, Sequence, TypeVar, List
+from typing import Optional, Union, Sequence, TypeVar, List, Set
 from enum import Enum
 
 _T = TypeVar('_T')
@@ -128,13 +128,22 @@ class CppSignature:
     # (i.e. with a potential TensorOptions argument and out arguments in the front)
     faithful: bool
 
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: Set[str]
+
+    # Is this a fallback C++ binding?  Fallback bindings are enabled by
+    # manual_cpp_binding: True and are alternate, non-public API that
+    # lets manual C++ binding implementors access the binding that would
+    # have been automatically generated
     fallback_binding: bool = False
 
     # Return the unpacked argument structure of this signature,
     # discarding information about which arguments are semantically
     # related to each other.
     def arguments(self) -> Sequence[Binding]:
-        return cpp.arguments(self.func.arguments, faithful=self.faithful, method=self.method)
+        return cpp.arguments(
+            self.func.arguments, faithful=self.faithful,
+            method=self.method, cpp_no_default_args=self.cpp_no_default_args)
 
     def name(self) -> str:
         n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful)
@@ -172,10 +181,22 @@ def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: b
         func = f.func
         faithful_signature: Optional[CppSignature]
         if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
-            faithful_signature = CppSignature(func=func, faithful=True, method=method, fallback_binding=fallback_binding)
+            faithful_signature = CppSignature(
+                func=func,
+                faithful=True,
+                method=method,
+                fallback_binding=fallback_binding,
+                cpp_no_default_args=f.cpp_no_default_args
+            )
         else:
             faithful_signature = None
-        signature = CppSignature(func=func, faithful=False, method=method, fallback_binding=fallback_binding)
+        signature = CppSignature(
+            func=func,
+            faithful=False,
+            method=method,
+            fallback_binding=fallback_binding,
+            cpp_no_default_args=f.cpp_no_default_args
+        )
         return CppSignatureGroup(
             func=func,
             signature=signature,
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index a04f3001c3da..a61a74cb3198 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -1053,7 +1053,9 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
 
     cpp_schema_order_types = [
         # NB: method here doesn't matter
-        r.type for a in schema_order_jit_arguments for r in cpp.argument(a, method=False)
+        r.type for a in schema_order_jit_arguments
+        for r in cpp.argument(
+            a, method=False, cpp_no_default_args=set(), faithful=False, has_tensor_options=False)
     ]
 
     cpp_returns = cpp.returns_type(f.func.returns)
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 4d0fca8eabe6..6d6ddba46de9 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -137,6 +137,10 @@ class NativeFunction:
     # changes the semantics of set_output to call the parent class.
     structured_inherits: Optional[str]
 
+    # Argument names whose default  should be excluded from the C++ interface.
+    # Intended for resolving overload ambiguities between signatures.
+    cpp_no_default_args: Set[str]
+
     # Note [Abstract ATen methods]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # An abstract ATen method is one whose dispatch differs between
@@ -169,6 +173,10 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
         assert isinstance(funcs, str), f'not a str: {funcs}'
         func = FunctionSchema.parse(funcs)
 
+        cpp_no_default_args_list = e.pop('cpp_no_default_args', [])
+        assert isinstance(cpp_no_default_args_list, list)
+        cpp_no_default_args = set(cpp_no_default_args_list)
+
         use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None)
         if use_c10_dispatcher_s is None:
             use_c10_dispatcher = UseC10Dispatcher.full
@@ -258,6 +266,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
             dispatch=dispatch,
             device_guard=device_guard,
             loc=loc,
+            cpp_no_default_args=cpp_no_default_args,
         )
 
     def validate_unstructured(self) -> None:
@@ -293,6 +302,10 @@ def __post_init__(self) -> None:
         # happen
         assert not (self.structured and self.structured_delegate), \
             "Cannot have both structured and structured_delegate on function"
+        defaulted_arguments = {a.name for a in self.func.schema_order_arguments()
+                               if a.default is not None}
+        invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
+        assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}'
 
 SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
 

From 6c833efd65599acfe61f2709c0ee5ce750ae0ee3 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 4 Jan 2021 11:51:28 -0800
Subject: [PATCH 35/89] Move default or no default logic into native.argument
 (#49489)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49489

Previously, it was done at a use site, but that meant other use
sites don't get the right logic.  Pushing it in makes sure everyone
gets it.

I also fixed one case of confusion where defn() was used to define a decl().
If you want to define a declaration with no defaults, say no_default().decl()
which is more direct and will give us code reviewers a clue if you should
have pushed this logic in.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: smessmer

Differential Revision: D25595407

Pulled By: ezyang

fbshipit-source-id: 89c664f0ed4d95699794a0d3123d54d0f7e4cba4
---
 tools/codegen/api/native.py | 42 +++++++++++++++++++++++++------------
 tools/codegen/gen.py        | 14 ++-----------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index 9781c46884e7..936500b560db 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -4,7 +4,7 @@
 import tools.codegen.api.cpp as cpp
 from tools.codegen import local
 
-from typing import Union, Sequence, List
+from typing import Union, Sequence, List, Optional
 
 # This file describes the translation of JIT schema to the native functions API.
 # This looks a lot like the C++ API (which makes historical sense, because the
@@ -43,26 +43,36 @@ def returns_type(rs: Sequence[Return]) -> str:
 def argument_type(a: Argument, *, binds: ArgName) -> CType:
     return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
 
-def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]:
+def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool) -> List[Binding]:
+    # Ideally, we NEVER default native functions.  However, there are a number
+    # of functions that call native:: directly and rely on the defaulting
+    # existing.  So for BC, we generate defaults for non-out variants (but not
+    # for out variants, where it is impossible to generate an appropriate
+    # default)
+    should_default = not is_out or local.use_c10_dispatcher() is not UseC10Dispatcher.full
     if isinstance(a, Argument):
+        default: Optional[str] = None
+        if should_default and a.default is not None:
+            default = cpp.default_expr(a.default, a.type)
         return [Binding(
             ctype=argument_type(a, binds=a.name),
             name=a.name,
-            default=cpp.default_expr(a.default, a.type) if a.default is not None else None,
+            default=default,
             argument=a,
         )]
     elif isinstance(a, SelfArgument):
         # Erase SelfArgument from the distinction
-        return argument(a.argument)
+        return argument(a.argument, is_out=is_out)
     elif isinstance(a, TensorOptionsArguments):
         if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures,
                                           UseC10Dispatcher.with_codegenerated_unboxing_wrapper]:
             # TODO: expunge this logic entirely
             default = None
-            if all(x.default == "None" for x in a.all()):
-                default = '{}'
-            elif a.dtype.default == "long":
-                default = 'at::kLong'  # TODO: this is wrong
+            if should_default:
+                if all(x.default == "None" for x in a.all()):
+                    default = '{}'
+                elif a.dtype.default == "long":
+                    default = 'at::kLong'  # TODO: this is wrong
             return [Binding(
                 ctype=ConstRefCType(BaseCType('TensorOptions', 'options')),
                 name='options',
@@ -71,29 +81,35 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B
             )]
         else:
             assert local.use_c10_dispatcher() == UseC10Dispatcher.full
+            default = None
+            if should_default:
+                default = '{}'
+            # TODO: Not sure why the arguments assigned here are for
+            # TensorOptionsArguments and not the constituent pieces.  It seems
+            # to matter
             return [
                 Binding(
                     ctype=OptionalCType(BaseCType('ScalarType', 'dtype')),
                     name='dtype',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('Layout', 'layout')),
                     name='layout',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('Device', 'device')),
                     name='device',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('bool', 'pin_memory')),
                     name='pin_memory',
-                    default='{}',
+                    default=default,
                     argument=a,
                 )]
     else:
@@ -107,4 +123,4 @@ def arguments(func: FunctionSchema) -> List[Binding]:
     else:
         args.extend(func.arguments.out)
         args.extend(func.arguments.non_out)
-    return [r for arg in args for r in argument(arg)]
+    return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())]
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index a61a74cb3198..6164a4d64713 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -730,17 +730,7 @@ def compute_native_function_declaration(g: Union[StructuredNativeFunctions, Nati
                 if is_structured_dispatch_key(k):
                     continue
                 seen.add(n)
-                if f.func.is_out_fn() and local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    # out overloads don't get default arguments because
-                    # defaulted arguments would be before the out argument
-                    # in the argument list and that doesn't work.
-                    # TODO We should consider if we just want to remove
-                    # default arguments from all at::native functions
-                    # but that would be a larger change because we need
-                    # to change a lot of call sites
-                    args_str = ', '.join(a.defn() for a in args)
-                else:
-                    args_str = ', '.join(a.decl() for a in args)
+                args_str = ', '.join(a.decl() for a in args)
                 rs.append(f"TORCH_API {returns_type} {n}({args_str});")
 
         return rs
@@ -1094,7 +1084,7 @@ def compute_registration_declarations(f: NativeFunction) -> str:
     name = dispatcher.name(f.func)
     returns_type = dispatcher.returns_type(f.func.returns)
     args = dispatcher.arguments(f.func)
-    args_str = ', '.join(a.defn() for a in args)
+    args_str = ', '.join(a.no_default().decl() for a in args)
     comment_data : Dict[str, str] = {
         'schema': f'aten::{f.func}',
         # TODO: What exactly is the semantics of the 'dispatch' field?

From 0216366f0d5df3a14daaf0ec394740589d7d0866 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 4 Jan 2021 11:51:28 -0800
Subject: [PATCH 36/89] Make use_c10_dispatcher: full mandatory for structured
 kernels (#49490)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49490

No reason to let people to do the legacy thing for the brand new kernel.
This simplifies the codegen.  I have to port the two structured kernels
to this new format.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: smessmer

Differential Revision: D25595406

Pulled By: ezyang

fbshipit-source-id: b5931873379afdd0f3b00a012e0066af05de0a69
---
 aten/src/ATen/native/BinaryOps.cpp            |  2 +-
 aten/src/ATen/native/UpSampleNearest1d.cpp    | 10 ++++++----
 .../src/ATen/native/cuda/UpSampleNearest1d.cu | 10 ++++++----
 aten/src/ATen/native/mkldnn/BinaryOps.cpp     | 12 +++++++-----
 aten/src/ATen/native/native_functions.yaml    |  6 +++---
 .../ATen/native/sparse/SparseTensorMath.cpp   |  2 +-
 .../sparse/cuda/SparseCUDATensorMath.cu       |  2 +-
 tools/codegen/gen.py                          | 19 ++++---------------
 tools/codegen/model.py                        |  3 +++
 torch/csrc/jit/runtime/static/ops.cpp         |  2 +-
 10 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 9103eafb1f12..fdd8057f9afd 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -71,7 +71,7 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
 }
 
 TORCH_IMPL_FUNC(add_out) (
-  Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha
+  const Tensor& self, const Tensor& other, Scalar alpha, Tensor& result
 ) {
   add_stub(device_type(), *this, alpha);
   TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype());
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index b9dd52dffa5d..6478bbb58eaf 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -66,19 +66,21 @@ TORCH_META_FUNC(upsample_nearest1d_backward) (
 namespace native {
 
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) (
-    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& output
+) {
   upsample_nearest1d_kernel(kCPU, output, input, scales);
 }
 
 TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) (
-    Tensor& grad_input,
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& grad_input
+) {
   grad_input.zero_();
   upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales);
 }
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 99488108ac26..b269bd303e76 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -197,19 +197,21 @@ static void upsample_nearest1d_backward_out_cuda_template(
 } // namespace
 
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) (
-    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& output
+) {
   upsample_nearest1d_out_cuda_template(output, input, output_size, scales);
 }
 
 TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) (
-    Tensor& grad_input,
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& grad_input
+) {
   upsample_nearest1d_backward_out_cuda_template(
       grad_input, grad_output, output_size, input_size, scales);
 }
diff --git a/aten/src/ATen/native/mkldnn/BinaryOps.cpp b/aten/src/ATen/native/mkldnn/BinaryOps.cpp
index 029b1d225d14..3358079f4df5 100644
--- a/aten/src/ATen/native/mkldnn/BinaryOps.cpp
+++ b/aten/src/ATen/native/mkldnn/BinaryOps.cpp
@@ -8,10 +8,11 @@ namespace at {
 namespace native {
 
 Tensor& mkldnn_add_out(
-    Tensor& result,
     const Tensor& self,
     const Tensor& other,
-    Scalar alpha) {
+    Scalar alpha,
+    Tensor& result
+    ) {
   TORCH_CHECK(false, "mkldnn_add_out: ATen not compiled with MKLDNN support");
 }
 
@@ -46,10 +47,11 @@ namespace at {
 namespace native {
 
 Tensor& mkldnn_add_out(
-    Tensor& result,
     const Tensor& self,
     const Tensor& other,
-    Scalar alpha) {
+    Scalar alpha,
+    Tensor& result
+    ) {
   ideep::tensor& x = itensor_from_mkldnn(self);
   ideep::tensor& y = itensor_from_mkldnn(other);
 
@@ -73,7 +75,7 @@ Tensor mkldnn_add(const Tensor& self, const Tensor& other, Scalar alpha) {
 }
 
 Tensor& mkldnn_add_(Tensor& self, const Tensor& other, Scalar alpha) {
-  return native::mkldnn_add_out(self, self, other, alpha);
+  return native::mkldnn_add_out(self, other, alpha, self);
 }
 
 Tensor& mkldnn_mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index cf1658af03f5..da5f2d10cfd3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -412,7 +412,7 @@
     MkldnnCPU: mkldnn_add_
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
@@ -9439,7 +9439,7 @@
     CUDA: upsample_trilinear3d_backward_cuda
 
 - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   python_module: nn
   structured: True
   dispatch:
@@ -9452,7 +9452,7 @@
   structured_delegate: upsample_nearest1d.out
 
 - func: upsample_nearest1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   python_module: nn
   structured: True
   dispatch:
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 60df74061c7a..9bb679beb3d0 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -544,7 +544,7 @@ SparseTensor& add_out_sparse_non_contiguous(SparseTensor& r, const SparseTensor&
 
 Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value);
 
-SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r) {
   if (!t.is_sparse()) {
     return add_out_dense_sparse_cpu(r, t, src, value);
   }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 753ea9fa4937..c8366f71618e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -399,7 +399,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
 
 Tensor& add_out_dense_sparse_cuda(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value);
 
-SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+SparseTensor& add_out_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r_) {
   if (!t.is_sparse()) {
     return add_out_dense_sparse_cuda(r_, t, src, value);
   }
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 6164a4d64713..782d8b919e7e 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -432,7 +432,7 @@ def gen_one(f: NativeFunction) -> Optional[str]:
                 if self.dispatch_key == 'Meta':
                     impl_call = ""
                 else:
-                    impl_call = f"op.impl({out_expr}, {functional_exprs});"
+                    impl_call = f"op.impl({functional_exprs}, {out_expr});"
 
                 # For an overview of what this template code looks like, see
                 # https://github.com/pytorch/rfcs/pull/9
@@ -455,19 +455,8 @@ def gen_one(f: NativeFunction) -> Optional[str]:
             elif self.target is Target.REGISTRATION:
                 dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
-                if local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    payload = f"TORCH_FN({sig.name()})"
-                elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
-                    payload = f"""
-c10::impl::hacky_wrapper_for_legacy_signatures<
-    {dispatcher_sig.type()},
-    {len(f.func.arguments.out)}
->(TORCH_FN({sig.name()}))
-"""
-                else:
-                    assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                    payload = f"torch::CppFunction::makeUnboxedOnly(&{sig.name()})"
-                return f'm.impl("{f.func.name}", {payload});'
+                assert local.use_c10_dispatcher() is UseC10Dispatcher.full
+                return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
             else:
                 assert_never(self.target)
                 # Silence mypy's "Missing return statement" error
@@ -760,7 +749,7 @@ def compute_meta_function_declaration(g: StructuredNativeFunctions) -> str:
         sig = g.signature()
         name = meta.name(g)
         args = native.arguments(sig)
-        args_str = ', '.join(a.defn() for a in args)
+        args_str = ', '.join(a.decl() for a in args)
         parent_class = g.out.structured_inherits
         if parent_class is None:
             parent_class = "at::impl::MetaBase"
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 6d6ddba46de9..ea667a0922cf 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -306,6 +306,9 @@ def __post_init__(self) -> None:
                                if a.default is not None}
         invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
         assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}'
+        if self.structured or self.structured_delegate:
+            assert self.use_c10_dispatcher is UseC10Dispatcher.full, \
+                "Structured kernels MUST be use_c10_dispatcher: full; port your argument order"
 
 SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 11fb5dae2d6c..5c118f513565 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -88,7 +88,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator {
     auto out_t = p_node->Output(0, reg).toTensor();
     static_add op{out_t};
     op.meta(in0_t, in1_t, in2_s);
-    op.impl(out_t, in0_t, in1_t, in2_s);
+    op.impl(in0_t, in1_t, in2_s, out_t);
   };
 });
 

From da790eca696398f89079c128db553ef34a58baa7 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Mon, 4 Jan 2021 11:54:34 -0800
Subject: [PATCH 37/89] Add trace batching forward/backward rule (#49979)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49979

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D25734379

Pulled By: ejguan

fbshipit-source-id: 8f9346afaf324e7ab17bafd6ecc97eed8442fd38
---
 aten/src/ATen/BatchingRegistrations.cpp | 21 +++++++++++++++++++++
 test/test_vmap.py                       | 14 ++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 419c454257d8..73f8216bd4b5 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -287,6 +287,25 @@ Tensor squeeze_dim_batching_rule(const Tensor& self, int64_t dim) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
+Tensor trace_batching_rule(const Tensor& self) {
+  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
+  // Batched Diagonal View
+  auto self_diag = at::diagonal(self_physical.tensor(), /*offset*/0, /*dim1*/-2, /*dim2*/-1);
+  auto result =  at::sum(self_diag, -1);
+  return self_physical.getPhysicalToLogicalMap().apply(result);
+}
+
+Tensor trace_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes) {
+  auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
+  auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
+  // Batched Diagonal View
+  auto grad_input_diag = at::diagonal(grad_input, /*offset*/0, /*dim1*/-2, /*dim2*/-1);
+  // Append a dimension of size one to the grad output 
+  auto grad_physical_tensor = grad_physical.tensor().unsqueeze(-1);
+  grad_input_diag.copy_(grad_physical_tensor);
+  return grad_physical.getPhysicalToLogicalMap().apply(grad_input);
+}
+
 Tensor transpose_int_batching_rule(const Tensor& self, int64_t dim0, int64_t dim1) {
   // PyTorch has a special case where scalar_tensor.transpose(dim0, dim1) works
   // for dim0, dim1 in {0, -1} and returns the scalar tensor. If the following happens:
@@ -1029,6 +1048,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("squeeze", squeeze_batching_rule);
   m.impl("squeeze.dim", squeeze_dim_batching_rule);
   m.impl("t", native::t); // composite wrt autograd
+  m.impl("trace", trace_batching_rule);
   m.impl("transpose.int", transpose_int_batching_rule);
   m.impl("unbind.int", unbind_batching_rule);
   m.impl("unfold", unfold_batching_rule);
@@ -1150,6 +1170,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   // backward operators
   m.impl("select_backward", select_backward_batching_rule);
   m.impl("slice_backward", slice_backward_batching_rule);
+  m.impl("trace_backward", trace_backward_batching_rule);
   m.impl("diagonal_backward", diagonal_backward_batching_rule);
 
   // Tensor.new_* operators
diff --git a/test/test_vmap.py b/test/test_vmap.py
index cc25dff3b306..e7943cf2940b 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -1907,6 +1907,16 @@ def test_split(self):
         test(vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
              (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
 
+    def test_trace(self):
+        op = torch.trace
+        test = self._vmap_test
+        B0, B1, B2 = 7, 11, 13
+
+        test(op, (torch.rand(B0, 2, 5),))
+        test(op, (torch.rand(2, B0, 5),), in_dims=1)
+        test(vmap(op), (torch.rand(B1, 2, B0, 5),), in_dims=2)
+        test(vmap(vmap(op, in_dims=2)), (torch.rand(B1, 2, B0, 5, B2),), in_dims=2)
+
     def test_transpose(self):
         op = torch.transpose
         test = self._vmap_view_test
@@ -2313,6 +2323,10 @@ def test_slice(self, device):
         self._batched_grad_test(lambda x: x[:, 1:3], (x,))
         self._batched_grad_test(lambda x: x[..., 1:3], (x,))
 
+    def test_trace(self, device):
+        x = torch.randn(2, 3, device=device, requires_grad=True)
+        self._batched_grad_test(Tensor.trace, (x,))
+
     @allowVmapFallbackUsage
     def test_symeig(self, device):
         def op(x):

From 483670ff0f94a19474e678707f8221660320c8f3 Mon Sep 17 00:00:00 2001
From: Kaiwen Wang <kaiwenw@fb.com>
Date: Mon, 4 Jan 2021 12:22:12 -0800
Subject: [PATCH 38/89] [pytorch] add threshold_backward batching for vmap
 (#49881)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49881

title

Test Plan: pytest test/test_vmap.py -v -k "BatchedGrad"

Reviewed By: zou3519

Differential Revision: D25711289

fbshipit-source-id: f1856193249fda70da41e36e15bc26ea7966b510
---
 aten/src/ATen/BatchingRegistrations.cpp | 7 +++++++
 test/test_vmap.py                       | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 73f8216bd4b5..9bdec2dce77e 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -1109,6 +1109,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
 #undef TO_BATCHING_RULE
   m.impl("clone", clone_batching_rule);
 
+  using TensorTensorScalarType = Tensor (*)(const Tensor&, const Tensor&, Scalar);
   using TensorTensorType = Tensor (*)(const Tensor&, const Tensor&);
   using TensorScalarType = Tensor (*)(const Tensor&, Scalar);
 
@@ -1135,6 +1136,12 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("pow.Scalar", pow_scalar_Tensor_batching_rule);
 
   m.impl("sigmoid_backward", binary_pointwise_batching_rule<TensorTensorType, at::sigmoid_backward>);
+  m.impl(
+      "threshold_backward",
+      binary_pointwise_batching_rule<
+          TensorTensorScalarType,
+          at::threshold_backward,
+          Scalar>);
 
   // for at::result_type, call the native::result_type implementation.
   // We don't have to do anything special because native::result_type operates
diff --git a/test/test_vmap.py b/test/test_vmap.py
index e7943cf2940b..b722fc126b24 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -2336,6 +2336,11 @@ def op(x):
         self._batched_grad_test(op, (x,), {})
         self._batched_grad_grad_test(op, (x,), {})
 
+    def test_threshold(self, device):
+        x = torch.randn(2, 3, device=device, requires_grad=True)
+        self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,))
+
+
     @allowVmapFallbackUsage
     def test_inplace_view(self, device):
         leaf = torch.randn(4, 5, requires_grad=True)

From 74dcb6d363824a35060bc0546854530daa92c8fc Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 4 Jan 2021 12:39:49 -0800
Subject: [PATCH 39/89] torch.xlogy: Use wrapped_scalar_tensor /
 gpu_with_scalars to speed up GPU kernel. (#49926)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49926

While investigating https://github.com/pytorch/pytorch/issues/49758, I changed the xlogy kernel to use the recommended wrapped_scaler_tensor pattern instead of moving the scalar to the GPU as a tensor.
While this doesn't avoid a synchronization (there is no synchronization in the move, as its done via fill), this does significantly speed up the GPU kernel (almost ~50%, benchmark in PR comments).

From looking at the nvprof output, it looks like this code path avoids broadcasting.  Aside: this seems unnecessary, as there is nothing special from the point-of-view of broadcasting whether the Tensor
is ()-sized or marked as a wrapped_scalar.  Still, this is a useful change to make as we avoid extra kernel launches and dispatches to create and fill the tensor.

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D25724215

Pulled By: gchanan

fbshipit-source-id: 4adcd5d8b3297502672ffeafc77e8af80592f460
---
 aten/src/ATen/native/BinaryOps.cpp              | 10 +++++-----
 .../ATen/native/cuda/BinaryMiscOpsKernels.cu    |  2 +-
 test/test_binary_ufuncs.py                      | 17 ++++++++++++++---
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index fdd8057f9afd..d4b514f6797b 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -1109,11 +1109,11 @@ Tensor& xlogy_out(Tensor& result, const Tensor& self, const Tensor& other) {
 }
 
 Tensor& xlogy_out(Tensor& result, Scalar self, const Tensor& other) {
-  return at::xlogy_out(result, c10::scalar_to_tensor(self, other.device()), other);
+  return at::xlogy_out(result, wrapped_scalar_tensor(self), other);
 }
 
 Tensor& xlogy_out(Tensor& result, const Tensor& self, Scalar other) {
-  return at::xlogy_out(result, self, c10::scalar_to_tensor(other, self.device()));
+  return at::xlogy_out(result, self, wrapped_scalar_tensor(other));
 }
 
 Tensor xlogy(const Tensor& x, const Tensor& y) {
@@ -1124,11 +1124,11 @@ Tensor xlogy(const Tensor& x, const Tensor& y) {
 }
 
 Tensor xlogy(Scalar x, const Tensor& y) {
-  return at::xlogy(c10::scalar_to_tensor(x, y.device()), y);
+  return at::xlogy(wrapped_scalar_tensor(x), y);
 }
 
 Tensor xlogy(const Tensor& x, Scalar y) {
-  return at::xlogy(x, c10::scalar_to_tensor(y, x.device()));
+  return at::xlogy(x, wrapped_scalar_tensor(y));
 }
 
 Tensor& xlogy_(Tensor& x, const Tensor& y) {
@@ -1136,7 +1136,7 @@ Tensor& xlogy_(Tensor& x, const Tensor& y) {
 }
 
 Tensor& xlogy_(Tensor& x, Scalar y) {
-  return at::xlogy_out(x, x, c10::scalar_to_tensor(y, x.device()));
+  return at::xlogy_out(x, x, wrapped_scalar_tensor(y));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index 2379877e91ba..bc1884d8d642 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -32,7 +32,7 @@ void mse_kernel_cuda(TensorIterator& iter) {
 
 void xlogy_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlogy_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
       if (at::_isnan(y)){
         return NAN;
       }
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 5739fb569628..2ff12396701e 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1082,13 +1082,13 @@ def test_maximum_minimum_cross_device(self, device):
         ops = (torch.maximum, torch.minimum)
 
         for torch_op in ops:
-            with self.assertRaisesRegex(RuntimeError, 
+            with self.assertRaisesRegex(RuntimeError,
                                         "Expected all tensors to be on the same device"):
                 torch_op(a, b)
 
-            with self.assertRaisesRegex(RuntimeError, 
+            with self.assertRaisesRegex(RuntimeError,
                                         "Expected all tensors to be on the same device"):
-                torch_op(b, a) 
+                torch_op(b, a)
 
         # test cuda tensor and cpu scalar
         ops = ((torch.maximum, np.maximum), (torch.minimum, np.minimum))
@@ -2560,6 +2560,17 @@ def inplace_variant_helper(x, y):
         self.compare_with_numpy(torch_fn, reference_fn, t, exact_dtype=False)
         out_variant_helper(torch.xlogy, 0, t)
 
+    def test_xlogy_scalar_type_promotion(self, device):
+        # Test that python numbers don't participate in type promotion at the same
+        # priority level as 0-dim tensors
+        t = torch.randn((), dtype=torch.float32, device=device)
+
+        self.assertEqual(t.dtype, torch.xlogy(t, 5).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(t, 5.).dtype)
+
+        self.assertEqual(t.dtype, torch.xlogy(5, t).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(5., t).dtype)
+
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     def test_xlogy_bfloat16(self, device):
         def _compare_helper(x, y):

From 71766d89ea55bb8d4e180bac1c21bd95a00d54d5 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 4 Jan 2021 13:41:00 -0800
Subject: [PATCH 40/89] [BE] unified run_process_no_exception code (#49774)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49774

Reviewed By: janeyx99

Differential Revision: D25756811

Pulled By: walterddr

fbshipit-source-id: 4d2b3bd772572764ff96e5aad70323b58393e332
---
 test/test_multiprocessing.py            | 14 ++------------
 test/test_testing.py                    | 15 ++++++---------
 torch/testing/_internal/common_utils.py | 19 +++++++++++++------
 3 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 75b486043c42..81b33c5900db 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -3,7 +3,6 @@
 import os
 import sys
 import time
-import subprocess
 import unittest
 import copy
 from sys import platform
@@ -525,7 +524,7 @@ def test_cuda_bad_call(self):
     @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
     @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
     def test_wrong_cuda_fork(self):
-        results = self.run_process_no_exception("""\
+        stderr = TestCase.runWithPytorchAPIUsageStderr("""\
 import torch
 from torch.multiprocessing import Process
 def run(rank):
@@ -542,7 +541,7 @@ def run(rank):
     for p in processes:
         p.join()
 """)
-        self.assertRegex(results[1].decode('ascii'), "Cannot re-initialize CUDA in forked subprocess.")
+        self.assertRegex(stderr, "Cannot re-initialize CUDA in forked subprocess.")
 
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
@@ -831,15 +830,6 @@ def test_cuda_parameter_sharing(self):
         param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5))
         self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
 
-    @staticmethod
-    def run_process_no_exception(code):
-        popen = subprocess.Popen(
-            [sys.executable, '-c', code],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-        pipes = popen.communicate()
-        return pipes
-
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     def test_integer_parameter_serialization(self):
diff --git a/test/test_testing.py b/test/test_testing.py
index 8cdca871185b..4ff215233fe2 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -442,10 +442,9 @@ def test_assert_messages(self, device):
     @slowTest
     def test_cuda_assert_should_stop_test_suite(self, device):
         # This test is slow because it spawn another process to run another test suite.
-        import subprocess
-        import sys
 
-        problematic_test_script = """\
+        # Test running of cuda assert test suite should early terminate.
+        stderr = TestCase.runWithPytorchAPIUsageStderr("""\
 #!/usr/bin/env python
 
 import torch
@@ -479,14 +478,12 @@ def test_trivial_passing_test_case_on_cpu_cuda(self, device):
 
 if __name__ == '__main__':
     run_tests()
-"""
-
-        # Test running of cuda assert test suite should early terminate.
-        p = subprocess.run([sys.executable, '-c', problematic_test_script], stderr=subprocess.PIPE, timeout=120)
+""")
         # should capture CUDA error
-        self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii'))
+        self.assertIn('CUDA error: device-side assert triggered', stderr)
         # should run only 1 test because it throws unrecoverable error.
-        self.assertIn('Ran 1 test', p.stderr.decode('ascii'))
+        self.assertIn('Ran 1 test', stderr)
+
 
 instantiate_device_type_tests(TestTesting, globals())
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index bea572722ae6..9f70551eb3b2 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1386,19 +1386,26 @@ def assertExpectedStripMangled(self, s, subname=None):
         s = re.sub(r'__torch__[^ ]+', '', s)
         self.assertExpected(s, subname)
 
-    # returns captured stderr
+    # run code in subprocess and capture exceptions.
     @staticmethod
-    def runWithPytorchAPIUsageStderr(code):
+    def run_process_no_exception(code, env=None):
         import subprocess
 
-        env = os.environ.copy()
-        env["PYTORCH_API_USAGE_STDERR"] = "1"
-        pipes = subprocess.Popen(
+        popen = subprocess.Popen(
             [sys.executable, '-c', code],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             env=env)
-        return pipes.communicate()[1].decode('ascii')
+        (stdout, stderr) = popen.communicate()
+        return (stdout, stderr)
+
+    # returns captured stderr
+    @staticmethod
+    def runWithPytorchAPIUsageStderr(code):
+        env = os.environ.copy()
+        env["PYTORCH_API_USAGE_STDERR"] = "1"
+        (stdout, stderr) = TestCase.run_process_no_exception(code, env=env)
+        return stderr.decode('ascii')
 
 
 def download_file(url, binary=True):

From f96ce3305ccea78017c1504a7ba690f80ccdb2aa Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Mon, 4 Jan 2021 14:37:02 -0800
Subject: [PATCH 41/89] prohibit assignment to a sparse tensor (#50040)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48225 by prohibiting assignment to a sparse Tensor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50040

Reviewed By: mrshenli

Differential Revision: D25757125

Pulled By: zou3519

fbshipit-source-id: 3db6f48932eb10bf6ca5e97a6091afcabb60e478
---
 test/test_sparse.py                              | 8 ++++++++
 torch/csrc/autograd/python_variable_indexing.cpp | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 6daf3f1931d2..4e982b8333d9 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -3168,6 +3168,14 @@ def different_dtypes():
         test_sparse_matmul(2, 0, [0, 10], [10, 0])
         test_error_cases()
 
+    def test_assign(self):
+        def assign_to(a):
+            a, i_a, v_a = self._gen_sparse(2, 5, [2, 3])
+            a[0] = 100
+
+        self.assertRaises(TypeError, assign_to)
+
+
 class TestUncoalescedSparse(TestSparse):
     def setUp(self):
         super(TestUncoalescedSparse, self).setUp()
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 4b38d924c91b..285161a49ef2 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -351,6 +351,10 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   }
 
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.is_sparse())
+  {
+    throw TypeError("Cannot assign to a sparse tensor");
+  }
   OptionalDeviceGuard device_guard(device_of(self_));
   at::Device self_device = self_.device();
   Variable value;

From 240c0b318aa726fe2ba17796aaa8db3eeaeb0321 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 4 Jan 2021 14:51:04 -0800
Subject: [PATCH 42/89] Suppress "statement is unreachable" warning (#49495)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49495

Compiling PyTorch currently generates a large number of warnings like this:
```
caffe2/aten/src/ATen/core/builtin_function.h(105): warning: statement is unreachable
```
The offending code
```
  std::string pretty_print_schema() const override {
    TORCH_INTERNAL_ASSERT(false);
    return "";
  }
```
has an unreachable return which prevents a "no return" warning.

We resolve the situation by using NVCC's pragma system to suppress this warning within this function.

Test Plan:
The warning appears when running:
```
buck build mode/dev-nosan //caffe2/torch/fb/sparsenn:test
```
As well as a number of other build commands.

Reviewed By: ngimel

Differential Revision: D25546542

fbshipit-source-id: 71cddd4fdb5fd16022a6d7b2daf0e6d55e6e90e2
---
 aten/src/ATen/core/builtin_function.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 8bfb4f7e9d16..adeaa1039638 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -101,8 +101,17 @@ struct BuiltinOpFunction : public Function {
   }
 
   std::string pretty_print_schema() const override {
+    #ifdef __NVCC__
+    // Disable the "statement is unreachable" warning
+    #pragma diag_suppress code_is_unreachable
+    #endif
+
     TORCH_INTERNAL_ASSERT(false);
     return "";
+
+    #ifdef __NVCC__
+    #pragma diag_default code_is_unreachable
+    #endif
   }
 
   Function& setSchema(c10::FunctionSchema schema) override {

From c439a6534de80be5ef30bfb39633a83cdba74838 Mon Sep 17 00:00:00 2001
From: David <jiafa@microsoft.com>
Date: Mon, 4 Jan 2021 15:09:20 -0800
Subject: [PATCH 43/89] [ONNX] Handle Sub-block index_put in
 _jit_pass_onnx_remove_inplace_ops_for_onnx (#48734)

Summary:
For the added UT and existing UTs, this code is independent and ready for review.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48734

Reviewed By: izdeby

Differential Revision: D25502677

Pulled By: bzinodev

fbshipit-source-id: 788b4eaa5e5e8b5df1fb4956fbd25928127bb199
---
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 29 ++++++
 .../onnx/remove_inplace_ops_for_onnx.cpp      | 98 ++++++++++++++++++-
 torch/onnx/symbolic_opset9.py                 |  3 +
 3 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index b0c7143d0129..c2e4865cee89 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1554,6 +1554,35 @@ def forward(self, x, update):
         update = torch.randn(4, 1, 3, 2)
         self.run_test(IndexPutModel2(), (x, update))
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_index_put_loop(self):
+        @torch.jit.script
+        def ngram_attention_bias(sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype):
+            bias = torch.ones((ngram, sequence_length), device=device, dtype=dtype) * float("-inf")
+            for stream_idx in range(ngram):
+                for i in range(sequence_length):
+                    bias[stream_idx, i] = 5
+            return bias
+
+        class ScriptModel(torch.nn.Module):
+            def __init__(self):
+                super(ScriptModel, self).__init__()
+                self.ngram = 2
+                self.max_target_positions = 512
+
+            def forward(self, hidden_states):
+                seq_length, batch_size = hidden_states.shape[:2]
+                predict_causal_mask = ngram_attention_bias(
+                    self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+                )
+                predict_causal_mask = predict_causal_mask[:, :seq_length]
+                return predict_causal_mask
+
+        x = torch.randn(6, 2)
+        y = torch.randn(4, 1)
+        self.run_test(ScriptModel(), x, input_names=['x'],
+                      dynamic_axes={'x': {0: 'seq_length', 1: 'batch_size'}}, test_with_inputs=[y])
+
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_(self):
         class CopyModel(torch.nn.Module):
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 162e596eb6a7..bc26183a25bb 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -271,6 +271,93 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
   return indices;
 }
 
+// Register index_put inputs/outputs through the blocks.
+// Eg. The IR before updating:
+//   = prim::Loop(%10, %27)
+//    block0(%stream_idx.1 : int):
+//       = prim::Loop(%9, %27)
+//        block0(%i.1 : int):
+//          %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1)
+//          %41 : Tensor = aten::copy_(%37, %40, %25)
+//          -> (%27)
+//      -> (%27)
+//  After updating:
+// %62 : Tensor = prim::Loop(%10, %27, %bias.2)
+//    block0(%stream_idx.1 : int, %bias.3 : Tensor):
+//      %61 : Tensor = prim::Loop(%9, %27, %bias.3)
+//        block0(%i.1 : int, %bias.1 : Tensor):
+//          %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1)
+//          %59 : Tensor?[] = prim::ListConstruct(%55, %58)
+//          %60 : Tensor = aten::index_put(%bias.1, %59, %45, %25)
+//          -> (%27, %60)
+//      -> (%27, %61)
+void RegisterIndexPutInBlocks(
+    Value* orig_data,
+    Value* new_index_put,
+    Node* block_node,
+    Block* outer_block,
+    Node* next_node) {
+  auto cur_node = next_node;
+  while (nullptr != cur_node) {
+    if (cur_node->kind() != prim::Loop)
+      return;
+    cur_node = cur_node->owningBlock()->owningNode();
+  }
+
+  for (auto block_input : outer_block->inputs()) {
+    if (block_input->debugName() == orig_data->debugName()) {
+      AT_ERROR(
+          "More than one aten::index_put in a subblock are not supported.");
+    }
+  }
+
+  // Register index_put outputs through the blocks.
+  for (auto block_output : outer_block->outputs()) {
+    if (block_output->debugName() == new_index_put->debugName())
+      return;
+  }
+  outer_block->registerOutput(new_index_put);
+  std::vector<std::pair<Block*, Node*>> node_list = {
+      std::make_pair(outer_block, next_node)};
+  next_node->addOutput()->copyMetadata(new_index_put);
+  auto next_block = next_node->owningBlock();
+  while (nullptr != next_block->owningNode()) {
+    outer_block = next_block;
+    outer_block->registerOutput(next_node->output(0));
+    next_node = outer_block->owningNode();
+    next_node->addOutput()->copyMetadata(new_index_put);
+    next_block = next_node->owningBlock();
+    node_list.emplace_back(std::make_pair(outer_block, next_node));
+  }
+
+  // Register index_put inputs through the blocks.
+  auto next_data = orig_data;
+  while (!node_list.empty()) {
+    auto cur_pair = node_list.back();
+    // Add input to current node.
+    cur_pair.second->addInput(next_data);
+    // Add input to current block.
+    auto cur_input = cur_pair.first->addInput();
+    cur_input->copyMetadata(next_data);
+    next_data = cur_input;
+    node_list.pop_back();
+  }
+  // Update index_put inputs inside the inner most block.
+  auto prev_data = block_node->input(0);
+  for (auto node : block_node->owningBlock()->nodes()) {
+    size_t idx = 0;
+    for (auto inputs_ : node->inputs()) {
+      if (inputs_ == prev_data) {
+        node->replaceInput(idx, next_data);
+        idx++;
+        break;
+      }
+    }
+  }
+  orig_data->replaceAllUsesAfterNodeWith(
+      next_node->output(0)->node(), next_node->output(0));
+}
+
 // Trace back all the slice & select nodes associated with the index_put node,
 // and convert them to associated indices.
 // E.g. The IR for x[1:3, 0] = update
@@ -336,7 +423,16 @@ void SquashSliceAndSelect(Node* index_put_node) {
   new_index_put->copyMetadata(index_put_node->output());
   index_put_node->output()->replaceAllUsesWith(new_index_put);
 
-  orig_data->replaceAllUsesAfterNodeWith(new_index_put->node(), new_index_put);
+  auto block_node = new_index_put->node();
+  auto outer_block = block_node->owningBlock();
+  auto next_node = outer_block->owningNode();
+  if (nullptr == next_node) {
+    orig_data->replaceAllUsesAfterNodeWith(
+        new_index_put->node(), new_index_put);
+    return;
+  }
+  RegisterIndexPutInBlocks(
+      orig_data, new_index_put, block_node, outer_block, next_node);
 }
 
 void PrepareCopyForONNX(Block* block) {
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 14ebcfcd8a6c..c5847f548b74 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2315,6 +2315,9 @@ def log2(g, self):
 def prim_shape(g, self):
     return g.op('Shape', self)
 
+def prim_max(g, self, other):
+    return g.op('Max', self, other)
+
 def prim_data(g, self):
     return self
 

From efe1fc21fcaeb3e0ada88dcdba3ccb1c652435d7 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Mon, 4 Jan 2021 15:43:19 -0800
Subject: [PATCH 44/89] Dont inlinine intermediates on cpu (#49565)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49565

Test Plan: Imported from OSS

Reviewed By: Krovatkin, ZolotukhinM

Differential Revision: D25688271

Pulled By: eellison

fbshipit-source-id: 9ea7858e2db4fb31292e04440fc72ee04623c688
---
 test/cpp/tensorexpr/test_kernel.cpp    | 74 +++++++++++++++++++-
 test/cpp/tensorexpr/test_loopnest.cpp  | 39 -----------
 test/elias.py                          | 12 ----
 torch/csrc/jit/tensorexpr/kernel.cpp   | 10 +--
 torch/csrc/jit/tensorexpr/loopnest.cpp | 94 ++++++++++++++++++++++----
 torch/csrc/jit/tensorexpr/loopnest.h   |  7 +-
 torch/csrc/jit/testing/file_check.cpp  |  6 +-
 7 files changed, 164 insertions(+), 78 deletions(-)
 delete mode 100644 test/elias.py

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index cf658ad488f6..902c2a701197 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -19,6 +19,65 @@ namespace jit {
 using namespace torch::indexing;
 using namespace torch::jit::tensorexpr;
 
+TEST(Kernel, InliningIntermediates) {
+  // here, each mul has only one use, so it should be completely inlined
+  {
+    const auto graph_string = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+              %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+          %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one)
+          return (%5))IR";
+    KernelScope kernel_scope;
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+    TensorExprKernel k(graph);
+    auto stmt = k.getCodeGenStmt();
+    std::ostringstream oss;
+    oss << *stmt;
+    torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+  }
+  {
+    const auto graph_template = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=${device}),
+              %1 : Float(5, 3, strides=[3, 1], device=${device})):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %3 : Float(5, 3, strides=[3, 1]) = aten::sub(%0, %2, %one)
+          %4 : Float(5, 3, strides=[3, 1]) = aten::add(%3, %0, %one)
+          %5 : Float(5, 3, strides=[3, 1]) = aten::div(%3, %0)
+          return (%4, %5))IR";
+    for (bool use_cuda : {false, true}) {
+      if (!torch::cuda::is_available() && use_cuda) {
+        continue;
+      }
+
+      KernelScope kernel_scope;
+      TemplateEnv env;
+      env.s("device", use_cuda ? "cuda:0" : "cpu");
+      const auto graph_string = format(graph_template, env);
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+      auto device = use_cuda ? kCUDA : kCPU;
+      TensorExprKernel k(graph);
+      auto stmt = k.getCodeGenStmt();
+      std::ostringstream oss;
+      oss << *stmt;
+      // aten_mul only has one use, inlined completely
+      torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+
+      // aten_sub should be removed in cuda, exist in cpu
+      // 5 uses: allocate, initialize, free and two reads
+      size_t num_out1_uses = use_cuda ? 0 : 5;
+      torch::jit::testing::FileCheck()
+          .check_count("aten_sub", num_out1_uses, /*exactly*/ true)
+          ->run(oss.str());
+    }
+  }
+}
+
 TEST(Kernel, _1) {
   KernelScope kernel_scope;
 
@@ -714,7 +773,10 @@ TEST(Kernel, Softmax2D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
@@ -789,7 +851,10 @@ TEST(Kernel, Softmax3D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
@@ -870,7 +935,10 @@ TEST(Kernel, Softmax4D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index af95f9971513..7294aa53c803 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -3649,45 +3649,6 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
   torch::jit::testing::FileCheck().run(expected_ir2, oss.str());
 }
 
-TEST(LoopNest, InlineOutputBuffers) {
-  KernelScope kernel_scope;
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  Placeholder a_buf("a", kFloat, {M, N});
-  Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
-      "broadcast_add",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  Tensor* out1 = Compute(
-      "out1",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->call(m, n, k) + 1;
-      });
-
-  Tensor* out2 = Compute(
-      "out2",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return out1->call(m, n, k) / c->call(m, n, k) * 4;
-      });
-  for (const bool inline_outputs : {true, false}) {
-    LoopNest l({out1, out2});
-    l.inlineIntermediateBufs(inline_outputs);
-    Stmt* stmt1 = l.root_stmt();
-    std::ostringstream oss;
-    oss << *stmt1;
-    size_t num_out1_uses = inline_outputs ? 1 : 2;
-    torch::jit::testing::FileCheck()
-        .check_count("out1", num_out1_uses, /*exactly*/ true)
-        ->run(oss.str());
-  }
-}
-
 TEST(LoopNest, CompoundTensorSimple) {
   KernelScope kernel_scope;
 
diff --git a/test/elias.py b/test/elias.py
deleted file mode 100644
index 74dbc3cbaa09..000000000000
--- a/test/elias.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch
-
-@torch.jit.script
-def foo(x):
-    return x + x + x
-
-torch._C._jit_override_can_fuse_on_cpu(True)
-
-foo(torch.rand([2], requires_grad=False))
-foo(torch.rand([2], requires_grad=False))
-foo(torch.rand([2], requires_grad=False))
-print(torch.jit.last_executed_optimized_graph())
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 0145014ee8f5..e6e31ba4d96c 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1532,12 +1532,12 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
     root_stmt->accept(block_analysis.get());
   }
 
-  // inlining output buffers duplicates computation. it slows down
-  // cpu code generation but is enabled on gpu because it avoids difficult
-  // synchronization logic across blocks.
-  bool inline_output_buffers =
+  // inlining output & intermediate buffers can duplicate computation.
+  // it slows down cpu code generation but is enabled on gpu because it avoids
+  // difficult synchronization logic across blocks.
+  bool allow_duplicated_work =
       (backendType == kCudaCodeGen || backendType == kBlockCodeGen);
-  l.inlineIntermediateBufs(inline_output_buffers);
+  l.inlineIntermediateBufs(allow_duplicated_work);
 
   if (backendType == kCudaCodeGen) {
     for (auto tensor : tensorOutputs_) {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 2fed242cf4c6..c2b274a3c9bb 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/Logging.h>
 #include <c10/util/string_utils.h>
 
+#include <ATen/core/functional.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
@@ -23,6 +24,28 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+class FunctionCallUseCount : public IRVisitor {
+ public:
+  std::unordered_map<const Buf*, size_t> findUses(Stmt* s) {
+    s->accept(this);
+    return uses_;
+  }
+
+ private:
+  void visit(const FunctionCall* v) override {
+    if (function_calls_[v->tensor()->buf()].insert(v).second) {
+      uses_[v->tensor()->buf()] = uses_[v->tensor()->buf()] + 1;
+    }
+    IRVisitor::visit(v);
+  }
+
+  std::unordered_map<const Buf*, size_t> uses_;
+
+  // Sets of FunctionCalls in order to keep the results unique
+  std::unordered_map<const Buf*, std::unordered_set<const FunctionCall*>>
+      function_calls_;
+};
+
 class IndexFlattener : public IRMutator {
  public:
   Stmt* flatten(Stmt* s) {
@@ -751,28 +774,67 @@ bool LoopNest::computeInline(const Buf* b) {
   return true;
 }
 
-void LoopNest::inlineIntermediateBufs(bool inline_output_buffers) {
+// inlining buffers with multiple uses can create duplicated work, which can
+// slow down cpu code generation but is enabled on gpu because it avoids
+// difficult synchronization logic across blocks. Inlining trivial reads does
+// not duplicate work
+void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
   // We need to collect all intermediate buffers as the buffers to be inlined
   // before calling 'computeInline' since the buffers that are inlined are
   // erased from the set 'intermediate_bufs_' in that function.
-  std::unordered_set<const Buf*> bufs_to_inline(
-      intermediate_bufs_.begin(), intermediate_bufs_.end());
+  std::unordered_set<const Buf*> bufs_to_inline;
+
+  if (allow_duplicated_work) {
+    bufs_to_inline.insert(intermediate_bufs_.begin(), intermediate_bufs_.end());
+  } else {
+    FunctionCallUseCount fcu;
+    auto function_call_uses = fcu.findUses(root_stmt_);
+    auto buf_load_store_uses = findLoadOrStoreUses(root_stmt_);
+    auto input_bufs = getInputBufs();
+
+    for (auto buf : intermediate_bufs_) {
+      TORCH_INTERNAL_ASSERT(buf_load_store_uses.count(buf));
+      std::vector<BufLoadOrStoreUse>& uses = buf_load_store_uses[buf];
+      auto stores = c10::filter(
+          uses, [](const BufLoadOrStoreUse& use) { return use.isStore; });
+
+      // if the intermediate is the buffer formed from reading in the input
+      // tensors, always inline, bc we are not duplicating any work
+      // and avoiding an intermediary buffer
+      if (stores.size() == 1) {
+        auto store = dynamic_cast<Store*>(stores[0].s);
+        auto input_as_load = dynamic_cast<const Load*>(store->value());
+        if (input_as_load && input_bufs.count(input_as_load->buf())) {
+          bufs_to_inline.insert(buf);
+          continue;
+        }
+      }
 
-  // inlining output buffers duplicates computation. it slows down
-  // cpu code generation but is enabled on gpu because it avoids difficult
-  // synchronization logic across blocks.
-  if (inline_output_buffers) {
+      // all bufs will have at least one store (if they have > 1 they cant be
+      // inlined anyway)
+      size_t reads = uses.size() - 1;
+      size_t function_call_reads = function_call_uses[buf];
+      // if only one read, we can inline it without duplicating work
+      if ((reads + function_call_reads) <= 1) {
+        bufs_to_inline.insert(buf);
+      }
+    }
+  }
+
+  if (allow_duplicated_work) {
     bufs_to_inline.insert(output_bufs_.begin(), output_bufs_.end());
   }
+
   for (auto b : bufs_to_inline) {
     computeInline(b);
   }
 }
 
 // TODO: Unify with DepTracker
-class UseFinder : public IRVisitor {
+class LoadOrStoreUseFinder : public IRVisitor {
  public:
-  std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s) {
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> findUses(
+      Stmt* s) {
     uses_.clear();
     s->accept(this);
     return uses_;
@@ -794,15 +856,16 @@ class UseFinder : public IRVisitor {
   }
 
   Stmt* last_stmt_ = nullptr;
-  std::unordered_map<const Buf*, std::vector<BufUse>> uses_;
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> uses_;
 
   // Sets of loads and stores in order to keep the results unique
   std::unordered_map<const Buf*, std::unordered_set<Stmt*>> loads_;
   std::unordered_map<const Buf*, std::unordered_set<Stmt*>> stores_;
 };
 
-std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s) {
-  UseFinder uf;
+std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>>
+findLoadOrStoreUses(Stmt* s) {
+  LoadOrStoreUseFinder uf;
   return uf.findUses(s);
 }
 
@@ -828,7 +891,7 @@ class ContainedStmtsFinder : public IRVisitor {
   std::unordered_set<Stmt*> contained_;
 };
 
-bool containsAll(const std::vector<BufUse>& uses, Block* b) {
+bool containsAll(const std::vector<BufLoadOrStoreUse>& uses, Block* b) {
   std::unordered_set<Stmt*> not_found;
   for (auto use : uses) {
     not_found.insert(use.s);
@@ -852,7 +915,7 @@ Block* findParentBlock(Stmt* s) {
   return nullptr;
 }
 
-Block* findLowestContainingBlock(const std::vector<BufUse>& uses) {
+Block* findLowestContainingBlock(const std::vector<BufLoadOrStoreUse>& uses) {
   // TODO: we're not using the most efficient algorithm here for simplicity.
   // Replace with something more performant in case it becomes a bottleneck.
   Block* b = findParentBlock(uses[0].s);
@@ -872,7 +935,8 @@ Stmt* LoopNest::insertAllocFree(Stmt* stmt) {
     b = new Block({stmt});
   }
 
-  std::unordered_map<const Buf*, std::vector<BufUse>> uses = findUses(stmt);
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> uses =
+      findLoadOrStoreUses(stmt);
   // Insert allocations and frees for temporary buffers in the innermost
   // possible scope.
   for (const Buf* buf : intermediate_bufs_) {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 0a588f6a95e4..962d69f0458d 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -53,7 +53,7 @@ class TORCH_API LoopNest {
 
   bool computeInline(Stmt* s);
   bool computeInline(const Buf* b);
-  void inlineIntermediateBufs(bool inline_output_buffers);
+  void inlineIntermediateBufs(bool allow_duplicated_work);
 
   static void splitWithTail(For* f, int factor);
   static void splitWithTail(
@@ -141,7 +141,7 @@ TORCH_API Stmt* FlattenIndexes(Stmt* s);
 // TODO: Revisit this once we decide on how dependencies analysis should look
 // like. Maybe we would choose to use a different API and BufUse would be
 // removed, or if we decide to keep it we need to properly document its API.
-struct BufUse {
+struct BufLoadOrStoreUse {
   Stmt* s;
   bool isStore;
 };
@@ -152,7 +152,8 @@ struct BufUse {
  * in the vectors reflects the order in which the uses appear in the given
  * statement.
  */
-std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s);
+std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>>
+findLoadOrStoreUses(Stmt* s);
 
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index 83b54397b01c..f5e96a501bfd 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -548,7 +548,11 @@ FileCheck* FileCheck::check_count(
     const std::string& str,
     size_t count,
     bool exactly) {
-  fcImpl->addCheck(CHECK_COUNT, str, count);
+  TORCH_INTERNAL_ASSERT(
+      count != 0 || exactly, "Count == 0 && !exactly doesn't do anything");
+  if (count) {
+    fcImpl->addCheck(CHECK_COUNT, str, count);
+  }
   if (exactly) {
     fcImpl->addCheck(CHECK_NOT, str);
   }

From 5acb1cc1df8b2a36564415d71ee3cb1412aa8664 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 4 Jan 2021 16:07:09 -0800
Subject: [PATCH 45/89] Drop unused imports from scripts (#49956)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49956

From
```
./python/libcst/libcst codemod remove_unused_imports.RemoveUnusedImportsWithGlean --no-format caffe2/
```

Test Plan: Standard sandcastle tests

Reviewed By: xush6528

Differential Revision: D25727347

fbshipit-source-id: 74d0a08aa0cfd0f492688a2b8278a0c65fd1deba
---
 scripts/model_zoo/update-models-from-caffe2.py | 4 ----
 scripts/release_notes/categorize.py            | 3 +--
 scripts/release_notes/commitlist.py            | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py
index fb582a047bc6..d3e46e449d8a 100644
--- a/scripts/model_zoo/update-models-from-caffe2.py
+++ b/scripts/model_zoo/update-models-from-caffe2.py
@@ -6,15 +6,12 @@
 import caffe2.python.workspace as c2_workspace
 import glob
 import json
-import math
 import numpy as np
 import onnx
 import caffe2.python.onnx.frontend
 import caffe2.python.onnx.backend
 import os
 import shutil
-import subprocess
-import sys
 import tarfile
 import tempfile
 
@@ -25,7 +22,6 @@
 from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
 from caffe2.proto import caffe2_pb2
 from onnx import numpy_helper
-from filechunkio import FileChunkIO
 
 
 """A script converting Caffe2 models to ONNX, and updating ONNX model zoos.
diff --git a/scripts/release_notes/categorize.py b/scripts/release_notes/categorize.py
index b72eb9094b7b..985d11f2e2bd 100644
--- a/scripts/release_notes/categorize.py
+++ b/scripts/release_notes/categorize.py
@@ -1,8 +1,7 @@
-import json
 import argparse
 import os
 import textwrap
-from common import dict_to_features, categories, topics, get_features, CommitDataCache
+from common import categories, topics, CommitDataCache
 from commitlist import CommitList
 
 class Categorizer:
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index fda7c913addd..0a76f896f217 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -1,6 +1,6 @@
 import argparse
 from common import run, topics
-from collections import namedtuple, defaultdict
+from collections import defaultdict
 import os
 import csv
 import pprint

From a5339b9d7cb52588b608d2d4ae61a3b571eb3b35 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 4 Jan 2021 16:30:16 -0800
Subject: [PATCH 46/89] Drop unused imports from leftovers (#49953)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49953

From
```
./python/libcst/libcst codemod remove_unused_imports.RemoveUnusedImportsWithGlean --no-format caffe2/
```

Test Plan: Standard sandcastle tests

Reviewed By: xush6528

Differential Revision: D25727348

fbshipit-source-id: b3feef80b9b4b535f1bd4060dace5b1a50bd5e69
---
 setup.py                     | 2 +-
 tools/setup_helpers/cmake.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8289b57e93be..50983a89ad55 100644
--- a/setup.py
+++ b/setup.py
@@ -186,7 +186,7 @@
                                                                      python_min_version_str))
     sys.exit(-1)
 
-from setuptools import setup, Extension, distutils, find_packages
+from setuptools import setup, Extension, find_packages
 from collections import defaultdict
 from distutils import core
 from distutils.core import Distribution
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 46b3befde9f4..f1809552cd40 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -7,7 +7,6 @@
 import re
 from subprocess import check_call, check_output
 import sys
-import distutils
 import distutils.sysconfig
 from distutils.version import LooseVersion
 

From d0369aabe1be99d4c6c64b85c338cf0dbb50972b Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 4 Jan 2021 17:28:56 -0800
Subject: [PATCH 47/89] Clean up some type annotations in
 caffe2/contrib/aten/gen_op (#49945)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49945

Upgrades type annotations from Python2 to Python3

Test Plan: Sandcastle tests

Reviewed By: xush6528

Differential Revision: D25717502

fbshipit-source-id: 718d93e8614e9d050f4da1c6bd4ac892bab98154
---
 caffe2/contrib/aten/gen_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index ba29ab933da9..769f9d59c856 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -236,11 +236,11 @@ def emit_assignments(o, env):
     decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
     factory_methods = find_factory_methods(decls)
     filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
-    top_env = {
+    top_env: Dict[str, List] = {
         'mappings': [],
         'implementations': [],
         'cases': [],
-    }  # type: Dict[str, List]
+    }
     seen: Set[str] = set()
     key = 0
     for o in filtered:

From 65122173ab2a2d315bede50b85cdd64d56fb1fb5 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare1@gmail.com>
Date: Mon, 4 Jan 2021 18:08:28 -0800
Subject: [PATCH 48/89] [ONNX] Modified var_mean symbolic to support more
 combinations of dims (#48949)

Summary:
Based on existing implementation of var_mean, values of dim have to be sequential and start with zero. The formats listed below are cause scenarios with incompatible dimension for the Sub node.
-> dim[1, 2]
-> dim[0, 2]
-> dim[2, 0]

The changes in this PR allow such formats to be supported in var_mean

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48949

Reviewed By: houseroad

Differential Revision: D25540272

Pulled By: SplitInfinity

fbshipit-source-id: 59813a77ff076d138655cc8c17953358f62cf137
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 25 ++++++++++++++++++++++
 torch/onnx/symbolic_opset9.py              |  4 +++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index c2e4865cee89..26896bc17863 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2134,6 +2134,31 @@ def forward(self, input):
         model = VarianceUnbiased()
         self.run_test(model, x)
 
+    def test_var_mean_mixed_dims(self):
+        class ReverseDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(2, 1), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = ReverseDims()
+        self.run_test(model, x)
+
+        class SkipDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(0, 2), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = SkipDims()
+        self.run_test(model, x)
+
+        class NonZeroDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(1, 2), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = NonZeroDims()
+        self.run_test(model, x)
+
     def test_var_mean_keepdim(self):
         class Variance(torch.nn.Module):
             def forward(self, input):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index c5847f548b74..a9edb9ca32ed 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2368,14 +2368,16 @@ def gather(g, self, dim, index, sparse_grad=False):
 def _var_mean(g, input, dim, unbiased, keepdim):
     if dim is None:
         mean = g.op("ReduceMean", input, keepdims_i=0)
+        t_mean = mean
         num_elements = numel(g, input)
     else:
         mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+        t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
         redudced_dims = g.op("Shape", input)
         # dim could contain one or multiple dimensions
         redudced_dims = g.op("Gather", redudced_dims, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0)
         num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-    sub_v = g.op("Sub", input, mean)
+    sub_v = g.op("Sub", input, t_mean)
     sqr_sub = g.op("Mul", sub_v, sub_v)
     keepdim_mean = 0 if dim is None else keepdim
     var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)

From 9e0b4a96e48132190220820684033a77a92e8a33 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 4 Jan 2021 19:15:27 -0800
Subject: [PATCH 49/89] introduce a flag to disable aten::cat in TE (#49579)

Summary:
introduce a flag to disable aten::cat in TE

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49579

Reviewed By: eellison

Differential Revision: D25763758

Pulled By: Krovatkin

fbshipit-source-id: c4f4a8220964813202369a3383057e77e7f10cb0
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 041471bfa077..73361f8f3415 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -15,6 +15,10 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/utils/memory.h>
 
+C10_DEFINE_bool(
+    torch_jit_disable_cat,
+    false,
+    "disable aten::cat in TE fusion groups");
 namespace torch {
 namespace jit {
 
@@ -202,6 +206,10 @@ bool isSupported(Node* node) {
       }
     }
 
+    if (FLAGS_torch_jit_disable_cat && node->kind() == aten::cat) {
+      return false;
+    }
+
     return true;
   }
 

From 8fb5f1693164cdba410a4548e7a2bdbf6159dd57 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Mon, 4 Jan 2021 19:42:04 -0800
Subject: [PATCH 50/89] Complex backward for indexing, slicing, joining, and
 mutating ops (#49552)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49552

This PR:
1. Migrates independent autograd test for `hstack`, `dstack`, `vstack`, `movedim`, `moveaxis` from `test_autograd.py` to the new `OpInfo` based tests.
2. Migrates autograd test for `gather`, `index_select` from the method_tests to the new `OpInfo` based tests.
2. Enables complex backward for `stack, gather, index_select, index_add_` and adds tests for complex autograd for all the above mentioned ops.

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D25682511

Pulled By: anjali411

fbshipit-source-id: 5d8f89db4a9ec340ab99a6196987d44a23e2c6c6
---
 .../ATen/native/TensorAdvancedIndexing.cpp    |   4 +-
 aten/src/ATen/native/cuda/Indexing.cu         |   2 +-
 test/test_autograd.py                         |  91 +---------
 test/test_torch.py                            |   4 -
 test/test_unary_ufuncs.py                     |   8 -
 tools/autograd/gen_variable_type.py           |   4 +-
 .../_internal/common_methods_invocations.py   | 166 ++++++++++++++++--
 7 files changed, 156 insertions(+), 123 deletions(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 2d79a4e3713f..3ced0cf5eb52 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -471,7 +471,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T
 
     // explicitly capture all required variables to work around windows build
     // TODO: fix this when windows can correctly capture variables in nested lambda
-    AT_DISPATCH_ALL_TYPES(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] {
       auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
       auto source_stride = source.dim() == 0 ? 1 : source.stride(dim);
       // TODO: Maybe TensorAccessor can beused here?
@@ -682,7 +682,7 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim
     TORCH_CHECK(result.dim() <= 1, "result.dim() (", result.dim(), ") must one or zero for given self.dim() (", self.dim(), ")");
     // explicitly capture all required variables to work around windows build
     // TODO: fix this when windows can correctly capture variables in nested lambda
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "index_select",
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, self.scalar_type(), "index_select",
       [&index_contig, &self, &result, &dim, &numel] {
       auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
       auto result_stride = result.dim() == 0 ? 1 : result.stride(dim);
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index e372f8bdb697..2dc04ed4ddef 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -505,7 +505,7 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const
   if (cuda::detail::canUse32BitIndexMath(self) &&
       cuda::detail::canUse32BitIndexMath(source) &&
       cuda::detail::canUse32BitIndexMath(index)) {
-    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] {
       cuda::detail::TensorInfo<scalar_t, unsigned int> selfInfo =
           cuda::detail::getTensorInfo<scalar_t, unsigned int>(self_);
       int selfAddDim = selfInfo.collapseDims(dim);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 34c38eefa342..9f5925212757 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1946,60 +1946,6 @@ def test_slice_expanded_v(self):
         expected[3:5] = v_expanded
         self.assertEqual(result, expected)
 
-    def test_stack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.stack([x, y, z], 0)
-        grad = torch.randn(3, 10, 10)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0])
-        self.assertEqual(y.grad, grad[1])
-        self.assertEqual(z.grad, grad[2])
-
-    def test_hstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.hstack([x, y, z])
-        grad = torch.randn(10, 30)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[:, 0:10])
-        self.assertEqual(y.grad, grad[:, 10:20])
-        self.assertEqual(z.grad, grad[:, 20:30])
-
-        x = torch.randn(10, requires_grad=True)
-        y = torch.randn(10, requires_grad=True)
-        z = torch.randn(10, requires_grad=True)
-        stacked = torch.hstack([x, y, z])
-        grad = torch.randn(30)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0:10])
-        self.assertEqual(y.grad, grad[10:20])
-        self.assertEqual(z.grad, grad[20:30])
-
-    def test_vstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.vstack([x, y, z])
-        grad = torch.randn(30, 10)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0:10])
-        self.assertEqual(y.grad, grad[10:20])
-        self.assertEqual(z.grad, grad[20:30])
-
-    def test_dstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.dstack([x, y, z])
-        grad = torch.randn(10, 10, 3)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[:, :, 0])
-        self.assertEqual(y.grad, grad[:, :, 1])
-        self.assertEqual(z.grad, grad[:, :, 2])
-
     def test_unbind(self):
         stacked = torch.randn(3, 10, 10, requires_grad=True)
         x, y, z = stacked.unbind()
@@ -5007,14 +4953,6 @@ def test_linalg_qr_r(self):
                                     "linalg_qr_backward: cannot compute backward"):
             b.backward()
 
-
-def index_variable(shape, max_indices):
-    if not isinstance(shape, tuple):
-        shape = (shape,)
-    index = torch.rand(*shape).mul_(max_indices).floor_().long()
-    return index
-
-
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
         shape = (shape,)
@@ -5022,20 +4960,6 @@ def index_perm_variable(shape, max_indices):
     index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
     return index
 
-
-def gather_variable(shape, index_dim, max_indices, duplicate=False):
-    assert len(shape) == 2
-    assert index_dim < 2
-    batch_dim = 1 - index_dim
-    index = torch.LongTensor(*shape)
-    for i in range(shape[index_dim]):
-        index.select(index_dim, i).copy_(
-            torch.randperm(max_indices)[:shape[batch_dim]])
-    if duplicate:
-        index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
-    return index
-
-
 def bernoulli_scalar():
     return torch.tensor(0, dtype=torch.uint8).bernoulli_()
 
@@ -5111,7 +5035,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul',
                 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub',
                 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul',
-                'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr', ] + separate_complex_tests
+                'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr',
+                'narrow', 'swapaxes', 'swapdims', 'tensor_split', 'tile'] + separate_complex_tests
 
 def add_test(
         name,
@@ -7383,18 +7308,6 @@ def test_strided_leaf_grad_layout(self, device):
         (c * d).sum().backward()
         self.assertEqual(c.grad.stride(), (2, 1))
 
-    def test_movedim(self, device):
-        for fn in [torch.movedim, torch.moveaxis]:
-            x = torch.randn(4, 3, 2, 1, dtype=torch.double, device=device, requires_grad=True)
-
-            # Positive axis
-            gradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
-            gradgradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
-
-            # Negative axis
-            gradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
-            gradgradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
-
     def _test_atleast(self, device, torch_fn):
         # 0-dim
         s = torch.tensor(0.5, dtype=torch.double, requires_grad=True)
diff --git a/test/test_torch.py b/test/test_torch.py
index 8872516ddd28..1f85ed2fff54 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -937,10 +937,6 @@ def test_index_add_all_dtypes(self):
                         # index_add calls atomicAdd on cuda.
                         zeros = torch.zeros(size, dtype=dtype, device=device)
 
-                        # index_add is not supported for complex dtypes on cuda yet
-                        if device.startswith('cuda') and dtype.is_complex:
-                            continue
-
                         added = zeros.index_add(0, torch.arange(0, size[0], dtype=idx_dtype, device=device), tensor)
                         self.assertEqual(added, tensor)
 
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 1daecc24f79f..960991a4820b 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -642,14 +642,6 @@ def test_sign_complex_assert_raises(self, device):
             size = [5, 5]
             tensor = torch.rand(size, dtype=dtype, device=device)
 
-            # index_add calls atomicAdd on cuda.
-            zeros = torch.zeros(size, dtype=dtype, device=device)
-
-            # index_add is not supported for complex dtypes on cuda yet
-            if device.startswith('cuda') and dtype.is_complex:
-                self.assertRaises(RuntimeError,
-                                  lambda: zeros.index_add(0, torch.arange(0, size[0], dtype=torch.long, device=device), tensor))
-
             with self.assertRaisesRegex(RuntimeError,
                                         (r'Unlike NumPy, torch.sign is not intended to support complex numbers\. '
                                          r'Please use torch.sgn instead\.')):
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 7346bfeb34c7..72be5b993f44 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,8 +78,8 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', 'svd',
-    '_fft_c2c', '_fft_r2c', 'linalg_solve', 'sqrt'
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', 'svd', '_fft_c2c', '_fft_r2c',
+    'linalg_solve', 'sqrt', 'stack', 'gather', 'index_select', 'index_add_'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0c5a5a6353df..0557ccc9560f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -340,6 +340,77 @@ def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad):
                                           requires_grad=requires_grad), shape))
                  for size, shape in test_cases)
 
+def sample_inputs_stack(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad)), kwargs=dict(idx=0)),)
+
+def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad))),)
+
+def sample_inputs_gather(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((M, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, gather_variable((S, S), 1, M, True, device=device))),
+            SampleInput((make_tensor((M, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        1, gather_variable((M, S // 2), 0, S, True, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor([0], dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((S,), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            )
+
+
+def sample_inputs_index_select(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, index_variable(2, S, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor([0], dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            )
+
+def sample_movedim_moveaxis(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((4, 3, 2, 1), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        (0, 1, 2, 3), (3, 2, 1, 0))),
+            SampleInput((make_tensor((4, 3, 2, 1), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        (0, -1, -2, -3), (-3, -2, -1, -0))))
+
 def np_unary_ufunc_integer_promotion_wrapper(fn):
     # Wrapper that passes PyTorch's default scalar
     #   type as an argument to the wrapped NumPy
@@ -1069,6 +1140,75 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
            supports_tensor_out=False,
            sample_inputs_func=sample_inputs_pinverse,
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    OpInfo('gather',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           sample_inputs_func=sample_inputs_gather),
+    OpInfo('index_select',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/49707
+               SkipInfo('TestCommon', 'test_variant_consistency_eager',
+                        dtypes=[torch.float16, torch.bfloat16]),
+               SkipInfo('TestCommon', 'test_variant_consistency_jit', dtypes=[torch.float16, torch.bfloat16]),
+           ),
+           sample_inputs_func=sample_inputs_index_select),
+    OpInfo('stack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args, idx: torch.stack([*args], idx),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_stack),
+    OpInfo('hstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.hstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('vstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.vstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('dstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.dstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('movedim',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_movedim_moveaxis),
+    OpInfo('moveaxis',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_movedim_moveaxis),
 ]
 
 if TEST_SCIPY:
@@ -1171,10 +1311,10 @@ def reference_sigmoid(x):
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True]
 
-def index_variable(shape, max_indices):
+def index_variable(shape, max_indices, device=torch.device('cpu')):
     if not isinstance(shape, tuple):
         shape = (shape,)
-    index = torch.rand(*shape).mul_(max_indices).floor_().long()
+    index = torch.rand(*shape, device=device).mul_(max_indices).floor_().long()
     return index
 
 
@@ -1186,14 +1326,14 @@ def index_perm_variable(shape, max_indices):
     return index
 
 
-def gather_variable(shape, index_dim, max_indices, duplicate=False):
+def gather_variable(shape, index_dim, max_indices, duplicate=False, device=torch.device('cpu')):
     assert len(shape) == 2
     assert index_dim < 2
     batch_dim = 1 - index_dim
-    index = torch.LongTensor(*shape)
+    index = torch.zeros(*shape, dtype=torch.long, device=device)
     for i in range(shape[index_dim]):
         index.select(index_dim, i).copy_(
-            torch.randperm(max_indices)[:shape[batch_dim]])
+            torch.randperm(max_indices, device=device)[:shape[batch_dim]])
     if duplicate:
         index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
     return index
@@ -1879,10 +2019,10 @@ def method_tests():
         ('diagonal', (M, M, M), (1, 1, 2), '3d_1'),
         ('diagonal', (M, M, M), (2, 0, 1), '3d_2'),
         ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
-        ('tile', (S, S, S), ([S, S, S, S],), 'more_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, S, S],), 'same_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, M],), 'less_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, S, 0],), 'zero_rep_dim', (False,)),
+        ('tile', (2, 2), ([2, 2, 2],), 'more_reps_dims', (False,)),
+        ('tile', (2, 2), ([2, 2],), 'same_reps_dims', (False,)),
+        ('tile', (2, 2), ([2, 3],), 'less_reps_dims', (False,)),
+        ('tile', (2, 2, 2), ([2, 2, 0],), 'zero_rep_dim', (False,)),
         ('tile', (), ([S, S, S],), 'empty_tensor', (False,)),
         ('tril', (M, M), NO_ARGS),
         ('tril', (M, M), (2,), 'idx'),
@@ -1897,9 +2037,6 @@ def method_tests():
         ('trace', (M, M), NO_ARGS),
         ('cross', (S, 3), ((S, 3),)),
         ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
-        ('index_select', (S, S, S), (0, index_variable(2, S)), 'dim', (), [0]),
-        ('index_select', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_mixed_dim', (), [0]),
-        ('index_select', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_dim', (), [0]),
         ('index_add', (S, S), (0, index_variable(2, S), (2, S)), 'dim', (), [0]),
         ('index_add', (), (0, torch.tensor([0], dtype=torch.int64), (1,)), 'scalar_input_dim', (), [0]),
         ('index_add', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim', (), [0]),
@@ -2098,11 +2235,6 @@ def method_tests():
         ('tensor_split', (S, S, S), (3, 1), 'sections_dim', (False,), [1]),
         ('tensor_split', (S, S, S), ([2, 4],), 'indices', (False,)),
         ('tensor_split', (S, S, S), ([2, 4], 1), 'indices_dim', (False,), [1]),
-        ('gather', (M, S), (0, gather_variable((S, S), 1, M, True)), 'dim0', (), [0]),
-        ('gather', (M, S), (1, gather_variable((M, S // 2), 0, S, True)), 'dim1', (), [0]),
-        ('gather', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_input', (), [0]),
-        ('gather', (S,), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_index', (), [0]),
-        ('gather', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_both', (), [0]),
         ('scatter', (M, S), (0, gather_variable((S, S), 1, M), (S, S)), 'dim0', (), [0]),
         ('scatter', (M, S), (1, gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1', (), [0]),
         ('scatter', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalartensor_all_dim0', (), [0]),

From c51455a7bbeb71956023c49116f428097dc223af Mon Sep 17 00:00:00 2001
From: Brandon Lin <branlin@fb.com>
Date: Mon, 4 Jan 2021 19:54:01 -0800
Subject: [PATCH 51/89] [FX] fix Graph python_code return type annotation
 (#49931)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49931

This fixes #49932. The `maybe_return_annotation` was not being passed by reference, so it was never getting modified.

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D25725582

Pulled By: esqu1

fbshipit-source-id: 4136ff169a269d6b98f0b8e14d95d19e7c7cfa71
---
 test/test_fx.py   | 13 +++++++++++++
 torch/fx/graph.py |  8 +++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 5e285039a6dd..65d5aa3f0101 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1196,5 +1196,18 @@ def forward(self, x):
         input = torch.rand(3, 4)
         self.assertEqual(traced(input), Pair(input, input))
 
+    def test_return_type_exists(self):
+        class ReturnTypeModule(torch.nn.Module):
+            def other(self, x: List[str]) -> List[str]:
+                return x
+
+            def forward(self, x: List[str]) -> List[str]:
+                return self.other(x)
+
+        traced = symbolic_trace(ReturnTypeModule())
+        self.assertIn("-> typing.List[str]", traced._code)
+        scripted = torch.jit.script(traced)
+        self.assertIn("-> List[str]", scripted.code)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 8f07f42529aa..fd0087dca398 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -577,7 +577,9 @@ def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
         modules_used : Set[str] = set()
         body: List[str] = []
-        maybe_return_annotation : str = ''
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation : List[str] = ['']
 
         def register_modules_used(qualified_name : str):
             if '.' in qualified_name:
@@ -675,7 +677,7 @@ def emit_node(node : Node):
                 return
             elif node.op == 'output':
                 if node.type is not None:
-                    maybe_return_annotation = f" -> {type_repr(node.type)}"
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
                 body.append(f'return {repr(node.args[0])}')
                 return
             raise NotImplementedError(f'node: {node.op} {node.target}')
@@ -695,7 +697,7 @@ def emit_node(node : Node):
         code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
         fn_code = f"""\
 {import_block}
-def forward(self, {', '.join(free_vars)}){maybe_return_annotation}:
+def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}:
 {code}
 """
 

From ee80b458433627ea8e4f48e7b50043dca1906bb1 Mon Sep 17 00:00:00 2001
From: "Wei Wang (Server LLVM)" <wangwei@fb.com>
Date: Mon, 4 Jan 2021 23:17:04 -0800
Subject: [PATCH 52/89] [TensorExpr] Fix LLVM 10 build after LLVM API changes

Summary: Use `llvm::CodeGenFileType` for llvm-10+

Test Plan: local build

Reviewed By: asuhan

Differential Revision: D25694990

fbshipit-source-id: c35d973ef2669929715a94da5dd46e4a0457c4e8
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index c5f94f16783d..adc3be984216 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -14,7 +14,13 @@
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/Host.h>
 #include <llvm/Support/TargetSelect.h>
+
+#if LLVM_VERSION_MAJOR >= 10
+#include <llvm/Support/CodeGen.h>
+#else
 #include <llvm/Target/TargetMachine.h>
+#endif
+
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
@@ -533,7 +539,11 @@ void LLVMCodeGenImpl::emitKernel(
         PM,
         asmStream,
         nullptr,
+#if LLVM_VERSION_MAJOR >= 10
+        llvm::CodeGenFileType::CGFT_AssemblyFile);
+#else
         llvm::TargetMachine::CodeGenFileType::CGFT_AssemblyFile);
+#endif
     PM.run(*module_);
   }
   GRAPH_DEBUG(

From 6e6231f9cd23e94db811a57f5ca50f7bdc5e8feb Mon Sep 17 00:00:00 2001
From: Hector Yuen <hyz@fb.com>
Date: Tue, 5 Jan 2021 00:25:40 -0800
Subject: [PATCH 53/89] unit test for fc parallelization aot (#50056)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50056

buck test //caffe2/caffe2/contrib/fakelowp/test:test_chunkingnnpi -- --fallback-classic

Test Plan: https://our.intern.facebook.com/intern/testinfra/testrun/7036874446100155

Reviewed By: venkatacrc

Differential Revision: D25731079

fbshipit-source-id: 4aa4ffc641659cd90bf4670d28cb43e43ae76dcd
---
 caffe2/contrib/fakelowp/test/test_chunking.py | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 caffe2/contrib/fakelowp/test/test_chunking.py

diff --git a/caffe2/contrib/fakelowp/test/test_chunking.py b/caffe2/contrib/fakelowp/test/test_chunking.py
new file mode 100644
index 000000000000..306b5c3b3f02
--- /dev/null
+++ b/caffe2/contrib/fakelowp/test/test_chunking.py
@@ -0,0 +1,142 @@
+# Must happen before importing caffe2.python.*
+import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
+import numpy as np
+from hypothesis import given, settings, example
+from hypothesis import strategies as st
+from caffe2.python import core, workspace
+from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
+from caffe2.python.fakelowp.test_utils import print_test_debug_info
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+# Test that parallel chunks behave the same way as the serial one
+
+workspace.GlobalInit(
+    [
+        "caffe2",
+        "--glow_global_fp16=1",
+        "--glow_global_fused_scale_offset_fp16=1",
+        "--glow_global_force_sls_fp16_accum=1",
+        "--glow_nnpi_num_parallel_chunks=2",
+        "--glow_use_dag_optimizer=false",
+        "--glow_dump_graph=true",
+    ]
+)
+
+class Fusions(serial.SerializedTestCase):
+    def _get_scale_zp(self, tensor):
+        tensor_max = np.max(tensor)
+        tensor_min = min(0, np.min(tensor))
+        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
+        if scale < 1e-6:
+            scale = 1e-6
+        zero_point = 0 - tensor_min / scale
+        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
+        return (scale, zero_point)
+
+    @given(
+        scale=st.floats(1e-4, 1e2),
+        zp=st.integers(-128, 128),
+        rand_seed=st.integers(0, 65534),
+        m=st.integers(32, 64),
+        k=st.integers(1000, 6000),
+        n=st.integers(200, 600),
+    )
+    # @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1)
+    @settings(deadline=datetime.timedelta(seconds=1000), max_examples=1)
+    def test_ParallelFC(self, m, k, n, scale, zp, rand_seed):
+        np.random.seed(rand_seed)
+        workspace.ResetWorkspace()
+
+        # Y = W_T * X + b
+        X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \
+            .astype(np.float32)
+
+        W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32)
+        b_fp32 = np.zeros((n,), dtype=np.float32)
+
+        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
+
+        workspace.FeedBlob("X", X_fp32)
+        workspace.FeedBlob("W", W_fp32)
+        workspace.FeedBlob("b", b_fp32)
+
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "Int8FCPackWeight",
+                ["W"],
+                ["W_int8"],
+                engine="DNNLOWP",
+                save_unpacked_weights=True,
+                in_scale=X_scale,
+            )
+        )
+
+        ref_net = core.Net("net")
+        ref_net.Int8QuantizeNNPI(
+            ["X"],
+            ["X_int8"],
+            Y_scale=X_scale,
+            Y_zero_point=X_zero_point
+        )
+        ref_net.Int8FCFakeAcc32NNPI(
+            ["X_int8", "W_int8", "b"],
+            ["Y_int8"],
+            Y_scale=X_scale,
+            Y_zero_point=X_zero_point,
+        )
+        ref_net.Int8Relu(
+            ["Y_int8"],
+            ["Y_relu"],
+            Y_zero_point=X_zero_point,
+            Y_scale=X_scale,
+        )
+        ref_net.Int8DequantizeNNPI(
+            ["Y_relu"],
+            ["Y"]
+        )
+        ref_net.Proto().external_output.append("Y")
+
+        # run ref_net
+        workspace.RunNetOnce(ref_net)
+        Y_fbgemm = workspace.FetchBlob("Y")
+
+        # run onnxifi net
+        ref_net.Proto().op[0].type = "Int8Quantize"
+        ref_net.Proto().op[1].type = "Int8FC"
+        ref_net.Proto().op[2].type = "Int8Relu"
+        ref_net.Proto().op[3].type = "Int8Dequantize"
+        net_onnxified = onnxifi_caffe2_net(
+            ref_net.Proto(),
+            {},
+            debug=True,
+            adjust_batch=False,
+            use_onnx=False,
+            weight_names=["W_int8", "b"],
+        )
+        num_onnxified_ops = sum(
+            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
+        )
+        print(net_onnxified)
+        np.testing.assert_equal(num_onnxified_ops, 1)
+        workspace.CreateNet(net_onnxified)
+        workspace.RunNet(net_onnxified.name)
+        Y_glow = workspace.FetchBlob("Y")
+
+        if not np.allclose(Y_glow, Y_fbgemm):
+            diff_Y = np.abs(Y_glow - Y_fbgemm)
+            print_test_debug_info(
+                "int8_fc",
+                {
+                    "seed": rand_seed,
+                    "n": n,
+                    "X": X_fp32,
+                    "W": W_fp32,
+                    "b": b_fp32,
+                    "Y_fbgemm": Y_fbgemm,
+                    "Y_glow": Y_glow,
+                    "diff": diff_Y,
+                    "maxdiff": diff_Y.max(axis=1),
+                },
+            )
+            assert 0

From e4d596c575018eddab9d9b6be0e42319de51224b Mon Sep 17 00:00:00 2001
From: Alex Henrie <alexhenrie24@gmail.com>
Date: Tue, 5 Jan 2021 06:59:00 -0800
Subject: [PATCH 54/89] Fix return value of _vmap_internals._get_name (#49951)

Summary:
This appears to have been a copy-paste error.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49951

Reviewed By: mrshenli

Differential Revision: D25757099

Pulled By: zou3519

fbshipit-source-id: e47cc3b0694645bd0025326bfe45852ef0266adf
---
 torch/_vmap_internals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py
index 67e2ec1a2cd9..26f32cfd9ffd 100644
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@@ -137,7 +137,7 @@ def _get_name(func: Callable):
     # Not all callables have __name__, in fact, only static functions/methods do.
     # A callable created via functools.partial or an nn.Module, to name some
     # examples, don't have a __name__.
-    fn_name = repr(func)
+    return repr(func)
 
 # vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
 # sends those into func, and then unwraps the output BatchedTensors. Operations

From 8c66aec435a5d3e6a79497b8b5187068e9b483da Mon Sep 17 00:00:00 2001
From: yqtianust <41743566+yqtianust@users.noreply.github.com>
Date: Tue, 5 Jan 2021 08:12:46 -0800
Subject: [PATCH 55/89] Fix grammar typo in readme.md (#50000)

Summary:
missing `

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50000

Reviewed By: ezyang

Differential Revision: D25759608

Pulled By: mrshenli

fbshipit-source-id: 4dbe06b8978ae5b2b9b66cde163dab4bd8ee2257
---
 .jenkins/pytorch/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md
index ea6c6dd40f68..86c50f550400 100644
--- a/.jenkins/pytorch/README.md
+++ b/.jenkins/pytorch/README.md
@@ -38,5 +38,5 @@ mechanisms we use:
   build scripts.
 
 - We reroute well known paths like `/usr/bin/gcc` to alternate
-  implementations with `update-alternatives, instead of setting
+  implementations with `update-alternatives`, instead of setting
   `CC` and `CXX` in our implementations.

From 3845770349a5894860f1aa80a59823c2f966b833 Mon Sep 17 00:00:00 2001
From: yqtianust <41743566+yqtianust@users.noreply.github.com>
Date: Tue, 5 Jan 2021 08:16:56 -0800
Subject: [PATCH 56/89] Fixing error in Readme.md. (#50033)

Summary:
Fix incorrect command in readme.
Fix incorrect url in readme.
Add url for dockerfile.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50033

Reviewed By: ezyang

Differential Revision: D25759567

Pulled By: mrshenli

fbshipit-source-id: 2a3bc88c8717a3890090ddd0d6657f49d14ff05a
---
 .jenkins/pytorch/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md
index 86c50f550400..9fd68ecf7f15 100644
--- a/.jenkins/pytorch/README.md
+++ b/.jenkins/pytorch/README.md
@@ -10,9 +10,9 @@ it is very easy to run these tests yourself:
    ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
    where ``$BUILD_ENVIRONMENT`` is one of the build environments
    enumerated in
-   [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh)
+   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/build.sh). The dockerfile used by jenkins can be found under the `.circle` [directory](https://github.com/pytorch/pytorch/blob/master/.circleci/docker)
 
-2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
+2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
    run one of the scripts in this directory.
 
 The Docker images are designed so that any "reasonable" build commands

From ace78ddb6a2bdbf03f08c69767eba57306dd69ed Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Tue, 5 Jan 2021 08:44:12 -0800
Subject: [PATCH 57/89] Revert D25763758: [pytorch][PR] introduce a flag to
 disable aten::cat in TE

Test Plan: revert-hammer

Differential Revision:
D25763758 (https://github.com/pytorch/pytorch/commit/9e0b4a96e48132190220820684033a77a92e8a33)

Original commit changeset: c4f4a8220964

fbshipit-source-id: 98775ad9058b81541a010e646b0cf4864854be3e
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 73361f8f3415..041471bfa077 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -15,10 +15,6 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/utils/memory.h>
 
-C10_DEFINE_bool(
-    torch_jit_disable_cat,
-    false,
-    "disable aten::cat in TE fusion groups");
 namespace torch {
 namespace jit {
 
@@ -206,10 +202,6 @@ bool isSupported(Node* node) {
       }
     }
 
-    if (FLAGS_torch_jit_disable_cat && node->kind() == aten::cat) {
-      return false;
-    }
-
     return true;
   }
 

From 52933b9923af3cabd830a915ae8f3a0aaa8cbab5 Mon Sep 17 00:00:00 2001
From: Andrii Grynenko <andrii@fb.com>
Date: Tue, 5 Jan 2021 09:06:04 -0800
Subject: [PATCH 58/89] Patch death tests/fork use after D25292667 (part 3)

Summary: (Note: this ignores all push blocking failures!)

Test Plan: unit tests

Differential Revision: D25775357

fbshipit-source-id: 0ae3c59181bc123d763ed9c0d05c536998ae5ca0
---
 caffe2/contrib/gloo/gloo_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
index fbca9b8fe64c..5ae066f5e3ca 100644
--- a/caffe2/contrib/gloo/gloo_test.py
+++ b/caffe2/contrib/gloo/gloo_test.py
@@ -27,7 +27,6 @@
 
 op_engine = 'GLOO'
 
-
 class TemporaryDirectory:
     def __enter__(self):
         self.tmpdir = tempfile.mkdtemp()

From e35b822d7d3c9caffe5c619c402cb7afc23d4f0b Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 5 Jan 2021 09:19:15 -0800
Subject: [PATCH 59/89] fixes indices computation for trilinear interpolate
 backwards (#50084)

Summary:
https://github.com/pytorch/pytorch/issues/48675 had some typos in indices computations so that results for trilinear interpolation where height is not equal to width were wrong. This PR fixes it.
cc xwang233

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50084

Reviewed By: BIT-silence

Differential Revision: D25777083

Pulled By: ngimel

fbshipit-source-id: 71be545628735fe875b7ea30bf6a09df4f2fae5c
---
 aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu | 14 +++++++-------
 torch/testing/_internal/common_nn.py             |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index 834c000fdb05..8ac7abca1824 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -168,43 +168,43 @@ __global__ void upsample_trilinear3d_backward_out_frame(
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1),
+      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
       i_numel,
       static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
       i_numel,
       static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
       i_numel,
       static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
       true);
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index c588f69c2875..714361497d94 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2988,7 +2988,7 @@ def fractional_max_pool3d_test(test_case):
                             .scale_factor(std::vector<double>({3., 3., 3.}))
                             .mode(torch::kTrilinear)
                             .align_corners(false)''',
-        input_size=(1, 2, 3, 4, 4),
+        input_size=(1, 2, 3, 4, 5),
         fullname='interpolate_trilinear_scale_3d',
         # See https://github.com/pytorch/pytorch/issues/5006
         precision=3e-4,

From b7bfc723d3245bffce92d1b0a2cef68f48426a5e Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Tue, 5 Jan 2021 09:22:01 -0800
Subject: [PATCH 60/89] Run mypy on more test files (#49658)

Summary:
Improves one annotation for `augment_model_with_bundled_inputs`

Also add a comment to not work on caffe2 type annotations, that's not worth the effort - those ignores can stay as they are.

xref gh-16574

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49658

Reviewed By: heitorschueroff

Differential Revision: D25757721

Pulled By: ezyang

fbshipit-source-id: 44c396d8da9ef3f41b97f9c46a528f0431c4b463
---
 mypy.ini                              | 11 +++++++++++
 test/test_bundled_inputs.py           |  4 +++-
 test/test_expecttest.py               |  3 ++-
 test/test_numpy_interop.py            | 18 +++++++++---------
 torch/testing/_internal/expecttest.py |  4 +++-
 torch/utils/bundled_inputs.py         |  4 ++--
 6 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 7d6161bddd17..bab4ce5dfd42 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,8 +17,13 @@ check_untyped_defs = True
 files =
     torch,
     caffe2,
+    test/test_bundled_images.py,
+    test/test_bundled_inputs.py,
     test/test_complex.py,
+    test/test_dataset.py,
+    test/test_expecttest.py,
     test/test_futures.py,
+    test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
     test/test_type_info.py
@@ -119,6 +124,12 @@ ignore_errors = True
 [mypy-torch.overrides]
 ignore_errors = True
 
+#
+# Adding type annotations to caffe2 is probably not worth the effort
+# only work on this if you have a specific reason for it, otherwise
+# leave these ignores as they are.
+#
+
 [mypy-caffe2.python.*]
 ignore_errors = True
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index f57407c9b1d1..e12339f3acea 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 import io
+from typing import List
+
 import torch
 import torch.utils.bundled_inputs
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -27,7 +29,7 @@ def forward(self, arg):
 
         sm = torch.jit.script(SingleTensorModel())
         original_size = model_size(sm)
-        get_expr = []
+        get_expr : List[str] = []
         samples = [
             # Tensor with small numel and small storage.
             (torch.tensor([1]),),
diff --git a/test/test_expecttest.py b/test/test_expecttest.py
index 652a33c41869..5e2461797705 100644
--- a/test/test_expecttest.py
+++ b/test/test_expecttest.py
@@ -4,6 +4,7 @@
 import string
 import textwrap
 import doctest
+from typing import Dict, Any
 
 import hypothesis
 from hypothesis.strategies import text, integers, composite, sampled_from, booleans
@@ -38,7 +39,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote):
         r3 = {r}{quote}placeholder3{quote}
         """.format(r='r' if raw else '', quote=quote * 3)
         new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0]
-        ns = {}
+        ns : Dict[str, Any] = {}
         exec(new_prog, ns)
         msg = "program was:\n{}".format(new_prog)
         self.assertEqual(ns['r'], 'placeholder', msg=msg)  # noqa: F821
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 35ac4eb94889..81c385ae90a2 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -47,10 +47,8 @@ def get_castable_tensor(shape, dtype):
             else:
                 # can't directly use min and max, because for int64_t, max - min
                 # is greater than int64_t range and triggers UB.
-                dtype_info = torch.iinfo(dtype)
-                low = max(dtype_info.min, int(-1e10))
-                high = min(dtype_info.max, int(1e10))
-                dtype_info = torch.iinfo(dtype)
+                low = max(torch.iinfo(dtype).min, int(-1e10))
+                high = min(torch.iinfo(dtype).max, int(1e10))
                 t = torch.empty(shape, dtype=torch.int64).random_(low, high)
             return t.to(dtype)
 
@@ -272,10 +270,12 @@ def test_numpy_array_interface(self, device):
         ]
         for tp, dtype in zip(types, dtypes):
             if np.dtype(dtype).kind == 'u':
-                x = torch.Tensor([1, 2, 3, 4]).type(tp)
+                # .type expects a XxxTensor, which have no type hints on
+                # purpose, so ignore during mypy type checking
+                x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
                 array = np.array([1, 2, 3, 4], dtype=dtype)
             else:
-                x = torch.Tensor([1, -2, 3, -4]).type(tp)
+                x = torch.Tensor([1, -2, 3, -4]).type(tp)  # type: ignore
                 array = np.array([1, -2, 3, -4], dtype=dtype)
 
             # Test __array__ w/o dtype argument
@@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device):
         float_types = [torch.DoubleTensor, torch.FloatTensor]
         float_dtypes = [np.float64, np.float32]
         for tp, dtype in zip(float_types, float_dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             for func in ['sin', 'sqrt', 'ceil']:
                 ufunc = getattr(np, func)
@@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device):
 
         # Test functions with boolean return value
         for tp, dtype in zip(types, dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             geq2_x = np.greater_equal(x, 2)
             geq2_array = np.greater_equal(array, 2).astype('uint8')
@@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device):
             self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val))
 
             # numpy integral type parses like a python int in custom python bindings:
-            self.assertEqual(torch.Storage(np_val).size(), scalar)
+            self.assertEqual(torch.Storage(np_val).size(), scalar)  # type: ignore
 
             tensor = torch.tensor([2], dtype=torch.int)
             tensor[0] = np_val
diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py
index 9e46a9a84a37..4dae7ebf03dc 100644
--- a/torch/testing/_internal/expecttest.py
+++ b/torch/testing/_internal/expecttest.py
@@ -3,6 +3,7 @@
 import traceback
 import os
 import string
+from typing import Tuple
 
 
 # This file implements expect tests (also known as "golden" tests).
@@ -139,7 +140,8 @@ def ok_for_raw_triple_quoted_string(s, quote):
                        r"(?P<raw>r?)", re.DOTALL)
 
 
-def replace_string_literal(src, lineno, new_string):
+def replace_string_literal(src : str, lineno : int,
+                           new_string : str) -> Tuple[str, int]:
     r"""
     Replace a triple quoted string literal with new contents.
     Only handles printable ASCII correctly at the moment.  This
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index c5d603885e4a..741c0841778a 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union
+from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence
 import textwrap
 import torch
 from torch._C import TupleType, OptionalType, ListType
@@ -17,7 +17,7 @@ class InflatableArg(NamedTuple):
 
 def augment_model_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Optional[List[Tuple[Any, ...]]] = None,
+        inputs: Optional[Sequence[Tuple[Any, ...]]] = None,
         _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
 ) -> None:
     """Add bundled sample inputs to a model.

From c86cfcd81da46b5e8226441edb58f0b11a97f215 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Tue, 5 Jan 2021 09:25:57 -0800
Subject: [PATCH 61/89] Run mypy over test/test_utils.py (#49654)

Summary:
This caught one incorrect annotation in `cpp_extension.load`.

xref gh-16574.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49654

Reviewed By: heitorschueroff

Differential Revision: D25757691

Pulled By: ezyang

fbshipit-source-id: 145ce3ae532cc585d9ca3bbd5381401bad0072e2
---
 mypy.ini                     |  3 +-
 test/test_utils.py           | 56 +++++++++++++++++++-----------------
 torch/utils/cpp_extension.py |  4 +--
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index bab4ce5dfd42..a74ed7979eac 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -26,7 +26,8 @@ files =
     test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
-    test/test_type_info.py
+    test/test_type_info.py,
+    test/test_utils.py
 
 
 # Minimum version supported - variable annotations were introduced
diff --git a/test/test_utils.py b/test/test_utils.py
index 5f1e693ab12f..9733ae036d6e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 import torch.utils.data
+from torch.utils.data import DataLoader
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
 import torch.utils.cpp_extension
@@ -28,7 +29,7 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 
 
-class RandomDatasetMock(object):
+class RandomDatasetMock(torch.utils.data.Dataset):
 
     def __getitem__(self, index):
         return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
@@ -190,7 +191,7 @@ def forward(self, a, b):
         b = torch.randn(1, 100, requires_grad=True)
 
         with self.assertRaises(TypeError):
-            checkpoint_sequential(model, 1, a, b)
+            checkpoint_sequential(model, 1, a, b)  # type: ignore[call-arg]
 
     def test_checkpoint_sequential_deprecated_no_args(self):
         class Noop(nn.Module):
@@ -200,7 +201,7 @@ def forward(self):
         model = nn.Sequential(Noop())
 
         with self.assertRaises(TypeError):
-            checkpoint_sequential(model, 1)
+            checkpoint_sequential(model, 1)  # type: ignore[call-arg]
 
     def test_checkpoint_rng_cpu(self):
         for _ in range(5):
@@ -277,7 +278,7 @@ def run_fn(tensor1, tensor2):
         out = checkpoint(run_fn, input_var, input_var2)
         out[0].sum().backward()
 
-        def run_fn(tensor1, tensor2):
+        def run_fn2(tensor1, tensor2):
             return tensor1
         input_var = torch.randn(1, 4, requires_grad=False)
         input_var2 = torch.randn(1, 4, requires_grad=True)
@@ -285,7 +286,7 @@ def run_fn(tensor1, tensor2):
             RuntimeError,
             r"none of output has requires_grad=True, this checkpoint\(\) is not necessary"
         ):
-            out = checkpoint(run_fn, input_var, input_var2)
+            out = checkpoint(run_fn2, input_var, input_var2)
             out.sum().backward()
 
 class TestDataLoader(TestCase):
@@ -308,35 +309,38 @@ def run():
         self.assertEqual(x1, x2)
 
     def test_single_keep(self):
-        dataloader = torch.utils.data.DataLoader(self.dataset,
-                                                 batch_size=self.batch_size,
-                                                 num_workers=0,
-                                                 drop_last=False)
+        # self.dataset is a Tensor here; technically not a valid input because
+        # not a Dataset subclass, but needs to stay working so add ignore's
+        # for type checking with mypy
+        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
+                                             batch_size=self.batch_size,
+                                             num_workers=0,
+                                             drop_last=False)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_single_drop(self):
-        dataloader = torch.utils.data.DataLoader(self.dataset,
-                                                 batch_size=self.batch_size,
-                                                 num_workers=0,
-                                                 drop_last=True)
+        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
+                                             batch_size=self.batch_size,
+                                             num_workers=0,
+                                             drop_last=True)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
     @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN")
     def test_multi_keep(self):
-        dataloader = torch.utils.data.DataLoader(self.dataset,
-                                                 batch_size=self.batch_size,
-                                                 num_workers=2,
-                                                 drop_last=False)
+        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
+                                             batch_size=self.batch_size,
+                                             num_workers=2,
+                                             drop_last=False)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_multi_drop(self):
-        dataloader = torch.utils.data.DataLoader(self.dataset,
-                                                 batch_size=self.batch_size,
-                                                 num_workers=2,
-                                                 drop_last=True)
+        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
+                                             batch_size=self.batch_size,
+                                             num_workers=2,
+                                             drop_last=True)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
@@ -347,7 +351,7 @@ def test_multi_drop(self):
 class TestFFI(TestCase):
     def test_deprecated(self):
         with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."):
-            from torch.utils.ffi import create_extension  # noqa: F401
+            from torch.utils.ffi import create_extension  # type: ignore  # noqa: F401
 
 
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
@@ -364,9 +368,9 @@ def _run(self, command, timeout=30):
             p.kill()
             output, err = p.communicate()
         rc = p.returncode
-        output = output.decode("ascii")
-        err = err.decode("ascii")
-        return (rc, output, err)
+        output_str = output.decode("ascii")
+        err_str = err.decode("ascii")
+        return (rc, output_str, err_str)
 
     def _run_bottleneck(self, test_file, scriptargs=''):
         curdir = os.path.dirname(os.path.abspath(__file__))
@@ -661,7 +665,7 @@ def forward(self, x):
         # data can be passed without errors
         x = torch.randn(4, 4).fill_(1.0)
         ms(x)
-        with self.assertRaisesRegex(torch.jit.Error, "foo"):
+        with self.assertRaisesRegex(torch.jit.Error, "foo"):  # type: ignore[type-var]
             ms(torch.tensor([False], dtype=torch.bool))
 
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 7ca849d48de0..93a8f403bc4a 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -17,7 +17,7 @@
 from ._cpp_extension_versioner import ExtensionVersioner
 from .hipify import hipify_python
 from .hipify.hipify_python import get_hip_file_path, GeneratedFileCleaner
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from setuptools.command.build_ext import build_ext
 from pkg_resources import packaging  # type: ignore
@@ -980,7 +980,7 @@ def library_paths(cuda: bool = False) -> List[str]:
 
 
 def load(name,
-         sources: List[str],
+         sources: Union[str, List[str]],
          extra_cflags=None,
          extra_cuda_cflags=None,
          extra_ldflags=None,

From 72306378b4be2bb87060dc2ce32ab870e80b1b49 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 5 Jan 2021 09:28:48 -0800
Subject: [PATCH 62/89] quant: ensure observers do not crash for empty Tensors
 (#49800)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49800

Ensures that having a Tensor with 0 elements does not crash observers.
Note: it's illegal to pass Tensors with 0 elements to reductions such
as min and max, so we gate this out before the logic hits min/max.

This should not be hit often in practice, but it's coming up
during debugging of some RCNN models with test inputs.

Test Plan:
```
python test/test_quantization.py TestObserver.test_zero_numel
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25693230

fbshipit-source-id: d737559697c98bd923356edacba895835060bb38
---
 test/quantization/test_workflow_module.py | 15 +++++++++++++++
 torch/quantization/observer.py            | 10 ++++++++++
 2 files changed, 25 insertions(+)

diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 22751697cd1d..8a70ae149c29 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -10,6 +10,7 @@
     PlaceholderObserver,
     NoopObserver,
     FakeQuantize,
+    FixedQParamsFakeQuantize,
     default_debug_qconfig,
     default_observer,
     default_per_channel_weight_observer,
@@ -504,6 +505,20 @@ def test_observer_qparams_respects_device_affinity(self):
             self.assertEqual(x.device, scale.device)
             self.assertEqual(x.device, zero_point.device)
 
+    def test_zero_numel(self):
+        obs_list = [MinMaxObserver, MovingAverageMinMaxObserver,
+                    PerChannelMinMaxObserver,
+                    MovingAveragePerChannelMinMaxObserver, HistogramObserver,
+                    FakeQuantize, FixedQParamsFakeQuantize]
+        for obs_cls in obs_list:
+            if obs_cls is FixedQParamsFakeQuantize:
+                obs = obs_cls(0.1, 0)
+            else:
+                obs = obs_cls()
+            x = torch.Tensor()
+            # verify no crash
+            x = obs(x)
+
 
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 7addaa622962..2cc579f66087 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -390,6 +390,8 @@ def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
 
     def forward(self, x_orig):
         r"""Records the running minimum and maximum of ``x``."""
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_val.dtype)
         min_val_cur, max_val_cur = torch._aminmax(x)
@@ -463,6 +465,8 @@ def __init__(self, averaging_constant=0.01, dtype=torch.quint8,
                                                           quant_max=quant_max)
 
     def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_val.dtype)
         min_val = self.min_val
@@ -532,6 +536,8 @@ def forward(self, x_orig):
         return self._forward(x_orig)
 
     def _forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         min_vals = self.min_vals
         max_vals = self.max_vals
@@ -638,6 +644,8 @@ def __init__(self, averaging_constant=0.01, ch_axis=0, dtype=torch.quint8,
         self.averaging_constant = averaging_constant
 
     def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_vals.dtype)
         min_vals = self.min_vals
@@ -878,6 +886,8 @@ def _combine_histograms(self,
         return orig_hist
 
     def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()
         min_val = self.min_val
         max_val = self.max_val

From 44c17b28c693c044daea169426c8e111e1e148fc Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 5 Jan 2021 09:34:07 -0800
Subject: [PATCH 63/89] quant: nice error message on convtranspose with
 per-channel weight (#49899)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49899

Per channel weights observer in conv transpose is not supported yet.  Adding an
error message which fails instantly instead of making the user wait until after
calibration/training finishes.

Test Plan:
```
python test/test_quantization.py TestPostTrainingStatic.test_convtranspose_per_channel_fails_early
python test/test_quantization.py TestQuantizeFx.test_convtranspose_per_channel_fails_early
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25717151

fbshipit-source-id: 093e5979030ec185e3e0d56c45d7ce7338bf94b6
---
 test/quantization/test_quantize.py    | 14 ++++++++++++++
 test/quantization/test_quantize_fx.py | 15 +++++++++++++++
 torch/quantization/qconfig.py         | 17 +++++++++++++++++
 torch/quantization/quantize.py        |  2 ++
 4 files changed, 48 insertions(+)

diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 067c35bd3c64..c47982f0c0cc 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -726,6 +726,20 @@ def forward(self, x):
         ref_res = ref_m(data)
         self.assertEqual(res, ref_res)
 
+    @skipIfNoFBGEMM
+    def test_convtranspose_per_channel_fails_early(self):
+        r"""
+        Verifies that attempting to quantize a ConvTranspose module with per-Channel
+        weight observers fails in the prepare step, as opposed to the convert step.
+        """
+        m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+        m.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+        with self.assertRaises(AssertionError) as context:
+            mp = torch.quantization.prepare(m)
+        self.assertTrue(
+            str(context.exception) ==
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
+
 
 @skipIfNoFBGEMM
 class TestPostTrainingDynamic(QuantizationTestCase):
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 545e70a2c5e6..d014bd31f02e 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1278,6 +1278,21 @@ def test_fp32_input_fp32_output(self):
         self._test_quantized_inputs_outputs(
             prepare_custom_config_dict, prepare_count_check, convert_count_check)
 
+    @skipIfNoFBGEMM
+    def test_convtranspose_per_channel_fails_early(self):
+        r"""
+        Verifies that attempting to quantize a ConvTranspose module with per-Channel
+        weight observers fails in the prepare step, as opposed to the convert step.
+        """
+        m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+        m.eval()
+        qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
+        with self.assertRaises(AssertionError) as context:
+            mp = prepare_fx(m, qconfig_dict)
+        self.assertTrue(
+            str(context.exception) ==
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 8da4ad6bb182..2d91d8ab6b3e 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -3,6 +3,8 @@
 from .fake_quantize import *
 import torch.nn as nn
 
+from typing import Union
+
 class QConfig(namedtuple('QConfig', ['activation', 'weight'])):
     """
     Describes how to quantize a layer or a part of the network by providing
@@ -109,3 +111,18 @@ def get_default_qat_qconfig(backend='fbgemm'):
     else:
         qconfig = default_qat_qconfig
     return qconfig
+
+def assert_valid_qconfig(qconfig: Union[QConfig, QConfigDynamic],
+                         mod: torch.nn.Module) -> None:
+    is_conv_transpose_mod = (
+        isinstance(mod, torch.nn.ConvTranspose1d) or
+        isinstance(mod, torch.nn.ConvTranspose2d) or
+        isinstance(mod, torch.nn.ConvTranspose3d))
+    if is_conv_transpose_mod:
+        example_observer = qconfig.weight()
+        is_per_channel = (
+            isinstance(example_observer, torch.quantization.PerChannelMinMaxObserver) or
+            isinstance(example_observer, torch.quantization.MovingAveragePerChannelMinMaxObserver)
+        )
+        assert not is_per_channel, \
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.'
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 1be867e0a299..a57a4ea6bcb8 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -50,6 +50,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
     module_qconfig = qconfig_dict.get(prefix, module_qconfig)
     module_qconfig = getattr(module, 'qconfig', module_qconfig)
 
+    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
+
     module.qconfig = module_qconfig
     for name, child in module.named_children():
         module_prefix = prefix + '.' + name if prefix else name

From 113b7623d6bd406670b7e6c8f69f7441454c5fe7 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 5 Jan 2021 10:52:49 -0800
Subject: [PATCH 64/89] quant: throw a nice error message for allclose with
 quantized inputs (#49802)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49802

Currently `torch.allclose` is not supported with quantized inputs.
Throw a nice error message instead of a cryptic one.

Test Plan:
```
torch.allclose(x_fp32, y_fp32)

torch.allclose(x_int8, y_int8)
```

Imported from OSS

Reviewed By: supriyar

Differential Revision: D25693538

fbshipit-source-id: 8958628433adfca3ae6ce215f3e3ec3c5e29994c
---
 aten/src/ATen/native/TensorCompare.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index b27a995962b4..5435f5042ce0 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -38,6 +38,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol
   TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type());
   TORCH_CHECK(!(self.is_complex() && equal_nan),
     "isclose with equal_nan=True is not supported for complex inputs.");
+  TORCH_CHECK(!(self.is_quantized() || other.is_quantized()),
+    "isclose is not supported for quantized inputs.");
 
   // Checks that rtol and atol are non-negative
   // Note: consistent with Python's isclose but divergent from NumPy's, which

From 04e86be1a202b755e296fbe759366f62e2b1ea43 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 5 Jan 2021 10:52:49 -0800
Subject: [PATCH 65/89] eager quant: fix error with removing forward hooks
 (#49813)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49813

https://github.com/pytorch/pytorch/issues/49739 reports a crash
where removing forward hooks results in a

```
RuntimeError: OrderedDict mutated during iteration
```

Unfortunately I cannot repro this inside the PyTorch module, but the issue
author has a good point and and we should not mutate the dict inside
of the iteration.

Test Plan:
```
// test plan from https://github.com/pytorch/pytorch/pull/46871 which
// originally added this
python test/test_quantization.py TestEagerModeQATOps
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25698725

fbshipit-source-id: 13069d0d5017a84038c8f7be439a3ed537938ac6
---
 torch/quantization/quantize.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index a57a4ea6bcb8..77752a8af9c9 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -258,9 +258,12 @@ def _remove_activation_post_process(module):
         delattr(module, 'activation_post_process')
 
     # remove activation_post_proceess hook
+    handle_ids_to_remove = set()
     for handle_id, hook_fn in module._forward_hooks.items():
         if hook_fn is _observer_forward_hook:
-            module._forward_hooks.pop(handle_id)
+            handle_ids_to_remove.add(handle_id)
+    for handle_id in handle_ids_to_remove:
+        module._forward_hooks.pop(handle_id)
 
 # TODO: rename to something more general
 def _remove_qconfig(module):

From bbae6774c16f8e7f55b66006172e2ffe85e92c04 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Tue, 5 Jan 2021 10:53:55 -0800
Subject: [PATCH 66/89] [JIT] Remove buffer metadata serialization
 forward-compat gate (#49990)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49990

**Summary**
This commit removes the forward-compatibility gate for buffer metadata
serialization. It was introduced to allow versions of fbcode
binaries statically linked against older versions of PyTorch (without
buffer metadata in JIT) to deserialize archives produced by new versions
of PyTorch. Enough time has probably passed that these old binaries
don't exist anymore, so it should be safe to remove the gate.

**Test Plan**
Internal tests.

Test Plan: Imported from OSS

Reviewed By: xw285cornell

Differential Revision: D25743199

Pulled By: SplitInfinity

fbshipit-source-id: 58d82ab4362270b309956826e36c8bf9d620f081
---
 torch/csrc/jit/serialization/python_print.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index e203a03a2e24..c86cbc460c9c 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -1339,15 +1339,13 @@ struct PythonPrintImpl {
           body_ << "\"" << param << "\", ";
         }
         body_ << "]\n";
-#ifndef FBCODE_CAFFE2
-        // Note: Forward compat gated. TODO: @voznesenskym to remove when ready.
+
         indent();
         body_ << "__buffers__ = [";
         for (const auto& buffer : buffers) {
           body_ << "\"" << buffer << "\", ";
         }
         body_ << "]\n";
-#endif
       }
 
       for (size_t i = 0; i < numAttrs; i++) {

From 3cd2f1f3a74b0101240f194d42ecb5af17375d94 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Tue, 5 Jan 2021 11:00:35 -0800
Subject: [PATCH 67/89] Add an option to disable aten::cat in TE (re-revert)
 (#50101)

Summary:
This reverts commit ace78ddb6a2bdbf03f08c69767eba57306dd69ed.

Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50101

Reviewed By: eellison

Differential Revision: D25784785

Pulled By: Krovatkin

fbshipit-source-id: cbb3d377e03303f6c8c71f4c59c6d90ab40d55f7
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 041471bfa077..e8091957ba65 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -15,6 +15,11 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/utils/memory.h>
 
+// NOLINTNEXTLINE
+C10_DEFINE_bool(
+    torch_jit_disable_cat,
+    false,
+    "disable aten::cat in TE fusion groups");
 namespace torch {
 namespace jit {
 
@@ -202,6 +207,10 @@ bool isSupported(Node* node) {
       }
     }
 
+    if (FLAGS_torch_jit_disable_cat && node->kind() == aten::cat) {
+      return false;
+    }
+
     return true;
   }
 

From c115957df086fd795abaae18c6b9bc1e80928a18 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>
Date: Tue, 5 Jan 2021 11:25:00 -0800
Subject: [PATCH 68/89] [distributed] Provide parameter to pass GPU ID in
 barrier function (#49069)

Summary:
For a multi GPU node, rank and corresponding GPU mapping can be different.
Provide optional parameter to specify the GPU device number for the
allreduce operation in barrier function.

Add test cases to validate barrier device_ids.

Signed-off-by: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>

Fixes https://github.com/pytorch/pytorch/issues/48110

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49069

Reviewed By: mrshenli

Differential Revision: D25658528

Pulled By: rohan-varma

fbshipit-source-id: 418198b6224c8c1fd95993b80c072a8ff8f02eec
---
 test/distributed/test_c10d.py         | 37 +++++++++++++++++++++++++++
 torch/_C/_distributed_c10d.pyi        |  1 +
 torch/csrc/distributed/c10d/init.cpp  |  1 +
 torch/distributed/distributed_c10d.py | 23 ++++++++++++++---
 torch/lib/c10d/ProcessGroupNCCL.cpp   |  8 +++++-
 torch/lib/c10d/Types.hpp              |  1 +
 6 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 5ffd4b4fb088..93e26be7ee98 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -4641,6 +4641,43 @@ def test_nccl_barrier_timeout_new_group_non_member(self):
             with self.assertRaisesRegex(RuntimeError, "Timed out initializing process group"):
                 c10d.new_group([0], timeout=timedelta(seconds=1))
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_nccl_barrier_device_ids(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        c10d.barrier(device_ids=[self.rank])
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_nccl_barrier_device_ids_function_argument(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        with self.assertRaisesRegex(RuntimeError, "Invalid function argument"):
+            c10d.barrier(device_ids=self.rank)
+
+    @requires_gloo()
+    def test_gloo_barrier_device_ids(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        with self.assertRaisesRegex(RuntimeError, "device_ids not supported"):
+            c10d.barrier(device_ids=[self.rank])
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index cd9a0f7d46a9..5ac2c0a8315d 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -77,6 +77,7 @@ class ReduceScatterOptions:
     timeout: timedelta
 
 class BarrierOptions:
+    device_ids: List[int]
     timeout: timedelta
 
 class AllToAllOptions:
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b31d44a1d295..76b466c91f10 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -345,6 +345,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
 
   py::class_<::c10d::BarrierOptions>(module, "BarrierOptions")
       .def(py::init<>())
+      .def_readwrite("device_ids", &::c10d::BarrierOptions::device_ids)
       .def_readwrite("timeout", &::c10d::BarrierOptions::timeout);
 
   py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions")
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index a8517a4bb394..51065ca9edc8 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -17,6 +17,7 @@
     AllreduceOptions,
     AllreduceCoalescedOptions,
     AllToAllOptions,
+    BarrierOptions,
     BroadcastOptions,
     FileStore,
     GatherOptions,
@@ -2370,8 +2371,11 @@ def all_to_all(output_tensor_list,
         work.wait()
 
 
+
 def barrier(group=GroupMember.WORLD,
-            async_op=False):
+            async_op=False,
+            device_ids=None):
+
     """
     Synchronizes all processes.
 
@@ -2382,6 +2386,8 @@ def barrier(group=GroupMember.WORLD,
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op
+        device_ids ([int], optional): List of device/GPU ids.
+                                      Valid only for NCCL backend.
 
     Returns:
         Async work handle, if async_op is set to True.
@@ -2390,11 +2396,22 @@ def barrier(group=GroupMember.WORLD,
     if _rank_not_in_group(group):
         return
 
+    opts = BarrierOptions()
+    if device_ids is not None:
+        if get_backend(group) != Backend.NCCL:
+            raise RuntimeError("Function argument device_ids not supported "
+                               "for the selected backend {}".format(get_backend(group)))
+        if isinstance(device_ids, list):
+            opts.device_ids = device_ids
+        else:
+            raise RuntimeError("Invalid function argument: "
+                               "device_ids type should be List[int]")
+
     if group is None:
         default_pg = _get_default_group()
-        work = default_pg.barrier()
+        work = default_pg.barrier(opts=opts)
     else:
-        work = group.barrier()
+        work = group.barrier(opts=opts)
 
     if async_op:
         return work
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 01ce71afd388..b9ac5aa77150 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -1409,7 +1409,13 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
     const BarrierOptions& opts) {
   std::vector<at::Device> devices;
-  if (usedDeviceIdxs_.empty()) {
+
+  // Use user defined GPU device ids if provided
+  if (!opts.device_ids.empty()) {
+    for (auto device : opts.device_ids) {
+      devices.push_back(at::Device(at::DeviceType::CUDA, device));
+    }
+  } else if (usedDeviceIdxs_.empty()) {
     // This means there is not yet a NCCL collective being called
     // Here we have to use the best guesses and will use a single GPU to call
     // allreduce to achieve barrier.
diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp
index 03b2e59e4295..a5a0d5fa20df 100644
--- a/torch/lib/c10d/Types.hpp
+++ b/torch/lib/c10d/Types.hpp
@@ -62,6 +62,7 @@ struct AllToAllOptions {
 };
 
 struct BarrierOptions {
+  std::vector<int> device_ids;
   std::chrono::milliseconds timeout = kUnsetTimeout;
 };
 

From e868825eb6423d5bf29ca1b1eb4d1dddde74ca68 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 5 Jan 2021 11:45:55 -0800
Subject: [PATCH 69/89] [RPC] Relax some profiling tests (#49983)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49983

We have observed very rare flakiness in some profiling tests recently,
i.e.: . However, we were not able to reproduce these even with thousands of
runs on the CI machines where the failure was originally reported. As a result,
relaxing these tests and re-enabling them to reduce failure rates.
ghstack-source-id: 119352019

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D25739416

fbshipit-source-id: 4dbb6b30f20d3af94ba39f4a7ccf4fb055e440bc
---
 .../_internal/distributed/rpc/rpc_test.py     | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 8eec8100270b..ede2471aa3a2 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1335,7 +1335,11 @@ def convert_remote_to_local(event_name):
                 for event in events
                 if convert_remote_to_local(event.name) in EXPECTED_REMOTE_EVENTS
             ]
-            self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS)
+            self.assertEqual(
+                set(remote_events_list),
+                set(EXPECTED_REMOTE_EVENTS),
+                f"Mismatch between profiled events: {set(remote_events_list)} and expected events: {set(EXPECTED_REMOTE_EVENTS)}",
+            )
 
     @dist_init
     def test_profiler_remote_events_profiled(self):
@@ -1579,8 +1583,8 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                 scope_event = get_function_event(events, "foo")
                 # Since RPC call is within the scope, its CPU interval should be
                 # contained within foo's interval.
-                self.assertTrue(scope_event.time_range.start < rpc_event.time_range.start)
-                self.assertTrue(scope_event.time_range.end > rpc_event.time_range.end)
+                self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start)
+                self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end)
             # the sender, dest worker, function run, and type of RPC should all
             # be recorded.
             self_worker_name = worker_name(self.rank)
@@ -1776,7 +1780,13 @@ def _assert_top_level_events(self, process_global_events, expected_top_level_eve
                 if time_range.start > last_end_time:
                     top_level_event_names.append(event_name)
                     last_end_time = time_range.end
-        self.assertEqual(sorted(top_level_event_names), sorted(expected_top_level_event_names))
+        top_level_event_names = sorted(top_level_event_names)
+        expected_top_level_event_names = sorted(expected_top_level_event_names)
+        self.assertEqual(
+            top_level_event_names,
+            expected_top_level_event_names,
+            f"Expected events {expected_top_level_event_names}, but got {top_level_event_names}",
+        )
 
     @dist_init
     def test_server_process_global_profiler(self):
@@ -1799,9 +1809,12 @@ def test_server_process_global_profiler(self):
         outer_profile_rref.rpc_sync().__exit__(None, None, None)
 
         inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,))
-        self._assert_top_level_events(inner_events, ['aten::sub'])
+        expected_inner_events = ['aten::sub']
+        expected_outer_events = expected_inner_events + ['aten::add']
+
+        self._assert_top_level_events(inner_events, expected_inner_events)
         outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,))
-        self._assert_top_level_events(outer_events, ['aten::add', 'aten::sub'])
+        self._assert_top_level_events(outer_events, expected_outer_events)
 
         inner_profile_rref.rpc_sync().key_averages()
         outer_profile_rref.rpc_sync().key_averages()

From 12ee7b61e75caa0c57b53ee1627e3075686d2510 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Tue, 5 Jan 2021 12:31:01 -0800
Subject: [PATCH 70/89] support building with conda installed libraries
 (#50080)

Summary:
This should fix a bunch of share library compilation error when installed in conda lib, lib64 folder.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50080

Reviewed By: seemethere

Differential Revision: D25781923

Pulled By: walterddr

fbshipit-source-id: 78a74925981d65243b98bb99a65f1f2766e87a2f
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba862b5a4d5f..e346087c0cdb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -316,7 +316,7 @@ set(OP_DEPENDENCY "" CACHE STRING
 # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
 # https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
 if(LINUX)
-  set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed")
+  set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}")
 endif()
 
 if(MSVC)

From 16e5af41dab3a64072d88d76003dd9c2aa49d4bc Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Tue, 5 Jan 2021 12:44:37 -0800
Subject: [PATCH 71/89] Fix store based barrier to only use 'add'. (#49930)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49930

Certain store implementations don't work well when we use get() and
add() on the same key. To avoid this issue, we only use add() in the store
based barrier. The buggy store implementations can't be properly fixed due to
legacy reasons.

Test Plan:
1) unit tests.
2) waitforbuildbot

Reviewed By: osalpekar

Differential Revision: D25725386

fbshipit-source-id: 1535e2629914de7f78847b730f8764f92cde67e7
---
 torch/distributed/distributed_c10d.py | 53 ++++++++++++++-------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 51065ca9edc8..5b300452f6d3 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,8 +1,8 @@
+import contextlib
+import logging
 import pickle
 import torch
 import warnings
-import contextlib
-import sys
 import time
 from torch._six import string_classes
 from datetime import timedelta
@@ -19,7 +19,6 @@
     AllToAllOptions,
     BarrierOptions,
     BroadcastOptions,
-    FileStore,
     GatherOptions,
     PrefixStore,
     ProcessGroup,
@@ -28,15 +27,8 @@
     ReduceScatterOptions,
     ScatterOptions,
     Store,
-    TCPStore,
 )
 
-if sys.platform != 'win32':
-    from torch._C._distributed_c10d import (
-        HashStore,
-    )
-
-
 _MPI_AVAILABLE = True
 _NCCL_AVAILABLE = True
 _GLOO_AVAILABLE = True
@@ -192,16 +184,35 @@ def _store_based_barrier(rank, store, timeout):
     """
     store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _group_count)
     store.add(store_key, 1)
+    logging.info('Added key: {} to store for rank: {}'.format(store_key, rank))
 
     # Now wait for all workers to check in with the store.
     world_size = get_world_size()
-    worker_count = int(store.get(store_key))
+    # Use 'add' instead of 'get' since for some store implementations 'add'
+    # doesn't work well with 'get'. Ideally the store implementations should
+    # be fixed, but for backward compatiblity reasons it is risky to change
+    # the store implementations. Once, we completely migrate away from these
+    # legacy stores, we can use 'get' here instead.
+    worker_count = store.add(store_key, 0)
     start = time.time()
+    log_time = time.time()
     while worker_count != world_size:
         time.sleep(0.01)
-        worker_count = int(store.get(store_key))
+        worker_count = store.add(store_key, 0)
+
+        # Print status periodically to keep track.
+        if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10):
+            logging.info(
+                "Waiting in store based barrier to initialize process group for "
+                "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format(
+                    rank, store_key, world_size, worker_count, timeout))
+            log_time = time.time()
+
         if timedelta(seconds=(time.time() - start)) > timeout:
-            raise RuntimeError("Timed out initializing process group")
+            raise RuntimeError(
+                "Timed out initializing process group in store based barrier on "
+                "rank: {}, for key: {} (world_size={}, worker_count={}, timeout={})".format(
+                    rank, store_key, world_size, worker_count, timeout))
 
 def _rank_not_in_group(group: ProcessGroup):
     """
@@ -505,12 +516,8 @@ def init_process_group(backend,
     # barrier at the end to ensure that once we return from this method, all
     # process groups including global variables are updated correctly on all
     # ranks.
-    if backend == Backend.MPI or not (
-        isinstance(store, TCPStore) or
-        isinstance(store, FileStore) or
-        (sys.platform != 'win32' and isinstance(store, HashStore))
-    ):
-        # MPI doesn't have store.
+    if backend == Backend.MPI:
+        # MPI backend doesn't use store.
         barrier()
     else:
         # Use store based barrier here since barrier() used a bunch of
@@ -2508,16 +2515,12 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
     # barrier at the end to ensure that once we return from this method, all
     # process groups including global variables are updated correctly on all
     # ranks.
-    if backend == Backend.MPI or not (
-        isinstance(default_store, TCPStore) or
-        isinstance(default_store, FileStore) or
-        (sys.platform != 'win32' and isinstance(default_store, HashStore))
-    ):
+    if backend == Backend.MPI:
         # MPI doesn't have store.
         barrier()
     else:
         # Use store based barrier here since barrier() used a bunch of
         # default devices and messes up NCCL internal state.
-        _store_based_barrier(group_rank, default_store, timeout)
+        _store_based_barrier(global_rank, default_store, timeout)
 
     return pg

From eee849be8cc6cce5ce867afb7d95aec521b16dce Mon Sep 17 00:00:00 2001
From: James Donald <jdonald@fb.com>
Date: Tue, 5 Jan 2021 13:09:58 -0800
Subject: [PATCH 72/89] [caffe2][a10] Move down pragma pop to properly suppress
 warning 4522 (#49233)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49233

As the comments on line 160, say we should suppress this overly aggressive warning with MSVC:
```
caffe2\tensorbody.h_ovrsource#header-mode-symlink-tree-only,headers\aten\core\tensorbody.h(1223): warning C4522: 'at::Tensor': multiple assignment operators specified
```

However, in order to remove the warning, the closing brace of the class must be between the`#pragma warning` push and its corresponding pop. Move the pop down to ensure that.

Test Plan: Built locally using clang for Windows without buck cache, confirmed the warning resolved

Reviewed By: bhosmer

Differential Revision: D25422447

fbshipit-source-id: c1e1c66fb8513af5f9d4e3c1dc48d0070c4a1f84
---
 aten/src/ATen/templates/TensorBody.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 1c0a04a318d0..0dfef701c51b 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -208,10 +208,6 @@ class TORCH_API Tensor {
   Tensor& operator=(const Tensor&) &&;
   Tensor& operator=(Tensor&&) &&;
 
-  #ifdef _MSC_VER
-  #pragma warning( pop )
-  #endif
-
   bool is_same(const Tensor& other) const noexcept {
     return impl_ == other.impl_;
   }
@@ -761,6 +757,12 @@ class TORCH_API Tensor {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
+// For "multiple ... operators specified" warnings, closing brace of class
+// declaration must be included between pragma push & pop
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
 int64_t get_device(Tensor self);
 
 template <typename T>

From 9945fd725371926181f51c01dc2b388266d6e1d4 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 5 Jan 2021 13:16:02 -0800
Subject: [PATCH 73/89] Drop unused imports from caffe2/python (#49980)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49980

From
```
./python/libcst/libcst codemod remove_unused_imports.RemoveUnusedImportsWithGlean --no-format caffe2/
```

Test Plan: Standard sandcastle tests

Reviewed By: xush6528

Differential Revision: D25727359

fbshipit-source-id: c4f60005b10546423dc093d31d46deb418352286
---
 caffe2/python/_import_c_extension.py                 | 10 ----------
 .../benchmarks/sparse_lengths_sum_nbit_benchmark.py  |  2 +-
 caffe2/python/convert.py                             |  3 ---
 caffe2/python/convert_test.py                        |  4 +---
 caffe2/python/core_gradients_test.py                 |  1 -
 caffe2/python/dataio_test.py                         |  2 --
 caffe2/python/ideep/conv_op_test.py                  |  1 -
 caffe2/python/ideep/convfusion_op_test.py            |  3 +--
 caffe2/python/ideep/dropout_op_test.py               |  2 --
 caffe2/python/ideep/order_switch_op_test.py          |  1 -
 caffe2/python/ideep/shape_op_test.py                 |  1 -
 caffe2/python/ideep/spatial_bn_op_test.py            |  3 +--
 caffe2/python/ideep/test_ideep_net.py                |  1 -
 caffe2/python/ideep/transform_ideep_net.py           |  1 -
 caffe2/python/ideep/transpose_op_test.py             |  1 -
 caffe2/python/ideep_test_util.py                     |  1 -
 caffe2/python/layer_model_helper.py                  |  1 -
 caffe2/python/mkl/mkl_LRN_op_test.py                 |  2 +-
 caffe2/python/mkl/mkl_LRN_speed_test.py              |  2 +-
 caffe2/python/mkl/mkl_conv_op_test.py                |  2 +-
 caffe2/python/mkl/mkl_fc_op_test.py                  |  2 +-
 caffe2/python/mkl/mkl_fc_speed_test.py               |  2 +-
 caffe2/python/mkl/mkl_fill_op_test.py                |  3 +--
 caffe2/python/mkl/mkl_pool_speed_test.py             |  2 +-
 caffe2/python/mkl/mkl_sbn_op_test.py                 |  2 +-
 caffe2/python/mkl/mkl_sbn_speed_test.py              |  2 +-
 caffe2/python/mkl/mkl_speed_test.py                  |  2 +-
 caffe2/python/mkl/rewrite_graph.py                   |  1 -
 caffe2/python/nomnigraph_test.py                     |  2 +-
 caffe2/python/onnx/backend.py                        | 12 +-----------
 caffe2/python/onnx/bin/conversion.py                 |  3 +--
 caffe2/python/onnx/frontend.py                       |  9 +++------
 caffe2/python/onnx/helper.py                         |  3 ---
 caffe2/python/onnx/onnxifi.py                        |  2 --
 caffe2/python/onnx/test_onnxifi.py                   |  4 +---
 caffe2/python/onnx/tests/c2_ref_test.py              |  3 +--
 caffe2/python/onnx/tests/onnx_backend_test.py        |  2 +-
 caffe2/python/onnx/tests/ssa_test.py                 |  3 +--
 caffe2/python/onnx/tests/test_utils.py               |  1 -
 caffe2/python/operator_fp_exceptions_test.py         |  1 -
 caffe2/python/operator_test/blobs_queue_db_test.py   |  1 -
 caffe2/python/operator_test/boolean_mask_test.py     |  1 -
 caffe2/python/operator_test/bucketize_op_test.py     |  3 +--
 caffe2/python/operator_test/concat_split_op_test.py  |  3 +--
 caffe2/python/operator_test/conv_test.py             |  1 -
 .../cosine_embedding_criterion_op_test.py            |  1 -
 caffe2/python/operator_test/crf_test.py              |  1 -
 .../python/operator_test/cross_entropy_ops_test.py   |  1 -
 .../operator_test/ctc_beam_search_decoder_op_test.py |  1 -
 caffe2/python/operator_test/cudnn_recurrent_test.py  |  1 -
 caffe2/python/operator_test/deform_conv_test.py      |  1 -
 .../python/operator_test/depthwise_3x3_conv_test.py  |  2 +-
 caffe2/python/operator_test/distance_op_test.py      |  1 -
 .../operator_test/elementwise_linear_op_test.py      |  1 -
 caffe2/python/operator_test/elementwise_ops_test.py  |  1 -
 .../python/operator_test/enforce_finite_op_test.py   |  1 -
 caffe2/python/operator_test/expand_op_test.py        |  2 +-
 caffe2/python/operator_test/feature_maps_ops_test.py |  2 +-
 caffe2/python/operator_test/glu_op_test.py           |  2 +-
 caffe2/python/operator_test/group_conv_test.py       |  1 -
 caffe2/python/operator_test/gru_test.py              |  1 -
 caffe2/python/operator_test/hyperbolic_ops_test.py   |  1 -
 caffe2/python/operator_test/im2col_col2im_test.py    |  3 ---
 caffe2/python/operator_test/instance_norm_test.py    |  1 -
 caffe2/python/operator_test/jsd_ops_test.py          |  1 -
 caffe2/python/operator_test/layer_norm_op_test.py    |  1 -
 caffe2/python/operator_test/lengths_pad_op_test.py   |  1 -
 .../lengths_reducer_fused_nbit_rowwise_ops_test.py   |  2 +-
 caffe2/python/operator_test/lengths_tile_op_test.py  |  1 -
 caffe2/python/operator_test/loss_ops_test.py         |  1 -
 caffe2/python/operator_test/matmul_op_test.py        |  2 --
 caffe2/python/operator_test/mean_op_test.py          |  2 --
 caffe2/python/operator_test/moments_op_test.py       |  1 -
 caffe2/python/operator_test/numpy_tile_op_test.py    |  2 +-
 caffe2/python/operator_test/onnx_while_test.py       |  2 +-
 .../operator_test/pack_rnn_sequence_op_test.py       |  1 -
 caffe2/python/operator_test/pad_test.py              |  2 --
 caffe2/python/operator_test/percentile_op_test.py    |  2 +-
 .../operator_test/rand_quantization_op_test.py       |  1 -
 .../python/operator_test/recurrent_network_test.py   |  3 ---
 caffe2/python/operator_test/reduce_ops_test.py       |  1 -
 caffe2/python/operator_test/reduction_ops_test.py    |  1 -
 .../operator_test/roi_align_rotated_op_test.py       |  1 -
 caffe2/python/operator_test/sequence_ops_test.py     |  1 -
 caffe2/python/operator_test/spatial_bn_op_test.py    |  1 -
 .../operator_test/square_root_divide_op_test.py      |  1 -
 caffe2/python/optimizer_test_util.py                 |  1 -
 caffe2/python/rnn/lstm_comparison.py                 |  1 -
 caffe2/python/rnn_cell.py                            |  1 -
 caffe2/python/scope_test.py                          |  1 -
 caffe2/python/test/executor_test_util.py             |  1 -
 caffe2/python/test/inference_lstm_op_test.py         |  3 +--
 caffe2/python/test/python_protobuf_test.py           |  3 ---
 caffe2/python/trt/test_pt_onnx_trt.py                |  4 ----
 caffe2/python/trt/test_trt.py                        |  3 +--
 caffe2/python/trt/transform.py                       |  4 +---
 96 files changed, 37 insertions(+), 155 deletions(-)

diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
index d6754adc20fd..32b9ec34d1f8 100644
--- a/caffe2/python/_import_c_extension.py
+++ b/caffe2/python/_import_c_extension.py
@@ -5,16 +5,6 @@
 import sys
 from caffe2.python import extension_loader
 
-# NOTE: we have to import python protobuf here **before** we load cpp extension.
-# Otherwise it breaks under certain build conditions if cpp implementation of
-# protobuf is used. Presumably there's some registry in protobuf library and
-# python side has to initialize the dictionary first, before static
-# initialization in python extension does so. Otherwise, duplicated protobuf
-# descriptors will be created and it can lead to obscure errors like
-#   "Parameter to MergeFrom() must be instance of same class:
-#    expected caffe2.NetDef got caffe2.NetDef."
-import caffe2.proto
-
 # We will first try to load the gpu-enabled caffe2. If it fails, we will then
 # attempt to load the cpu version. The cpu backend is the minimum required, so
 # if that still fails, we will exit loud.
diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
index 1b683be0d51e..b4cb8f2da0b4 100644
--- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
+++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
@@ -5,7 +5,7 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core, dyndep, workspace
+from caffe2.python import core, workspace
 
 
 def benchmark_sparse_lengths_sum(
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 18033661a69e..b4b37811de10 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -5,6 +5,3 @@
 
 
 
-from caffe2.proto import caffe2_pb2, torch_pb2
-
-import caffe2.python._import_c_extension as C
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index a1dc52aad2d9..d9d82bf5e6c4 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -3,10 +3,8 @@
 
 
 
-from caffe2.python import convert, workspace
-from caffe2.proto import caffe2_pb2, torch_pb2
+from caffe2.python import workspace
 import unittest
-import numpy as np
 
 class TestOperator(unittest.TestCase):
     def setUp(self):
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index 3674b7aa4585..293eccca0dd4 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -3,7 +3,6 @@
 
 
 
-from future.utils import bytes_to_native_str
 from hypothesis import given, settings
 import hypothesis.strategies as st
 import unittest
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
index 0c45fb50aed9..ac1c72284fbf 100644
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@@ -6,7 +6,6 @@
 from caffe2.python.dataio import (
     CompositeReader,
     CompositeReaderBuilder,
-    Reader,
     ReaderBuilder,
     ReaderWithDelay,
     ReaderWithLimit,
@@ -29,7 +28,6 @@
 import shutil
 import unittest
 import tempfile
-import time
 
 
 def make_source_dataset(ws, size=100, offset=0, name=None):
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
index ae4473ea4864..7c5a0026c113 100644
--- a/caffe2/python/ideep/conv_op_test.py
+++ b/caffe2/python/ideep/conv_op_test.py
@@ -4,7 +4,6 @@
 
 
 import unittest
-import sys
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
index 18ce574b623b..a0a782ab8a03 100644
--- a/caffe2/python/ideep/convfusion_op_test.py
+++ b/caffe2/python/ideep/convfusion_op_test.py
@@ -5,8 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
-import copy
+from hypothesis import given
 import numpy as np
 import math
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
index 33b0a52a7421..5b07333758dd 100644
--- a/caffe2/python/ideep/dropout_op_test.py
+++ b/caffe2/python/ideep/dropout_op_test.py
@@ -7,8 +7,6 @@
 from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
-
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py
index a259e01bab10..39ede0d214fe 100644
--- a/caffe2/python/ideep/order_switch_op_test.py
+++ b/caffe2/python/ideep/order_switch_op_test.py
@@ -10,7 +10,6 @@
 import caffe2.python.ideep_test_util as mu
 
 from hypothesis import given, settings
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 
 
diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py
index 47114832f85d..1beb24bc8803 100644
--- a/caffe2/python/ideep/shape_op_test.py
+++ b/caffe2/python/ideep/shape_op_test.py
@@ -7,7 +7,6 @@
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
index 618a0e7fbfc3..97efafa72057 100644
--- a/caffe2/python/ideep/spatial_bn_op_test.py
+++ b/caffe2/python/ideep/spatial_bn_op_test.py
@@ -7,9 +7,8 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-from caffe2.python import brew, core, workspace
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
-from caffe2.python.model_helper import ModelHelper
 import caffe2.python.ideep_test_util as mu
 
 
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
index aa1c5bc260fa..42feeed00122 100644
--- a/caffe2/python/ideep/test_ideep_net.py
+++ b/caffe2/python/ideep/test_ideep_net.py
@@ -9,7 +9,6 @@
 import numpy as np
 import argparse
 import time
-import os.path
 
 
 def GetArgumentParser():
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
index 962d4051718b..2d0f35a7406f 100644
--- a/caffe2/python/ideep/transform_ideep_net.py
+++ b/caffe2/python/ideep/transform_ideep_net.py
@@ -6,7 +6,6 @@
 import argparse
 import copy
 import json
-import os.path
 
 import numpy as np
 
diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py
index 8b324ed964ae..f8b784822a07 100644
--- a/caffe2/python/ideep/transpose_op_test.py
+++ b/caffe2/python/ideep/transpose_op_test.py
@@ -7,7 +7,6 @@
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
index 7129ed14ba74..0cc643317c93 100644
--- a/caffe2/python/ideep_test_util.py
+++ b/caffe2/python/ideep_test_util.py
@@ -14,7 +14,6 @@
 import hypothesis.strategies as st
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace
 from caffe2.python import hypothesis_test_util as hu
 
 cpu_do = hu.cpu_do
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 9d825f3827b9..6a5a3c82dd30 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -17,7 +17,6 @@
 from caffe2.python.optimizer import get_param_device, Optimizer
 from caffe2.python.regularizer import Regularizer, RegularizationBy
 from caffe2.python.layers import layers
-from caffe2.proto import caffe2_pb2
 from future.utils import viewitems, viewvalues
 
 import logging
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
index 2b084bea591b..fddb20e6bb14 100644
--- a/caffe2/python/mkl/mkl_LRN_op_test.py
+++ b/caffe2/python/mkl/mkl_LRN_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
index ae42902d9102..c192137dc28c 100644
--- a/caffe2/python/mkl/mkl_LRN_speed_test.py
+++ b/caffe2/python/mkl/mkl_LRN_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
index f1fe7b062318..74c4f2c6cde9 100644
--- a/caffe2/python/mkl/mkl_conv_op_test.py
+++ b/caffe2/python/mkl/mkl_conv_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
index 01786d55c337..180d93f26570 100644
--- a/caffe2/python/mkl/mkl_fc_op_test.py
+++ b/caffe2/python/mkl/mkl_fc_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
index 85f5605e9676..243e49c2f8f8 100644
--- a/caffe2/python/mkl/mkl_fc_speed_test.py
+++ b/caffe2/python/mkl/mkl_fc_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
index 26a9b7131b0b..f233275786f7 100644
--- a/caffe2/python/mkl/mkl_fill_op_test.py
+++ b/caffe2/python/mkl/mkl_fill_op_test.py
@@ -5,8 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
+from hypothesis import given
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.mkl_test_util as mu
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
index b25e0f915cc7..aa43aed97a09 100644
--- a/caffe2/python/mkl/mkl_pool_speed_test.py
+++ b/caffe2/python/mkl/mkl_pool_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
index 2ac9080ce670..86856b130d63 100644
--- a/caffe2/python/mkl/mkl_sbn_op_test.py
+++ b/caffe2/python/mkl/mkl_sbn_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
index 3b3b71d1c997..05885ceca575 100644
--- a/caffe2/python/mkl/mkl_sbn_speed_test.py
+++ b/caffe2/python/mkl/mkl_sbn_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
index 9a7310a484d1..ab2e4428519a 100644
--- a/caffe2/python/mkl/mkl_speed_test.py
+++ b/caffe2/python/mkl/mkl_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index 3a88a3deeccc..b52501584064 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -6,7 +6,6 @@
 import copy
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
-import caffe2.python._import_c_extension as C
 
 
 def rewrite_init_net_simple(net):
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 3d9adc696486..bd9d10fcbae1 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace, test_util
+from caffe2.python import core, test_util
 from caffe2.proto import caffe2_pb2
 import caffe2.python.nomnigraph as ng
 
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 5d445576b32c..2c80fadafaee 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -5,14 +5,7 @@
 
 To run this, you will need to have Caffe2 installed as well.
 """
-
-
-
-
-
-import os
 import collections
-from subprocess import Popen, PIPE
 import sys
 import zipfile
 import itertools
@@ -23,8 +16,6 @@
 # importing onnx first, which will cause it to go out and pick up the
 # system protobuf.
 import onnx.backend
-
-import caffe2
 from caffe2.python import core, workspace, rnn_cell, gru_cell
 from caffe2.python.compatibility import container_abcs
 from caffe2.python.model_helper import ModelHelper
@@ -32,7 +23,7 @@
 import caffe2.python.utils
 import numpy as np
 import onnx
-from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto
+from onnx import TensorProto
 import onnx.numpy_helper
 import onnx.defs
 import onnx.optimizer
@@ -42,7 +33,6 @@
 
 from caffe2.python.onnx.workspace import Workspace
 from caffe2.python.onnx.backend_rep import Caffe2Rep
-from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
index 126eef8a8470..7e469e514a73 100644
--- a/caffe2/python/onnx/bin/conversion.py
+++ b/caffe2/python/onnx/bin/conversion.py
@@ -9,8 +9,7 @@
 
 from caffe2.proto import caffe2_pb2
 import click
-import numpy as np
-from onnx import checker, ModelProto
+from onnx import ModelProto
 
 from caffe2.python.onnx.backend import Caffe2Backend as c2
 import caffe2.python.onnx.frontend as c2_onnx
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index ee3c30949ff7..bb2778d1a991 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -17,15 +17,12 @@
 
 from caffe2.python import core as caffe2_core
 from caffe2.python.compatibility import container_abcs
-from caffe2.proto import caffe2_legacy_pb2
-from enum import Enum
-from onnx import (defs, checker, helper, numpy_helper, mapping,
-                  ModelProto, GraphProto, NodeProto, AttributeProto, TensorProto, OperatorSetIdProto)
-from onnx.helper import make_tensor, make_tensor_value_info, make_attribute, make_model
+from onnx import (checker, helper, numpy_helper, mapping,
+                  GraphProto, NodeProto, TensorProto, OperatorSetIdProto)
+from onnx.helper import make_tensor_value_info, make_model
 import numpy as np
 
 from caffe2.python.onnx.helper import c2_native_run_net
-from caffe2.python.onnx.error import Unsupported
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
index 7f8f1a6d346a..6e73a5d5c95d 100644
--- a/caffe2/python/onnx/helper.py
+++ b/caffe2/python/onnx/helper.py
@@ -9,9 +9,6 @@
 from onnx.backend.base import namedtupledict
 
 from caffe2.python.onnx.workspace import Workspace
-import caffe2.python._import_c_extension as C
-
-import io
 import logging
 import time
 
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
index a04e7e4554b9..3e67c4948b1f 100644
--- a/caffe2/python/onnx/onnxifi.py
+++ b/caffe2/python/onnx/onnxifi.py
@@ -11,9 +11,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
 import caffe2.python._import_c_extension as C
-import numpy as np
 
 
 def onnxifi_caffe2_net(
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
index 7eafccaec9e4..4316149d5bf6 100644
--- a/caffe2/python/onnx/test_onnxifi.py
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -3,16 +3,14 @@
 
 
 
-import json
 import numpy as np
-import os
 import time
 import unittest
 
 import onnx
 import onnx.defs
 from onnx.backend.base import namedtupledict
-from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from caffe2.python.models.download import ModelDownloader
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index d2efcc79823e..aab5a04a169c 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -6,7 +6,6 @@
 
 
 
-import json
 import os
 import unittest
 
@@ -17,7 +16,7 @@
 from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
 
-from onnx import defs, mapping
+from onnx import mapping
 import caffe2.python.onnx.frontend as c2_onnx
 import caffe2.python.onnx.backend as c2
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 5166ec3c5083..e8b718a5a2be 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -13,7 +13,7 @@
 
 import caffe2.python.onnx.backend as c2
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 core.SetEnginePref({}, {})
 
 # This is a pytest magic variable to load extra plugins
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
index d34d4a0e5287..96f954037178 100644
--- a/caffe2/python/onnx/tests/ssa_test.py
+++ b/caffe2/python/onnx/tests/ssa_test.py
@@ -7,11 +7,10 @@
 
 
 import copy
-import onnx
 import numpy as np
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
-from onnx import helper, TensorProto
+from onnx import TensorProto
 
 import caffe2.python.onnx.frontend as c2_onnx
 from caffe2.python.onnx.helper import c2_native_run_net
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
index d224daf05ba3..bebfc1012957 100644
--- a/caffe2/python/onnx/tests/test_utils.py
+++ b/caffe2/python/onnx/tests/test_utils.py
@@ -6,7 +6,6 @@
 
 
 
-import os
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
index 3a1ebcd4ec67..f039ef09f637 100644
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ b/caffe2/python/operator_fp_exceptions_test.py
@@ -3,7 +3,6 @@
 
 
 from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
 from caffe2.python.test_util import TestCase
 
 import numpy as np
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
index 6cf8170b34f8..88197d16d70b 100644
--- a/caffe2/python/operator_test/blobs_queue_db_test.py
+++ b/caffe2/python/operator_test/blobs_queue_db_test.py
@@ -3,7 +3,6 @@
 
 
 
-import unittest
 import numpy as np
 
 import caffe2.proto.caffe2_pb2 as caffe2_pb2
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 05b8212242e4..38fe43899990 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -2,7 +2,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
index bf9af112a5b0..2eb2acf87902 100644
--- a/caffe2/python/operator_test/bucketize_op_test.py
+++ b/caffe2/python/operator_test/bucketize_op_test.py
@@ -2,10 +2,9 @@
 
 
 
-from caffe2.python import core, dyndep
+from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
 import numpy as np
 
 
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index 1927b4eac78f..ac83681f08bf 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -3,8 +3,7 @@
 
 
 
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index ae54cd37a91d..e600aa2c9ee9 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -2,7 +2,6 @@
 
 import collections
 import functools
-import os
 import unittest
 
 import caffe2.python._import_c_extension as C
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index 04bfbbe6f4f6..d979407321a4 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
index b75e7b7b1a10..4d7b90c431a6 100644
--- a/caffe2/python/operator_test/crf_test.py
+++ b/caffe2/python/operator_test/crf_test.py
@@ -9,7 +9,6 @@
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 from hypothesis import given, settings
-import unittest
 
 
 class TestCRFOp(hu.HypothesisTestCase):
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index d1852e7dd9e8..c88f93503a15 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -9,7 +9,6 @@
 import numpy as np
 
 import unittest
-import os
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 1dda7166e65a..29440c00a4b3 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from caffe2.python.test_util import caffe2_flaky
 from collections import defaultdict, Counter
 from hypothesis import given, settings
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
index db1b826cfe41..ef4433a41a18 100644
--- a/caffe2/python/operator_test/cudnn_recurrent_test.py
+++ b/caffe2/python/operator_test/cudnn_recurrent_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import model_helper, workspace, core, rnn_cell
-from caffe2.proto import caffe2_pb2
 from future.utils import viewitems
 import numpy as np
 
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
index f6ad0e38e73c..67289de5e924 100644
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -1,6 +1,5 @@
 
 
-import os
 import unittest
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
index 2d6d6429f833..cdfffce288dd 100644
--- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py
+++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, dyndep, utils, workspace
+from caffe2.python import core, utils
 from hypothesis import given, settings
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index e948fdae9673..5b46548e072b 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -6,7 +6,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index ac0dc3dd0975..2bd85625a3d9 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 8dbfdc1871e8..31f70086de7b 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -10,7 +10,6 @@
 import numpy as np
 
 import unittest
-import os
 
 class TestElementwiseOps(hu.HypothesisTestCase):
 
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
index b843bfdc95b9..8150977945a2 100644
--- a/caffe2/python/operator_test/enforce_finite_op_test.py
+++ b/caffe2/python/operator_test/enforce_finite_op_test.py
@@ -8,7 +8,6 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
 
 
 class TestEnforceFinite(hu.HypothesisTestCase):
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index 0d198b1aff14..aba2c1106da3 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 from hypothesis import given, settings
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
index 19fa329c9389..5a20b63166be 100644
--- a/caffe2/python/operator_test/feature_maps_ops_test.py
+++ b/caffe2/python/operator_test/feature_maps_ops_test.py
@@ -2,7 +2,7 @@
 
 
 
-from caffe2.python import core, workspace, dyndep
+from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
 
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index f38df09ec9fb..7b7a33dcd90a 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -6,7 +6,7 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import assume, given, settings, HealthCheck
+from hypothesis import given, settings
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index 62aba236d5ba..8e864bb42152 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -12,7 +12,6 @@
 import caffe2.python.hypothesis_test_util as hu
 
 import unittest
-import os
 
 class TestGroupConvolution(hu.HypothesisTestCase):
 
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 99444f39ac26..1a7db2634989 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -16,7 +16,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-import os
 
 
 def gru_unit(*args, **kwargs):
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index 90a8197e7ccf..c0a1e8f49f5a 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 760228382bc6..42cb1deaf8ae 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -10,9 +10,6 @@
 import hypothesis.strategies as st
 import numpy as np
 
-import unittest
-import os
-
 
 class TestReduceFrontSum(hu.HypothesisTestCase):
     @given(batch_size=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index fb4f3c935ba8..efce9d7001fe 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -11,7 +11,6 @@
 import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
-import os
 
 
 class TestInstanceNorm(serial.SerializedTestCase):
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 6ed2db2e88c2..f205d8e650b2 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 62e94afe9e7d..d402cce4c4f9 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -13,7 +13,6 @@
 import hypothesis.strategies as st
 
 import numpy as np
-import os
 import torch
 
 import unittest
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index 626ec0542b7d..cda2f7da323e 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
index fc4e89e2545b..49b0ba7ec22c 100644
--- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
+++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
@@ -3,7 +3,7 @@
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core, dyndep, workspace
+from caffe2.python import core, workspace
 from hypothesis import given
 
 
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index e0a5f9609588..441fcc747835 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index 24cb65ac96f8..f6a07ead3cf9 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index b8cef19b24df..8b4001a574ac 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -9,8 +9,6 @@
 
 from hypothesis import assume, given, settings
 import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index 5830089f8e9b..ee2c6fc8fbf7 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -6,8 +6,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index 3b270df254ce..bee44e360e3f 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index a202581f808c..c32aa99470db 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -9,7 +9,7 @@
 import hypothesis.strategies as st
 import unittest
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 4cff53b87d6e..5ad9c277239d 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -3,7 +3,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index 9a76e6b847a5..eceb1e5ba6a9 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index 6d4e6bbdcd08..788c4035dd5f 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -5,8 +5,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
index d81b0a963185..40c4192e21e9 100644
--- a/caffe2/python/operator_test/percentile_op_test.py
+++ b/caffe2/python/operator_test/percentile_op_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace, dyndep
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py
index e244f77149e1..a702ab41577f 100644
--- a/caffe2/python/operator_test/rand_quantization_op_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_test.py
@@ -6,7 +6,6 @@
 import numpy as np
 import struct
 import unittest
-import os
 
 from hypothesis import given, example
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index 13650e6cad4e..33ada4d6881c 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -11,9 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 
-import os
-import unittest
-
 class RecurrentNetworkTest(serial.SerializedTestCase):
     @given(T=st.integers(1, 4),
            n=st.integers(1, 5),
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index 727631befe89..7b79b3b81aed 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -11,7 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import itertools as it
-import unittest
 
 
 class TestReduceOps(serial.SerializedTestCase):
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
index 7d4287df6609..6a99f2b27d42 100644
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from hypothesis import assume, given, settings
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index c74157a039b0..ea835acead61 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 4609473f91f0..65c0669abfb0 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -11,7 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-import os
 
 
 def _gen_test_add_padding(with_pad_data=True,
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 35f7bd2a5e29..21a530346329 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, utils, workspace
 import caffe2.python.hip_test_util as hiputl
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
index 5bd6cb1d08f8..51f328c95f5f 100644
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -5,7 +5,6 @@
 
 from caffe2.python import core
 from functools import partial
-from hypothesis import given
 from hypothesis import strategies as st
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index 02276b08c176..beb8a3781832 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -8,7 +8,6 @@
 import unittest
 import numpy as np
 from caffe2.python import brew, core, workspace, cnn, optimizer
-from caffe2.proto import caffe2_pb2
 from caffe2.python.modeling.initializers import (
     Initializer, PseudoFP16Initializer)
 
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
index dee96413dbe5..34fddbc1a66e 100644
--- a/caffe2/python/rnn/lstm_comparison.py
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -2,7 +2,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, lstm_benchmark, utils
 from copy import copy
 
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 9c85d0efd2a5..f6da5e126119 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -7,7 +7,6 @@
 
 import functools
 import inspect
-import itertools
 import logging
 import numpy as np
 import random
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index 9bd69eb32902..bf3c8e9a0d06 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import scope, core, workspace
-from caffe2.proto import caffe2_pb2
 
 import unittest
 import threading
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
index ba10247eaa2e..abf63626a7fa 100644
--- a/caffe2/python/test/executor_test_util.py
+++ b/caffe2/python/test/executor_test_util.py
@@ -14,7 +14,6 @@
 
 import time
 import numpy as np
-from hypothesis import settings
 
 
 CI_MAX_EXAMPLES = 2
diff --git a/caffe2/python/test/inference_lstm_op_test.py b/caffe2/python/test/inference_lstm_op_test.py
index 20caab9ba78b..768827bd8876 100644
--- a/caffe2/python/test/inference_lstm_op_test.py
+++ b/caffe2/python/test/inference_lstm_op_test.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python3
-import inspect
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from caffe2.python import core, workspace
+from caffe2.python import core
 from caffe2.python.test_util import TestCase
 from hypothesis import given, settings
 from torch import nn
diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py
index 7790e0f6d8f5..a407f33fe253 100644
--- a/caffe2/python/test/python_protobuf_test.py
+++ b/caffe2/python/test/python_protobuf_test.py
@@ -5,9 +5,6 @@
 # make sure we use cpp implementation of protobuf
 import os
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
-
-# import cpp extension first
-from caffe2.python import core
 # then import protobuf
 from caffe2.proto import caffe2_pb2, metanet_pb2
 
diff --git a/caffe2/python/trt/test_pt_onnx_trt.py b/caffe2/python/trt/test_pt_onnx_trt.py
index 96f1ad76f6b7..5e6abb5c4d0b 100644
--- a/caffe2/python/trt/test_pt_onnx_trt.py
+++ b/caffe2/python/trt/test_pt_onnx_trt.py
@@ -15,17 +15,13 @@
 
 import os
 import unittest
-from typing import List, Any
 
 from PIL import Image
 import numpy as np
 import torch
-from torch.onnx import OperatorExportTypes
 import torchvision.models as models
 
 import pycuda.driver as cuda
-# This import causes pycuda to automatically manage CUDA context creation and cleanup.
-import pycuda.autoinit
 
 import tensorrt as trt
 TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
index 39d37ca9fa0a..2782cca7c13f 100644
--- a/caffe2/python/trt/test_trt.py
+++ b/caffe2/python/trt/test_trt.py
@@ -7,7 +7,7 @@
 from caffe2.python import core, workspace
 import onnx
 import onnx.defs
-from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
 from onnx.backend.base import namedtupledict
 from caffe2.python.models.download import ModelDownloader
 import caffe2.python.onnx.backend as c2
@@ -16,7 +16,6 @@
 from caffe2.python.onnx.tests.test_utils import TestCase
 import numpy as np
 import os.path
-import json
 import time
 import unittest
 import tarfile
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index 0936941aac03..1b201007daab 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -12,9 +12,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
-from caffe2.python import core, workspace
-import caffe2.python.onnx.frontend as c2_front
+from caffe2.python import workspace
 import caffe2.python._import_c_extension as C
 import numpy as np
 

From e442ac1e3f4e6b40ef3ffad48e5ec29fb113192f Mon Sep 17 00:00:00 2001
From: vfdev-5 <vfdev.5@gmail.com>
Date: Tue, 5 Jan 2021 13:28:48 -0800
Subject: [PATCH 74/89] Update MultiHeadAttention docstring (#49950)

Summary:
Fixes MultiHeadAttention docstring.

Currently, https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html#torch.nn.MultiheadAttention
is

<img width="648" alt="Screen Shot 2020-12-29 at 21 06 43" src="https://user-images.githubusercontent.com/2459423/103311124-cd10cc00-4a19-11eb-89c9-0ee261364963.png">

and with the fix will be

<img width="648" alt="Screen Shot 2020-12-29 at 22 41 35" src="https://user-images.githubusercontent.com/2459423/103315838-0dc31200-4a27-11eb-82e2-ca8f13d713a1.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49950

Reviewed By: mrshenli

Differential Revision: D25732573

Pulled By: zhangguanheng66

fbshipit-source-id: b362f3f617ab26b0dd25c3a0a7d4117e522e620c
---
 torch/nn/modules/activation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 0c5258615bfd..073c95c28619 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -848,8 +848,9 @@ class MultiheadAttention(Module):
         kdim: total number of features in key. Default: None.
         vdim: total number of features in value. Default: None.
 
-        Note: if kdim and vdim are None, they will be set to embed_dim such that
-              query, key, and value have the same number of features.
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
 
     Examples::
 

From e3c56ddde67ca1a49159ffa886d889b6e65c7033 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Tue, 5 Jan 2021 13:34:04 -0800
Subject: [PATCH 75/89] Revert D25757691: [pytorch][PR] Run mypy over
 test/test_utils.py

Test Plan: revert-hammer

Differential Revision:
D25757691 (https://github.com/pytorch/pytorch/commit/c86cfcd81da46b5e8226441edb58f0b11a97f215)

Original commit changeset: 145ce3ae532c

fbshipit-source-id: 3dfd68f0c42fc074cde15c6213a630b16e9d8879
---
 mypy.ini                     |  3 +-
 test/test_utils.py           | 56 +++++++++++++++++-------------------
 torch/utils/cpp_extension.py |  4 +--
 3 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index a74ed7979eac..bab4ce5dfd42 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -26,8 +26,7 @@ files =
     test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
-    test/test_type_info.py,
-    test/test_utils.py
+    test/test_type_info.py
 
 
 # Minimum version supported - variable annotations were introduced
diff --git a/test/test_utils.py b/test/test_utils.py
index 9733ae036d6e..5f1e693ab12f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -10,7 +10,6 @@
 import torch
 import torch.nn as nn
 import torch.utils.data
-from torch.utils.data import DataLoader
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
 import torch.utils.cpp_extension
@@ -29,7 +28,7 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 
 
-class RandomDatasetMock(torch.utils.data.Dataset):
+class RandomDatasetMock(object):
 
     def __getitem__(self, index):
         return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
@@ -191,7 +190,7 @@ def forward(self, a, b):
         b = torch.randn(1, 100, requires_grad=True)
 
         with self.assertRaises(TypeError):
-            checkpoint_sequential(model, 1, a, b)  # type: ignore[call-arg]
+            checkpoint_sequential(model, 1, a, b)
 
     def test_checkpoint_sequential_deprecated_no_args(self):
         class Noop(nn.Module):
@@ -201,7 +200,7 @@ def forward(self):
         model = nn.Sequential(Noop())
 
         with self.assertRaises(TypeError):
-            checkpoint_sequential(model, 1)  # type: ignore[call-arg]
+            checkpoint_sequential(model, 1)
 
     def test_checkpoint_rng_cpu(self):
         for _ in range(5):
@@ -278,7 +277,7 @@ def run_fn(tensor1, tensor2):
         out = checkpoint(run_fn, input_var, input_var2)
         out[0].sum().backward()
 
-        def run_fn2(tensor1, tensor2):
+        def run_fn(tensor1, tensor2):
             return tensor1
         input_var = torch.randn(1, 4, requires_grad=False)
         input_var2 = torch.randn(1, 4, requires_grad=True)
@@ -286,7 +285,7 @@ def run_fn2(tensor1, tensor2):
             RuntimeError,
             r"none of output has requires_grad=True, this checkpoint\(\) is not necessary"
         ):
-            out = checkpoint(run_fn2, input_var, input_var2)
+            out = checkpoint(run_fn, input_var, input_var2)
             out.sum().backward()
 
 class TestDataLoader(TestCase):
@@ -309,38 +308,35 @@ def run():
         self.assertEqual(x1, x2)
 
     def test_single_keep(self):
-        # self.dataset is a Tensor here; technically not a valid input because
-        # not a Dataset subclass, but needs to stay working so add ignore's
-        # for type checking with mypy
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=False)
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=False)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_single_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=True)
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=0,
+                                                 drop_last=True)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
     @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN")
     def test_multi_keep(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=False)
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=False)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_multi_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=True)
+        dataloader = torch.utils.data.DataLoader(self.dataset,
+                                                 batch_size=self.batch_size,
+                                                 num_workers=2,
+                                                 drop_last=True)
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
@@ -351,7 +347,7 @@ def test_multi_drop(self):
 class TestFFI(TestCase):
     def test_deprecated(self):
         with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."):
-            from torch.utils.ffi import create_extension  # type: ignore  # noqa: F401
+            from torch.utils.ffi import create_extension  # noqa: F401
 
 
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
@@ -368,9 +364,9 @@ def _run(self, command, timeout=30):
             p.kill()
             output, err = p.communicate()
         rc = p.returncode
-        output_str = output.decode("ascii")
-        err_str = err.decode("ascii")
-        return (rc, output_str, err_str)
+        output = output.decode("ascii")
+        err = err.decode("ascii")
+        return (rc, output, err)
 
     def _run_bottleneck(self, test_file, scriptargs=''):
         curdir = os.path.dirname(os.path.abspath(__file__))
@@ -665,7 +661,7 @@ def forward(self, x):
         # data can be passed without errors
         x = torch.randn(4, 4).fill_(1.0)
         ms(x)
-        with self.assertRaisesRegex(torch.jit.Error, "foo"):  # type: ignore[type-var]
+        with self.assertRaisesRegex(torch.jit.Error, "foo"):
             ms(torch.tensor([False], dtype=torch.bool))
 
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 93a8f403bc4a..7ca849d48de0 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -17,7 +17,7 @@
 from ._cpp_extension_versioner import ExtensionVersioner
 from .hipify import hipify_python
 from .hipify.hipify_python import get_hip_file_path, GeneratedFileCleaner
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from setuptools.command.build_ext import build_ext
 from pkg_resources import packaging  # type: ignore
@@ -980,7 +980,7 @@ def library_paths(cuda: bool = False) -> List[str]:
 
 
 def load(name,
-         sources: Union[str, List[str]],
+         sources: List[str],
          extra_cflags=None,
          extra_cuda_cflags=None,
          extra_ldflags=None,

From 093aca082e2878f3a28defe9075e7334dfceac70 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritz.obermeyer@gmail.com>
Date: Tue, 5 Jan 2021 13:55:10 -0800
Subject: [PATCH 76/89] Enable distribution validation if __debug__ (#48743)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/47123
Follows https://github.com/pyro-ppl/pyro/pull/2701

This turns on `Distribution` validation by default. The motivation is to favor beginners by providing helpful error messages. Advanced users focused on speed can disable validation by calling
```py
torch.distributions.Distribution.set_default_validate_args(False)
```
or by disabling individual distribution validation via `MyDistribution(..., validate_args=False)`.

In practice I have found many beginners forget or do not know about validation. Therefore I have [enabled it by default](https://github.com/pyro-ppl/pyro/pull/2701) in Pyro. I believe PyTorch could also benefit from this change. Indeed validation caught a number of bugs in `.icdf()` methods, in tests, and in PPL benchmarks, all of which have been fixed in this PR.

## Release concerns
- This may slightly slow down some models. Concerned users may disable validation.
- This may cause new `ValueErrors` in models that rely on unsupported behavior, e.g. `Categorical.log_prob()` applied to continuous-valued tensors (only {0,1}-valued tensors are supported).

We should clearly note this change in release notes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48743

Reviewed By: heitorschueroff

Differential Revision: D25304247

Pulled By: neerajprad

fbshipit-source-id: 8d50f28441321ae691f848c55f71aa80cb356b41
---
 .../ppl_models.py                             |  7 +--
 test/distributions/test_distributions.py      | 43 ++++++++++---------
 torch/distributions/cauchy.py                 |  2 -
 torch/distributions/constraints.py            | 27 ++++++++++++
 torch/distributions/continuous_bernoulli.py   |  2 -
 torch/distributions/distribution.py           | 13 +++++-
 torch/distributions/exponential.py            |  2 -
 torch/distributions/laplace.py                |  2 -
 torch/distributions/negative_binomial.py      |  4 +-
 torch/distributions/normal.py                 |  2 -
 torch/distributions/one_hot_categorical.py    |  2 +-
 torch/distributions/uniform.py                |  2 -
 12 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/benchmarks/functional_autograd_benchmark/ppl_models.py b/benchmarks/functional_autograd_benchmark/ppl_models.py
index 906ebac5d41b..94ba6698a91d 100644
--- a/benchmarks/functional_autograd_benchmark/ppl_models.py
+++ b/benchmarks/functional_autograd_benchmark/ppl_models.py
@@ -24,8 +24,9 @@ def forward(beta_value: Tensor) -> Tensor:
         mu = X.mm(beta_value)
 
         # We need to compute the first and second gradient of this score with respect
-        # to beta_value.
-        score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum()
+        # to beta_value. We disable Bernoulli validation because Y is a relaxed value.
+        score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() +
+                 beta_prior.log_prob(beta_value).sum())
         return score
 
     return forward, (beta_value.to(device),)
@@ -40,7 +41,7 @@ def get_robust_regression(device: torch.device) -> GetterReturnType:
     Y = torch.rand(N, 1, device=device)
 
     # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1)
-    nu_alpha = torch.randn(1, 1, device=device)
+    nu_alpha = torch.rand(1, 1, device=device)
     nu_beta = torch.rand(1, 1, device=device)
     nu = dist.Gamma(nu_alpha, nu_beta)
 
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b057d12a285d..8c927f35fd2e 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -727,7 +727,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params):
         # performs gradient checks on log_prob
         distribution = dist_ctor(*ctor_params)
         s = distribution.sample()
-        if s.is_floating_point():
+        if not distribution.support.is_discrete:
             s = s.detach().requires_grad_()
 
         expected_shape = distribution.batch_shape + distribution.event_shape
@@ -1422,7 +1422,7 @@ def test_uniform(self):
         self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,))
 
         # Check log_prob computation when value outside range
-        uniform = Uniform(low_1d, high_1d)
+        uniform = Uniform(low_1d, high_1d, validate_args=False)
         above_high = torch.tensor([4.0])
         below_low = torch.tensor([-1.0])
         self.assertEqual(uniform.log_prob(above_high).item(), -inf)
@@ -1517,7 +1517,7 @@ def test_halfcauchy(self):
 
     def test_halfnormal(self):
         std = torch.randn(5, 5).abs().requires_grad_()
-        std_1d = torch.randn(1, requires_grad=True)
+        std_1d = torch.randn(1).abs().requires_grad_()
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(HalfNormal(std).sample().size(), (5, 5))
         self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5))
@@ -1978,6 +1978,8 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril):
                     sigma = 0.5 * (sigma + sigma.transpose(-1, -2))  # Ensure symmetry of covariance
                 if prec is not None:
                     prec = 0.5 * (prec + prec.transpose(-1, -2))  # Ensure symmetry of precision
+                if scale_tril is not None:
+                    scale_tril = scale_tril.tril()
                 return MultivariateNormal(mu, sigma, prec, scale_tril).log_prob(samples)
             gradcheck(gradcheck_func, (mvn_samples, mean, covariance, precision, scale_tril), raise_exception=True)
 
@@ -2643,7 +2645,7 @@ def test_cdf_log_prob(self):
             for i, param in enumerate(params):
                 dist = Dist(**param)
                 samples = dist.sample()
-                if samples.dtype.is_floating_point:
+                if not dist.support.is_discrete:
                     samples.requires_grad_()
                 try:
                     cdfs = dist.cdf(samples)
@@ -3050,11 +3052,9 @@ def setUp(self):
         self.scalar_sample = 1
         self.tensor_sample_1 = torch.ones(3, 2)
         self.tensor_sample_2 = torch.ones(3, 2, 3)
-        Distribution.set_default_validate_args(True)
 
     def tearDown(self):
         super(TestDistributionShapes, self).tearDown()
-        Distribution.set_default_validate_args(False)
 
     def test_entropy_shape(self):
         for Dist, params in EXAMPLES:
@@ -3186,23 +3186,23 @@ def test_one_hot_categorical_shape(self):
         self.assertEqual(dist.sample().size(), torch.Size((3,)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
-        simplex_sample = self.tensor_sample_2 / self.tensor_sample_2.sum(-1, keepdim=True)
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 2,)))
+        sample = torch.tensor([0., 1., 0.]).expand(3, 2, 3)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 2,)))
         self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,)))
-        simplex_sample = torch.ones(3, 3) / 3
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        sample = torch.eye(3)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         # batched
         dist = OneHotCategorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
         self.assertEqual(dist._batch_shape, torch.Size((3,)))
         self.assertEqual(dist._event_shape, torch.Size((2,)))
         self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True)
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        sample = torch.tensor([0., 1.])
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
         self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3)))
-        simplex_sample = torch.ones(3, 1, 2) / 2
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3)))
+        sample = torch.tensor([0., 1.]).expand(3, 1, 2)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 3)))
 
     def test_cauchy_shape_scalar_params(self):
         cauchy = Cauchy(0, 1)
@@ -3531,12 +3531,15 @@ def __init__(self, probs):
                                                          [0.2, 0.7, 0.1],
                                                          [0.33, 0.33, 0.34],
                                                          [0.2, 0.2, 0.6]])
-        pareto = pairwise(Pareto, [2.5, 4.0, 2.5, 4.0], [2.25, 3.75, 2.25, 3.75])
+        pareto = (Pareto(torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4),
+                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)),
+                  Pareto(torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4),
+                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)))
         poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0])
-        uniform_within_unit = pairwise(Uniform, [0.15, 0.95, 0.2, 0.8], [0.1, 0.9, 0.25, 0.75])
+        uniform_within_unit = pairwise(Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8])
         uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7])
         uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4])
-        uniform_pareto = pairwise(Uniform, [6.5, 8.5, 6.5, 8.5], [7.5, 7.5, 9.5, 9.5])
+        uniform_pareto = pairwise(Uniform, [6.5, 7.5, 6.5, 8.5], [7.5, 8.5, 9.5, 9.5])
         continuous_bernoulli = pairwise(ContinuousBernoulli, [0.1, 0.2, 0.5, 0.9])
 
         # These tests should pass with precision = 0.01, but that makes tests very expensive.
@@ -4148,8 +4151,8 @@ def test_lazy_logits_initialization(self):
                 probs = param.pop('probs')
                 param['logits'] = probs_to_logits(probs)
                 dist = Dist(**param)
-                shape = (1,) if not dist.event_shape else dist.event_shape
-                dist.log_prob(torch.ones(shape))
+                # Create new instance to generate a valid sample
+                dist.log_prob(Dist(**param).sample())
                 message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
                 self.assertFalse('probs' in vars(dist), msg=message)
                 try:
@@ -4455,7 +4458,6 @@ def test_stack_transform(self):
 class TestValidation(TestCase):
     def setUp(self):
         super(TestCase, self).setUp()
-        Distribution.set_default_validate_args(True)
 
     def test_valid(self):
         for Dist, params in EXAMPLES:
@@ -4475,7 +4477,6 @@ def test_invalid(self):
 
     def tearDown(self):
         super(TestValidation, self).tearDown()
-        Distribution.set_default_validate_args(False)
 
 
 class TestJit(TestCase):
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 50be941e073a..63181a2a6733 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -69,8 +69,6 @@ def cdf(self, value):
         return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc
 
     def entropy(self):
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 630c192ffed0..87d72d52d26b 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -3,13 +3,17 @@
 
 - ``constraints.boolean``
 - ``constraints.cat``
+- ``constraints.corr_cholesky``
 - ``constraints.dependent``
 - ``constraints.greater_than(lower_bound)``
+- ``constraints.greater_than_eq(lower_bound)``
 - ``constraints.integer_interval(lower_bound, upper_bound)``
 - ``constraints.interval(lower_bound, upper_bound)``
+- ``constraints.less_than(upper_bound)``
 - ``constraints.lower_cholesky``
 - ``constraints.lower_triangular``
 - ``constraints.nonnegative_integer``
+- ``constraints.one_hot``
 - ``constraints.positive``
 - ``constraints.positive_definite``
 - ``constraints.positive_integer``
@@ -57,6 +61,8 @@ class Constraint(object):
     A constraint object represents a region over which a variable is valid,
     e.g. within which a variable can be optimized.
     """
+    is_discrete = False
+
     def check(self, value):
         """
         Returns a byte tensor of `sample_shape + batch_shape` indicating
@@ -103,14 +109,30 @@ class _Boolean(Constraint):
     """
     Constrain to the two values `{0, 1}`.
     """
+    is_discrete = True
+
     def check(self, value):
         return (value == 0) | (value == 1)
 
 
+class _OneHot(Constraint):
+    """
+    Constrain to one-hot vectors.
+    """
+    is_discrete = True
+
+    def check(self, value):
+        is_boolean = (value == 0) | (value == 1)
+        is_normalized = value.sum(-1).eq(1)
+        return is_boolean.all(-1) & is_normalized
+
+
 class _IntegerInterval(Constraint):
     """
     Constrain to an integer interval `[lower_bound, upper_bound]`.
     """
+    is_discrete = True
+
     def __init__(self, lower_bound, upper_bound):
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
@@ -128,6 +150,8 @@ class _IntegerLessThan(Constraint):
     """
     Constrain to an integer interval `(-inf, upper_bound]`.
     """
+    is_discrete = True
+
     def __init__(self, upper_bound):
         self.upper_bound = upper_bound
 
@@ -144,6 +168,8 @@ class _IntegerGreaterThan(Constraint):
     """
     Constrain to an integer interval `[lower_bound, inf)`.
     """
+    is_discrete = True
+
     def __init__(self, lower_bound):
         self.lower_bound = lower_bound
 
@@ -358,6 +384,7 @@ def check(self, value):
 dependent = _Dependent()
 dependent_property = _DependentProperty
 boolean = _Boolean()
+one_hot = _OneHot()
 nonnegative_integer = _IntegerGreaterThan(0)
 positive_integer = _IntegerGreaterThan(1)
 integer_interval = _IntegerInterval
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index 180fbd8187ee..5d3d48840203 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -168,8 +168,6 @@ def cdf(self, value):
             torch.where(torch.ge(value, 1.0), torch.ones_like(value), unbounded_cdfs))
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         cut_probs = self._cut_probs()
         return torch.where(
             self._outside_unstable_region(),
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index f16eb154e2dd..bc61e0b0584e 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -12,10 +12,21 @@ class Distribution(object):
 
     has_rsample = False
     has_enumerate_support = False
-    _validate_args = False
+    _validate_args = __debug__
 
     @staticmethod
     def set_default_validate_args(value):
+        """
+        Sets whether validation is enabled or disabled.
+
+        The default behavior mimics Python's ``assert`` statement: validation
+        is on by default, but is disabled if Python is run in optimized mode
+        (via ``python -O``). Validation may be expensive, so you may want to
+        disable it once a model is working.
+
+        Args:
+            value (bool): Whether to enable validation.
+        """
         if value not in [True, False]:
             raise ValueError
         Distribution._validate_args = value
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 41d7cd9f9787..ac18980c778b 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -68,8 +68,6 @@ def cdf(self, value):
         return 1 - torch.exp(-self.rate * value)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return -torch.log(1 - value) / self.rate
 
     def entropy(self):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index d7ec01c65b35..a505d60c8f38 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -75,8 +75,6 @@ def cdf(self, value):
         return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(-(value - self.loc).abs() / self.scale)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         term = value - 0.5
         return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs())
 
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 051725db19ca..4a8babb34a7c 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -77,8 +77,10 @@ def param_shape(self):
 
     @lazy_property
     def _gamma(self):
+        # Note we avoid validating because self.total_count can be zero.
         return torch.distributions.Gamma(concentration=self.total_count,
-                                         rate=torch.exp(-self.logits))
+                                         rate=torch.exp(-self.logits),
+                                         validate_args=False)
 
     def sample(self, sample_shape=torch.Size()):
         with torch.no_grad():
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 2468e2f225dc..1f14f0ae015f 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -82,8 +82,6 @@ def cdf(self, value):
         return 0.5 * (1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2)))
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2)
 
     def entropy(self):
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index c661a245f716..64f696802d76 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -29,7 +29,7 @@ class OneHotCategorical(Distribution):
     """
     arg_constraints = {'probs': constraints.simplex,
                        'logits': constraints.real}
-    support = constraints.simplex
+    support = constraints.one_hot
     has_enumerate_support = True
 
     def __init__(self, probs=None, logits=None, validate_args=None):
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index b212c52695c2..edaf5abf77a5 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -81,8 +81,6 @@ def cdf(self, value):
         return result.clamp(min=0, max=1)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         result = value * (self.high - self.low) + self.low
         return result
 

From a272a7eeabe2cd80ca4cac25bca94bb9afa9ff2f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 5 Jan 2021 14:00:01 -0800
Subject: [PATCH 77/89] [PyTorch] Avoid heap allocations in
 inferUnsqueezeGeometry (#49497)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49497

Noticed this thing spending relatively most of its time in
malloc in perf. Optimize for typical tensor sizes.
ghstack-source-id: 119318388

Test Plan:
perf profile internal benchmark; saw inferUnsqueezeGeometry
go from 0.30% exclusive 0.47% inclusive to 0.11% exclusive 0.16%
inclusive.

Differential Revision: D25596549

fbshipit-source-id: 3bbd2031645a4b9fe6f49a77d41db46826d0f632
---
 aten/src/ATen/native/TensorShape.cpp | 31 +++++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index f8ba5527e5a9..09d50356abd9 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -8,6 +8,7 @@
 #include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/SparseTensorUtils.h>
@@ -1467,15 +1468,25 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) {
   return std::make_tuple(sizes, strides);
 }
 
-std::tuple<std::vector<int64_t>, std::vector<int64_t> >
+namespace {
+// Named type instead of a pair/tuple so that we can be sure to
+// construct the vectors in place and get NRVO.
+struct InferUnsqueezeGeometryResult {
+  c10::SmallVector<int64_t, 5> sizes;
+  c10::SmallVector<int64_t, 5> strides;
+  InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides)
+      : sizes(tensor_sizes.begin(), tensor_sizes.end())
+      , strides(tensor_strides.begin(), tensor_strides.end()) {}
+};
+}
+InferUnsqueezeGeometryResult
 inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
-  auto sizes = tensor.sizes().vec();
-  auto strides = tensor.strides().vec();
-  int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim];
-  sizes.insert(sizes.begin() + dim, 1);
-  strides.insert(strides.begin() + dim, new_stride);
+  InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides());
+  int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
+  result.sizes.insert(result.sizes.begin() + dim, 1);
+  result.strides.insert(result.strides.begin() + dim, new_stride);
 
-  return std::make_tuple(sizes, strides);
+  return result;
 }
 
 Tensor squeeze_qtensor(const Tensor& self) {
@@ -1624,7 +1635,7 @@ Tensor unsqueeze_qtensor(const Tensor& self, int64_t dim) {
                                                   axis,
                                                   quantizer->scalar_type());
   }
-  return make_qtensor(self, std::get<0>(g), std::get<1>(g), quantizer);
+  return make_qtensor(self, g.sizes, g.strides, quantizer);
 }
 
 Tensor unsqueeze(const Tensor& self, int64_t dim) {
@@ -1636,7 +1647,7 @@ Tensor unsqueeze(const Tensor& self, int64_t dim) {
     return unsqueeze_qtensor(self, dim);
   } else {
     auto g = inferUnsqueezeGeometry(self, dim);
-    return self.as_strided(std::get<0>(g), std::get<1>(g));
+    return self.as_strided(g.sizes, g.strides);
   }
 }
 
@@ -1644,7 +1655,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, self.dim() + 1);
 
   auto g = inferUnsqueezeGeometry(self, dim);
-  return self.as_strided_(std::get<0>(g), std::get<1>(g));
+  return self.as_strided_(g.sizes, g.strides);
 }
 
 Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) {

From e71a13e8a38760ba1f28630154a379fb265cd554 Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Tue, 5 Jan 2021 14:00:02 -0800
Subject: [PATCH 78/89] [pytorch][codegen] migrate gen_variable_type to new
 data model (#49735)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49735

This is the final wave of autograd codegen data model migration.

After this PR:
- autograd codegen no longer depends on Declarations.yaml;
- autograd codegen sources are fully type annotated and pass mypy-strict check;

To avoid potential merge conflicts with other pending PRs, some structural
changes are intentionally avoided, e.g. didn't move inner methods out, didn't
change all inner methods to avoid reading outer function's variables, and etc.

Confirmed byte-for-byte compatible with the old codegen:
```
Run it before and after this PR:
  .jenkins/pytorch/codegen-test.sh <baseline_output_dir>
  .jenkins/pytorch/codegen-test.sh <test_output_dir>

Then run diff to compare the generated files:
  diff -Naur <baseline_output_dir> <test_output_dir>
```

Confirmed clean mypy-strict run:
```
mypy --config mypy-strict.ini
```

Test Plan: Imported from OSS

Reviewed By: ezyang, bhosmer

Differential Revision: D25678879

Pulled By: ljk53

fbshipit-source-id: ba6e2eb6b9fb744208f7f79a922d933fcc3bde9f
---
 mypy-strict.ini                           |   2 +
 tools/autograd/gen_autograd.py            | 106 +---
 tools/autograd/gen_trace_type.py          |   2 +-
 tools/autograd/gen_variable_type.py       | 712 +++++++++++-----------
 tools/codegen/api/autograd.py             |  33 +
 tools/codegen/api/cpp.py                  |   2 +-
 tools/codegen/api/types.py                |  13 +-
 tools/codegen/gen.py                      |   9 +-
 tools/codegen/model.py                    |  12 +-
 tools/codegen/selective_build/selector.py |  15 +
 tools/jit/gen_unboxing_wrappers.py        |  65 +-
 11 files changed, 507 insertions(+), 464 deletions(-)

diff --git a/mypy-strict.ini b/mypy-strict.ini
index 42fc73abf1cc..7cc6fff83577 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -31,9 +31,11 @@ strict_equality = True
 
 files = tools/codegen/gen.py,
     tools/autograd/gen_annotated_fn_args.py,
+    tools/autograd/gen_autograd.py,
     tools/autograd/gen_python_functions.py,
     tools/autograd/gen_trace_type.py,
     tools/autograd/gen_variable_factories.py,
+    tools/autograd/gen_variable_type.py,
     tools/autograd/load_derivatives.py,
     torch/utils/benchmark/utils/common.py,
     torch/utils/benchmark/utils/timer.py,
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 88c00e0ba71a..b930aca504df 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -23,9 +23,6 @@
 
 import argparse
 import os
-import yaml
-import re
-from .utils import YamlLoader, op_name_with_overload
 from tools.codegen.selective_build.selector import SelectiveBuilder
 
 # See NOTE [ Autograd View Variables ] in variable.h for details.
@@ -89,84 +86,14 @@
     'tensor_split', 'swapdims', 'swapaxes'
 })
 
-def format_return_type(returns):
-    if len(returns) == 0:
-        return 'void'
-    elif len(returns) == 1:
-        return returns[0]['type']
-    else:
-        return_types = [r['type'] for r in returns]
-        return 'std::tuple<{}>'.format(','.join(return_types))
-
-
-def get_simple_type(arg):
-    simple_type = arg['type']
-    simple_type = simple_type.replace(' &', '').replace('const ', '')
-    simple_type = simple_type.replace('Generator *', 'Generator')
-
-    opt_match = re.match(r'c10::optional<(.+)>', simple_type)
-    if opt_match:
-        simple_type = '{}?'.format(opt_match.group(1))
-    return simple_type
-
-def has_tensoroptions_argument(declaration):
-    for argument in declaration['arguments']:
-        if 'TensorOptions' == argument['dynamic_type']:
-            return True
-    return False
-
-
-def load_aten_declarations(path):
-    with open(path, 'r') as f:
-        declarations = yaml.load(f, Loader=YamlLoader)
-
-    # enrich declarations with additional information
-    selected_declarations = []
-    for declaration in declarations:
-        if declaration.get('deprecated'):
-            continue
-
-        for arg in declaration['arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for arg in declaration['schema_order_arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for ret in declaration['returns']:
-            ret['simple_type'] = get_simple_type(ret)
-
-        declaration['formals'] = [arg['type'] + ' ' + arg['name']
-                                  for arg in declaration['arguments']]
-        declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name']
-                                               for arg in declaration['schema_order_arguments']]
-        declaration['args'] = [arg['name'] for arg in declaration['arguments']]
-        declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
-        declaration['api_name'] = declaration['name']
-        if declaration.get('overload_name'):
-            declaration['type_wrapper_name'] = "{}_{}".format(
-                declaration['name'], declaration['overload_name'])
-        else:
-            declaration['type_wrapper_name'] = declaration['name']
-        declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0]
-        declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1]
-        declaration['return_type'] = format_return_type(declaration['returns'])
-
-        declaration['base_name'] = declaration['name']
-        selected_declarations.append(declaration)
-
-    return selected_declarations
-
-
-def gen_autograd(aten_path, native_functions_path, out, autograd_dir, operator_selector: SelectiveBuilder, disable_autograd=False):
-    full_aten_decls = load_aten_declarations(aten_path)
-
-    def filter_decls(aten_decls, operator_selector):
-        def is_operator_selected_for_training(decl):
-            op_name = op_name_with_overload(decl)
-            return operator_selector.is_operator_selected_for_training(op_name)
-
-        return [decl for decl in aten_decls if is_operator_selected_for_training(decl)]
-
-    aten_decls = filter_decls(full_aten_decls, operator_selector)
-
+def gen_autograd(
+    aten_path: str,
+    native_functions_path: str,
+    out: str,
+    autograd_dir: str,
+    operator_selector: SelectiveBuilder,
+    disable_autograd: bool = False,
+) -> None:
     # Parse and load derivatives.yaml
     from .load_derivatives import load_derivatives
     differentiability_infos = load_derivatives(
@@ -175,13 +102,13 @@ def is_operator_selected_for_training(decl):
     template_path = os.path.join(autograd_dir, 'templates')
 
     # Generate VariableType.h/cpp
+    from .gen_trace_type import gen_trace_type
+    from .gen_variable_type import gen_variable_type
     if not disable_autograd:
-        from .gen_variable_type import gen_variable_type
-        gen_variable_type(out, aten_decls, differentiability_infos, template_path)
+        gen_variable_type(out, native_functions_path, differentiability_infos, template_path, operator_selector)
 
-        from . import gen_trace_type
         # operator filter not applied as tracing sources are excluded in selective build
-        gen_trace_type.gen_trace_type(out, native_functions_path, template_path)
+        gen_trace_type(out, native_functions_path, template_path)
 
     # Generate Functions.h/cpp
     from .gen_autograd_functions import gen_autograd_functions_lib
@@ -193,7 +120,12 @@ def is_operator_selected_for_training(decl):
     gen_variable_factories(out, native_functions_path, template_path)
 
 
-def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir):
+def gen_autograd_python(
+    aten_path: str,
+    native_functions_path: str,
+    out: str,
+    autograd_dir: str,
+) -> None:
     from .load_derivatives import load_derivatives
     differentiability_infos = load_derivatives(
         os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path)
@@ -212,7 +144,7 @@ def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir):
         out, native_functions_path, deprecated_path, template_path)
 
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(
         description='Generate autograd C++ files script')
     parser.add_argument('declarations', metavar='DECL',
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 31eb8aacf296..d8d42762e4fb 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -422,7 +422,7 @@ def gen_trace_type_shard(
     fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str
 ) -> None:
     fm.write_with_template('TraceType%s.cpp' % suffix, 'TraceType.cpp', lambda: {
-        'generated_comment': f'@generated from {fm.template_dir}/TraceType.cpp',
+        'generated_comment': '@' + f'generated from {fm.template_dir}/TraceType.cpp',
         'trace_method_definitions': list(mapMaybe(method_definition, native_functions)),
         'trace_wrapper_registrations': list(mapMaybe(method_registration, native_functions)),
     })
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 72be5b993f44..f49f5e15845b 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -22,20 +22,24 @@
 #     which will in turn dispatch back to VariableType for its
 #     differentiable subcomponents.
 #
+from dataclasses import dataclass
 
-from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful
 from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \
     MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT
 from .gen_autograd_functions import uses_single_grad
-from .gen_trace_type import MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD
+from .gen_trace_type import (
+    MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD,
+    declare_returned_variables, tie_return_values, get_return_value, type_wrapper_name,
+)
 
 from tools.codegen.api.types import *
 from tools.codegen.api.autograd import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.api.python as python
-from tools.codegen.gen import with_native_function
+from tools.codegen.code_template import CodeTemplate
+from tools.codegen.gen import with_native_function, parse_native_yaml, FileManager, mapMaybe
 from tools.codegen.model import *
-from typing import Dict, Optional, List, Sequence, Any, Callable
+from tools.codegen.selective_build.selector import SelectiveBuilder
+from typing import Callable, List, Optional, Sequence, Tuple, Union
 
 # We don't set or modify grad_fn on these methods. Generally, they return
 # tensors that have requires_grad=False. In-place functions listed here will
@@ -209,9 +213,6 @@
 UNPACK_TENSOR = CodeTemplate("""\
 auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
 
-LEGACY_WRAP_OPTIONS = CodeTemplate("""\
-auto ${arg_name}_ = TensorOptions(${arg_name});""")
-
 DECLARE_GRAD_FN = CodeTemplate("""\
 std::shared_ptr<${op}> grad_fn;
 """)
@@ -304,49 +305,18 @@
 #endif
 """)
 
-# Methods shared by TraceType and VariableType to handle return variable declaration, tie and tuple.
-def format_return_variables(declaration):
-    name = declaration['name']
-    arguments = declaration['arguments']
-    inplace = declaration['inplace']
-    is_out_fn = name.endswith('_out')
-    modifies_arguments = inplace or is_out_fn
-
-    def declare_returned_variables():
-        if modifies_arguments:
-            return ''
-        if len(declaration['returns']) == 1:
-            return ''
-        # TODO: this will be ugly
-        names = [ret['type'] + ' ' + ret['name'] + ';' for ret in declaration['returns']]
-        return '\n'.join(names)
-
-    def tie_return_values():
-        if len(declaration['returns']) == 1:
-            return 'auto {}'.format(declaration['returns'][0]['name'])
-        names = [ret['name'] for ret in declaration['returns']]
-        return 'std::tie({})'.format(', '.join(names))
-
-    def get_return_value():
-        if inplace:
-            return 'self'
-        if is_out_fn:
-            return_names = [arg['name'] for arg in arguments
-                            if arg.get('output', False)]
-            if len(return_names) == 1:
-                return return_names[0]
-            return 'std::forward_as_tuple({})'.format(', '.join(return_names))
-
-        returns = declaration['returns']
-        if len(returns) == 1:
-            return returns[0]['name']
-        moved = ['std::move({})'.format(r['name']) for r in returns]
-        return 'std::make_tuple({})'.format(', '.join(moved))
-
-    return (declare_returned_variables(), tie_return_values(), get_return_value())
+@dataclass(frozen=True)
+class NativeFunctionWithDifferentiabilityInfo:
+    func: NativeFunction
+    info: Optional[DifferentiabilityInfo]
 
-
-def gen_variable_type(out, aten_declarations, differentiability_infos, template_path):
+def gen_variable_type(
+    out: str,
+    native_yaml_path: str,
+    differentiability_infos: Sequence[DifferentiabilityInfo],
+    template_path: str,
+    operator_selector: SelectiveBuilder,
+) -> None:
 
     """VariableType.h and VariableType.cpp body
 
@@ -354,154 +324,202 @@ def gen_variable_type(out, aten_declarations, differentiability_infos, template_
     implementation of each function dispatches to the base tensor type to
     compute the output. The grad_fn is attached to differentiable functions.
     """
+    fns = list(sorted(filter(
+        operator_selector.is_native_function_selected_for_training,
+        parse_native_yaml(native_yaml_path)), key=lambda f: cpp.name(f.func)))
+    fns_with_infos = match_differentiability_info(fns, differentiability_infos)
 
-    aten_declarations = list(sorted(aten_declarations, key=lambda decl: decl['name']))
-    match_declarations_with_differentiability_info(aten_declarations, differentiability_infos)
-
-    gen_variable_type_shard(out, aten_declarations, template_path, None, True)
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    gen_variable_type_shard(fm, fns_with_infos, 'VariableType.h', 'VariableType.h')
 
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
     # template regarding sharding of the generated files.
     num_shards = 5
-    shards = [[] for _ in range(num_shards)]
+    shards: List[List[NativeFunctionWithDifferentiabilityInfo]] = [[] for _ in range(num_shards)]
 
     # functions are assigned arbitrarily but stably to a file based on hash
-    for decl in aten_declarations:
-        x = sum(ord(c) for c in decl['name']) % num_shards
-        shards[x].append(decl)
+    for fn in fns_with_infos:
+        x = sum(ord(c) for c in cpp.name(fn.func.func)) % num_shards
+        shards[x].append(fn)
 
     for i, shard in enumerate(shards):
-        gen_variable_type_shard(out, shard, template_path, '_%d' % i, False)
-    gen_variable_type_shard(out, aten_declarations, template_path, 'Everything', False)
+        gen_variable_type_shard(fm, shard, 'VariableType.cpp', f'VariableType_{i}.cpp')
 
+    gen_variable_type_shard(fm, fns_with_infos, 'VariableType.cpp', 'VariableTypeEverything.cpp')
 
-def gen_variable_type_shard(out, aten_declarations, template_path, suffix, header):
-    VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h')
-    VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp')
-
-    type_declarations = []
-    type_definitions = []
-    wrapper_registrations = []
-
-    for declaration in aten_declarations:
-        if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-            formals = declaration['schema_order_formals']
-        else:
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-            formals = declaration['formals']
-        type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals))
-        strategy = dispatch_strategy(declaration)
-        if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived':
-            body = emit_body(declaration)
+@with_native_function
+def gen_formals(f: NativeFunction) -> str:
+    if f.use_c10_dispatcher.dispatcher_uses_new_style():
+        formals = ', '.join(
+            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        )
+    else:
+        sig_group = CppSignatureGroup.from_native_function(f, method=False)
+        formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments())
+    return formals
 
+@with_native_function
+def gen_wrapper_registration(f: NativeFunction) -> str:
+    if f.use_c10_dispatcher.dispatcher_uses_new_style():
+        return WRAPPER_REGISTRATION.substitute(
+            unqual_operator_name_with_overload=f.func.name,
+            type_wrapper_name=type_wrapper_name(f),
+            class_type='VariableType',
+        )
+    else:
+        return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
+            unqual_operator_name_with_overload=f.func.name,
+            type_wrapper_name=type_wrapper_name(f),
+            class_type='VariableType',
+        )
+
+def gen_variable_type_shard(
+    fm: FileManager,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_name: str,
+    output_name: str,
+) -> None:
+    type_declarations: List[str] = []
+    type_definitions: List[str] = []
+    wrapper_registrations: List[str] = []
+
+    for fn in fns_with_infos:
+        f = fn.func
+        name = cpp.name(f.func)
+        formals = gen_formals(f)
+
+        type_declarations.append(METHOD_DECLARATION.substitute(
+            return_type=cpp.returns_type(f.func.returns),
+            type_wrapper_name=type_wrapper_name(f),
+            formals=formals,
+        ))
+
+        if name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived':
             type_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=body, formals=formals))
-            if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-                wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
-                    declaration, class_type='VariableType'))
-            else:
-                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
-                    declaration, class_type='VariableType'))
+                return_type=cpp.returns_type(f.func.returns),
+                type_wrapper_name=type_wrapper_name(f),
+                type_definition_body=emit_body(fn),
+                formals=formals,
+            ))
+            wrapper_registrations.append(gen_wrapper_registration(f))
 
         # See Note [Manual Backend kernels]
-        assert (declaration['name'] in MANUAL_BACKEND) == declaration['manual_kernel_registration']
+        assert (name in MANUAL_BACKEND) == f.manual_kernel_registration
         # If you want to register a kernel to Autograd, you must make the op abstract.
         # In other words, this op must have dispatch section in native_functions.yaml.
-        if declaration['name'] in MANUAL_AUTOGRAD_AND_TRACER or declaration['derivative']:
-            msg = (f'There\'s a formula for {declaration["name"]}(or its functional variant) in derivatives.yaml. '
+        if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives):
+            msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. '
                    f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA '
                    f'or DefaultBackend in native_functions.yaml. Please see '
                    f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword '
                    f'for instructions to choose the right dispatch keyword.')
-            assert declaration['abstract'], msg
+            assert f.is_abstract, msg
 
-    env = {
+    fm.write_with_template(output_name, template_name, lambda: {
+        'generated_comment': '@' + f'generated from {fm.template_dir}/{template_name}',
         'type_derived_method_declarations': type_declarations,
         'type_derived_method_definitions': type_definitions,
         'wrapper_registrations': wrapper_registrations,
-    }
-    if header:
-        write(out, 'VariableType.h', VARIABLE_TYPE_H, env)
-    else:
-        write(out, 'VariableType%s.cpp' % suffix, VARIABLE_TYPE_CPP, env)
-
-
-def emit_body(declaration):
-    assert dispatch_strategy(declaration) == 'use_derived'
-
-    arguments = declaration['arguments']
-    returns = declaration['returns']
-    func = declaration['derivative']
-    name = declaration['name']
-    inplace = declaration['inplace']
-    is_out_fn = name.endswith('_out')
-    modifies_arguments = inplace or is_out_fn
-    returns_void = len(returns) == 0
-
-    base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name
+    })
+
+def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]:
+    assert dispatch_strategy(fn) == 'use_derived'
+    f = fn.func
+    info = fn.info
+
+    name = cpp.name(f.func)
+    inplace = f.func.kind() == SchemaKind.inplace
+    is_out_fn = f.func.kind() == SchemaKind.out
+    returns_void = len(f.func.returns) == 0
+    base_name = f.func.name.name.base  # TODO: should be str(f.func.name.name)?
     view_info = VIEW_FUNCTIONS.get(base_name, None)
     if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
         view_info = "self"
 
-    def is_differentiable(arg):
-        if 'TensorOptions' in arg['type']:
-            return False
-        if 'Tensor' not in arg['type']:
-            return False
-        if arg['name'] in declaration.get('non_differentiable_arg_names', []):
-            return False
-        return True
-
-    def find_args_with_derivatives(differentiable_inputs):
+    def is_differentiable(name: str, type: Type) -> bool:
+        return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names)
+
+    def gen_differentiable_input(
+        arg: Union[Argument, SelfArgument, TensorOptionsArguments]
+    ) -> Optional[DifferentiableInput]:
+        if isinstance(arg, TensorOptionsArguments):
+            return None
+        a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg
+
+        # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove.
+        # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are
+        # not handled properly as they are irrelevant for this codegen.
+        cpp_type = cpp.argument_type(a, binds=a.name).cpp_type()
+
+        if not is_differentiable(a.name, a.type):
+            return None
+        return DifferentiableInput(
+            name=a.name,
+            type=a.type,
+            cpp_type=cpp_type,
+        )
+
+    @with_native_function
+    def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]:
+        return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out))
+
+    def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]:
         """Find arguments that have derivative definitions"""
-        if func is None:
+        if info is None or not info.has_derivatives:
             return differentiable_inputs
-        names = set(name for d in func.derivatives for name in d.var_names)
-        differentiable = [arg for arg in differentiable_inputs if arg['name'] in names]
+        names = set(name for d in info.derivatives for name in d.var_names)
+        differentiable = [arg for arg in differentiable_inputs if arg.name in names]
         if len(differentiable) != len(names):
-            missing = names - set(arg['name'] for arg in differentiable)
-            raise RuntimeError(f'Missing arguments for derivatives: {missing} in {func.name}')
+            missing = names - set(arg.name for arg in differentiable)
+            raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}')
         return differentiable
 
-    inputs = [arg for arg in arguments if not arg.get('output', False)]
-    differentiable_inputs = list(filter(is_differentiable, inputs))
+    def gen_differentiable_outputs(f: NativeFunction) -> List[DifferentiableOutput]:
+        outputs: List[DifferentiableOutput] = [
+            DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret))
+            for name, ret in zip(cpp.return_names(f), f.func.returns)]
+
+        output_differentiability = info.output_differentiability if info else None
+        if output_differentiability is not None:
+            differentiable_outputs: List[DifferentiableOutput] = []
+            if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
+                raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)")
+            for differentiable, output in zip(output_differentiability, outputs):
+                if differentiable:
+                    differentiable_outputs.append(output)
+            return differentiable_outputs
+
+        candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type), outputs))
+
+        if uses_single_grad(info):
+            return candidate_differentiable_outputs[:1]
+        else:
+            return candidate_differentiable_outputs
+
+    differentiable_inputs = gen_differentiable_inputs(f)
     args_with_derivatives = find_args_with_derivatives(differentiable_inputs)
-    non_differentiable_arg_names = declaration.get('non_differentiable_arg_names', [])
-    candidate_differentiable_outputs = list(filter(is_differentiable, returns))
-
-    if declaration['output_differentiability'] is not None:
-        differentiable_outputs = []
-        output_differentiability = declaration['output_differentiability']
-        if False in output_differentiability and inplace:
-            raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)")
-        for differentiable, output in zip(output_differentiability, returns):
-            if differentiable:
-                differentiable_outputs.append(output)
-    elif uses_single_grad(func):
-        differentiable_outputs = candidate_differentiable_outputs[:1]
-    else:
-        differentiable_outputs = candidate_differentiable_outputs
+    differentiable_outputs = gen_differentiable_outputs(f)
 
     requires_derivative = (
         base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and
         len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0)
 
-    if func is not None and not requires_derivative:
-        raise RuntimeError('ERROR: derivative ignored for {} -- specified an autograd function without derivative'
-                           .format(name))
+    if info is not None and info.has_derivatives and not requires_derivative:
+        raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative')
 
-    def emit_save_inputs():
-        setup = []
-        if func is None:
+    def emit_save_inputs() -> List[str]:
+        setup: List[str] = []
+        if info is None or not info.has_derivatives:
             return setup
 
-        has_tensorlist_arg = \
-            any(arg.type in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] for arg in func.args_with_derivatives)
+        has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives)
 
         # We don't want to save tensors if we know that they will never be used
         # when computing the derivative, so we add guards to those statements
         def guard_for(arg: SavedAttribute) -> Optional[str]:
+            assert info is not None
+
             # It's hard to determine the edge offset if we have TensorLists
             if has_tensorlist_arg:
                 return None
@@ -512,12 +530,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             # require_grad if the backward function even gets executed. I don't
             # have any good ideas for detecting those cases, so I simply disabled the
             # checks.
-            if 'backward' in func.name:
+            if 'backward' in info.name:
                 return None
 
             # If there's a single derivative we could compute, we already have
             # a requires_grad check that is sufficient
-            if len(func.args_with_derivatives) <= 1:
+            if len(args_with_derivatives) <= 1:
                 return None
 
             # We really only care about trimming down the amount of tensors we save
@@ -526,7 +544,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
 
             # We want to emit simple guards, so we only allow that if checking one
             # input is enough to determine whether we need that value
-            used_in = [d for d in func.derivatives if arg in d.saved_inputs]
+            used_in = [d for d in info.derivatives if arg in d.saved_inputs]
             assert len(used_in) > 0
             if len(used_in) != 1:
                 return None
@@ -536,75 +554,76 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             derivative_var_name = derivative.var_names[0]
 
             # Figure out the offset of the edge that uses this variable
-            for edge_off, arg in enumerate(func.args_with_derivatives):
-                if arg.name == derivative_var_name:
+            for edge_off, a in enumerate(args_with_derivatives):
+                if a.name == derivative_var_name:
                     break
             else:
                 raise AssertionError()
 
             return f'grad_fn->should_compute_output({edge_off})'
 
-        setup.extend(save_variables(func.all_saved_inputs, False, guard_for))
-        for arg in func.args_with_derivatives:
-            if arg.type in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
+        setup.extend(save_variables(info.all_saved_inputs, False, guard_for))
+        for arg in args_with_derivatives:
+            if is_tensor_list_type(arg.type):
                 setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();')
 
         return setup
 
-    def setup_derivative(differentiable_inputs):
-        env = {}
-        env['args_with_derivatives'] = [arg['name'] for arg in args_with_derivatives]
-        env['op'] = func.op if func is not None else 'NotImplemented'
-        env['op_ctor'] = '' if func is not None else '"{}"'.format(declaration['api_name'])
-
+    def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]:
+        body: List[str] = []
         if is_out_fn:
             # For out functions, ensure that no input or output requires grad
-            body = []
             body.append(DECLARE_GRAD_FN.substitute(op='Node'))
             body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
                 base_name=base_name,
-                args_to_check=[arg['name'] for arg in differentiable_inputs]))
+                args_to_check=[arg.name for arg in differentiable_inputs]))
             body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
                 base_name=base_name,
-                args_to_check=[arg['name'] for arg in differentiable_outputs]))
+                args_to_check=[arg.name for arg in differentiable_outputs]))
             return body
 
+        op = info.op if info is not None and info.has_derivatives else 'NotImplemented'
         setup = []
-        setup.extend(ASSIGN_GRAD_FN.substitute(env).split('\n'))
+        setup.extend(ASSIGN_GRAD_FN.substitute(
+            op=op,
+            op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"',
+            args_with_derivatives=[arg.name for arg in args_with_derivatives],
+        ).split('\n'))
         setup.extend(emit_save_inputs())
 
-        body = []
         body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives))
-        body.append(DECLARE_GRAD_FN.substitute(env))
+        body.append(DECLARE_GRAD_FN.substitute(op=op))
         body.append(SETUP_DERIVATIVE.substitute(setup=setup))
         return body
 
-    def emit_check_if_in_complex_autograd_allowlist():
-        body = []
+    def emit_check_if_in_complex_autograd_allowlist() -> List[str]:
+        body: List[str] = []
         if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
             return body
         for arg in differentiable_outputs:
-            name = arg['name']
-            if arg['type'] in ['Tensor', 'TensorList', 'const c10::List<c10::optional<Tensor>> &']:
-                body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name))
+            name = arg.name
+            # TODO: should be `arg.type.is_tensor_like()`?
+            if arg.cpp_type in ['Tensor', 'TensorList', 'const c10::List<c10::optional<Tensor>> &']:
+                body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");')
         return body
 
-    def emit_check_no_requires_grad(tensor_args, args_with_derivatives):
+    def emit_check_no_requires_grad(
+        tensor_args: List[DifferentiableInput],
+        args_with_derivatives: List[DifferentiableInput],
+    ) -> List[str]:
         """Checks that arguments without derivatives don't require grad"""
-        body = []
+        body: List[str] = []
         for arg in tensor_args:
             if arg in args_with_derivatives:
                 continue
-            name = arg['name']
-            if name in non_differentiable_arg_names:
+            name = arg.name
+            if info and name in info.non_differentiable_arg_names:
                 continue
             if name == 'output':
                 # Double-backwards definitions sometimes take in 'input' and
                 # 'output', but only define the derivative for input.
                 continue
-            if arg['dynamic_type'] in {'IndexTensor', 'ByteTensor', 'BoolTensor'}:
-                continue
-            body.append('check_no_requires_grad({}, "{}");'.format(name, name))
+            body.append(f'check_no_requires_grad({name}, "{name}");')
         return body
 
     def save_variables(
@@ -644,42 +663,40 @@ def save_variables(
                 stmts.append('}')
         return stmts
 
-    def emit_dispatch_call(api_name, input_base, unpacked_args):
+    def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str:
         """ Dispatch call via function in a namespace or method on Tensor."""
-        if 'namespace' in declaration['method_of']:
-            if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']:
-                dispatcher_api_name = make_out_api_name_faithful(api_name)
-            else:
-                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                dispatcher_api_name = api_name
+        if Variant.function in f.variants:
             call = CALL_DISPATCH_VIA_NAMESPACE.substitute(
-                api_name=dispatcher_api_name,
+                api_name=cpp.name(
+                    f.func,
+                    faithful_name_for_out_overloads=f.use_c10_dispatcher.dispatcher_uses_new_style(),
+                ),
                 unpacked_args=unpacked_args)
         else:
             call = CALL_DISPATCH_VIA_METHOD.substitute(
-                api_name=api_name,
+                api_name=cpp.name(f.func),
                 var=input_base,
                 unpacked_method_args=unpacked_args[1:])
         return call
 
-    def emit_view_lambda():
+    def emit_view_lambda(unpacked_bindings: List[Binding]) -> str:
         """ Generate an additional lambda function to recover views in backward when as_strided is not supported.
         See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details."""
         input_base = 'input_base'
         replay_view_func = ''
-        updated_unpacked_args = []
-        combined = nested_dict(env, declaration)
-        known_view_arg_simple_types = ['int64_t', 'int64_t?', 'bool', 'IntArrayRef']
-        for arg in combined['unpacked_args']:
+        updated_unpacked_args: List[str] = []
+        known_view_arg_simple_types: List[str] = ['int64_t', 'c10::optional<int64_t>', 'bool', 'IntArrayRef']
+        for unpacked_binding in unpacked_bindings:
+            arg, arg_type = unpacked_binding.name, unpacked_binding.type
             if arg == 'self_':
                 updated_unpacked_args.append(input_base)
                 continue
-            arg_type = combined['unpacked_args_simple_type'][arg]
             if arg_type not in known_view_arg_simple_types:
-                raise TypeError('You are adding an {} {} argument to op {} in addition to known types: {}. '
-                                'Please update the list or materialize it so that it can be closed over by value, '
-                                'also add a test in pytorch/xla/test/test_operations.py where this code is exercised.'
-                                .format(arg_type, arg, declaration['name'], ', '.join(known_view_arg_simple_types)))
+                known_types_str = ', '.join(known_view_arg_simple_types)
+                raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: '
+                                f'{known_types_str}. Please update the list or materialize it so that it can be closed '
+                                'over by value, also add a test in pytorch/xla/test/test_operations.py where this code '
+                                'is exercised.')
 
             if arg_type == 'IntArrayRef':
                 # It's not safe to close over IntArrayRef by value, since this is a
@@ -687,7 +704,7 @@ def emit_view_lambda():
                 arg_vec = arg + '_vec'
                 replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
                 updated_unpacked_args.append(arg_vec)
-            elif arg_type == 'int64_t?':
+            elif arg_type == 'c10::optional<int64_t>':
                 # Materialize int64_t? to int64_t
                 arg_value = arg + '_val'
                 replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0')
@@ -695,7 +712,7 @@ def emit_view_lambda():
             else:
                 updated_unpacked_args.append(arg)
 
-        replay_view_call = emit_dispatch_call(combined['api_name'], input_base, updated_unpacked_args)
+        replay_view_call = emit_dispatch_call(f, input_base, updated_unpacked_args)
         replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute(
             input_base=input_base,
             replay_view_call=replay_view_call)
@@ -706,17 +723,17 @@ def emit_view_lambda():
             is_view_with_metadata_change=is_view_with_metadata_change,
             replay_view_func=replay_view_func)
 
-    def wrap_output(return_values, var):
+    def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str:
         call = ''
-        rhs_value = None
-        if 'Tensor' not in declaration['return_type']:
+        rhs_value: Optional[str] = None
+        if not any(r.type.is_tensor_like() for r in f.func.returns):
             rhs_value = var
         elif view_info is not None:
             # See NOTE [ Autograd View Variables ] in variable.h for details.
-            differentiable_output_vars = {r['name'] for r in differentiable_outputs}
+            differentiable_output_vars = {r.name for r in differentiable_outputs}
 
             if not isinstance(view_info, str):
-                raise TypeError("The view info should be a string for {}, but it is: {}".format(base_name, view_info))
+                raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}')
 
             if len(differentiable_output_vars) == 0:
                 # no output is differentiable (.indices() for SparseTensors for example)
@@ -725,54 +742,55 @@ def wrap_output(return_values, var):
                 # Single differentiable output (Tensor or Tensor[])
                 return_info = differentiable_outputs[0]
                 # We only support simple Tensor or a TensorList for functions that return views
-                if not return_info['dynamic_type'] in ['Tensor', 'TensorList']:
-                    raise RuntimeError("{} that return differentiable views can only return Tensor or Tensor[]".format(base_name))
+                if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type):
+                    raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]')
                 # Only allow rebasing of the history if we return a single Tensor
                 # If we are in a no grad block, raise a warning
                 # See NOTE [ View + Inplace detection ] for more details about this logic
-                if return_info['dynamic_type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
+                if is_tensor_list_type(return_info.type):
                     if base_name in MULTI_OUTPUT_SAFE_FUNCTIONS:
-                        creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE"
+                        creation_meta = 'CreationMeta::MULTI_OUTPUT_SAFE'
                     else:
-                        creation_meta = "CreationMeta::MULTI_OUTPUT_NODE"
-                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
-                             "/* is_fw_differentiable */ true, "
-                             "/* creation_meta */ {});").format(view_info, var, creation_meta)
-                    rhs_value = 'std::move({})'.format(var)
+                        creation_meta = 'CreationMeta::MULTI_OUTPUT_NODE'
+                    call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
+                             '/* is_fw_differentiable */ true, '
+                             f'/* creation_meta */ {creation_meta});')
+                    rhs_value = f'std::move({var})'
                 else:
-                    call += emit_view_lambda()
-                    creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE"
-                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
-                                 "/* is_fw_differentiable */ true, "
-                                 "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta)
+                    call += emit_view_lambda(unpacked_bindings)
+                    creation_meta = 'GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE'
+                    rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
+                                 '/* is_fw_differentiable */ true, '
+                                 f'/* view_func */ func, /* creation_meta */ {creation_meta})')
             else:
                 # This could be supported but we don't need it at the moment, so keeping things simple.
-                raise RuntimeError("Function that return multiple differentiable output "
-                                   "when at least one of them is view is not supported.")
+                raise RuntimeError('Function that return multiple differentiable output '
+                                   'when at least one of them is view is not supported.')
         else:
-            rhs_value = 'std::move({})'.format(var)
+            rhs_value = f'std::move({var})'
         assert rhs_value is not None
-        call += ASSIGN_RETURN_VALUE.substitute(return_values=return_values,
+        call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f),
                                                rhs_value=rhs_value)
         return call
 
-    def enforce_same_tensorimpl_and_storage(env, call):
-        save_ptrs_stmts = []
-        enforce_same_ptrs_stmts = []
-        if declaration['name'] not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
-            for arg in env.get('unpacked_args', []):
-                simple_type = env['unpacked_args_simple_type'][arg]
-                if simple_type == 'TensorList':
+    def enforce_same_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str:
+        save_ptrs_stmts: List[str] = []
+        enforce_same_ptrs_stmts: List[str] = []
+        if cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
+            for unpacked_binding in unpacked_bindings:
+                arg = unpacked_binding.name
+                noref_cpp_type = unpacked_binding.ctype.cpp_type(strip_ref=True)
+                if noref_cpp_type == 'TensorList':
                     save_ptrs_stmts += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                         SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                                 ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
-                elif simple_type == 'c10::List<c10::optional<Tensor>>':
+                elif noref_cpp_type == 'c10::List<c10::optional<Tensor>>':
                     save_ptrs_stmts += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                         SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                                 ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
-                elif simple_type == 'Tensor':
+                elif noref_cpp_type == 'Tensor':
                     save_ptrs_stmts += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
                                         SAVE_TENSOR_IMPL.substitute(tensor_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg),
@@ -784,74 +802,69 @@ def enforce_same_tensorimpl_and_storage(env, call):
                 RUN_ONLY_IN_DEBUG_MODE.substitute(statements=enforce_same_ptrs_stmts)
         return call
 
-    def emit_call(env, tie_return_values):
-        combined = nested_dict(env, declaration)
+    def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch
         # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
         # the baseType operations still dispatch to non-Variable type, even if the arguments passed
         # in are now Variables.
         # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
-        base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args'])
-        if not modifies_arguments and not returns_void:
+        unpacked_args = [b.name for b in unpacked_bindings]
+        base_type_call = emit_dispatch_call(f, 'self_', unpacked_args)
+        if not modifies_arguments(f) and not returns_void:
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
                 base_type_call=base_type_call)
 
-            call += wrap_output(tie_return_values, 'tmp')
+            call += wrap_output(f, unpacked_bindings, 'tmp')
         else:
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
                 base_type_call=base_type_call)
-        call = enforce_same_tensorimpl_and_storage(env, call)
+        call = enforce_same_tensorimpl_and_storage(call, unpacked_bindings)
         return call
 
-    def emit_history():
-        fn = 'rebase' if modifies_arguments and view_info is None else 'set'
-        output_names = [r['name'] for r in differentiable_outputs]
+    def emit_history() -> str:
+        fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set'
+        output_names = [r.name for r in differentiable_outputs]
         # TODO: flatten allocates a std::vector, which could be expensive
         outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names)
         return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
 
-    def emit_save_outputs():
+    def emit_save_outputs() -> str:
         if is_out_fn:
             # out functions don't currently support differentiation
             return ''
-        func = declaration['derivative']
-        if func is not None:
-            stmts = save_variables(func.all_saved_outputs, True)
+        if info is not None and info.has_derivatives:
+            stmts = save_variables(info.all_saved_outputs, True)
             if len(stmts) == 0:
                 return ''
             return CONDITIONAL.substitute(cond='grad_fn', statements=stmts)
         return ''
 
-    def emit_any_requires_grad():
+    def emit_any_requires_grad() -> List[str]:
         return [SETUP_ANY_REQUIRES_GRAD.substitute(
-            args_with_derivatives=[arg['name'] for arg in args_with_derivatives]), ]
+            args_with_derivatives=[arg.name for arg in args_with_derivatives]), ]
 
-    def emit_check_inplace():
+    def emit_check_inplace() -> List[str]:
         if not inplace:
             return []
-        return ['check_inplace({}, _any_requires_grad);'.format(arg['name']) for arg in differentiable_outputs]
+        return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs]
 
-    def emit_increment_version():
-        if not modifies_arguments:
+    def emit_increment_version(f: NativeFunction) -> List[str]:
+        if not modifies_arguments(f):
             return []
-        return ['increment_version({});'.format(arg['name']) for arg in returns]
+        return [f'increment_version({r});' for r in cpp.return_names(f)]
 
-    env = {}
-    combined = nested_dict(env, declaration)
+    body: List[str] = []
+    unpack_args_stats, unpacked_bindings = unpack_args(f)
 
-    body = []
-
-    declare_returned_variables, tie_return_values, get_return_value = format_return_variables(declaration)
-
-    body.extend(unpack_args(env, declaration))
+    body.extend(unpack_args_stats)
     if requires_derivative:
         body.extend(emit_any_requires_grad())
         body.extend(emit_check_inplace())
         body.extend(setup_derivative(differentiable_inputs))
-    body.append(declare_returned_variables)
+    body.append(declare_returned_variables(f))
 
-    body.append(emit_call(env, tie_return_values))
-    body.extend(emit_increment_version())
+    body.append(emit_call(f, unpacked_bindings))
+    body.extend(emit_increment_version(f))
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
@@ -866,56 +879,54 @@ def emit_increment_version():
         assert inplace
         body.append('reset_grad_accumulator(self);')
     if not returns_void:
-        body.append('return {};'.format(get_return_value))
+        body.append(f'return {get_return_value(f)};')
     return body
 
-
-def unpack_args(env, declaration):
-    def requires_unpack(arg):
-        return 'Tensor' in arg['dynamic_type'] and 'c10::optional' not in arg['type']
-
-    body = []
-    unpacked_args = []
-    unpacked_args_simple_type = {}
-    if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-        arguments = declaration['schema_order_arguments']
+@with_native_function
+def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
+    body: List[str] = []
+    unpacked_bindings: List[Binding] = []
+
+    if f.use_c10_dispatcher.dispatcher_uses_new_style():
+        bindings = [r for a in f.func.schema_order_arguments()
+                    for r in cpp.argument(a,
+                                          method=False,
+                                          cpp_no_default_args=set(),
+                                          faithful=False,
+                                          has_tensor_options=False)]
     else:
-        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-        arguments = declaration['arguments']
-    for i, arg in enumerate(arguments):
-        if not requires_unpack(arg):
-            unpacked_args.append(arg['name'])
-            unpacked_args_simple_type[arg['name']] = arg['simple_type']
-            continue
+        sig_group = CppSignatureGroup.from_native_function(f, method=False)
+        bindings = list(sig_group.signature.arguments())
 
-        dynamic_type = arg['dynamic_type']
-        if 'TensorOptions' not in dynamic_type:
-            is_nullable = arg.get('is_nullable', False)
-            ref = (not is_nullable) and dynamic_type != 'TensorList'
-            suffix = '_opt' if is_nullable and dynamic_type != 'TensorList' else ''
-            body.append(UNPACK_TENSOR.substitute(
-                arg_name=arg['name'],
-                arg_pos=i,
-                suffix=suffix,
-                ref='&' if ref else '',
-            ))
-        else:
-            # Okay, we are abusing the definition of 'unpack' here a bit,
-            # although it's still getting the non-variable from the variable
-            # (in this case via TensorOptions rather than Variable/Tensor).
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \
-                "VariableKernel shouldn't take TensorOptions if the op is c10-full"
-            body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name']))
-
-        unpacked_args.append(arg['name'] + '_')
-        unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type']
-
-    env['unpacked_args'] = unpacked_args
-    env['unpacked_args_simple_type'] = unpacked_args_simple_type
-    return body
+    for i, binding in enumerate(bindings):
+        assert not isinstance(binding.argument, SelfArgument)
+        if isinstance(binding.argument, TensorOptionsArguments):
+            raise RuntimeError("VariableKernel shouldn't take TensorOptions")
 
+        is_nullable = binding.argument.type.is_nullable()
+        if not binding.argument.type.is_tensor_like() or is_nullable:
+            unpacked_bindings.append(binding)
+            continue
 
-def dispatch_strategy(declaration):
+        is_tensor_list = is_tensor_list_type(binding.argument.type)
+        ref = (not is_nullable) and not is_tensor_list
+        suffix = '_opt' if is_nullable and not is_tensor_list else ''
+        body.append(UNPACK_TENSOR.substitute(
+            arg_name=binding.name,
+            arg_pos=i,
+            suffix=suffix,
+            ref='&' if ref else '',
+        ))
+        unpacked_bindings.append(Binding(
+            name=binding.name + '_',
+            ctype=binding.ctype,
+            argument=binding.argument,
+            default=binding.default,
+        ))
+
+    return body, unpacked_bindings
+
+def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
     """How are we going to call the underlying implementation of a
     declaration?  There are two strategies:
 
@@ -935,7 +946,7 @@ def dispatch_strategy(declaration):
           get dispatched back to VariableType (which will ensure that they
           are differentiable.)
     """
-    if declaration['abstract'] or declaration['derivative'] is not None:
+    if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives):
         # If the function is abstract (not implemented on at::Type), we must
         # call the implementation on the derived type with unpacked tensors.
 
@@ -959,62 +970,47 @@ def dispatch_strategy(declaration):
         # assumption might not hold, but then you'll see gradcheck fail.)
         return 'use_type'
 
-def get_decl_signature(declaration: Dict[Any, Any], use_base_variant: bool = False) -> str:
-    name = declaration['name']
-    arguments = declaration['arguments']
-    if use_base_variant:
-        if declaration['inplace']:
-            assert name.endswith('_')
-            name = name[:-1]
-        elif name.endswith('_out'):
-            name = name[:-4]
-            arguments = [arg for arg in arguments if not arg.get('output', False)]
-    simple_types = ', '.join(arg['simple_type'] for arg in arguments)
-    return f'{name}({simple_types})'
+def is_tensor_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is None
 
-@with_native_function
-def get_func_signature(f: NativeFunction) -> str:
-    args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
-    types = ', '.join(python.argument_type_str(a.argument.type, simple_type=True)
-                      if isinstance(a.argument, Argument) else 'TensorOptions'
-                      for a in args)
-    return f'{cpp.name(f.func)}({types})'
-
-def match_declarations_with_differentiability_info(
-    declarations: Dict[Any, Any],
+def is_tensor_list_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is not None
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+def match_differentiability_info(
+    native_functions: List[NativeFunction],
     differentiability_infos: Sequence[DifferentiabilityInfo],
-) -> None:
+) -> List[NativeFunctionWithDifferentiabilityInfo]:
     """Sets the "derivative" key on declarations to matching autograd function
 
     In-place functions will use the out-of-place derivative definition if there
     is no in-place specific derivative.
     """
 
-    info_by_signature = {get_func_signature(info.func): info for info in differentiability_infos}
+    info_by_schema = {info.func.func: info for info in differentiability_infos}
+    functional_info_by_signature = {
+        info.func.func.signature(strip_default=True): info
+        for info in differentiability_infos
+        if info.func.func.kind() == SchemaKind.functional}
 
-    def find_info(declaration: Dict[Any, Any]) -> Optional[DifferentiabilityInfo]:
-        signature = get_decl_signature(declaration)
-        if signature in info_by_signature:
-            return info_by_signature[signature]
+    def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]:
+        if f.func in info_by_schema:
+            return info_by_schema[f.func], True
 
         # if there is no exact match look for the out-of-place signature.
         # i.e mul() for mul_() or mul_out()
-        signature = get_decl_signature(declaration, use_base_variant=True)
-        return info_by_signature.get(signature)
-
-    for declaration in declarations:
-        info = find_info(declaration)
-        declaration['derivative'] = info if info and info.args_with_derivatives else None
-
-        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
-        # 'self' derivatives of an inplace function, so we must check for this case.
-        if declaration['inplace'] and (info is not None):
-            for derivative in info.derivatives:
-                if 'self' in derivative.var_names:
-                    for saved_input in derivative.saved_inputs:
-                        assert 'strides_or_error' not in saved_input.expr, (
-                            "Calling '.strides()' in the 'self' derivative formula of an "
-                            f"in-place function is not supported: {declaration['name']}")
-
-        declaration['non_differentiable_arg_names'] = info.non_differentiable_arg_names if info else []
-        declaration['output_differentiability'] = info.output_differentiability if info else None
+        return functional_info_by_signature.get(f.func.signature(strip_default=True)), False
+
+    result: List[NativeFunctionWithDifferentiabilityInfo] = []
+    for f in native_functions:
+        info, is_exact_match = find_info(f)
+        result.append(NativeFunctionWithDifferentiabilityInfo(
+            func=f,
+            info=info,
+        ))
+
+    return result
diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py
index 58fb75bb7c07..6f58eea6d1ea 100644
--- a/tools/codegen/api/autograd.py
+++ b/tools/codegen/api/autograd.py
@@ -87,3 +87,36 @@ class DifferentiabilityInfo:
 
     # Raw data read from derivatives.yaml.
     output_differentiability: Optional[List[bool]]
+
+    @property
+    def has_derivatives(self) -> bool:
+        return len(self.args_with_derivatives) > 0
+
+# Represents a differentiable `Argument`.
+# How is it different from the `Argument` type?
+# - It's processed Arguments which are differentiable and only used in the
+#   context of the autograd codegen;
+# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
+@dataclass(frozen=True)
+class DifferentiableInput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+# Represents a differentiable `Return`.
+# How it it different from the `Return` type?
+# - The name in `Return` is optional. Here it is always populated using the same
+#   `cpp.return_names()` method.
+#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
+# - It's processed Returns which are differentiable, in compliance with the
+#   `output_differentiability` field defined in derivatives.yaml (if specified),
+#   and are only used in the context of the autograd codegen;
+@dataclass(frozen=True)
+class DifferentiableOutput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index 29a29e215f4f..8a1d2a5272f5 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -106,7 +106,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
             return BaseCType("DimnameList", binds)
         elif str(t.elem) == 'Tensor?':
             if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return BaseCType("const c10::List<c10::optional<Tensor>> &", binds)
+                return ConstRefCType(BaseCType("c10::List<c10::optional<Tensor>>", binds))
             else:
                 return BaseCType("TensorList", binds)
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index ea03a1799cfb..39fb8bef3846 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -31,14 +31,16 @@ class BaseCType:
     type: str
     name: ArgName
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
         return self.type
 
 @dataclass(frozen=True)
 class ConstRefCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
         return f'const {self.elem.cpp_type()} &'
 
     @property
@@ -49,7 +51,9 @@ def name(self) -> ArgName:
 class MutRefCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
         return f'{self.elem.cpp_type()} &'
 
     @property
@@ -60,7 +64,8 @@ def name(self) -> ArgName:
 class OptionalCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
         return f'c10::optional<{self.elem.cpp_type()}>'
 
     @property
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 782d8b919e7e..8f521e6651bc 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -203,8 +203,7 @@ class RegisterSchema:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
-        op_name = f"aten::{f.func.name}"
-        if not self.selector.is_operator_selected(op_name):
+        if not self.selector.is_native_function_selected(f):
             return None
         return f'm.def({cpp_string(str(f.func))});\n'
 
@@ -399,8 +398,7 @@ def gen_one(f: NativeFunction) -> Optional[str]:
                 e.expr for e in translate(functional_sig.arguments(), dispatcher.arguments(functional_func), method=False)
             )
 
-            op_name = f"aten::{f.func.name}"
-            if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
+            if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
                 return None
 
             k = f.func.kind()
@@ -480,8 +478,7 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
         if f.manual_kernel_registration:
             return None
 
-        op_name = f"aten::{f.func.name}"
-        if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
+        if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
             return None
 
         name = native.name(f.func)
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index ea667a0922cf..9c8a0d73e815 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -567,7 +567,7 @@ def kind(self) -> SchemaKind:
         else:
             return SchemaKind.functional
 
-    def signature(self) -> 'FunctionSchema':
+    def signature(self, *, strip_default: bool = False) -> 'FunctionSchema':
         """
         Certain schemas are 'related', in that they are simply
         inplace/out/functional versions of the same function.  This method
@@ -582,11 +582,13 @@ def signature(self) -> 'FunctionSchema':
         - Out arguments are stripped
         - Mutability annotations are stripped  (this is sound
           because you cannot overload on mutability annotation)
+        - Return names are stripped since they are not overloadable and
+          some variants have return names but some not
         """
 
         def strip_ret_annotation(r: Return) -> Return:
             return Return(
-                name=r.name,
+                name=None,
                 type=r.type,
                 annotation=None,
             )
@@ -600,7 +602,7 @@ def strip_ret_annotation(r: Return) -> Return:
                 ),
                 overload_name="",  # stripped
             ),
-            arguments=self.arguments.signature(),
+            arguments=self.arguments.signature(strip_default=strip_default),
             returns=tuple(map(strip_ret_annotation, self.returns)),
         )
 
@@ -983,14 +985,14 @@ def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]:
         ret.extend(self.post_tensor_options_kwarg_only)
         return ret
 
-    def signature(self) -> 'Arguments':
+    def signature(self, *, strip_default: bool = False) -> 'Arguments':
         # dataclasses.replace could be used here, but it is less
         # type safe so for now I've opted to type everything out
         def strip_arg_annotation(a: Argument) -> Argument:
             return Argument(
                 name=a.name,
                 type=a.type,
-                default=a.default,  # hmmm
+                default=a.default if not strip_default else None,
                 annotation=None,
             )
 
diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py
index 24e387128b6c..3e80e168d31c 100644
--- a/tools/codegen/selective_build/selector.py
+++ b/tools/codegen/selective_build/selector.py
@@ -3,6 +3,7 @@
 
 from dataclasses import dataclass
 
+from tools.codegen.model import NativeFunction
 from tools.codegen.selective_build.operator import *
 
 # A SelectiveBuilder holds information extracted from the selective build
@@ -96,6 +97,10 @@ def is_operator_selected(self, name: str) -> bool:
         name = strip_operator_overload_name(name)
         return name in self.operators and self.operators[name].include_all_overloads
 
+    def is_native_function_selected(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected(op_name)
+
     def is_operator_selected_for_training(self, name: str) -> bool:
         if not self.is_operator_selected(name):
             return False
@@ -123,6 +128,10 @@ def is_operator_selected_for_training(self, name: str) -> bool:
             (base_op.include_all_overloads and base_op.is_used_for_training)
         )
 
+    def is_native_function_selected_for_training(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected_for_training(op_name)
+
     def is_root_operator(self, name: str) -> bool:
         if not self.is_operator_selected(name):
             return False
@@ -158,3 +167,9 @@ def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) ->
     debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
     operators = merge_operator_dicts(lhs.operators, rhs.operators)
     return SelectiveBuilder(include_all_operators, debug_info, operators)
+
+
+def op_name_from_native_function(f: NativeFunction) -> str:
+    # This was originally read from the 'operator_name_with_overload' field in the
+    # declaration dict, which was the part before the first '(' in 'schema_string'.
+    return f'aten::{f.func.name}'
diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py
index 267b5a3b221a..a52c109c603f 100644
--- a/tools/jit/gen_unboxing_wrappers.py
+++ b/tools/jit/gen_unboxing_wrappers.py
@@ -22,9 +22,10 @@
 import re
 from itertools import groupby
 from functools import reduce
-from ..autograd.gen_autograd import load_aten_declarations
+import yaml
+
 from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT
-from ..autograd.utils import CodeTemplate, write, is_out_variant, op_name_with_overload
+from ..autograd.utils import CodeTemplate, YamlLoader, write, is_out_variant, op_name_with_overload
 from tools.codegen.selective_build.selector import SelectiveBuilder
 
 # JIT has a type system of
@@ -279,6 +280,66 @@ def argument_order(decl):
     return decl.get('jit_argument_order') or list(range(len(decl['arguments'])))
 
 
+def format_return_type(returns):
+    if len(returns) == 0:
+        return 'void'
+    elif len(returns) == 1:
+        return returns[0]['type']
+    else:
+        return_types = [r['type'] for r in returns]
+        return 'std::tuple<{}>'.format(','.join(return_types))
+
+
+def get_simple_type(arg):
+    simple_type = arg['type']
+    simple_type = simple_type.replace(' &', '').replace('const ', '')
+    simple_type = simple_type.replace('Generator *', 'Generator')
+
+    opt_match = re.match(r'c10::optional<(.+)>', simple_type)
+    if opt_match:
+        simple_type = '{}?'.format(opt_match.group(1))
+    return simple_type
+
+
+def load_aten_declarations(path):
+    with open(path, 'r') as f:
+        declarations = yaml.load(f, Loader=YamlLoader)
+
+    # enrich declarations with additional information
+    selected_declarations = []
+    for declaration in declarations:
+        if declaration.get('deprecated'):
+            continue
+
+        for arg in declaration['arguments']:
+            arg['simple_type'] = get_simple_type(arg)
+        for arg in declaration['schema_order_arguments']:
+            arg['simple_type'] = get_simple_type(arg)
+        for ret in declaration['returns']:
+            ret['simple_type'] = get_simple_type(ret)
+
+        declaration['formals'] = [arg['type'] + ' ' + arg['name']
+                                  for arg in declaration['arguments']]
+        declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name']
+                                               for arg in declaration['schema_order_arguments']]
+        declaration['args'] = [arg['name'] for arg in declaration['arguments']]
+        declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
+        declaration['api_name'] = declaration['name']
+        if declaration.get('overload_name'):
+            declaration['type_wrapper_name'] = "{}_{}".format(
+                declaration['name'], declaration['overload_name'])
+        else:
+            declaration['type_wrapper_name'] = declaration['name']
+        declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0]
+        declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1]
+        declaration['return_type'] = format_return_type(declaration['returns'])
+
+        declaration['base_name'] = declaration['name']
+        selected_declarations.append(declaration)
+
+    return selected_declarations
+
+
 def gen_unboxing_wrappers(
     declarations,
     out,

From abe1fa49e983e5abe7caab1e3b008316dc9529dc Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Tue, 5 Jan 2021 14:06:16 -0800
Subject: [PATCH 79/89] [JIT] Add `__prepare_scriptable__` duck typing to allow
 replacing nn.modules with scriptable preparations (#45645) (#49242)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49242

Fixes https://github.com/pytorch/pytorch/issues/45072

As discussed with zdevito gchanan cpuhrsch and suo, this change allows developers to create custom preparations for their modules before scripting. This is done by adding a `__prepare_scriptable__` method to a module which returns the prepared scriptable module out-of-place. It does not expand the API surface for end users.

Prior art by jamesr66a: https://github.com/pytorch/pytorch/pull/42244

Test Plan: Imported from OSS

Reviewed By: dongreenberg

Differential Revision: D25500303

fbshipit-source-id: d3ec9005de27d8882fc29d02f0d08acd2a5c6b2c
---
 test/jit/test_recursive_script.py | 53 +++++++++++++++++++++++++++++++
 test/jit/test_torchbind.py        | 26 +++++++++++++++
 torch/jit/_script.py              | 38 ++++++++++++++++++++++
 3 files changed, 117 insertions(+)

diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index bd9a2bb32b89..a0dc99a4e463 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -495,6 +495,59 @@ def forward(self, x):
 
         self.checkModule(M(), (torch.randn(5, 5),))
 
+    def test_prepare_scriptable_basic(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        t = torch.randn(5, 5)
+        m = SeluButReluWhenScripted()
+        sm = torch.jit.script(m)
+        eager_out = m(t)
+        script_out = sm(t)
+        self.assertNotEqual(eager_out, script_out)
+
+    def test_prepare_scriptable_iterable_modules(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                shared = SeluButReluWhenScripted()
+                self.sequential = nn.Sequential(
+                    SeluButReluWhenScripted(),
+                    SeluButReluWhenScripted(),
+                    nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()),
+                    shared,
+                )
+                self.module_list = nn.ModuleList([SeluButReluWhenScripted(),
+                                                  shared,
+                                                  SeluButReluWhenScripted()])
+
+            def forward(self, x):
+                for mod in self.module_list:
+                    x += mod(x)
+                x += self.sequential(x)
+                return x
+
+        t = torch.randn(5, 5)
+        m = M()
+        eager_out = m(t.clone())
+        sm = torch.jit.script(m)
+        script_out = sm(t.clone())
+        self.assertNotEqual(eager_out, script_out)
+
+    def test_prepare_scriptable_cycle(self):
+        t = torch.randn(5, 5)
+        c = torch.nn.Module()
+        p = torch.nn.Module()
+        c.__dict__["_p"] = p
+        p.__dict__["_c"] = c
+
+        sm = torch.jit.script(p)
+
     def test_attributes(self):
         @torch.jit.script
         class Inner2(object):
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index 31eec81d480a..7f43b31fe6ec 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -62,6 +62,32 @@ def f():
             return ss1.pop() + ss2.pop()
         test_equality(f, lambda x: x)
 
+        # test nn module with prepare_scriptable function
+        class NonJitableClass(object):
+            def __init__(self, int1, int2):
+                self.int1 = int1
+                self.int2 = int2
+
+            def return_vals(self):
+                return self.int1, self.int2
+
+        class CustomWrapper(torch.nn.Module):
+            def __init__(self, foo):
+                super(CustomWrapper, self).__init__()
+                self.foo = foo 
+
+            def forward(self) -> None:
+                self.foo.increment(1)
+                return
+
+            def __prepare_scriptable__(self):
+                int1, int2 = self.foo.return_vals()
+                foo = torch.classes._TorchScriptTesting._Foo(int1, int2)
+                return CustomWrapper(foo) 
+
+        foo = CustomWrapper(NonJitableClass(1, 2))
+        jit_foo = torch.jit.script(foo)
+
     def test_torchbind_take_as_arg(self):
         global StackString  # see [local resolution in python]
         StackString = torch.classes._TorchScriptTesting._StackString
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index cc84877e5267..8bc8c6117c1b 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -741,6 +741,43 @@ class RecursiveScriptModule(ScriptModule):  # type: ignore
         def __init__(self, arg=None):
             super().__init__()
 
+def call_prepare_scriptable_func_impl(obj, memo):
+    if not isinstance(obj, torch.nn.Module):
+        return obj
+
+    obj_id = id(obj)
+
+    # If obj_id is in memo, obj has already been prepared or is being
+    # prepared in another call up the stack.
+    if obj_id in memo:
+        return memo[id(obj)]
+
+    obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj  # type: ignore
+    # Record obj in memo to avoid infinite recursion in the case of cycles in the module
+    # hierarchy when recursing below.
+    memo[obj_id] = obj
+
+    new_obj_dict = {}
+
+    for name in obj.__dict__:
+        sub_module = obj.__dict__.get(name)
+        if name == '_modules':
+            for k, v in sub_module.items():
+                sub_module[k] = call_prepare_scriptable_func_impl(v, memo)
+            new_obj_dict[name] = sub_module
+        elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule):
+            new_obj_dict[name] = call_prepare_scriptable_func_impl(sub_module, memo)
+        else:
+            new_obj_dict[name] = sub_module
+
+    for k, v in new_obj_dict.items():
+        obj.__dict__[name] = v
+
+    return obj
+
+def call_prepare_scriptable_func(obj):
+    memo: Dict[int, torch.nn.Module] = {}
+    return call_prepare_scriptable_func_impl(obj, memo)
 
 def script(obj, optimize=None, _frames_up=0, _rcb=None):
     r"""
@@ -894,6 +931,7 @@ def forward(self, input):
         return obj
 
     if isinstance(obj, torch.nn.Module):
+        obj = call_prepare_scriptable_func(obj)
         return torch.jit._recursive.create_script_module(
             obj, torch.jit._recursive.infer_methods_to_compile
         )

From d1a56fcd9d68e3d3706a190c950a777d3104750f Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Tue, 5 Jan 2021 14:49:14 -0800
Subject: [PATCH 80/89] [docs] add docstring in
 torch.cuda.get_device_properties (#49792)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49737

Added docstring in `torch.cuda.get_device_properties`
Added the `Returns` in `torch.cuda.get_device_name`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49792

Reviewed By: mruberry

Differential Revision: D25784046

Pulled By: ngimel

fbshipit-source-id: f88da02147f92c889398957fcaf22961d3bb1062
---
 torch/cuda/__init__.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 8ee83fa81fe7..5535cef78395 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -271,6 +271,9 @@ def get_device_name(device: Optional[_device_t] = None) -> str:
             name. This function is a no-op if this argument is a negative
             integer. It uses the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
     """
     return get_device_properties(device).name
 
@@ -293,6 +296,15 @@ def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]
 
 
 def get_device_properties(device: _device_t) -> _CudaDeviceProperties:
+    r"""Gets the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _CudaDeviceProperties: the properties of the device
+    """
     _lazy_init()  # will define _get_device_properties
     device = _get_device_index(device, optional=True)
     if device < 0 or device >= device_count():

From 9529ae377629f4253af0720cb01f09db6fcad4c9 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 5 Jan 2021 15:16:12 -0800
Subject: [PATCH 81/89] Revert D25757721: [pytorch][PR] Run mypy on more test
 files

Test Plan: revert-hammer

Differential Revision:
D25757721 (https://github.com/pytorch/pytorch/commit/b7bfc723d3245bffce92d1b0a2cef68f48426a5e)

Original commit changeset: 44c396d8da9e

fbshipit-source-id: 58437d719285a4fecd8c05e487cc86fc2cebadff
---
 mypy.ini                              | 11 -----------
 test/test_bundled_inputs.py           |  4 +---
 test/test_expecttest.py               |  3 +--
 test/test_numpy_interop.py            | 18 +++++++++---------
 torch/testing/_internal/expecttest.py |  4 +---
 torch/utils/bundled_inputs.py         |  4 ++--
 6 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index bab4ce5dfd42..7d6161bddd17 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,13 +17,8 @@ check_untyped_defs = True
 files =
     torch,
     caffe2,
-    test/test_bundled_images.py,
-    test/test_bundled_inputs.py,
     test/test_complex.py,
-    test/test_dataset.py,
-    test/test_expecttest.py,
     test/test_futures.py,
-    test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
     test/test_type_info.py
@@ -124,12 +119,6 @@ ignore_errors = True
 [mypy-torch.overrides]
 ignore_errors = True
 
-#
-# Adding type annotations to caffe2 is probably not worth the effort
-# only work on this if you have a specific reason for it, otherwise
-# leave these ignores as they are.
-#
-
 [mypy-caffe2.python.*]
 ignore_errors = True
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index e12339f3acea..f57407c9b1d1 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python3
 import io
-from typing import List
-
 import torch
 import torch.utils.bundled_inputs
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -29,7 +27,7 @@ def forward(self, arg):
 
         sm = torch.jit.script(SingleTensorModel())
         original_size = model_size(sm)
-        get_expr : List[str] = []
+        get_expr = []
         samples = [
             # Tensor with small numel and small storage.
             (torch.tensor([1]),),
diff --git a/test/test_expecttest.py b/test/test_expecttest.py
index 5e2461797705..652a33c41869 100644
--- a/test/test_expecttest.py
+++ b/test/test_expecttest.py
@@ -4,7 +4,6 @@
 import string
 import textwrap
 import doctest
-from typing import Dict, Any
 
 import hypothesis
 from hypothesis.strategies import text, integers, composite, sampled_from, booleans
@@ -39,7 +38,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote):
         r3 = {r}{quote}placeholder3{quote}
         """.format(r='r' if raw else '', quote=quote * 3)
         new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0]
-        ns : Dict[str, Any] = {}
+        ns = {}
         exec(new_prog, ns)
         msg = "program was:\n{}".format(new_prog)
         self.assertEqual(ns['r'], 'placeholder', msg=msg)  # noqa: F821
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 81c385ae90a2..35ac4eb94889 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -47,8 +47,10 @@ def get_castable_tensor(shape, dtype):
             else:
                 # can't directly use min and max, because for int64_t, max - min
                 # is greater than int64_t range and triggers UB.
-                low = max(torch.iinfo(dtype).min, int(-1e10))
-                high = min(torch.iinfo(dtype).max, int(1e10))
+                dtype_info = torch.iinfo(dtype)
+                low = max(dtype_info.min, int(-1e10))
+                high = min(dtype_info.max, int(1e10))
+                dtype_info = torch.iinfo(dtype)
                 t = torch.empty(shape, dtype=torch.int64).random_(low, high)
             return t.to(dtype)
 
@@ -270,12 +272,10 @@ def test_numpy_array_interface(self, device):
         ]
         for tp, dtype in zip(types, dtypes):
             if np.dtype(dtype).kind == 'u':
-                # .type expects a XxxTensor, which have no type hints on
-                # purpose, so ignore during mypy type checking
-                x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
+                x = torch.Tensor([1, 2, 3, 4]).type(tp)
                 array = np.array([1, 2, 3, 4], dtype=dtype)
             else:
-                x = torch.Tensor([1, -2, 3, -4]).type(tp)  # type: ignore
+                x = torch.Tensor([1, -2, 3, -4]).type(tp)
                 array = np.array([1, -2, 3, -4], dtype=dtype)
 
             # Test __array__ w/o dtype argument
@@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device):
         float_types = [torch.DoubleTensor, torch.FloatTensor]
         float_dtypes = [np.float64, np.float32]
         for tp, dtype in zip(float_types, float_dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)
             array = np.array([1, 2, 3, 4], dtype=dtype)
             for func in ['sin', 'sqrt', 'ceil']:
                 ufunc = getattr(np, func)
@@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device):
 
         # Test functions with boolean return value
         for tp, dtype in zip(types, dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)
             array = np.array([1, 2, 3, 4], dtype=dtype)
             geq2_x = np.greater_equal(x, 2)
             geq2_array = np.greater_equal(array, 2).astype('uint8')
@@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device):
             self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val))
 
             # numpy integral type parses like a python int in custom python bindings:
-            self.assertEqual(torch.Storage(np_val).size(), scalar)  # type: ignore
+            self.assertEqual(torch.Storage(np_val).size(), scalar)
 
             tensor = torch.tensor([2], dtype=torch.int)
             tensor[0] = np_val
diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py
index 4dae7ebf03dc..9e46a9a84a37 100644
--- a/torch/testing/_internal/expecttest.py
+++ b/torch/testing/_internal/expecttest.py
@@ -3,7 +3,6 @@
 import traceback
 import os
 import string
-from typing import Tuple
 
 
 # This file implements expect tests (also known as "golden" tests).
@@ -140,8 +139,7 @@ def ok_for_raw_triple_quoted_string(s, quote):
                        r"(?P<raw>r?)", re.DOTALL)
 
 
-def replace_string_literal(src : str, lineno : int,
-                           new_string : str) -> Tuple[str, int]:
+def replace_string_literal(src, lineno, new_string):
     r"""
     Replace a triple quoted string literal with new contents.
     Only handles printable ASCII correctly at the moment.  This
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index 741c0841778a..c5d603885e4a 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence
+from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union
 import textwrap
 import torch
 from torch._C import TupleType, OptionalType, ListType
@@ -17,7 +17,7 @@ class InflatableArg(NamedTuple):
 
 def augment_model_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Optional[Sequence[Tuple[Any, ...]]] = None,
+        inputs: Optional[List[Tuple[Any, ...]]] = None,
         _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
 ) -> None:
     """Add bundled sample inputs to a model.

From 4a6c178f73f523410f237dc16da4eb1ed34a427d Mon Sep 17 00:00:00 2001
From: Leon Voland <31935319+leonvol@users.noreply.github.com>
Date: Tue, 5 Jan 2021 15:22:35 -0800
Subject: [PATCH 82/89] Improve documentation and warning message for creation
 of a tensor with from_numpy() (#49516)

Summary:
Implements very simple changes suggested in the short discussion of the issue. Updated documentation to inform user that creation of tensor with memory mapped read only numpy arrays will probably cause a crash of the program. The displayed warning message was also updated to contain the information about issues concerning the use of a memory mapped read only numpy array. Closes https://github.com/pytorch/pytorch/issues/46741.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49516

Reviewed By: mrshenli

Differential Revision: D25746115

Pulled By: mruberry

fbshipit-source-id: 3e534a8f2bc1f083a2835440d324bd6f30798ad4
---
 torch/_torch_docs.py              |  3 +++
 torch/csrc/utils/tensor_numpy.cpp | 11 ++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4a1c36df7497..7d34b080e252 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3080,6 +3080,9 @@ def merge_dicts(*dicts):
 ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
 and ``numpy.bool``.
 
+.. warning::     
+    Writing to a tensor created from a read-only NumPy array is not supported and 
+    will result in undefined behavior.
 Example::
 
     >>> a = numpy.array([1, 2, 3])
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index c2a67f8df06b..01471d3b62ac 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -136,11 +136,12 @@ at::Tensor tensor_from_numpy(PyObject* obj, bool warn_if_not_writeable/*=true*/)
   if (!PyArray_ISWRITEABLE(array) && warn_if_not_writeable) {
     TORCH_WARN_ONCE(
       "The given NumPy array is not writeable, and PyTorch does "
-      "not support non-writeable tensors. This means you can write to the "
-      "underlying (supposedly non-writeable) NumPy array using the tensor. "
-      "You may want to copy the array to protect its data or make it writeable "
-      "before converting it to a tensor. This type of warning will be "
-      "suppressed for the rest of this program.");
+      "not support non-writeable tensors. Writing to this tensor is not "
+      "supported and will result in undefined behavior. "
+      "You may want to copy the "
+      "array to protect its data or make it writeable before converting it to " 
+      "a tensor. "
+      "This type of warning will be suppressed for the rest of this program.");
 
   }
 

From 5e1c8f24d49efdde093c02bf6259a0f97d859c20 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Tue, 5 Jan 2021 15:28:04 -0800
Subject: [PATCH 83/89] Make stft (temporarily) warn (#50102)

Summary:
When continuing the deprecation process for stft it was made to throw an error when `use_complex` was not explicitly set by the user. Unfortunately this PR missed a model relying on the historic stft functionality. Before re-enabling the error we'll need to write an upgrader for that model.

This PR turns the error back into a warning to allow that model to continue running as before.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50102

Reviewed By: ngimel

Differential Revision: D25784325

Pulled By: mruberry

fbshipit-source-id: 825fb38af39b423ce11b376ad3c4a8b21c410b95
---
 aten/src/ATen/native/SpectralOps.cpp | 22 ++++++++++++----------
 test/test_spectral_ops.py            |  6 ++++--
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index c8eb3cc99a01..7f5211be9095 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -469,18 +469,20 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   const bool return_complex = return_complexOpt.value_or(
       self.is_complex() || (window.defined() && window.is_complex()));
   if (!return_complex) {
-    TORCH_CHECK(return_complexOpt.has_value(),
-        "stft requires the return_complex parameter be given for real inputs."
-        "You should pass return_complex=True to opt-in to complex dtype returns "
-        "(which will be required in a future pytorch release). "
+    if (!return_complexOpt.has_value()) {
+      TORCH_WARN_ONCE(
+        "stft will soon require the return_complex parameter be given for real inputs, "
+        "and will further require that return_complex=True in a future PyTorch release."
       );
+    }
 
-    TORCH_WARN_ONCE(
-        "stft with return_complex=False is deprecated. In a future pytorch "
-        "release, stft will return complex tensors for all inputs, and "
-        "return_complex=False will raise an error.\n"
-        "Note: you can still call torch.view_as_real on the complex output to "
-        "recover the old return format.");
+
+    // TORCH_WARN_ONCE(
+    //     "stft with return_complex=False is deprecated. In a future pytorch "
+    //     "release, stft will return complex tensors for all inputs, and "
+    //     "return_complex=False will raise an error.\n"
+    //     "Note: you can still call torch.view_as_real on the complex output to "
+    //     "recover the old return format.");
   }
 
   if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 6192d6c4d6b6..9082668e8596 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -1066,10 +1066,12 @@ def test_complex_stft_onesided(self, device):
         with self.assertRaisesRegex(RuntimeError, 'complex'):
             x.stft(10, pad_mode='constant', onesided=True)
 
+    # stft is currently warning that it requires return-complex while an upgrader is written
     def test_stft_requires_complex(self, device):
         x = torch.rand(100)
-        with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'):
-            y = x.stft(10, pad_mode='constant')
+        y = x.stft(10, pad_mode='constant')
+        # with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'):
+        #     y = x.stft(10, pad_mode='constant')
 
     @skipCUDAIfRocm
     @skipCPUIfNoMkl

From dcc83868c5fc15c44be698ad1fae3248e87016e7 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Tue, 5 Jan 2021 15:50:56 -0800
Subject: [PATCH 84/89] [PyTorch Mobile] Mark xnnpack operators selective

Summary: The remaining operator registrations that are not marked as selective. The size save is -12.2 KB for igios and -14 KB for fbios.

Test Plan: CI

Reviewed By: dhruvbird

Differential Revision: D25742543

fbshipit-source-id: 3e58789d36d216a52340c00b53e2f783ea2c9414
---
 .../native/xnnpack/RegisterOpContextClass.cpp | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
index e8442a64d0ad..da13fb9574d5 100644
--- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
+++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
@@ -73,21 +73,21 @@ TORCH_LIBRARY(xnnpack, m) {
 }
 
 TORCH_LIBRARY(prepacked, m) {
-  m.def("linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext");
-  m.def("linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y");
-  m.def("conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext");
-  m.def("conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext");
-  m.def("conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y");
-  m.def("conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y");
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y"));
 }
 
 TORCH_LIBRARY_IMPL(prepacked, CPU, m) {
-  m.impl("linear_clamp_prepack", TORCH_FN(createLinearClampPrePackOpContext));
-  m.impl("linear_clamp_run", TORCH_FN(internal::linear::linear_clamp_run));
-  m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext));
-  m.impl("conv2d_transpose_clamp_prepack", TORCH_FN(createConv2dTransposeClampPrePackOpContext));
-  m.impl("conv2d_clamp_run", TORCH_FN(internal::convolution2d::conv2d_clamp_run));
-  m.impl("conv2d_transpose_clamp_run", TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_prepack"), TORCH_FN(createLinearClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_run"), TORCH_FN(internal::linear::linear_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_prepack"), TORCH_FN(createConv2dClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_prepack"), TORCH_FN(createConv2dTransposeClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run));
 }
 
 } // namespace xnnpack

From 7fe25af59dc5307498d8ea00be76239708b9cb57 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 5 Jan 2021 16:17:29 -0800
Subject: [PATCH 85/89] Revert D25746115: [pytorch][PR] Improve documentation
 and warning message for creation of a tensor with from_numpy()

Test Plan: revert-hammer

Differential Revision:
D25746115 (https://github.com/pytorch/pytorch/commit/4a6c178f73f523410f237dc16da4eb1ed34a427d)

Original commit changeset: 3e534a8f2bc1

fbshipit-source-id: 12c921cf2d062794ce45afcaed1fbedc28dcdd01
---
 torch/_torch_docs.py              |  3 ---
 torch/csrc/utils/tensor_numpy.cpp | 11 +++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7d34b080e252..4a1c36df7497 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3080,9 +3080,6 @@ def merge_dicts(*dicts):
 ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
 and ``numpy.bool``.
 
-.. warning::     
-    Writing to a tensor created from a read-only NumPy array is not supported and 
-    will result in undefined behavior.
 Example::
 
     >>> a = numpy.array([1, 2, 3])
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index 01471d3b62ac..c2a67f8df06b 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -136,12 +136,11 @@ at::Tensor tensor_from_numpy(PyObject* obj, bool warn_if_not_writeable/*=true*/)
   if (!PyArray_ISWRITEABLE(array) && warn_if_not_writeable) {
     TORCH_WARN_ONCE(
       "The given NumPy array is not writeable, and PyTorch does "
-      "not support non-writeable tensors. Writing to this tensor is not "
-      "supported and will result in undefined behavior. "
-      "You may want to copy the "
-      "array to protect its data or make it writeable before converting it to " 
-      "a tensor. "
-      "This type of warning will be suppressed for the rest of this program.");
+      "not support non-writeable tensors. This means you can write to the "
+      "underlying (supposedly non-writeable) NumPy array using the tensor. "
+      "You may want to copy the array to protect its data or make it writeable "
+      "before converting it to a tensor. This type of warning will be "
+      "suppressed for the rest of this program.");
 
   }
 

From d1c375f0711090bb45aad7735ae3d2e346120403 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Tue, 5 Jan 2021 16:36:54 -0800
Subject: [PATCH 86/89] fix fork formatting (#49436)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49436

Test Plan: Imported from OSS

Reviewed By: tugsbayasgalan

Differential Revision: D25788166

Pulled By: eellison

fbshipit-source-id: e368b473ad64a1168be01fc674625415a07ff31c
---
 torch/jit/_async.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/torch/jit/_async.py b/torch/jit/_async.py
index 26bc6eeada67..ae9684a0e229 100644
--- a/torch/jit/_async.py
+++ b/torch/jit/_async.py
@@ -17,7 +17,7 @@
 
 
 def fork(func, *args, **kwargs):
-    """
+    r"""
     Creates an asynchronous task executing `func` and a reference to the value
     of the result of this execution. `fork` will return immediately,
     so the return value of `func` may not have been computed yet. To force completion
@@ -42,7 +42,8 @@ def fork(func, *args, **kwargs):
 
     Example (fork a free function):
 
-    .. testcode::
+    .. code-block:: python
+
         import torch
         from torch import Tensor
         def foo(a : Tensor, b : int) -> Tensor:
@@ -60,16 +61,17 @@ def bar(a):
 
     Example (fork a module method):
 
-    .. testcode::
+    .. code-block:: python
+
         import torch
         from torch import Tensor
-        class SubMod(torch.nn.Module):
+        class AddMod(torch.nn.Module):
             def forward(self, a: Tensor, b : int):
                 return a + b
         class Mod(torch.nn.Module):
             def __init__(self):
                 super(self).__init__()
-                self.mod = SubMod()
+                self.mod = AddMod()
             def forward(self, input):
                 fut = torch.jit.fork(self.mod, a, b=2)
                 return torch.jit.wait(fut)
@@ -81,7 +83,7 @@ def forward(self, input):
 
 
 def wait(future):
-    """
+    r"""
     Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the
     result of the task. See :func:`~fork` for docs and examples.
     Args:

From 5d93e2b8180a495b6e78467ac470758134aac983 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Tue, 5 Jan 2021 16:50:02 -0800
Subject: [PATCH 87/89] torch.flip and torch.flip{lr, ud}: Half support for CPU
 and BFloat16 support for CPU & CUDA (#49895)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49889

Also adds BFloat16 support for CPU and CUDA

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49895

Reviewed By: mrshenli

Differential Revision: D25746272

Pulled By: mruberry

fbshipit-source-id: 0b6a9bc13ae60c22729a0aea002ed857c36f14ff
---
 .../src/ATen/native/TensorTransformations.cpp |  2 +-
 .../ATen/native/cuda/TensorTransformations.cu |  4 +-
 test/test_shape_ops.py                        | 20 ++++++--
 .../_internal/common_methods_invocations.py   | 49 ++++++++++++++++---
 4 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index fdee519c4bd0..5c6ab40b0ad4 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -73,7 +73,7 @@ Tensor flip_cpu(const Tensor& self, IntArrayRef dims) {
       );
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool,
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16,
                                           in_tensor.scalar_type(),
                                           "flip_cpu", [&] {
       flip_cpu_kernel<scalar_t>(
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index a435c7060f45..9dfa4e8759cf 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -87,7 +87,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
 
   // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
   if (flip_dims_size == 1 && in_tensor.is_contiguous() && (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, in_tensor.scalar_type(), "flip_cuda", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] {
       auto in_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(in_tensor);
       auto out_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(out_tensor);
       int flip_dim = in_tensor_info.collapseDims(flip_dims[0]);
@@ -123,7 +123,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
     }
   }
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] {
     flip_cuda_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
       in_tensor.data_ptr<scalar_t>(), out_tensor.data_ptr<scalar_t>(), N,
       flip_dims_t.cuda().data_ptr<int64_t>(),
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 43321508e0e2..f7da08eb24d7 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -378,21 +378,31 @@ def test_flip(self, device):
             self.assertEqual(size, list(data.flip(ds).size()))
 
         # test rectangular case
-        data = torch.tensor([1, 2, 3, 4, 5, 6]).view(2, 3).to(device)
-        flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]]).to(device)
-        flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]]).to(device)
+        data = torch.tensor([1, 2, 3, 4, 5, 6], device=device).view(2, 3)
+        flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]], device=device)
+        flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]], device=device)
 
         self.assertEqual(flip0_result, data.flip(0))
         self.assertEqual(flip1_result, data.flip(1))
 
         # test empty tensor, should just return an empty tensor of the same shape
-        data = torch.tensor([])
+        data = torch.tensor((), device=device)
         self.assertEqual(data, data.flip(0))
 
         # test bool tensor
-        a = torch.tensor([False, True])
+        a = torch.tensor([False, True], device=device)
         self.assertEqual(a.flip(0), torch.tensor([True, False]))
 
+        # case: dims=()
+        a = torch.randn(3, 2, 1, device=device)
+        if device == 'cpu':
+            self.assertEqual(a.flip(dims=()), a)
+        else:
+            # Reference: https://github.com/pytorch/pytorch/issues/49982
+            with self.assertRaisesRegex(IndexError,
+                                        "flip dims size out of range, got flip dims size=0"):
+                a.flip(dims=())
+
     def _rand_shape(self, dim, min_size, max_size):
         shape = []
         for i in range(dim):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0557ccc9560f..a49a5101a649 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -617,6 +617,30 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
     return out
 
 
+def sample_inputs_flip(op_info, device, dtype, requires_grad):
+    tensors = (
+        make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad)
+    )
+
+    dims = ((0, 1, 2), (0,), (0, 2), (-1,))
+
+    # On CUDA, `dims=()` errors out with IndexError
+    # Reference: https://github.com/pytorch/pytorch/issues/49982
+    if device == 'cpu':
+        dims = dims + ((),)  # type: ignore
+
+    samples = [SampleInput(tensor, kwargs={'dims': dim}) for tensor, dim in product(tensors, dims)]
+
+    return samples
+
+def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
+    tensors = (
+        make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad)
+    )
+    return [SampleInput(tensor) for tensor in tensors]
+
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
@@ -865,6 +889,24 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      default_test_dtypes=floating_and_complex_types(),
                      supports_tensor_out=False,
                      test_inplace_grad=False,),
+    OpInfo('flip',
+           op=torch.flip,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_flip,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
+    OpInfo('fliplr',
+           op=torch.fliplr,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
+    OpInfo('flipud',
+           op=torch.flipud,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, float('inf')),
@@ -1537,13 +1579,6 @@ def method_tests():
         ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
         ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'),
         ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),
-        ('flip', (S, S, S), ([0],), 'd0'),
-        ('flip', (S, S, S), ([0, 1, 2],), 'd012'),
-        ('flip', (S, S, S), ([0, 2],), 'd02'),
-        ('flip', (S, S, S), ([2, 0],), 'd20'),
-        ('flip', (S, S, S), ([-1],), 'neg_d'),
-        ('fliplr', (S, S, S), ()),
-        ('flipud', (S, S, S), ()),
         ('roll', (S, S, S), (0, 0), 'd0'),
         ('roll', (S, S, S), (1, 2), 'd12'),
         ('roll', (S, S, S), (0, 2,), 'd02'),

From 26391143b6578c24131b652753e12d617008d0f0 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 5 Jan 2021 17:15:37 -0800
Subject: [PATCH 88/89] Support out argument in torch.fft ops (#49335)

Summary:
Ref https://github.com/pytorch/pytorch/issues/42175

This adds out argument support to all functions in the `torch.fft` namespace except for `fftshift` and `ifftshift` because they rely on `at::roll` which doesn't have an out argument version.

Note that there's no general way to do the transforms directly into the output since both cufft and mkl-fft only support single batch dimensions. At a minimum, the output may need to be re-strided which I don't think is expected from `out` arguments normally. So, on cpu this just copies the result into the out tensor. On cuda, the normalization is changed to call `at::mul_out` instead of an inplace multiply.

If it's desirable, I could add a special case to transform into the output when `out.numel() == 0` since there's no expectation to preserve the strides in that case anyway. But that would lead to the slightly odd situation where `out` having the correct shape follows a different code path from `out.resize_(0)`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49335

Reviewed By: mrshenli

Differential Revision: D25756635

Pulled By: mruberry

fbshipit-source-id: d29843f024942443c8857139a2abdde09affd7d6
---
 aten/src/ATen/native/SpectralOps.cpp          | 249 +++++++++++++++---
 aten/src/ATen/native/cuda/SpectralOps.cu      |  52 +++-
 aten/src/ATen/native/mkl/SpectralOps.cpp      |  48 ++++
 aten/src/ATen/native/native_functions.yaml    | 101 +++++++
 caffe2/CMakeLists.txt                         |   1 +
 test/test_spectral_ops.py                     |  46 +++-
 .../templates/python_fft_functions.cpp        |   2 +
 .../templates/python_torch_functions.cpp      |  33 +--
 tools/build_variables.bzl                     |   1 +
 torch/csrc/utils/out_types.cpp                |  39 +++
 torch/csrc/utils/out_types.h                  |  14 +
 .../_internal/common_methods_invocations.py   |  20 +-
 12 files changed, 511 insertions(+), 95 deletions(-)
 create mode 100644 torch/csrc/utils/out_types.cpp
 create mode 100644 torch/csrc/utils/out_types.h

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 7f5211be9095..289d1128d2f9 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -102,9 +102,12 @@ Tensor resize_fft_input(Tensor x, IntArrayRef dims, IntArrayRef sizes) {
 }
 
 // Complex to real FFT
-Tensor fft_c2r(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_c2r(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward) {
+  TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name,
+              " expects a floating point output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input, /*require_complex=*/true);
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
@@ -118,14 +121,22 @@ Tensor fft_c2r(Tensor input, c10::optional<int64_t> n_opt,
     // FIXME: _fft does not support complex_output=false with inverse=false
     input = at::conj(input);
   }
-  return at::_fft_c2r(input, dim, static_cast<int64_t>(norm), n);
+  if (out.defined()) {
+    return at::_fft_c2r_out(out, input, dim, static_cast<int64_t>(norm), n);
+  } else {
+    return at::_fft_c2r(input, dim, static_cast<int64_t>(norm), n);
+  }
 }
 
 // Real to complex FFT
-Tensor fft_r2c(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_r2c(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward, bool onesided) {
-  TORCH_CHECK(!input.is_complex(), "Expected a real input tensor to FFT");
+  TORCH_CHECK(!input.is_complex(), function_name,
+              " expects a real input tensor, but got ", input.scalar_type());
+  TORCH_CHECK(!out.defined() || out.is_complex(), function_name,
+              " expects a complex output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input);
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
@@ -136,19 +147,29 @@ Tensor fft_r2c(Tensor input, c10::optional<int64_t> n_opt,
   }
 
   const auto norm = norm_from_string(norm_str, forward);
-  auto out = at::_fft_r2c(input, dim, static_cast<int64_t>(norm), onesided);
+
+  Tensor ret;
+  if (out.defined() && forward) {
+    ret = at::_fft_r2c_out(out, input, dim, static_cast<int64_t>(norm), onesided);
+  } else {
+    ret = at::_fft_r2c(input, dim, static_cast<int64_t>(norm), onesided);
+  }
+
   if (!forward) {
     // FIXME: _fft_r2c doesn't support native r2c IFFT
-    out = at::conj(out);
+    return out.defined() ? at::conj_out(out, ret) : at::conj(ret);
+  } else {
+    return ret;
   }
-  return out;
 }
 
 // Complex to complex FFT
-Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_c2c(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward) {
-  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+  TORCH_CHECK(input.is_complex(), function_name,
+              " expects a complex input tensor, but got ", input.scalar_type());
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
   const auto n = n_opt.value_or(input.sizes()[dim]);
@@ -157,7 +178,13 @@ Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
     input = resize_fft_input(input, dim, n);
   }
   const auto norm = norm_from_string(norm_str, forward);
-  return at::_fft_c2c(input, dim, static_cast<int64_t>(norm), forward);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), function_name,
+                " expects a complex output tensor, but got ", out.scalar_type());
+    return at::_fft_c2c_out(out, input, dim, static_cast<int64_t>(norm), forward);
+  } else {
+    return at::_fft_c2c(input, dim, static_cast<int64_t>(norm), forward);
+  }
 }
 
 // Dimensions to transform, and the signal shape in those dimensions
@@ -230,12 +257,18 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args(
 
 // Complex to complex n-dimensional fft
 Tensor fftn_c2c(
-    const Tensor& input, IntArrayRef shape, IntArrayRef dim,
-    c10::optional<std::string> norm_str, bool forward) {
-  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+    c10::string_view function_name,
+    Tensor out, const Tensor& input, IntArrayRef shape,
+    IntArrayRef dim, c10::optional<std::string> norm_str, bool forward) {
+  TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type());
   Tensor x = resize_fft_input(input, dim, shape);
   const auto norm = norm_from_string(norm_str, forward);
-  return at::_fft_c2c(x, dim, static_cast<int64_t>(norm), forward);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), function_name, " expects a complex output tensor, but got ", out.scalar_type());
+    return at::_fft_c2c_out(out, x, dim, static_cast<int64_t>(norm), forward);
+  } else {
+    return at::_fft_c2c(x, dim, static_cast<int64_t>(norm), forward);
+  }
 }
 
 }  // namespace (anonymous)
@@ -244,35 +277,79 @@ Tensor fftn_c2c(
 Tensor fft_fft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                c10::optional<std::string> norm) {
   return self.is_complex() ?
-    fft_c2c(self, n, dim, norm, /*forward=*/true) :
-    fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+    fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) :
+    fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+}
+
+Tensor& fft_fft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                    int64_t dim, c10::optional<std::string> norm) {
+  if (self.is_complex()) {
+    fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true);
+  } else {
+    fft_r2c("fft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+  }
+  return out;
 }
 
 Tensor fft_ifft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
   return self.is_complex() ?
-    fft_c2c(self, n, dim, norm, /*forward=*/false) :
-    fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+    fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) :
+    fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+}
+
+Tensor& fft_ifft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  if (self.is_complex()) {
+    fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false);
+  } else {
+    fft_r2c("ifft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+  }
+  return out;
 }
 
 Tensor fft_rfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+  return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+}
+
+Tensor& fft_rfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+  return out;
 }
 
 Tensor fft_irfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                  c10::optional<std::string> norm) {
-  return fft_c2r(self, n, dim, norm, /*forward=*/false);
+  return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false);
+}
+
+Tensor& fft_irfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                  int64_t dim, c10::optional<std::string> norm) {
+  fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false);
+  return out;
 }
 
 Tensor fft_hfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return fft_c2r(self, n, dim, norm, /*forward=*/true);
+  return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true);
+}
+
+Tensor& fft_hfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true);
+  return out;
 }
 
 Tensor fft_ihfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                  c10::optional<std::string> norm) {
-  return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+  return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+}
+
+Tensor& fft_ihfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+  return out;
 }
 
 Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -281,7 +358,18 @@ Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
-  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true);
+  return fftn_c2c("fftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor& fft_fftn_out(Tensor& out, const Tensor& self,
+                     c10::optional<IntArrayRef> s,
+                     c10::optional<IntArrayRef> dim,
+                     c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  // TODO: For real input, perform rfftn then mirror with conjugate symmetry
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  fftn_c2c("fftn", out, input, desc.shape, desc.dim, norm, /*forward=*/true);
+  return out;
 }
 
 Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -289,24 +377,55 @@ Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
                 c10::optional<std::string> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
-  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false);
+  return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false);
 }
 
-Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
-                c10::optional<IntArrayRef> dim,
-                c10::optional<std::string> norm_str) {
+Tensor& fft_ifftn_out(Tensor& out, const Tensor& self,
+                      c10::optional<IntArrayRef> s,
+                      c10::optional<IntArrayRef> dim,
+                      c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false);
+  return out;
+}
+
+static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
+                             c10::optional<IntArrayRef> s,
+                             c10::optional<IntArrayRef> dim,
+                             const c10::optional<std::string>& norm_str) {
   TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type());
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis");
   Tensor input = promote_tensor_fft(self, /*require_complex=*/false);
   Tensor x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = norm_from_string(norm_str, /*forward=*/true);
-  return at::_fft_r2c(x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), "rfftn expects a complex-valued output tensor, but got ", out.scalar_type());
+    return at::_fft_r2c_out(out, x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  } else {
+    return at::_fft_r2c(x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  }
 }
 
-Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
                 c10::optional<IntArrayRef> dim,
                 c10::optional<std::string> norm_str) {
+  return fft_rfftn_impl({}, self, s, dim, norm_str);
+}
+
+Tensor& fft_rfftn_out(Tensor& out, const Tensor& self,
+                      c10::optional<IntArrayRef> s,
+                      c10::optional<IntArrayRef> dim,
+                      c10::optional<std::string> norm_str) {
+  fft_rfftn_impl(out, self, s, dim, norm_str);
+  return out;
+}
+
+static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
+                              c10::optional<IntArrayRef> s,
+                              c10::optional<IntArrayRef> dim,
+                              const c10::optional<std::string>& norm_str) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis");
 
@@ -323,7 +442,27 @@ Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
   Tensor x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = norm_from_string(norm_str, /*forward=*/false);
-  return at::_fft_c2r(x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_floating_point(), "irfftn expects a floating point output tensor, but got ", out.scalar_type());
+    return at::_fft_c2r_out(out, x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  } else {
+    return at::_fft_c2r(x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  }
+}
+
+Tensor fft_irfftn(const Tensor& self,
+                  c10::optional<IntArrayRef> s,
+                  c10::optional<IntArrayRef> dim,
+                  c10::optional<std::string> norm_str) {
+  return fft_irfftn_impl({}, self, s, dim, norm_str);
+}
+
+Tensor& fft_irfftn_out(Tensor& out, const Tensor& self,
+                       c10::optional<IntArrayRef> s,
+                       c10::optional<IntArrayRef> dim,
+                       c10::optional<std::string> norm_str) {
+  fft_irfftn_impl(out, self, s, dim, norm_str);
+  return out;
 }
 
 Tensor fft_fft2(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -331,41 +470,69 @@ Tensor fft_fft2(const Tensor& self, c10::optional<IntArrayRef> s,
   return native::fft_fftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_fft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                     IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_fftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_ifft2(const Tensor& self, c10::optional<IntArrayRef> s,
                 IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_ifftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_ifft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                      IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_ifftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_rfft2(const Tensor& self, c10::optional<IntArrayRef> s,
                 IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_rfftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_rfft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                      IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_rfftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_irfft2(const Tensor& self, c10::optional<IntArrayRef> s,
                   IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_irfftn(self, s, dim, std::move(norm));
 }
 
-Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
-  ScalarType dtype = typeMetaToScalarType(options.dtype());
+Tensor& fft_irfft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                       IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_irfftn_out(out, self, s, dim, std::move(norm));
+}
+
+Tensor& fft_fftfreq_out(Tensor& out, int64_t n, double d) {
+  ScalarType dtype = out.scalar_type();
   TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
               "fftfreq requires a floating point or complex dtype");
   // TODO: arange doesn't have complex support
-  Tensor result = native::arange(n, options);
-  auto right_slice = result.slice(0, (n + 1) / 2, 0);
+  at::arange_out(out, n);
+  auto right_slice = out.slice(0, (n + 1) / 2, 0);
   at::arange_out(right_slice, -(n/2), 0, 1);
-  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
-  return result;
+  return out.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
 }
 
-Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
-  ScalarType dtype = typeMetaToScalarType(options.dtype());
+Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
+  auto out = at::empty({n}, options);
+  return native::fft_fftfreq_out(out, n, d);
+}
+
+Tensor& fft_rfftfreq_out(Tensor& out, int64_t n, double d) {
+  ScalarType dtype = out.scalar_type();
   TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
               "rfftfreq requires a floating point or complex dtype");
   // TODO: arange doesn't have complex support
-  Tensor result = native::arange(n/2 + 1, options);
-  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
-  return result;
+  native::arange_out(out, n/2 + 1);
+  return out.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+}
+
+Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  auto out = at::empty({n/2 + 1}, options);
+  return native::fft_rfftfreq_out(out, n, d);
 }
 
 // If an array dim is specified, wraps them according to self.dim().
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index db3e853a9321..e5e91cea4ccc 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -7,6 +7,7 @@
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
@@ -439,10 +440,10 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
 
 // Calculates the normalization constant and applies it in-place to self
 // sizes is the sizes of a twosided tensor and dims are all transformed dims
-void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
   auto norm = static_cast<fft_norm_mode>(normalization);
   if (norm == fft_norm_mode::none) {
-    return;
+    return 1.0;
   }
 
   int64_t signal_numel = 1;
@@ -451,7 +452,17 @@ void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArra
   }
   const double scale_denom = (norm == fft_norm_mode::by_root_n) ?
     std::sqrt(signal_numel) : static_cast<double>(signal_numel);
-  self.div_(scale_denom);
+  return 1.0 / scale_denom;
+}
+
+const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return (scale == 1.0) ? self : self.mul_(scale);
+}
+
+Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return at::mul_out(out, self, c10::scalar_to_tensor(scale));
 }
 
 }  // namespace (anonymous)
@@ -522,6 +533,23 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   return output;
 }
 
+Tensor& _fft_r2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool onesided) {
+  auto result = _fft_r2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), /*onesided=*/true);
+  if (onesided) {
+    return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim);
+  }
+
+  resize_output(out, self.sizes());
+
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = result.sizes()[last_dim];
+  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
+  _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim);
+  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
+  return out;
+}
+
 // n-dimensional complex to real IFFT
 Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) {
   TORCH_CHECK(self.is_complex());
@@ -544,8 +572,13 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   // TODO: could transform up to 2 other dims in the same cuFFT operation
   auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
   _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false);
-  _fft_apply_normalization(output, normalization, out_sizes, dim);
-  return output;
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2r_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, int64_t lastdim) {
+  auto result = _fft_c2r_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), lastdim);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
 }
 
 // n-dimensional complex to complex FFT/IFFT
@@ -586,8 +619,13 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
     }
   }
 
-  _fft_apply_normalization(output, normalization, out_sizes, dim);
-  return output;
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool forward) {
+  auto result = _fft_c2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), forward);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
 }
 
 
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 8fca9ad9ecdf..d5a39e45941b 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/Config.h>
 
@@ -21,6 +22,21 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   AT_ERROR("fft: ATen not compiled with MKL support");
 }
 
+Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool onesided) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
+Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         int64_t last_dim_size) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
+Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool forward) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
 }}
 
 #else // AT_MKL_ENABLED
@@ -381,6 +397,13 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false);
 }
 
+Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         int64_t last_dim_size) {
+  auto result = _fft_c2r_mkl(self, dim, normalization, last_dim_size);
+  resize_output(out, result.sizes());
+  return out.copy_(result);
+}
+
 // n-dimensional real to complex FFT
 Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
   TORCH_CHECK(self.is_floating_point());
@@ -402,6 +425,24 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return out;
 }
 
+Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool onesided) {
+  auto result = _fft_r2c_mkl(self, dim, normalization, /*onesided=*/true);
+  if (onesided) {
+    resize_output(out, result.sizes());
+    return out.copy_(result);
+  }
+
+  resize_output(out, self.sizes());
+
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = result.sizes()[last_dim];
+  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
+  out_slice.copy_(result);
+  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
+  return out;
+}
+
 // n-dimensional complex to complex FFT/IFFT
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
   TORCH_CHECK(self.is_complex());
@@ -410,6 +451,13 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward);
 }
 
+Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool forward) {
+  auto result = _fft_c2c_mkl(self, dim, normalization, forward);
+  resize_output(out, result.sizes());
+  return out.copy_(result);
+}
+
 }} // namespace at::native
 
 #endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index da5f2d10cfd3..215ca70bfbae 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2197,6 +2197,13 @@
     CPU: _fft_r2c_mkl
     CUDA: _fft_r2c_cufft
 
+- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl_out
+    CUDA: _fft_r2c_cufft_out
+
 # Complex to real inverse FFT
 - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
   use_c10_dispatcher: full
@@ -2205,6 +2212,13 @@
     CPU: _fft_c2r_mkl
     CUDA: _fft_c2r_cufft
 
+- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl_out
+    CUDA: _fft_c2r_cufft_out
+
 # Standard complex to complex FFT (forward or backward)
 - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor
   use_c10_dispatcher: full
@@ -2213,6 +2227,13 @@
     CPU: _fft_c2c_mkl
     CUDA: _fft_c2c_cufft
 
+- func: _fft_c2c.out(Tensor self, int[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl_out
+    CUDA: _fft_c2c_cufft_out
+
 - func: _cufft_get_plan_cache_size(int device_index) -> int
   use_c10_dispatcher: full
 
@@ -9889,81 +9910,161 @@
   python_module: fft
   variants: function
 
+- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
   variants: function
 
+- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
+- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
+- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
   use_c10_dispatcher: full
   python_module: fft
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4fcf86be55e2..191a7ca26835 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -479,6 +479,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   # This one needs to be unconditionally added as Functions.cpp is also unconditionally added
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp
   )
 
   if(NOT INTERN_DISABLE_AUTOGRAD)
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 9082668e8596..085af5294a04 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -225,13 +225,13 @@ def test_empty_fft(self, device, dtype):
     def test_fft_invalid_dtypes(self, device):
         t = torch.randn(64, device=device, dtype=torch.complex128)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+        with self.assertRaisesRegex(RuntimeError, "rfft expects a real input tensor"):
             torch.fft.rfft(t)
 
         with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input tensor"):
             torch.fft.rfftn(t)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+        with self.assertRaisesRegex(RuntimeError, "ihfft expects a real input tensor"):
             torch.fft.ihfft(t)
 
     @skipCUDAIfRocm
@@ -332,6 +332,27 @@ def test_fft_backward(self, device, dtype):
                 args = args[1:]
                 self._fft_grad_check_helper(fname, input, args)
 
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    def test_fft_invalid_out_types(self, device):
+
+        complex_fft_funcs = [torch.fft.fft, torch.fft.ifft, torch.fft.fftn, torch.fft.ifftn,
+                             torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft]
+        real_fft_funcs = [torch.fft.irfft, torch.fft.irfftn, torch.fft.hfft]
+        fft_funcs = complex_fft_funcs + real_fft_funcs
+
+        # Test errors on invalid out dtypes
+        x = torch.rand(10, device=device, dtype=torch.float32)
+        for out_dtype, funcs in [(torch.int16, fft_funcs),
+                                 (torch.float32, complex_fft_funcs),
+                                 (torch.complex64, real_fft_funcs)]:
+            out = torch.empty((), device=device, dtype=out_dtype)
+
+            for func in funcs:
+                with self.assertRaisesRegex(RuntimeError, "expects a .* output tensor"):
+                    func(x, out=out)
+
     # nd-fft tests
 
     @skipCPUIfNoMkl
@@ -463,10 +484,10 @@ def test_fftn_invalid(self, device):
                      torch.fft.rfftn, torch.fft.irfftn)
 
         for func in fft_funcs:
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(0, 1, 0))
 
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(2, -1))
 
             with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
@@ -578,10 +599,10 @@ def test_fft2_invalid(self, device):
                      torch.fft.rfft2, torch.fft.irfft2)
 
         for func in fft_funcs:
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(0, 0))
 
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(2, -1))
 
             with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
@@ -623,6 +644,19 @@ def test_fftfreq_numpy(self, device, dtype):
                 actual = torch_fn(*args, device=device, dtype=dtype)
                 self.assertEqual(actual, expected, exact_dtype=False)
 
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float, torch.double)
+    def test_fftfreq_out(self, device, dtype):
+        for func in (torch.fft.fftfreq, torch.fft.rfftfreq):
+            expect = func(n=100, d=.5, device=device, dtype=dtype)
+            actual = torch.empty((), device=device, dtype=dtype)
+            with self.assertWarnsRegex(UserWarning, "out tensor will be resized"):
+                func(n=100, d=.5, out=actual)
+            self.assertEqual(actual, expect)
+
+
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
     @onlyOnCPUAndCUDA
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 49be92d30d35..a77547a6cc07 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -8,6 +8,7 @@
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/autograd/utils/python_arg_parsing.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
@@ -30,6 +31,7 @@ using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
 
+using torch::utils::check_out_type_matches;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index e05e6fbe1975..c42a869b3a98 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -19,6 +19,7 @@
 #include "torch/csrc/Dtype.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/out_types.h"
 #include "torch/csrc/utils/pybind.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
@@ -53,43 +54,13 @@ using at::Dimname;
 using at::DimnameList;
 using at::ArrayRef;
 
+using torch::utils::check_out_type_matches;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
 
 static PyObject* THPVariableFunctionsModule = NULL;
 
-static void check_out_type_matches(Tensor result,
-                                   ScalarType scalarType, bool scalarType_is_none,
-                                   c10::optional<at::Layout> layout,
-                                   const Device& device, bool device_is_none) {
-  if (scalarType_is_none && !layout && device_is_none) {  // common case
-    return;
-  }
-  if (!scalarType_is_none && result.scalar_type() != scalarType) {
-    AT_ERROR(
-        "dtype ", scalarType,
-        " does not match dtype of out parameter (", result.scalar_type(), ")");
-  }
-  auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType;
-  auto device_type_arg = device_is_none ? result.device().type() : device.type();
-  if (result.scalar_type() != scalarType_arg) {
-    AT_ERROR(
-        "scalar type ", scalarType_arg,
-        " does not match scalar type of out parameter (", result.scalar_type(), ")");
-  }
-  if (layout && result.layout() != *layout) {
-    AT_ERROR(
-        "layout ", *layout,
-        " does not match layout of out parameter (", result.layout(), ")");
-  }
-  if (result.device().type() != device_type_arg) {
-    AT_ERROR(
-        "device type ", device_type_arg,
-        " does not match device type of out parameter (", result.device().type(), ")");
-  }
-}
-
 inline Tensor dispatch_arange(Scalar end, Tensor result) {
   pybind11::gil_scoped_release no_gil;
   return at::arange_out(result, end);
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 8eeffe724c8e..dc05ace7c542 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -351,6 +351,7 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
     "torch/csrc/jit/serialization/export_module.cpp",
     "torch/csrc/jit/serialization/import_legacy.cpp",
     "torch/csrc/utils/byte_order.cpp",
+    "torch/csrc/utils/out_types.cpp",
 ]
 
 def libtorch_sources(gencode_pattern = ":generate-code[{}]"):
diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp
new file mode 100644
index 000000000000..0ceeb43bd1f8
--- /dev/null
+++ b/torch/csrc/utils/out_types.cpp
@@ -0,0 +1,39 @@
+#include <torch/csrc/utils/out_types.h>
+
+namespace torch {
+namespace utils {
+
+// Used by python binding codegen to ensure any TensorOptions arguments are consistent
+// with the out tensor's options
+void check_out_type_matches(const at::Tensor& result,
+                            at::ScalarType scalarType, bool scalarType_is_none,
+                            c10::optional<at::Layout> layout,
+                            const at::Device& device, bool device_is_none) {
+  if (scalarType_is_none && !layout && device_is_none) {  // common case
+    return;
+  }
+  if (!scalarType_is_none && result.scalar_type() != scalarType) {
+    AT_ERROR(
+        "dtype ", scalarType,
+        " does not match dtype of out parameter (", result.scalar_type(), ")");
+  }
+  auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType;
+  auto device_type_arg = device_is_none ? result.device().type() : device.type();
+  if (result.scalar_type() != scalarType_arg) {
+    AT_ERROR(
+        "scalar type ", scalarType_arg,
+        " does not match scalar type of out parameter (", result.scalar_type(), ")");
+  }
+  if (layout && result.layout() != *layout) {
+    AT_ERROR(
+        "layout ", *layout,
+        " does not match layout of out parameter (", result.layout(), ")");
+  }
+  if (result.device().type() != device_type_arg) {
+    AT_ERROR(
+        "device type ", device_type_arg,
+        " does not match device type of out parameter (", result.device().type(), ")");
+  }
+}
+
+}}
diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h
new file mode 100644
index 000000000000..adc3686a6b97
--- /dev/null
+++ b/torch/csrc/utils/out_types.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch {
+namespace utils {
+
+TORCH_API void check_out_type_matches(
+    const at::Tensor& result,
+    at::ScalarType scalarType, bool scalarType_is_none,
+    c10::optional<at::Layout> layout,
+    const at::Device& device, bool device_is_none);
+
+}}
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a49a5101a649..119750396f1e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -812,7 +812,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.fftn',
                      aten_name='fft_fftn',
@@ -820,7 +820,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,
                      decorators=[precisionOverride(
                          {torch.float: 1e-4, torch.cfloat: 1e-4})],),
@@ -830,7 +830,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.rfft',
                      aten_name='fft_rfft',
@@ -838,7 +838,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.rfftn',
                      aten_name='fft_rfftn',
@@ -846,7 +846,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=True,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,
                      decorators=[precisionOverride({torch.float: 1e-4})],),
     SpectralFuncInfo('fft.ifft',
@@ -855,7 +855,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.ifftn',
                      aten_name='fft_ifftn',
@@ -863,7 +863,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.ihfft',
                      aten_name='fft_ihfft',
@@ -871,7 +871,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.irfft',
                      aten_name='fft_irfft',
@@ -879,7 +879,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.irfftn',
                      aten_name='fft_irfftn',
@@ -887,7 +887,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     OpInfo('flip',
            op=torch.flip,

From 70734f12607bdbb649e8d1b3d982781cbb879393 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 5 Jan 2021 18:08:10 -0800
Subject: [PATCH 89/89] Kill AT_SKIP_BFLOAT16_IF_NOT_ROCM (#48810)

Summary:
Dependency:
https://github.com/pytorch/pytorch/pull/48809 https://github.com/pytorch/pytorch/pull/48807 https://github.com/pytorch/pytorch/pull/48806 https://github.com/pytorch/pytorch/pull/48805 https://github.com/pytorch/pytorch/pull/48801 https://github.com/pytorch/pytorch/pull/44994 https://github.com/pytorch/pytorch/pull/44848

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48810

Reviewed By: mruberry

Differential Revision: D25772955

Pulled By: ngimel

fbshipit-source-id: 353f130eb701f8b338a826d2edaea69e6e644ee9
---
 aten/src/ATen/Dispatch.h | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 41252609953f..2e663b4f48dd 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -93,26 +93,6 @@ inline constexpr bool should_include_kernel_dtype(
     return __VA_ARGS__();                                                         \
   }
 
-// This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and
-// should be removed once the bfloat16 bringup is complete on other platforms.
-// This is supposed to be used as a wrapper around the lambda function passed to
-// the dispatch macro and will conditionally dispatch ops with bfloat16 type
-// only on ROCm.
-#if !defined(__HIP_PLATFORM_HCC__)
-#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) \
-  if (std::is_same<SCALARTYPE, at::BFloat16>::value) {      \
-    AT_ERROR(                                               \
-        #NAME,                                              \
-        " not implemented for '",                           \
-        toString(at::ScalarType::BFloat16),                 \
-        "'");                                               \
-  } else {                                                  \
-    return __VA_ARGS__();                                   \
-  }
-#else
-#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) return __VA_ARGS__()
-#endif
-
 namespace detail {
 
 inline at::ScalarType scalar_type(at::ScalarType s) {