From acd72e79a314da0ec6905088f1141a77f868ae4c Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 15 Dec 2020 21:45:18 -0800
Subject: [PATCH 01/34] update breathe (#49407)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/47462, but not completely.

Update breathe to the latest version to get fixes for the "Unable to resolve..." issues. There are still some build errors, but much fewer than before.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49407

Reviewed By: izdeby

Differential Revision: D25562163

Pulled By: glaringlee

fbshipit-source-id: 91bfd9e9ac70723816309f489022d72853f5fdc5
---
 docs/cpp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt
index 452aa3eadad0..731a0475be79 100644
--- a/docs/cpp/requirements.txt
+++ b/docs/cpp/requirements.txt
@@ -1,5 +1,5 @@
 sphinx==3.1.2
-breathe==4.19.2
+breathe==4.25.0
 exhale==0.2.3
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 bs4

From cbeb4c25e53e90b179f0ded89cafb4cbe140f236 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Tue, 15 Dec 2020 23:07:53 -0800
Subject: [PATCH 02/34] [StaticRuntime] Permute_out (#49447)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49447

Adding an out variant for `permute`. It's better than fixing the copy inside contiguous because 1) we can leverage the c2 math library, 2) contiguous creates a tensor inside the function which isn't managed by the MemoryPlanner in StaticRuntime

Test Plan:
Benchmark:
```
After:
I1214 12:35:32.218775 991920 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.0902339. Iters per second: 11082.3

Before:
I1214 12:35:43.368770 992620 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.0961521. Iters per second: 10400.2
```

Reviewed By: yinghai

Differential Revision: D25541666

fbshipit-source-id: 013ed0d4080cd01de4d3e1b031ab51e5032e6651
---
 torch/csrc/jit/runtime/static/ops.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 57db79699e07..11fb5dae2d6c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -32,7 +32,6 @@ bool canRunNatively(Node* n) {
   const static std::unordered_set<std::string> native_nodes{
       "aten::flatten",
       "aten::narrow",
-      "aten::permute",
       "aten::reshape",
       "aten::slice",
       "aten::transpose",

From 94e328c0385d2e915d2bc38446d1da2f1cd94068 Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Tue, 15 Dec 2020 23:39:50 -0800
Subject: [PATCH 03/34] fix optimizer.pyi typo 'statue'->'state' (#49388)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49388

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D25553672

Pulled By: glaringlee

fbshipit-source-id: e9f2233bd678a90768844af2d8d5e2994d59e304
---
 torch/optim/optimizer.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index aa50a6fd1027..6202050f3493 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -10,7 +10,7 @@ class Optimizer:
     param_groups: List[dict]
 
     def __init__(self, params: _params_t, default: dict) -> None: ...
-    def __setstate__(self, statue: dict) -> None: ...
+    def __setstate__(self, state: dict) -> None: ...
     def state_dict(self) -> dict: ...
     def load_state_dict(self, state_dict: dict) -> None: ...
     def zero_grad(self, set_to_none: Optional[bool]=...) -> None: ...

From 8954eb3f7296c7f98954adb150e885bbdf4791e0 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Wed, 16 Dec 2020 00:32:27 -0800
Subject: [PATCH 04/34] [StaticRuntime] Fusion pass for
 ClipRanges/GatherRanges/LengthsToOffsets (#49113)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49113

Reviewed By: ajyu

Differential Revision: D25388512

fbshipit-source-id: 3daa5b9387a3a10b6c220688df06540c4d844aea
---
 torch/csrc/jit/runtime/static/passes.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index a75d187b2a49..f264423fdec2 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -71,11 +71,30 @@ void ConcatBatchMatMulBatchGather(std::shared_ptr<torch::jit::Graph>& graph) {
   fuse.runOnGraph(graph);
 }
 
+void ClipRangesGatherRangesLengthsToOffsets(
+    std::shared_ptr<torch::jit::Graph>& graph) {
+  // TODO:: check restrictions for inputs; outputs not used elsewhere
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d):
+        %y0 : Tensor = fb::clip_ranges(%b, %c)
+        %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0)
+        %y3 : Tensor = fb::lengths_to_offsets(%y2, %d)
+        return (%y3, %y1))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %b, %c, %d):
+        %y0 : Tensor, %y1 : Tensor = fb::clip_ranges_gather_lengths_to_offsets(%a, %b, %c, %d)
+        return (%y1, %y0))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
 void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
 #ifdef FBCODE_CAFFE2
   ConcatAddMulReplaceNaNClip(graph);
   CastedBatchOneHotLengths(graph);
   ConcatBatchMatMulBatchGather(graph);
+  ClipRangesGatherRangesLengthsToOffsets(graph);
 #endif
 }
 

From a9137aeb06bd2bfd3c9ec8aa7ea29fc754096341 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 16 Dec 2020 01:25:04 -0800
Subject: [PATCH 05/34] quantized tensor: add preliminary support for advanced
 indexing, try 2 (#49346)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49346

This is less ambitious redo of
https://github.com/pytorch/pytorch/pull/49129/.

We make the

```
xq_slice = xq[:, [0], :, :]
```

indexing syntax work if `xq` is a quantized Tensor.  For now, we are
making the code not crash, with an in efficient `dq -> index -> q`
implementation.  A future PR can optimize performance by removing
the unnecessary memory copies (which will require some non-trivial
changes to TensorIterator).

Test Plan:
```
python test/test_quantization.py TestQuantizedOps.test_advanced_indexing
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25539365

fbshipit-source-id: 98485875aaaf5743e1a940e170258057691be4fa
---
 .../ATen/native/TensorAdvancedIndexing.cpp    | 21 ++++++++++
 aten/src/ATen/native/native_functions.yaml    |  1 +
 c10/core/TensorOptions.h                      |  2 +
 test/quantization/test_quantized_op.py        | 39 +++++++++++++++++++
 4 files changed, 63 insertions(+)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index f3147bdf78aa..1d9f9d9d2a12 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -290,6 +290,27 @@ Tensor index(const Tensor & self, TensorList indices) {
   return iter.output();
 }
 
+Tensor quantized_index(const Tensor & self, TensorList indices) {
+  TORCH_INTERNAL_ASSERT(
+      self.qscheme() == c10::kPerTensorAffine ||
+      self.qscheme() == c10::kPerTensorSymmetric,
+      "Indexing is only supported for per-Tensor quantized Tensors.");
+
+  // For now, this is a naive implementation which does dq -> index -> q.
+  // TODO(future PR): improve performance by removing the copies.
+  const auto& self_dq = self.dequantize();
+
+  TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+
+  auto info = make_info(self_dq, indices);
+  auto iter = make_index_iterator(info);
+  index_stub(iter.device_type(), iter, info.indexed_sizes, info.indexed_strides);
+  at::Tensor res = iter.output();
+
+  return at::quantize_per_tensor(
+      res, self.q_scale(), self.q_zero_point(), self.scalar_type());
+}
+
 Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   at::assert_no_internal_overlap(result);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 09b7c5f7e762..715fdccc9691 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2200,6 +2200,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: index
+    QuantizedCPU: quantized_index
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
   # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
   # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 347df066cc90..34e17c37f774 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -691,6 +691,8 @@ inline DeviceType computeDeviceType(DispatchKey tid) {
     return DeviceType::Vulkan;
   } else if (tid == DispatchKey::Metal) {
     return DeviceType::Metal;
+  } else if (tid == DispatchKey::QuantizedCPU) {
+    return DeviceType::CPU;
   } else {
     AT_ASSERTM(false, "Unknown DispatchKey: ", tid);
   }
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 1c66c8fb986f..f1e52fc38d32 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -2274,6 +2274,45 @@ def test_empty_batch(self):
         result = torch.ops.quantized.linear_dynamic(X, w_packed)
         self.assertEqual(result.shape, (0, 2))
 
+    def test_advanced_indexing(self):
+        """
+        Verifies that the x[:, [0], :, :] syntax works for quantized tensors.
+        """
+        for dtype in (torch.qint8, torch.quint8, torch.qint32):
+            scale = 0.1
+            zp = 0
+            x_q = torch.quantize_per_tensor(
+                torch.randn(1, 4, 4, 4), scale, zp, dtype)
+            # reference
+            x_fp32 = x_q.dequantize()
+
+            # single dim, single index
+            x_q_s1 = x_q[:, [0], :, :]
+            x_fp32_s1 = x_fp32[:, [0], :, :]
+            x_fp32_s1_ref = \
+                torch.quantize_per_tensor(x_fp32_s1, scale, zp, dtype)
+            self.assertEqual(x_q_s1, x_fp32_s1_ref)
+
+            # multiple dim, single index
+            x_q_s2 = x_q[:, [0], [2], :]
+            x_fp32_s2 = x_fp32[:, [0], [2], :]
+            x_fp32_s2_ref = \
+                torch.quantize_per_tensor(x_fp32_s2, scale, zp, dtype)
+            self.assertEqual(x_q_s2, x_fp32_s2_ref)
+
+            # single dim, multiple indices
+            x_q_s3 = x_q[:, [2, 0, 1], :, :]
+            x_fp32_s3 = x_fp32[:, [2, 0, 1], :, :]
+            x_fp32_s3_ref = \
+                torch.quantize_per_tensor(x_fp32_s3, scale, zp, dtype)
+            self.assertEqual(x_q_s3, x_fp32_s3_ref)
+
+            # multiple dim, multiple indices
+            x_q_s4 = x_q[:, [2, 0, 1], :, [1]]
+            x_fp32_s4 = x_fp32[:, [2, 0, 1], :, [1]]
+            x_fp32_s4_ref = \
+                torch.quantize_per_tensor(x_fp32_s4, scale, zp, dtype)
+            self.assertEqual(x_q_s4, x_fp32_s4_ref)
 
 
 class TestDynamicQuantizedLinear(TestCase):

From 40d7c1091fff8350ce7d9877f9080079edba1836 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 16 Dec 2020 01:38:38 -0800
Subject: [PATCH 06/34] Unescape string in RPC error message (#49373)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49373

Unescaping the string in RPC error message to provide better error msg

Test Plan: CI

Reviewed By: xush6528

Differential Revision: D25511730

fbshipit-source-id: 054f46d5ffbcb1350012362a023fafb1fe57fca1
---
 torch/distributed/rpc/internal.py              |  2 +-
 .../_internal/distributed/rpc/rpc_test.py      | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index a41c0e454f24..42847896c136 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -201,7 +201,7 @@ def _run_function(python_udf):
 
 def _handle_exception(result):
     if isinstance(result, RemoteException):
-        raise result.exception_type(result.msg)
+        raise result.exception_type(result.msg.encode("utf-8").decode("unicode_escape"))
 
 
 def _build_rpc_profiling_key(
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 46dbacc3c2eb..a149c541a090 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -317,6 +317,10 @@ def my_script_func(tensor):
 def raise_func():
     raise ValueError(expected_err)
 
+expected_err_escape = "\nFirst line of error \n next line of error \n last line of error"
+def raise_func_escape():
+    raise ValueError(expected_err_escape)
+
 
 global_rref = None
 
@@ -1982,6 +1986,20 @@ def test_py_raise_in_user_func(self):
         stderr_lines = err.getvalue()
         self.assertTrue(expected_err in stderr_lines)
 
+    @dist_init
+    def test_py_raise_in_user_func_escaped_str(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        fut = rpc.rpc_async(worker_name(dst_rank), raise_func_escape)
+        try:
+            fut.wait()
+        except ValueError as e:
+            msg = str(e)
+            # Ensure newlines are unescaped to provide a better repr of error.
+            self.assertEqual(msg, msg.encode("utf-8").decode("unicode_escape"))
+        else:
+            self.assertTrue(False, "expected raise_func_escape to raise ValueError.")
+
     @dist_init
     def test_nested_rpc(self):
         n = self.rank + 1

From ed04b71651f29cd7727e36b51a2d723bc68b6a3e Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Wed, 16 Dec 2020 01:43:55 -0800
Subject: [PATCH 07/34] [StaticRuntime][ATen] Add out variant for narrow_copy
 (#49449)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49449

Similar to permute_out, add the out variant of `aten::narrow` (slice in c2) which does an actual copy. `aten::narrow` creates a view, however, an copy is incurred when we call `input.contiguous` in the ops that follow `aten::narrow`, in `concat_add_mul_replacenan_clip`, `casted_batch_one_hot_lengths`, and `batch_box_cox`.

{F351263599}

Test Plan:
Unit test:

```
buck test //caffe2/aten:native_test
```
Benchmark with the adindexer model:
```
bs = 1 is neutral

Before:
I1214 21:32:51.919239 3285258 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.0886948. Iters per second: 11274.6
After:
I1214 21:32:52.492352 3285277 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.0888019. Iters per second: 11261

bs = 20 shows more gains probably because the tensors are bigger and therefore the cost of copying is higher

Before:
I1214 21:20:19.702445 3227229 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.527563. Iters per second: 1895.51
After:
I1214 21:20:20.370173 3227307 PyTorchPredictorBenchLib.cpp:209] PyTorch run finished. Milliseconds per iter: 0.508734. Iters per second: 1965.67
```

Reviewed By: bwasti

Differential Revision: D25554109

fbshipit-source-id: 6bae62e6ce3456ff71559b635cc012fdcd1fdd0e
---
 aten/src/ATen/native/TensorShape.cpp       | 83 +++++++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml |  6 +-
 aten/src/ATen/test/native_test.cpp         | 11 +++
 torch/csrc/jit/runtime/static/ops.cpp      | 24 ++++++-
 4 files changed, 121 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index eda688ad6e1d..784dd2927fba 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -753,8 +753,89 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   return newTensor._coalesced_(self.is_coalesced());
 }
 
+Tensor& narrow_copy_dense_out(
+  Tensor& output, const Tensor& self, int64_t dim, int64_t start, int64_t length
+) {
+  if (self.is_cuda()) {
+    return output.copy_(self.narrow(dim, start, length));
+  }
+  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(self.dtype() == output.dtype());
+
+  Tensor self_contig = self.contiguous();
+  const auto self_sizes = self_contig.sizes();
+
+  // wrap dim if negative and do bound check
+  if (dim < 0) {
+    dim = at::maybe_wrap_dim(dim, self_sizes.size());
+  } else {
+    TORCH_CHECK(dim < self_sizes.size());
+  }
+
+  // wrap start and do bound check
+  const auto cur_size = self_sizes[dim];
+  if (start != cur_size && start < 0) { // start being the end is valid, but
+                                        // not a valid dim specification.
+    start = at::maybe_wrap_dim(start, cur_size);
+  }
+  TORCH_CHECK(
+      length >= 0 && start <= cur_size - length,
+      "start (",
+      start,
+      ") + length (",
+      length,
+      ") exceeds dimension size (",
+      cur_size,
+      ").");
+
+  // resize output
+  auto output_sizes = self_sizes.vec();
+  output_sizes[dim] = length;
+  at::native::resize_(output, output_sizes);
+
+  const int64_t unit = c10::size_from_dim_(dim + 1, self_sizes);
+  const int64_t num_blocks = c10::size_to_dim_(dim, self_sizes);
+
+  const auto itemsize = self_contig.dtype().itemsize();
+  size_t src_nbytes = itemsize * self_contig.numel();
+  size_t dst_nbytes = itemsize * output.numel();
+
+  size_t src_block_size = unit * self_sizes[dim];
+  size_t dst_block_size = unit * length;
+
+  if (num_blocks == 0 || dst_block_size == 0) {
+    return output;
+  }
+
+  char* src_bytes = static_cast<char*>(self_contig.data_ptr());
+  char* dst_bytes = static_cast<char*>(output.data_ptr());
+
+  size_t src_block_size_bytes = itemsize * src_block_size;
+  size_t dst_block_size_bytes = itemsize * dst_block_size;
+  size_t src_offset = unit * start;
+
+  char* src_offset_bytes = src_bytes + itemsize * src_offset;
+  char* dst_offset_bytes = dst_bytes;
+
+  for (size_t i = 0; i < num_blocks; ++i) {
+    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
+    char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
+        static_cast<void*>(src_bytes + src_nbytes));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
+        static_cast<void*>(dst_bytes + dst_nbytes));
+
+    memcpy(
+        local_dst_offset_bytes, local_src_offset_bytes, dst_block_size_bytes);
+  }
+  return output;
+}
+
 Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-    return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
+  auto output = at::empty_like(self);
+  return narrow_copy_dense_out(output, self, dim, start, length);
 }
 
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 715fdccc9691..8885e06e9ef6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3065,11 +3065,15 @@
 
 - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
   use_c10_dispatcher: full
-  variants: method
+  variants: function, method
   dispatch:
     CPU, CUDA: narrow_copy_dense
     SparseCPU, SparseCUDA: narrow_copy_sparse
 
+- func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: narrow_copy_dense_out
+
 - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function, method
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index b32a0b081042..4c53fd6b6620 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -64,6 +64,16 @@ void TestStack(TensorOptions T, Tensor& t) {
   }
 }
 
+void TestNarrow(TensorOptions T, Tensor& t) {
+  auto x = rand({5, 8, 3});
+  for (int64_t dim = 0; dim < 3; ++dim) {
+    const int64_t start = 1, length = 2;
+    auto y_ref = x.narrow(dim, start, length);
+    auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
+    ASSERT_EQUAL(y_ref, y_test);
+  }
+}
+
 // size / stride
 void TestSize(TensorOptions T, Tensor& t) {
   auto scalar = randn({}, T);
@@ -199,6 +209,7 @@ void test(TensorOptions T, TensorOptions AccT) {
   TestSplit(T, t);
   TestChunk(T, t);
   TestStack(T, t);
+  TestNarrow(T, t);
   TestSize(T, t);
   TestMatmul(T, t, AccT);
   TestStandardGammaGrad(T, t);
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 11fb5dae2d6c..4d38b8b0a97d 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -31,7 +31,6 @@ bool canRunNatively(Node* n) {
   // In alphabetical order
   const static std::unordered_set<std::string> native_nodes{
       "aten::flatten",
-      "aten::narrow",
       "aten::reshape",
       "aten::slice",
       "aten::transpose",
@@ -303,6 +302,29 @@ REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
   };
 });
 
+// The out variant takes precedence over native
+REGISTER_OPERATOR_FUNCTOR(aten::narrow, aten_narrow, [](Node* n) -> SROperator {
+  return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
+    auto self = p_node->Input(0, reg).toTensor(); // self
+    auto dim = p_node->Input(1, reg).toInt(); // dim
+    int64_t start = 0;
+    if (p_node->Input(2, reg).isScalar()) {
+      start = p_node->Input(2, reg).toInt();
+    } else {
+      auto t = p_node->Input(2, reg).toTensor();
+      start = t.item<int64_t>();
+    }
+    auto length = p_node->Input(3, reg).toInt(); // length
+
+    if (p_node->Output(0, reg).isNone()) {
+      p_node->Output(0, reg) = create_empty_from(self);
+    }
+    auto output = p_node->Output(0, reg).toTensor();
+    output.resize_({0});
+    at::native::narrow_copy_dense_out(output, self, dim, start, length);
+  };
+});
+
 std::function<void(const ProcessedNode*, std::vector<IValue>&)>
 getOutOfPlaceOperation(Node* n) {
   auto op_name = n->kind().toQualString();

From 306bab220ece9f57c9a01e5b5822aeb79c904d20 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Wed, 16 Dec 2020 02:42:40 -0800
Subject: [PATCH 08/34] Revert D25554109: [StaticRuntime][ATen] Add out variant
 for narrow_copy

Test Plan: revert-hammer

Differential Revision:
D25554109 (https://github.com/pytorch/pytorch/commit/ed04b71651f29cd7727e36b51a2d723bc68b6a3e)

Original commit changeset: 6bae62e6ce34

fbshipit-source-id: bfa038e150166d0116bcae8f7a6415d98d4146de
---
 aten/src/ATen/native/TensorShape.cpp       | 83 +---------------------
 aten/src/ATen/native/native_functions.yaml |  6 +-
 aten/src/ATen/test/native_test.cpp         | 11 ---
 torch/csrc/jit/runtime/static/ops.cpp      | 24 +------
 4 files changed, 3 insertions(+), 121 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 784dd2927fba..eda688ad6e1d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -753,89 +753,8 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   return newTensor._coalesced_(self.is_coalesced());
 }
 
-Tensor& narrow_copy_dense_out(
-  Tensor& output, const Tensor& self, int64_t dim, int64_t start, int64_t length
-) {
-  if (self.is_cuda()) {
-    return output.copy_(self.narrow(dim, start, length));
-  }
-  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  TORCH_CHECK(self.dtype() == output.dtype());
-
-  Tensor self_contig = self.contiguous();
-  const auto self_sizes = self_contig.sizes();
-
-  // wrap dim if negative and do bound check
-  if (dim < 0) {
-    dim = at::maybe_wrap_dim(dim, self_sizes.size());
-  } else {
-    TORCH_CHECK(dim < self_sizes.size());
-  }
-
-  // wrap start and do bound check
-  const auto cur_size = self_sizes[dim];
-  if (start != cur_size && start < 0) { // start being the end is valid, but
-                                        // not a valid dim specification.
-    start = at::maybe_wrap_dim(start, cur_size);
-  }
-  TORCH_CHECK(
-      length >= 0 && start <= cur_size - length,
-      "start (",
-      start,
-      ") + length (",
-      length,
-      ") exceeds dimension size (",
-      cur_size,
-      ").");
-
-  // resize output
-  auto output_sizes = self_sizes.vec();
-  output_sizes[dim] = length;
-  at::native::resize_(output, output_sizes);
-
-  const int64_t unit = c10::size_from_dim_(dim + 1, self_sizes);
-  const int64_t num_blocks = c10::size_to_dim_(dim, self_sizes);
-
-  const auto itemsize = self_contig.dtype().itemsize();
-  size_t src_nbytes = itemsize * self_contig.numel();
-  size_t dst_nbytes = itemsize * output.numel();
-
-  size_t src_block_size = unit * self_sizes[dim];
-  size_t dst_block_size = unit * length;
-
-  if (num_blocks == 0 || dst_block_size == 0) {
-    return output;
-  }
-
-  char* src_bytes = static_cast<char*>(self_contig.data_ptr());
-  char* dst_bytes = static_cast<char*>(output.data_ptr());
-
-  size_t src_block_size_bytes = itemsize * src_block_size;
-  size_t dst_block_size_bytes = itemsize * dst_block_size;
-  size_t src_offset = unit * start;
-
-  char* src_offset_bytes = src_bytes + itemsize * src_offset;
-  char* dst_offset_bytes = dst_bytes;
-
-  for (size_t i = 0; i < num_blocks; ++i) {
-    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
-    char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(src_bytes + src_nbytes));
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(dst_bytes + dst_nbytes));
-
-    memcpy(
-        local_dst_offset_bytes, local_src_offset_bytes, dst_block_size_bytes);
-  }
-  return output;
-}
-
 Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-  auto output = at::empty_like(self);
-  return narrow_copy_dense_out(output, self, dim, start, length);
+    return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
 }
 
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8885e06e9ef6..715fdccc9691 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3065,15 +3065,11 @@
 
 - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
   use_c10_dispatcher: full
-  variants: function, method
+  variants: method
   dispatch:
     CPU, CUDA: narrow_copy_dense
     SparseCPU, SparseCUDA: narrow_copy_sparse
 
-- func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: narrow_copy_dense_out
-
 - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function, method
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index 4c53fd6b6620..b32a0b081042 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -64,16 +64,6 @@ void TestStack(TensorOptions T, Tensor& t) {
   }
 }
 
-void TestNarrow(TensorOptions T, Tensor& t) {
-  auto x = rand({5, 8, 3});
-  for (int64_t dim = 0; dim < 3; ++dim) {
-    const int64_t start = 1, length = 2;
-    auto y_ref = x.narrow(dim, start, length);
-    auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
-    ASSERT_EQUAL(y_ref, y_test);
-  }
-}
-
 // size / stride
 void TestSize(TensorOptions T, Tensor& t) {
   auto scalar = randn({}, T);
@@ -209,7 +199,6 @@ void test(TensorOptions T, TensorOptions AccT) {
   TestSplit(T, t);
   TestChunk(T, t);
   TestStack(T, t);
-  TestNarrow(T, t);
   TestSize(T, t);
   TestMatmul(T, t, AccT);
   TestStandardGammaGrad(T, t);
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 4d38b8b0a97d..11fb5dae2d6c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -31,6 +31,7 @@ bool canRunNatively(Node* n) {
   // In alphabetical order
   const static std::unordered_set<std::string> native_nodes{
       "aten::flatten",
+      "aten::narrow",
       "aten::reshape",
       "aten::slice",
       "aten::transpose",
@@ -302,29 +303,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
   };
 });
 
-// The out variant takes precedence over native
-REGISTER_OPERATOR_FUNCTOR(aten::narrow, aten_narrow, [](Node* n) -> SROperator {
-  return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto self = p_node->Input(0, reg).toTensor(); // self
-    auto dim = p_node->Input(1, reg).toInt(); // dim
-    int64_t start = 0;
-    if (p_node->Input(2, reg).isScalar()) {
-      start = p_node->Input(2, reg).toInt();
-    } else {
-      auto t = p_node->Input(2, reg).toTensor();
-      start = t.item<int64_t>();
-    }
-    auto length = p_node->Input(3, reg).toInt(); // length
-
-    if (p_node->Output(0, reg).isNone()) {
-      p_node->Output(0, reg) = create_empty_from(self);
-    }
-    auto output = p_node->Output(0, reg).toTensor();
-    output.resize_({0});
-    at::native::narrow_copy_dense_out(output, self, dim, start, length);
-  };
-});
-
 std::function<void(const ProcessedNode*, std::vector<IValue>&)>
 getOutOfPlaceOperation(Node* n) {
   auto op_name = n->kind().toQualString();

From d69d42db78c514b8baefbf42ec1d8126efde52a6 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 16 Dec 2020 02:51:55 -0800
Subject: [PATCH 09/34] Making ops c10 full: optional out arguments (#49083)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49083

We have some (but very few) ops that take optional out arguments `Tensor(a!)? out`.
This PR makes them non-optional mandatory arguments and enables c10-fullness for them.
There is only a very small number of ops affected by this.

Putting this up for discussion.

Alternatives considered:
If we keep them optional, we run into lots of issues in the dispatcher. We have to decide what the dispatcher calling convention for this argument type should be.
1) If we keep passing them in as `Tensor&` arguments and return them as `tuple<Tensor&, Tensor&, Tensor&>`, so basically same as currently, then the schema inference check will say "Your kernel function got inferred to have a `Tensor` argument but your native_functions.yaml declaration says `Tensor?`. This is a mismatch, you made an error". We could potentially disable that check, but that would open the door for real mistakes to not be reported anymore in the future. This sounds bad.
2) If we change them to a type that schema inference could differentiate from `Tensor`, say we pass them in as `const optional<Tensor>&` and return them as `tuple<const optional<Tensor>&, const optional<Tensor>&, const optional<Tensor>&>`, then our boxing logic fails because it can't recognize those as out overloads anymore and shortcut the return value as it is doing right now. We might be able to rewrite the boxing logic, but that could be difficult and could easily develop into a rabbit hole of having to clean up `Tensor&` references throughout the system where we use them.

Furthermore, having optional out arguments in C++ doesn't really make sense. the C++ API puts them to the front of the argument list, so you can't omit them anyways when calling an op.
You would be able to omit them when calling from Python with out kwargs, but not sure if we want that discrepancy between the c++ and python API.
ghstack-source-id: 118660075

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25422197

fbshipit-source-id: 3cb25c5a3d93f9eb960d70ca014bae485be9f058
---
 aten/src/ATen/native/native_functions.yaml        | 15 ++++++++++-----
 .../check_backward_compatibility.py               |  5 +++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 715fdccc9691..2e775993a8b5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9566,7 +9566,8 @@
     CPU: slow_conv_transpose2d_cpu
     CUDA: slow_conv_transpose2d_cuda
 
-- func: slow_conv_transpose2d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+- func: slow_conv_transpose2d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose2d_backward_out_cpu
@@ -9593,7 +9594,8 @@
     CPU: slow_conv_transpose3d_cpu
     CUDA: slow_conv_transpose3d_cuda
 
-- func: slow_conv_transpose3d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+- func: slow_conv_transpose3d_backward.grad_output(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_backward_out_cpu
@@ -9628,7 +9630,8 @@
     CPU: slow_conv2d_forward_cpu
     CUDA: legacy::cuda::_thnn_conv2d_forward
 
-- func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+- func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_out_cpu
@@ -9661,7 +9664,8 @@
   dispatch:
     CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward
 
-- func: thnn_conv_depthwise2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight) -> (Tensor(a!), Tensor(b!))
+- func: thnn_conv_depthwise2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) grad_input, Tensor(b!) grad_weight) -> (Tensor(a!), Tensor(b!))
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: nn
   dispatch:
     CUDA: thnn_conv_depthwise2d_backward_out
@@ -9692,7 +9696,8 @@
   dispatch:
     CPU: slow_conv3d_forward_cpu
 
-- func: slow_conv3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+- func: slow_conv3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: nn
   dispatch:
     CPU: slow_conv3d_backward_out_cpu
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index deb7a161e1d3..fa2e54844935 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -190,6 +190,11 @@
     ("aten::quantile", datetime.date(2021, 1, 31)),
     ("aten::nanquantile", datetime.date(2021, 1, 31)),
     ("aten::_fft_with_size", datetime.date(2021, 1, 31)),
+    ("aten::thnn_conv_depthwise2d_backward", datetime.date(2021, 1, 31)),
+    ("aten::slow_conv3d_backward", datetime.date(2021, 1, 31)),
+    ("aten::thnn_conv2d_backward", datetime.date(2021, 1, 31)),
+    ("aten::slow_conv_transpose3d_backward", datetime.date(2021, 1, 31)),
+    ("aten::slow_conv_transpose2d_backward", datetime.date(2021, 1, 31)),
 ]
 
 def allow_listed(schema, allow_list):

From ec8e9d31cf3a9c2bc5f31d500389b48a5024917d Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 16 Dec 2020 02:51:55 -0800
Subject: [PATCH 10/34] Making ops c10-full: optional lists (#49088)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49088

We had special case logic to support `int[]?` and `double[]?` but nothing for `DimnameList[]?`.
This PR generalizes the logic to support optional lists so it should now work with all types.
It also enables c10-fullness for ops that were blocked by this.

Note that using these arguments in a signature was always and still is expensive because the whole list needs to be copied.
We should probably consider alternatives in the future like for example using `torch::List` instead of `ArrayRef`, that could work without copying the list.
ghstack-source-id: 118660071

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25423901

fbshipit-source-id: dec58dc29f3bb4cbd89e2b95c42da204a9da2e0a
---
 aten/src/ATen/VmapModeRegistrations.cpp       |  8 +--
 .../impl/make_boxed_from_unboxed_functor.h    | 21 +++-----
 aten/src/ATen/core/ivalue.h                   |  8 ---
 aten/src/ATen/core/ivalue_inl.h               | 54 ++++++++++---------
 aten/src/ATen/native/native_functions.yaml    | 11 ++++
 5 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp
index 6bf0f027cf7c..ab4556c8c415 100644
--- a/aten/src/ATen/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/VmapModeRegistrations.cpp
@@ -79,15 +79,15 @@ TORCH_LIBRARY_IMPL(aten, VmapMode, m) {
 
   m.impl("rand", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
   m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
-  m.impl_UNBOXED("rand.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, const TensorOptions&>);
-  m.impl_UNBOXED("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, const TensorOptions&>);
+  m.impl("rand.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
   m.impl("rand.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
   m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
 
   m.impl("randn", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
   m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
-  m.impl_UNBOXED("randn.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, const TensorOptions&>);
-  m.impl_UNBOXED("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, const TensorOptions&>);
+  m.impl("randn.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
   m.impl("randn.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
   m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
 
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index ab603a09c86b..7bdb0d996a13 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -265,20 +265,13 @@ namespace impl {
       return ivalue_to_arg<std::vector<T>, AllowDeprecatedTypes>::call(std::move(v));
     }
   };
-  template<bool AllowDeprecatedTypes>
-  struct ivalue_to_arg<optional<ArrayRef<int64_t>>, AllowDeprecatedTypes> final {
-    // If an argument is optional<ArrayRef<int64_t>>, convert the IValue to a optional<std::vector<int64_t>> and pass that
-    // to the operator.
-    static OptionalArray<int64_t> call(IValue&& v) {
-      return std::move(v).toOptionalIntArray();
-    }
-  };
-  template<bool AllowDeprecatedTypes>
-  struct ivalue_to_arg<optional<ArrayRef<double>>, AllowDeprecatedTypes> final {
-    // If an argument is optional<ArrayRef<T>>, convert the IValue to a optional<std::vector<T>> and pass that
-    // to the operator.
-    static OptionalArray<double> call(IValue&& v) {
-      return std::move(v).toOptionalDoubleArray();
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+    // If an argument is optional<ArrayRef<T>>, convert the IValue to an optional<std::vector<T>> and pass that
+    // to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but impliticly convertible
+    // to optional<ArrayRef<T>>.
+    static OptionalArray<T> call(IValue&& v) {
+      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(std::move(v));
     }
   };
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index d2e72933b532..5ab5a9c0a501 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -705,14 +705,6 @@ struct CAFFE2_API IValue final {
   template <typename T>
   optional<T> toOptional();
 
-  /// @private [doxygen private]
-  /// Only for use in generated code.
-  OptionalArray<int64_t> toOptionalIntArray();
-
-  /// @private [doxygen private]
-  /// Only for use in generated code.
-  OptionalArray<double> toOptionalDoubleArray();
-
   /// @private [doxygen private]
   /// this is a shallow comparison of two IValues to test the object identity
   bool isSameIdentity(const IValue& rhs) const;
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index b3b53aed994c..46bde6103043 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -861,6 +861,36 @@ c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
   return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }
 
+template <typename T>
+static std::vector<T> createVectorFromList(const c10::detail::ListImpl* impl) {
+  std::vector<T> result;
+  result.reserve(impl->list.size());
+  for (size_t i = 0, N = impl->list.size(); i < N; ++i) {
+    result.push_back(impl->list[i].to<T>());
+  }
+  return result;
+}
+
+template <typename T>
+static std::vector<T> createVectorFromList(const c10::List<T>& impl) {
+  std::vector<T> result;
+  result.reserve(impl.size());
+  for (size_t i = 0, N = impl.size(); i < N; ++i) {
+    result.push_back(impl[i]);
+  }
+  return result;
+}
+
+template <typename T>
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+  if (ivalue.isNone()) {
+    return {};
+  }
+  return createVectorFromList<T>(
+    std::move(ivalue).to<c10::List<T>>()
+  );
+}
+
 namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
@@ -952,16 +982,6 @@ inline T IValue::to() const& {
   return generic_to(*this, _fake_type<T>{});
 }
 
-template <typename T>
-static std::vector<T> createVectorFromList(const c10::detail::ListImpl* impl) {
-  std::vector<T> result;
-  result.reserve(impl->list.size());
-  for (size_t i = 0, N = impl->list.size(); i < N; ++i) {
-    result.push_back(impl->list[i].to<T>());
-  }
-  return result;
-}
-
 inline c10::List<int64_t> IValue::toIntList() && {
   AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
   return c10::List<int64_t>(moveToIntrusivePtr<c10::detail::ListImpl>());
@@ -1211,20 +1231,6 @@ inline optional<T> IValue::toOptional() {
   return this->to<T>();
 }
 
-inline OptionalArray<int64_t> IValue::toOptionalIntArray() {
-  if (this->isNone()) {
-    return {};
-  }
-  return this->toIntVector();
-}
-
-inline OptionalArray<double> IValue::toOptionalDoubleArray() {
-  if (this->isNone()) {
-    return {};
-  }
-  return this->toDoubleVector();
-}
-
 inline bool IValue::isCustomClass() const {
   return torch::isCustomClass(*this);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2e775993a8b5..2bbde22c9389 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -106,9 +106,11 @@
   variants: method
 
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
+  use_c10_dispatcher: full
   variants: method
 
 - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
@@ -1738,6 +1740,7 @@
   use_c10_dispatcher: full
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@@ -1942,6 +1945,7 @@
   variants: function, method
 
 - func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a)
+  use_c10_dispatcher: full
   variants: method
 
 - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
@@ -2023,6 +2027,7 @@
     CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3165,6 +3170,7 @@
   variants: function
 
 - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3311,9 +3317,11 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: rand.generator_with_names(int[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3368,9 +3376,11 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: randn.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: randn.generator_with_names(int[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: randn.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
@@ -4443,6 +4453,7 @@
   variants: function
 
 - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

From 76d09ec33eabd11b09f0df6e8abc2d6a54e0f254 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 16 Dec 2020 07:05:20 -0800
Subject: [PATCH 11/34] [PyTorch] Avoid move-constructing a List in
 listConstruct (#49355)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49355

List's move ctor is a little bit more expensive than you might expect, but we can easily avoid it.
ghstack-source-id: 118624596

Test Plan: Roughly 1% improvement on internal benchmark.

Reviewed By: hlu1

Differential Revision: D25542190

fbshipit-source-id: 08532642c7d1f1604e16c8ebefd1ed3e56f7c919
---
 torch/csrc/jit/runtime/vararg_functions.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/runtime/vararg_functions.cpp b/torch/csrc/jit/runtime/vararg_functions.cpp
index e61676b83eca..44bc56206eaf 100644
--- a/torch/csrc/jit/runtime/vararg_functions.cpp
+++ b/torch/csrc/jit/runtime/vararg_functions.cpp
@@ -208,13 +208,20 @@ void listConstruct(
     Stack& stack,
     const at::ListTypePtr& type,
     size_t num_inputs) {
-  c10::List<IValue> vals(type->getElementType());
-  vals.reserve(num_inputs);
-  for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) {
-    vals.emplace_back(std::move(stack[i]));
-  }
-  drop(stack, num_inputs);
-  push(stack, std::move(vals));
+  // Structuring the implementation this way allows NRVO to avoid
+  // move-constructing vals on its way onto the stack. Moving a List
+  // isn't free.
+  auto makeList =
+      [](Stack& stack, const at::ListTypePtr& type, size_t num_inputs) {
+        c10::List<IValue> vals(type->getElementType());
+        vals.reserve(num_inputs);
+        for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) {
+          vals.emplace_back(std::move(stack[i]));
+        }
+        drop(stack, num_inputs);
+        return vals;
+      };
+  stack.push_back(makeList(stack, type, num_inputs));
 }
 
 void dictConstruct(

From efc090652e656c518303971def0b2825817f7dbc Mon Sep 17 00:00:00 2001
From: ivannz <ivannz@yandex.ru>
Date: Wed, 16 Dec 2020 07:09:11 -0800
Subject: [PATCH 12/34] Enhanced generators with grad-mode decorators (#49017)

Summary:
This PR addresses the feature request outlined in https://github.com/pytorch/pytorch/issues/48713 for two-way communication with enhanced generators from [pep-342](https://www.python.org/dev/peps/pep-0342/).

Briefly, the logic of the patch resembles `yield from` [pep-380](https://www.python.org/dev/peps/pep-0380/), which cannot be used, since the generator **must be interacted with from within the grad-mode context**, while yields from the decorator **must take place outside of the context**. Hence any interaction with the wrapped generator, be it via [.send](https://docs.python.org/3/reference/expressions.html?highlight=throw#generator.send), [.throw](https://docs.python.org/3/reference/expressions.html?highlight=throw#generator.throw), and even [.close](https://docs.python.org/3/reference/expressions.html?highlight=throw#generator.close) must be wrapped by a `with` clause. The patch is compatible with `for i in gen: pass` and `next(gen)` use cases and allows two-way communication with the generator via `.send <-> yield` points.

### Logic
At lines [L37-L38](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L37-L38) we (the decorator) **start the wrapped generator** (coroutine) by issuing `None` into it (equivalently, we can use `next(get)` here). Then we **dispatch responses of the generator** to our ultimate caller and **relay the latter's requests** into the generator in the loop on lines [L39-L52](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L39-L52).

We yield the most recent response on [L40-L41](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L40-L41), at which point we become **paused**, waiting for the next ultimate caller's interaction with us. If the caller **sends us a request**, then we become unpaused and move to [L51-L52](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L51-L52) and **forward it into the generator**, at which point we pause, waiting for its response. The response might be a value, an exception or a `StopIteration`. In the case of an exception from the generator, we let it **bubble up** from the immediately surrounding [except clause](https://docs.python.org/3/reference/compound_stmts.html#the-try-statement)  to the ultimate caller through the [outer try-except](https://github.com/ivannz/pytorch/blob/2dc287bba87fa6f05c49446c0239ffdcdb1e896e/torch/autograd/grad_mode.py#L36-L54). In the case of a `StopIteration`, we **take it's payload and propagate it** to the caller via [return](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L54). In the case of a value, the flow and the loop continues.

The caller **throwing an exception at us** is handled much like a proper request, except for the exception playing the role of the request. In this case we **forward it into the generator** on lines [L47-L49](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L47-L49) and await its response. We explicitly **advance** the traceback one frame up, in order to indicate the **source of the exception within the generator**.

Finally the `GeneratorExit` is handled on lines [L42-L45](https://github.com/ivannz/pytorch/blob/2d40296c0c6617b3980c86762be466c995aa7f8e/torch/autograd/grad_mode.py#L42-L45) and closes the generator.

Updates: clarified exception propagation

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49017

Reviewed By: izdeby

Differential Revision: D25567796

Pulled By: albanD

fbshipit-source-id: 801577cccfcb2b5e13a08e77faf407881343b7b0
---
 test/test_autograd.py       | 181 ++++++++++++++++++++++++++++++++++++
 torch/autograd/grad_mode.py |  48 ++++++++--
 2 files changed, 222 insertions(+), 7 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 0d99169f4d65..be276e334df6 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1161,6 +1161,187 @@ def no_grad_context_manager_recursive(depth):
             enable_grad_context_manager_recursive(10)
             self.assertFalse(torch.is_grad_enabled())
 
+    def test_set_grad_coroutines(self):
+        @torch.no_grad()
+        def coro_no_grad(n=10):
+            self.assertFalse(torch.is_grad_enabled())
+            for i in range(n):
+                self.assertFalse(torch.is_grad_enabled())
+                r = yield i
+                self.assertFalse(torch.is_grad_enabled())
+                self.assertEqual(i, r)
+            self.assertFalse(torch.is_grad_enabled())
+
+        @torch.enable_grad()
+        def coro_enable_grad(n=10):
+            self.assertTrue(torch.is_grad_enabled())
+            for i in range(n):
+                self.assertTrue(torch.is_grad_enabled())
+                r = yield i
+                self.assertTrue(torch.is_grad_enabled())
+                self.assertEqual(i, r)
+            self.assertTrue(torch.is_grad_enabled())
+
+        with torch.enable_grad():
+            self.assertTrue(torch.is_grad_enabled())
+            coro, r = coro_no_grad(), None
+            try:
+                while True:
+                    self.assertTrue(torch.is_grad_enabled())
+                    r = coro.send(r)
+                    self.assertTrue(torch.is_grad_enabled())
+
+            except StopIteration:
+                pass
+
+        with torch.no_grad():
+            self.assertFalse(torch.is_grad_enabled())
+            coro, r = coro_enable_grad(), None
+            try:
+                while True:
+                    self.assertFalse(torch.is_grad_enabled())
+                    r = coro.send(r)
+                    self.assertFalse(torch.is_grad_enabled())
+
+            except StopIteration:
+                pass
+
+    def test_set_grad_coroutines_benign_exceptions(self):
+        class RecoverableException(Exception):
+            pass
+
+        @torch.no_grad()
+        def coro_no_grad(n=10):
+            has_raised = False
+            for i in range(n):
+                try:
+                    self.assertFalse(torch.is_grad_enabled())
+                    yield (-i if has_raised else i)
+
+                except RecoverableException:
+                    self.assertFalse(torch.is_grad_enabled())
+                    has_raised = True
+
+        @torch.enable_grad()
+        def coro_enable_grad(n=10):
+            has_raised = False
+            for i in range(n):
+                try:
+                    self.assertTrue(torch.is_grad_enabled())
+                    yield (-i if has_raised else i)
+
+                except RecoverableException:
+                    self.assertTrue(torch.is_grad_enabled())
+                    has_raised = True
+
+        with torch.enable_grad():
+            coro = coro_no_grad()
+            assert 0 == next(coro)
+            try:
+                while True:
+                    r = coro.throw(RecoverableException)
+                    self.assertLess(r, 0)
+
+            except StopIteration:
+                pass
+
+        with torch.no_grad():
+            coro = coro_enable_grad()
+            assert 0 == next(coro)
+            try:
+                while True:
+                    r = coro.throw(RecoverableException)
+                    self.assertLess(r, 0)
+
+            except StopIteration:
+                pass
+
+    def test_set_grad_coroutines_critical_exceptions(self):
+        class UnrecoverableException(Exception):
+            pass
+
+        class SecondaryException(Exception):
+            pass
+
+        @torch.no_grad()
+        def coro_no_grad(n=10):
+            has_raised = False
+            for i in range(n):
+                try:
+                    self.assertFalse(torch.is_grad_enabled())
+                    yield (-i if has_raised else i)
+
+                except UnrecoverableException:
+                    self.assertFalse(torch.is_grad_enabled())
+                    raise SecondaryException
+
+        @torch.enable_grad()
+        def coro_enable_grad(n=10):
+            has_raised = False
+            for i in range(n):
+                try:
+                    self.assertTrue(torch.is_grad_enabled())
+                    yield (-i if has_raised else i)
+
+                except UnrecoverableException:
+                    self.assertTrue(torch.is_grad_enabled())
+                    raise SecondaryException
+
+        with torch.enable_grad():
+            coro = coro_no_grad()
+            assert 0 == next(coro)
+            with self.assertRaises(SecondaryException):
+                coro.throw(UnrecoverableException)
+
+        with torch.no_grad():
+            coro = coro_enable_grad()
+            assert 0 == next(coro)
+            with self.assertRaises(SecondaryException):
+                coro.throw(UnrecoverableException)
+
+    def test_set_grad_coroutines_exit(self):
+        @torch.no_grad()
+        def coro_no_grad(state):
+            for i in range(10):
+                try:
+                    self.assertFalse(torch.is_grad_enabled())
+                    yield i
+
+                except GeneratorExit:
+                    self.assertFalse(torch.is_grad_enabled())
+                    state.add('GeneratorExit')
+                    raise
+
+        @torch.enable_grad()
+        def coro_enable_grad(state):
+            for i in range(10):
+                try:
+                    self.assertTrue(torch.is_grad_enabled())
+                    yield i
+
+                except GeneratorExit:
+                    self.assertTrue(torch.is_grad_enabled())
+                    state.add('GeneratorExit')
+                    raise
+
+        state = set()
+        with torch.enable_grad():
+            coro = coro_no_grad(state)
+            for i in range(5):
+                next(coro)
+
+            coro.close()
+        self.assertTrue('GeneratorExit' in state)
+
+        state = set()
+        with torch.no_grad():
+            coro = coro_enable_grad(state)
+            for i in range(5):
+                next(coro)
+
+            coro.close()
+        self.assertTrue('GeneratorExit' in state)
+
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
         x = torch.ones(5, 5, requires_grad=True)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index bbd96e941a54..6e0f6f1469c5 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -1,3 +1,4 @@
+import sys
 import torch
 import functools
 import inspect
@@ -31,13 +32,46 @@ def _wrap_generator(self, func):
         @functools.wraps(func)
         def generator_context(*args, **kwargs):
             gen = func(*args, **kwargs)
-            while True:
-                try:
-                    with self.__class__():
-                        x = next(gen)
-                    yield x
-                except StopIteration:
-                    break
+
+            # Generators are suspended and unsuspended at `yield`, hence we
+            # make sure the grad mode is properly set every time the execution
+            # flow returns into the wrapped generator and restored when it
+            # returns through our `yield` to our caller (see PR #49017).
+            cls = type(self)
+            try:
+                # Issuing `None` to a generator fires it up
+                with cls():
+                    response = gen.send(None)
+
+                while True:
+                    try:
+                        # Forward the response to our caller and get its next request
+                        request = yield response
+
+                    except GeneratorExit:
+                        # Inform the still active generator about its imminent closure
+                        with cls():
+                            gen.close()
+                        raise
+
+                    except BaseException:
+                        # Propagate the exception thrown at us by the caller
+                        with cls():
+                            response = gen.throw(*sys.exc_info())
+
+                    else:
+                        # Pass the last request to the generator and get its response
+                        with cls():
+                            response = gen.send(request)
+
+            # We let the exceptions raised above by the generator's `.throw` or
+            # `.send` methods bubble up to our caller, except for StopIteration
+            except StopIteration as e:
+                # The generator informed us that it is done: take whatever its
+                # returned value (if any) was and indicate that we're done too
+                # by returning it (see docs for python's return-statement).
+                return e.value
+
         return generator_context
 
     def __enter__(self) -> None:

From 6786b2b9660435d78af8df52fe0ba9a7924a5c26 Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Wed, 16 Dec 2020 08:32:24 -0800
Subject: [PATCH 13/34] webdataset prototype - ListDirFilesIterableDataset
 (#48944)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48944

This is a stack PR for webdataset prototype. I am trying to make each stack a separate dataset.
To make the implementation simple, each dataset will only support the basic functionality.

- [x] ListDirFilesDataset
- [x] LoadFilesFromDiskIterableDataset
- [x] ReadFilesFromTarIterableDataset
- [x] ReadFilesFromZipIterableDataset
- [x] RoutedDecoderIterableDataset

Test Plan: Imported from OSS

Reviewed By: izdeby

Differential Revision: D25541277

Pulled By: glaringlee

fbshipit-source-id: 9e738f6973493f6be1d5cc1feb7a91513fa5807c
---
 test/test_dataset.py                          | 38 +++++++++++++++++++
 torch/utils/data/datasets/__init__.py         |  3 ++
 torch/utils/data/datasets/common.py           | 37 ++++++++++++++++++
 .../data/datasets/listdirfilesdataset.py      | 36 ++++++++++++++++++
 4 files changed, 114 insertions(+)
 create mode 100644 test/test_dataset.py
 create mode 100644 torch/utils/data/datasets/__init__.py
 create mode 100644 torch/utils/data/datasets/common.py
 create mode 100644 torch/utils/data/datasets/listdirfilesdataset.py

diff --git a/test/test_dataset.py b/test/test_dataset.py
new file mode 100644
index 000000000000..5f573641efa2
--- /dev/null
+++ b/test/test_dataset.py
@@ -0,0 +1,38 @@
+import tempfile
+import warnings
+
+from torch.testing._internal.common_utils import (TestCase, run_tests)
+
+from torch.utils.data.datasets import (ListDirFilesIterableDataset)
+
+def create_temp_dir_and_files():
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_dir_path = temp_dir.name
+    temp_file1 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
+    temp_file2 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
+    temp_file3 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
+
+    return (temp_dir, temp_file1.name, temp_file2.name, temp_file3.name)
+
+
+class TestIterableDatasetBasic(TestCase):
+
+    def setUp(self):
+        ret = create_temp_dir_and_files()
+        self.temp_dir = ret[0]
+        self.temp_files = ret[1:]
+
+    def tearDown(self):
+        try:
+            self.temp_dir.cleanup()
+        except Exception as e:
+            warnings.warn("TestIterableDatasetBasic was not able to cleanup temp dir due to {}".format(str(e)))
+
+    def test_listdirfiles_iterable_dataset(self):
+        temp_dir = self.temp_dir.name
+        dataset = ListDirFilesIterableDataset(temp_dir, '')
+        for pathname in dataset:
+            self.assertTrue(pathname in self.temp_files)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/torch/utils/data/datasets/__init__.py b/torch/utils/data/datasets/__init__.py
new file mode 100644
index 000000000000..ac5692152012
--- /dev/null
+++ b/torch/utils/data/datasets/__init__.py
@@ -0,0 +1,3 @@
+from .listdirfilesdataset import ListDirFilesIterableDataset
+
+__all__ = ['ListDirFilesIterableDataset']
diff --git a/torch/utils/data/datasets/common.py b/torch/utils/data/datasets/common.py
new file mode 100644
index 000000000000..2749d0dfa9b4
--- /dev/null
+++ b/torch/utils/data/datasets/common.py
@@ -0,0 +1,37 @@
+import os
+import fnmatch
+import warnings
+from typing import List, Union, Iterable
+
+def match_masks(name : str, masks : Union[str, List[str]]) -> bool:
+    # empty mask matches any input name
+    if not masks:
+        return True
+
+    if isinstance(masks, str):
+        return fnmatch.fnmatch(name, masks)
+
+    for mask in masks:
+        if fnmatch.fnmatch(name, mask):
+            return True
+    return False
+
+def get_file_pathnames_from_root(
+        root: str,
+        masks: Union[str, List[str]],
+        recursive: bool = False,
+        abspath: bool = False) -> Iterable[str]:
+
+    # print out an error message and raise the error out
+    def onerror(err : OSError):
+        warnings.warn(err.filename + " : " + err.strerror)
+        raise err
+
+    for path, dirs, files in os.walk(root, onerror=onerror):
+        if abspath:
+            path = os.path.abspath(path)
+        for f in files:
+            if match_masks(f, masks):
+                yield os.path.join(path, f)
+        if not recursive:
+            break
diff --git a/torch/utils/data/datasets/listdirfilesdataset.py b/torch/utils/data/datasets/listdirfilesdataset.py
new file mode 100644
index 000000000000..376971cc1adc
--- /dev/null
+++ b/torch/utils/data/datasets/listdirfilesdataset.py
@@ -0,0 +1,36 @@
+from torch.utils.data.dataset import IterableDataset
+from torch.utils.data.datasets.common import get_file_pathnames_from_root
+
+from typing import List, Union, Iterator
+
+class ListDirFilesIterableDataset(IterableDataset):
+    r""" :class:`ListDirFilesIterableDataset`
+
+    IterableDataset to load file pathname(s) (path + filename), yield pathname from given disk root dir.
+    args:
+        root : root dir
+        mask : a unix style filter string or string list for filtering file name(s)
+        abspath : whether to return relative pathname or absolute pathname
+        length : a nominal length of the dataset
+    """
+
+    def __init__(
+            self,
+            root: str = '.',
+            masks: Union[str, List[str]] = '*.tar',
+            *,
+            abspath: bool = False,
+            length: int = -1):
+        super().__init__()
+        self.root : str = root
+        self.masks : Union[str, List[str]] = masks
+        self.abspath : bool = abspath
+        self.length : int = length
+
+    def __iter__(self) -> Iterator[str] :
+        yield from get_file_pathnames_from_root(self.root, self.masks, self.abspath)
+
+    def __len__(self):
+        if self.length == -1:
+            raise NotImplementedError
+        return self.length

From 001ff3acf6e4bab2f2d6fb7b6cb711b040aebf4d Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Wed, 16 Dec 2020 08:32:24 -0800
Subject: [PATCH 14/34] webdataset prototype - LoadFilesFromDiskIterableDataset
 (#48955)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48955

Test Plan: Imported from OSS

Reviewed By: izdeby

Differential Revision: D25541393

Pulled By: glaringlee

fbshipit-source-id: dea6ad64a7ba40abe45612d99f078b14d1da8bbf
---
 test/test_dataset.py                          | 21 +++++++++----
 torch/utils/data/datasets/__init__.py         |  3 +-
 torch/utils/data/datasets/common.py           | 16 ++++++++++
 .../data/datasets/loadfilesfromdiskdataset.py | 30 +++++++++++++++++++
 4 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 torch/utils/data/datasets/loadfilesfromdiskdataset.py

diff --git a/test/test_dataset.py b/test/test_dataset.py
index 5f573641efa2..d984ca1c3837 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -3,14 +3,16 @@
 
 from torch.testing._internal.common_utils import (TestCase, run_tests)
 
-from torch.utils.data.datasets import (ListDirFilesIterableDataset)
+from torch.utils.data.datasets import (ListDirFilesIterableDataset, LoadFilesFromDiskIterableDataset)
 
 def create_temp_dir_and_files():
-    temp_dir = tempfile.TemporaryDirectory()
+    # The temp dir and files within it will be released and deleted in tearDown().
+    # Adding `noqa: P201` to avoid mypy's warning on not releasing the dir handle within this function.
+    temp_dir = tempfile.TemporaryDirectory()  # noqa: P201
     temp_dir_path = temp_dir.name
-    temp_file1 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
-    temp_file2 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
-    temp_file3 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)
+    temp_file1 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)  # noqa: P201
+    temp_file2 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)  # noqa: P201
+    temp_file3 = tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False)  # noqa: P201
 
     return (temp_dir, temp_file1.name, temp_file2.name, temp_file3.name)
 
@@ -34,5 +36,14 @@ def test_listdirfiles_iterable_dataset(self):
         for pathname in dataset:
             self.assertTrue(pathname in self.temp_files)
 
+    def test_loadfilesfromdisk_iterable_dataset(self):
+        temp_dir = self.temp_dir.name
+        dataset1 = ListDirFilesIterableDataset(temp_dir, '')
+        dataset2 = LoadFilesFromDiskIterableDataset(dataset1)
+
+        for rec in dataset2:
+            self.assertTrue(rec[0] in self.temp_files)
+            self.assertTrue(rec[1].read() == open(rec[0], 'rb').read())
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/data/datasets/__init__.py b/torch/utils/data/datasets/__init__.py
index ac5692152012..f2c3a8b2c260 100644
--- a/torch/utils/data/datasets/__init__.py
+++ b/torch/utils/data/datasets/__init__.py
@@ -1,3 +1,4 @@
 from .listdirfilesdataset import ListDirFilesIterableDataset
+from .loadfilesfromdiskdataset import LoadFilesFromDiskIterableDataset
 
-__all__ = ['ListDirFilesIterableDataset']
+__all__ = ['ListDirFilesIterableDataset', 'LoadFilesFromDiskIterableDataset']
diff --git a/torch/utils/data/datasets/common.py b/torch/utils/data/datasets/common.py
index 2749d0dfa9b4..c28e01eb3a83 100644
--- a/torch/utils/data/datasets/common.py
+++ b/torch/utils/data/datasets/common.py
@@ -3,6 +3,7 @@
 import warnings
 from typing import List, Union, Iterable
 
+
 def match_masks(name : str, masks : Union[str, List[str]]) -> bool:
     # empty mask matches any input name
     if not masks:
@@ -16,6 +17,7 @@ def match_masks(name : str, masks : Union[str, List[str]]) -> bool:
             return True
     return False
 
+
 def get_file_pathnames_from_root(
         root: str,
         masks: Union[str, List[str]],
@@ -35,3 +37,17 @@ def onerror(err : OSError):
                 yield os.path.join(path, f)
         if not recursive:
             break
+
+
+def get_file_binaries_from_pathnames(pathnames : Iterable):
+
+    if not isinstance(pathnames, Iterable):
+        warnings.warn("get_file_binaries_from_pathnames needs the input be an Iterable")
+        raise TypeError
+
+    for pathname in pathnames:
+        if not isinstance(pathname, str):
+            warnings.warn("file pathname must be string type, but got {}".format(type(pathname)))
+            raise TypeError
+
+        yield (pathname, open(pathname, 'rb'))
diff --git a/torch/utils/data/datasets/loadfilesfromdiskdataset.py b/torch/utils/data/datasets/loadfilesfromdiskdataset.py
new file mode 100644
index 000000000000..fdf8acb07ca1
--- /dev/null
+++ b/torch/utils/data/datasets/loadfilesfromdiskdataset.py
@@ -0,0 +1,30 @@
+from torch.utils.data.dataset import IterableDataset
+from torch.utils.data.datasets.common import get_file_binaries_from_pathnames
+
+from typing import Iterable, Iterator
+
+class LoadFilesFromDiskIterableDataset(IterableDataset):
+    r""" :class:`LoadFilesFromDiskIterableDataset`.
+
+    IterableDataset to load file binary streams from given pathnames,
+    yield pathname and binary stream in a tuple.
+    args:
+        dataset: Iterable dataset that provides pathnames
+        length: a nominal length of the dataset
+    """
+
+    def __init__(
+            self,
+            dataset : Iterable,
+            length : int = -1):
+        super().__init__()
+        self.dataset : Iterable = dataset
+        self.length : int = length
+
+    def __iter__(self) -> Iterator[tuple] :
+        yield from get_file_binaries_from_pathnames(self.dataset)
+
+    def __len__(self):
+        if self.length == -1:
+            raise NotImplementedError
+        return self.length

From 86902f84bf82173ac15781c2e75feb37cfd88681 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 16 Dec 2020 09:21:45 -0800
Subject: [PATCH 15/34] CUDA BFloat embedding (#44848)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44848

Reviewed By: izdeby

Differential Revision: D25574204

Pulled By: ngimel

fbshipit-source-id: b35f7253a6ad2b83f7b6b06862a5ab77295373e0
---
 aten/src/ATen/native/cuda/Embedding.cu        |  50 ++++-----
 .../native/cuda/EmbeddingBackwardKernel.cu    | 100 +++++++++---------
 aten/src/ATen/native/cuda/EmbeddingBag.cu     |  22 ++--
 test/test_nn.py                               |   1 -
 4 files changed, 82 insertions(+), 91 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index fd97d8ab26b6..80a8bfa5a6e8 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -249,23 +249,21 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
        "embedding_backward",
        [&]
        {
-         AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "embedding_backward", [&] {
-           using accscalar_t = acc_type<scalar_t, true>;
-           AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
-            embedding_backward_feature_kernel<scalar_t, accscalar_t, index_t>
-              <<<grid,
-                  block,
-                  sizeof(accscalar_t)*C10_WARP_SIZE*BLOCKDIMY + sizeof(int)*C10_WARP_SIZE*BLOCKDIMY,
-                  stream>>>
-              (indices_contig.data_ptr<index_t>(),
-                grad.data_ptr<scalar_t>(),
-                grad_weight.data_ptr<scalar_t>(),
-                static_cast<int>(num_indices),
-                static_cast<int64_t>(stride),
-                static_cast<int>(padding_idx));
-            C10_CUDA_KERNEL_LAUNCH_CHECK();
-           });
-         });
+          using accscalar_t = acc_type<scalar_t, true>;
+          AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
+          embedding_backward_feature_kernel<scalar_t, accscalar_t, index_t>
+            <<<grid,
+                block,
+                sizeof(accscalar_t)*C10_WARP_SIZE*BLOCKDIMY + sizeof(int)*C10_WARP_SIZE*BLOCKDIMY,
+                stream>>>
+            (indices_contig.data_ptr<index_t>(),
+              grad.data_ptr<scalar_t>(),
+              grad_weight.data_ptr<scalar_t>(),
+              static_cast<int>(num_indices),
+              static_cast<int64_t>(stride),
+              static_cast<int>(padding_idx));
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+          });
        });
     return grad_weight;
   }
@@ -362,16 +360,14 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
     int dim = self.stride(0);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_backward", [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "embedding_backward", [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-        renorm_kernel<<<grid, block, 128 * sizeof(accscalar_t), stream>>>(
-          self.data_ptr<scalar_t>(),
-          unique_indices.data_ptr<index_t>(),
-          static_cast<accscalar_t>(max_norm),
-          static_cast<accscalar_t>(norm_type),
-          dim, self.stride(0), self.stride(1));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+      using accscalar_t = acc_type<scalar_t, true>;
+      renorm_kernel<<<grid, block, 128 * sizeof(accscalar_t), stream>>>(
+        self.data_ptr<scalar_t>(),
+        unique_indices.data_ptr<index_t>(),
+        static_cast<accscalar_t>(max_norm),
+        static_cast<accscalar_t>(norm_type),
+        dim, self.stride(0), self.stride(1));
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
   });
   return self;
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index 689db4347067..dd0730a38bcb 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -272,59 +272,57 @@ Tensor embedding_backward_cuda_kernel(
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
       grad.scalar_type(), "embedding_bag_backward_cuda_compute_grad_weight", [&] {
-        AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "embedding_bag_backward_cuda_compute_grad_weight", [&] {
-          // For numerical stability, the dtype of `grad_weight_per_segment`
-          // should match `acc_type`
-          using partial_weight_t = acc_type<scalar_t, true>;
-          TensorOptions op;
-          if(grad.dtype() == at::kHalf || grad.dtype() == at::kBFloat16) {
-              op = grad.options().dtype(at::kFloat);
-          } else {
-              op = grad.options();
-          }
-          auto grad_weight_per_segment = at::empty({num_of_partial_segments, stride}, op);
-          // Compute the sum of each partial-segment and handle bags
-          if (offset2bag.defined()) {
-                compute_grad_weight_bags<scalar_t><<<grid, block, 0, stream>>>(
-                  orig_indices.data_ptr<index_t>(),
-                  grad.data_ptr<scalar_t>(),
-                  offset2bag.data_ptr<index_t>(),
-                  count.defined() ? count.data_ptr<index_t>() : nullptr, numel, stride,
-                  mode_mean, bag_size.data_ptr<index_t>(),
-                  per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
-                  per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
-                  partial_segment_offset.data_ptr<index_t>(),
-                  num_of_partial_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
-                  stride_warped);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-          } else {
-                compute_grad_weight<scalar_t><<<grid, block, 0, stream>>>(
-                  orig_indices.data_ptr<index_t>(),
-                  grad.data_ptr<scalar_t>(),
-                  count.defined() ? count.data_ptr<index_t>() : nullptr,
-                  numel, stride,
-                  partial_segment_offset.data_ptr<index_t>(),
-                  num_of_partial_segments,
-                  grad_weight_per_segment.data_ptr<partial_weight_t>(),
-                  stride_warped);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-          }
-
-          // Finally, we sum all the partial-sums and scatter them
-          // into `grad_weight`.
-          const int grid2 = ceil_div(num_of_segments*stride_warped, block);
-              sum_and_scatter<scalar_t><<<grid2, block, 0, stream>>>(
-                sorted_indices.data_ptr<index_t>(),
-                grad_weight.data_ptr<scalar_t>(),
-                stride,
-                segment_offsets.data_ptr<index_t>(),
-                num_of_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
-                partials_per_segment_offset.data_ptr<index_t>(),
+        // For numerical stability, the dtype of `grad_weight_per_segment`
+        // should match `acc_type`
+        using partial_weight_t = acc_type<scalar_t, true>;
+        TensorOptions op;
+        if(grad.dtype() == at::kHalf || grad.dtype() == at::kBFloat16) {
+            op = grad.options().dtype(at::kFloat);
+        } else {
+            op = grad.options();
+        }
+        auto grad_weight_per_segment = at::empty({num_of_partial_segments, stride}, op);
+        // Compute the sum of each partial-segment and handle bags
+        if (offset2bag.defined()) {
+              compute_grad_weight_bags<scalar_t><<<grid, block, 0, stream>>>(
+                orig_indices.data_ptr<index_t>(),
+                grad.data_ptr<scalar_t>(),
+                offset2bag.data_ptr<index_t>(),
+                count.defined() ? count.data_ptr<index_t>() : nullptr, numel, stride,
+                mode_mean, bag_size.data_ptr<index_t>(),
+                per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
+                per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
+                partial_segment_offset.data_ptr<index_t>(),
+                num_of_partial_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
+                stride_warped);
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+        } else {
+              compute_grad_weight<scalar_t><<<grid, block, 0, stream>>>(
+                orig_indices.data_ptr<index_t>(),
+                grad.data_ptr<scalar_t>(),
+                count.defined() ? count.data_ptr<index_t>() : nullptr,
+                numel, stride,
+                partial_segment_offset.data_ptr<index_t>(),
                 num_of_partial_segments,
-                padding_idx,
+                grad_weight_per_segment.data_ptr<partial_weight_t>(),
                 stride_warped);
-          C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }
+
+        // Finally, we sum all the partial-sums and scatter them
+        // into `grad_weight`.
+        const int grid2 = ceil_div(num_of_segments*stride_warped, block);
+            sum_and_scatter<scalar_t><<<grid2, block, 0, stream>>>(
+              sorted_indices.data_ptr<index_t>(),
+              grad_weight.data_ptr<scalar_t>(),
+              stride,
+              segment_offsets.data_ptr<index_t>(),
+              num_of_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
+              partials_per_segment_offset.data_ptr<index_t>(),
+              num_of_partial_segments,
+              padding_idx,
+              stride_warped);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
   });
   return grad_weight;
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 651261cf6408..a80de4b45138 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -325,18 +325,16 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
 #endif
   int grid = 1024;
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_cuda", [&] {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "embedding_bag_cuda", [&] {
-      AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_cuda", [&] () {
-        EmbeddingBag_updateOutputKernel<scalar_t, index_t><<<grid, block, 0, stream>>>(
-            indices.data_ptr<index_t>(), offsets.data_ptr<index_t>(),
-            weight.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
-            offset2bag.data_ptr<index_t>(), numIndices, numBags, featureSize,
-            weight.stride(0), weight.stride(1), mode, bag_size.data_ptr<index_t>(),
-            mode == MODE_MAX ? max_indices.data_ptr<index_t>() : NULL,
-            per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
-            per_sample_weights.defined() ? per_sample_weights.stride(0) : 0);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_cuda", [&] () {
+      EmbeddingBag_updateOutputKernel<scalar_t, index_t><<<grid, block, 0, stream>>>(
+          indices.data_ptr<index_t>(), offsets.data_ptr<index_t>(),
+          weight.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+          offset2bag.data_ptr<index_t>(), numIndices, numBags, featureSize,
+          weight.stride(0), weight.stride(1), mode, bag_size.data_ptr<index_t>(),
+          mode == MODE_MAX ? max_indices.data_ptr<index_t>() : NULL,
+          per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
+          per_sample_weights.defined() ? per_sample_weights.stride(0) : 0);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
   });
 
diff --git a/test/test_nn.py b/test/test_nn.py
index 652b4d85cbed..a3d18bc3e49c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -12250,7 +12250,6 @@ def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
 
 
     @onlyCUDA
-    @skipCUDAIfNotRocm
     @dtypes(torch.int, torch.long)
     def test_embedding_bag_bfloat16(self, device, dtype):
         self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtype, test_backward=True)

From f2ee8c624149be1c81c279b274fa8a10eb5c37b8 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Wed, 16 Dec 2020 10:07:18 -0800
Subject: [PATCH 16/34] Instantiate PackedConvWeight to avoid linking error
 (#49442)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49442

When moving Aten/native to app level, symbols from native/quantized may sit in a target away from some of its call sites. As a result, there are linking errors of missing symbols of instantiations of PackedConvWeight::prepack. The solution is to instantiate PackedConvWeight in the same compilation unit. It's similar to D24941989 (https://github.com/pytorch/pytorch/commit/fe6bb2d287ab039127d2443bbb85b6152fc55bc4).
ghstack-source-id: 118676374

Test Plan: CI

Reviewed By: dhruvbird

Differential Revision: D25576703

fbshipit-source-id: d6e3d11d51d8172ab8487ce44ec8c042889f0f11
---
 aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index a4a9e34b8251..1bd0da28f053 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -170,6 +170,8 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeight<
   return ret_ptr;
 }
 
+template struct PackedConvWeight<2>;
+template struct PackedConvWeight<3>;
 #endif // USE_FBGEMM
 
 #ifdef USE_PYTORCH_QNNPACK

From c52f1dc365c55507b987e87a62b4898f5b320941 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Wed, 16 Dec 2020 10:15:45 -0800
Subject: [PATCH 17/34] .circleci: downgrade conda-package-handling to 1.6.0
 (#49434)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49434

There was a bug that was introduced in conda-package-handling >= 1.6.1 that makes archives
above a certain size fail out when attempting to extract
see: https://github.com/conda/conda-package-handling/issues/71

coincides with https://github.com/pytorch/builder/pull/611

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: xuzhao9, janeyx99, samestep

Differential Revision: D25573390

Pulled By: seemethere

fbshipit-source-id: 82173804f1b30da6e4b401c4949e2ee52065e149
---
 .circleci/scripts/binary_linux_test.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 6be3a0ddefc7..3cdb676c25a4 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -7,6 +7,11 @@ set -eux -o pipefail
 
 python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"
 
+# There was a bug that was introduced in conda-package-handling >= 1.6.1 that makes archives
+# above a certain size fail out when attempting to extract
+# see: https://github.com/conda/conda-package-handling/issues/71
+conda install -y conda-package-handling=1.6.0
+
 # Set up Python
 if [[ "$PACKAGE_TYPE" == conda ]]; then
   retry conda create -qyn testenv python="$DESIRED_PYTHON"

From 4b3f05a471a3bfbf55c60a168141d36b80e103bf Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Wed, 16 Dec 2020 10:16:28 -0800
Subject: [PATCH 18/34] [Docs] Updating init_process_group docs to indicate
 correct rank range (#49131)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49131

Users frequently assume the correct range of ranks is 1 ...
`world_size`. This PR udpates the docs to indicate that the correct rank range
users should specify is 0 ... `world_size` - 1.

Test Plan: Rendering and Building Docs

Reviewed By: mrshenli

Differential Revision: D25410532

fbshipit-source-id: fe0f17a4369b533dc98543204a38b8558e68497a
---
 torch/distributed/distributed_c10d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 387da70403b0..caba506eec07 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -382,7 +382,8 @@ def init_process_group(backend,
                                      Mutually exclusive with ``store``.
         world_size (int, optional): Number of processes participating in
                                     the job. Required if ``store`` is specified.
-        rank (int, optional): Rank of the current process.
+        rank (int, optional): Rank of the current process (it should be a
+                              number between 0 and ``world_size``-1).
                               Required if ``store`` is specified.
         store(Store, optional): Key/value store accessible to all workers, used
                                 to exchange connection/address information.

From 09c741868cac31e9214aa107d07435f539a91f0c Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Wed, 16 Dec 2020 10:16:28 -0800
Subject: [PATCH 19/34] [c10d Store] Store Python Docs Fixes (#49130)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49130

The Python Store API docs had some typos, where boolean value were
lower case, which is incorrect Python syntax. This diff fixes those typos.

Test Plan: Built and Rendered Docs

Reviewed By: mrshenli

Differential Revision: D25411492

fbshipit-source-id: fdbf1e6b8f81e9589e638286946cad68eb7c9252
---
 torch/csrc/distributed/c10d/init.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 0a7daa3a5b94..689c2e835529 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -383,7 +383,7 @@ value with the new supplied ``value``.
 
 Example::
     >>> import torch.distributed as dist
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.set("first_key", "first_value")
     >>> # Should return "first_value"
     >>> store.get("first_key")
@@ -411,7 +411,7 @@ when initializing the store, before throwing an exception.
 
 Example::
     >>> import torch.distributed as dist
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.set("first_key", "first_value")
     >>> # Should return "first_value"
     >>> store.get("first_key")
@@ -435,7 +435,7 @@ in an exception.
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, other store types can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.add("first_key", 1)
     >>> store.add("first_key", 6)
     >>> # Should return 7
@@ -457,12 +457,12 @@ Deletes the key-value pair associated with ``key`` from the store. Returns
     key (str): The key to be deleted from the store
 
 Returns:
-    `true` if ``key`` was deleted, otherwise `false`.
+    `True` if ``key`` was deleted, otherwise `False`.
 
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, HashStore can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.set("first_key")
     >>> # This should return true
     >>> store.delete_key("first_key")
@@ -489,7 +489,7 @@ the workers using the store.
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, HashStore can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.set("first_key", "first_value")
     >>> # This should return 2
     >>> store.num_keys()
@@ -508,7 +508,7 @@ Sets the store's default timeout. This timeout is used during initialization and
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, other store types can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> store.set_timeout(timedelta(seconds=10))
     >>> # This will throw an exception after 10 seconds
     >>> store.wait(["bad_key"])
@@ -530,7 +530,7 @@ will throw an exception.
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, other store types can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> # This will throw an exception after 30 seconds
     >>> store.wait(["bad_key"])
 )")
@@ -553,7 +553,7 @@ if the keys have not been set by the supplied ``timeout``.
 Example::
     >>> import torch.distributed as dist
     >>> # Using TCPStore as an example, other store types can also be used
-    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
     >>> # This will throw an exception after 10 seconds
     >>> store.wait(["bad_key"], timedelta(seconds=10))
 )");
@@ -618,8 +618,8 @@ pair, :meth:`~torch.distributed.store.get` to retrieve a key-value pair, etc.
 
 Example::
     >>> import torch.distributed as dist
-    >>> server_store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
-    >>> client_store = dist.TCPStore("127.0.0.1", 0, false)
+    >>> server_store = dist.TCPStore("127.0.0.1", 0, True, timedelta(seconds=30))
+    >>> client_store = dist.TCPStore("127.0.0.1", 0, False)
     >>> # Use any of the store methods from either the client or server after initialization
     >>> server_store.set("first_key", "first_value")
     >>> client_store.get("first_key")

From bbc71435b7bbaee310f488be766b1a37bb9a08ca Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 16 Dec 2020 10:18:54 -0800
Subject: [PATCH 20/34] Add sinc operator (#48740)

Summary:
Implements the sinc operator.
See https://numpy.org/doc/stable/reference/generated/numpy.sinc.html

![image](https://user-images.githubusercontent.com/13428986/101653855-cdffa080-3a0d-11eb-8426-ecc81c152ebd.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48740

Reviewed By: izdeby

Differential Revision: D25564477

Pulled By: soulitzer

fbshipit-source-id: 13f36a2b84dadfb4fd1442a2a40a3a3246cbaecb
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/native/UnaryOps.cpp             |  5 ++++
 aten/src/ATen/native/UnaryOps.h               |  1 +
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 16 ++++++++++
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   | 14 +++++++++
 aten/src/ATen/native/native_functions.yaml    | 16 ++++++++++
 docs/source/tensors.rst                       |  2 ++
 docs/source/torch.rst                         |  1 +
 tools/autograd/derivatives.yaml               |  3 ++
 tools/autograd/gen_variable_type.py           |  2 +-
 torch/_tensor_docs.py                         | 14 +++++++++
 torch/_torch_docs.py                          | 28 ++++++++++++++++++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   | 29 +++++++++++++++++++
 14 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 92952799ec49..7f0b01d95049 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -622,6 +622,7 @@ _(aten, signbit) \
 _(aten, silu) \
 _(aten, sgn) \
 _(aten, sin) \
+_(aten, sinc) \
 _(aten, sinh) \
 _(aten, size) \
 _(aten, sizes) \
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 9c91821aed80..ab58a8f277a5 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -343,6 +343,10 @@ Tensor& cos_out(Tensor& result, const Tensor& self) { return unary_op_impl_float
 Tensor cos(const Tensor& self) { return unary_op_impl_float(self, cos_stub); }
 Tensor& cos_(Tensor& self) { return unary_op_impl_(self, at::cos_out); }
 
+Tensor& sinc_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sinc_stub); }
+Tensor sinc(const Tensor& self) { return unary_op_impl_float(self, sinc_stub); }
+Tensor& sinc_(Tensor& self) { return unary_op_impl_(self, at::sinc_out); }
+
 Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sinh_stub); }
 Tensor sinh(const Tensor& self) { return unary_op_impl_float(self, sinh_stub); }
 Tensor& sinh_(Tensor& self) { return unary_op_impl_(self, at::sinh_out); }
@@ -717,6 +721,7 @@ DEFINE_DISPATCH(sign_stub);
 DEFINE_DISPATCH(signbit_stub);
 DEFINE_DISPATCH(sgn_stub);
 DEFINE_DISPATCH(sin_stub);
+DEFINE_DISPATCH(sinc_stub);
 DEFINE_DISPATCH(sinh_stub);
 DEFINE_DISPATCH(sqrt_stub);
 DEFINE_DISPATCH(tan_stub);
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index a6db47f17153..f732cb9a0141 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -55,6 +55,7 @@ DECLARE_DISPATCH(unary_fn, sign_stub);
 DECLARE_DISPATCH(unary_fn, signbit_stub);
 DECLARE_DISPATCH(unary_fn, sgn_stub);
 DECLARE_DISPATCH(unary_fn, sin_stub);
+DECLARE_DISPATCH(unary_fn, sinc_stub);
 DECLARE_DISPATCH(unary_fn, sinh_stub);
 DECLARE_DISPATCH(unary_fn, sqrt_stub);
 DECLARE_DISPATCH(unary_fn, tan_stub);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index f7c4f9c34613..8aa5957f4b7e 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -302,6 +302,21 @@ static void sgn_kernel(TensorIterator& iter){
   });
 }
 
+static void sinc_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.common_dtype(), "sinc_cpu", [&]() {
+    cpu_kernel(
+        iter,
+        [=](scalar_t a) -> scalar_t {
+          if (a == scalar_t(0)) {
+            return scalar_t(1);
+          } else {
+            scalar_t product = scalar_t(M_PI) * a;
+            return std::sin(product) / product;
+          }
+        });
+  });
+}
+
 static void sinh_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
     cpu_kernel_vec(
@@ -677,6 +692,7 @@ REGISTER_DISPATCH(neg_stub, &neg_kernel);
 REGISTER_DISPATCH(sign_stub, &sign_kernel);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel);
 REGISTER_DISPATCH(sgn_stub, &sgn_kernel);
+REGISTER_DISPATCH(sinc_stub, &sinc_kernel);
 REGISTER_DISPATCH(sinh_stub, &sinh_kernel);
 REGISTER_DISPATCH(cosh_stub, &cosh_kernel);
 REGISTER_DISPATCH(acosh_stub, &acosh_kernel);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 4d676181be79..059da1f49f75 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -103,6 +103,19 @@ void sigmoid_kernel_cuda(TensorIterator& iter) {
   });
 }
 
+void sinc_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "sinc_cuda", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      if (a == scalar_t(0)) {
+        return scalar_t(1);
+      } else {
+        scalar_t product = scalar_t(M_PI) * a;
+        return std::sin(product) / product;
+      }
+    });
+  });
+}
+
 void logit_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
@@ -245,6 +258,7 @@ REGISTER_DISPATCH(i0_stub, &i0_kernel_cuda);
 REGISTER_DISPATCH(rsqrt_stub, &rsqrt_kernel_cuda);
 REGISTER_DISPATCH(sqrt_stub, &sqrt_kernel_cuda);
 REGISTER_DISPATCH(sigmoid_stub, &sigmoid_kernel_cuda);
+REGISTER_DISPATCH(sinc_stub, &sinc_kernel_cuda);
 REGISTER_DISPATCH(logit_stub, &logit_kernel_cuda);
 REGISTER_DISPATCH(erf_stub, &erf_kernel_cuda);
 REGISTER_DISPATCH(erfc_stub, &erfc_kernel_cuda);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2bbde22c9389..7c24b79b2758 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3720,6 +3720,22 @@
   dispatch:
     CPU, CUDA: sin_out
 
+- func: sinc(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    DefaultBackend: sinc
+
+- func: sinc_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    DefaultBackend: sinc_
+
+- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sinc_out
+
 - func: sinh(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 1c3e02b5c418..5f3ffe43b8d0 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -560,6 +560,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: sgn_
    .. automethod:: sin
    .. automethod:: sin_
+   .. automethod:: sinc
+   .. automethod:: sinc_
    .. automethod:: sinh
    .. automethod:: sinh_
    .. automethod:: asinh
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index bc2e9c237a1b..ed5c59a26c02 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -340,6 +340,7 @@ Pointwise Ops
     sign
     signbit
     sin
+    sinc
     sinh
     sqrt
     square
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8791dfa7b095..ffd1ad347751 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -961,6 +961,9 @@
 - name: sin(Tensor self) -> Tensor
   self: grad * self.cos().conj()
 
+- name: sinc(Tensor self) -> Tensor
+  self: grad * ((M_PI * self * (M_PI * self).cos() - (M_PI * self).sin()) / (M_PI * self * self)).conj()
+
 - name: sinh(Tensor self) -> Tensor
   self: grad * self.cosh().conj()
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index a17e222f8cf1..10d67c668a32 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -70,7 +70,7 @@
     'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
     'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
     'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_',
-    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinh',
+    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinc', 'sinh',
     'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
     'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 1b1c772c37e9..16284aeffb15 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3268,6 +3268,20 @@ def callable(a, b) -> number
 In-place version of :meth:`~Tensor.sin`
 """)
 
+add_docstr_all('sinc',
+               r"""
+sinc() -> Tensor
+
+See :func:`torch.sinc`
+""")
+
+add_docstr_all('sinc_',
+               r"""
+sinc_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinc`
+""")
+
 add_docstr_all('sinh',
                r"""
 sinh() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 1c7ebadceb0d..61b06fd42c64 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7552,6 +7552,34 @@ def merge_dicts(*dicts):
     tensor([-0.5194,  0.1343, -0.4032, -0.2711])
 """.format(**common_args))
 
+add_docstr(torch.sinc,
+           r"""
+sinc(input, *, out=None) -> Tensor
+
+Computes the normalized sinc of :attr:`input.`
+
+.. math::
+    \text{out}_{i} =
+    \begin{cases}
+      1, & \text{if}\ \text{out}_{i}=0 \\
+      \sin(\pi \text{input}_{i}) / (\pi \text{input}_{i}), & \text{otherwise}
+    \end{cases}
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
+    >>> torch.sinc(a)
+    tensor([ 0.9186,  0.8631, -0.0259, -0.1300])
+""".format(**common_args))
+
 add_docstr(torch.sinh,
            r"""
 sinh(input, *, out=None) -> Tensor
diff --git a/torch/overrides.py b/torch/overrides.py
index 2af6e36ea914..79016c9a0e9f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -771,6 +771,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.signbit: lambda input, out=None: -1,
         torch.sgn: lambda input, out=None: -1,
         torch.sin: lambda input, out=None: -1,
+        torch.sinc: lambda input, out=None: -1,
         torch.sinh: lambda input, out=None: -1,
         torch.slogdet: lambda input: -1,
         torch.smm: lambda input, mat2: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b88dcaaccb33..6c18c0cbaa6d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -281,6 +281,15 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
                                     low=None, high=None,
                                     requires_grad=False))),)
 
+def np_sinc_with_fp16_as_fp32(x):
+    # Wraps numpy's sinc function so that fp16 values are promoted to fp32
+    # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
+    # at 0 for fp16.
+    if x.dtype == np.float16:
+        return np.sinc(x.astype(np.float32))
+    else:
+        return np.sinc(x)
+
 def np_unary_ufunc_integer_promotion_wrapper(fn):
     # Wrapper that passes PyTorch's default scalar
     #   type as an argument to the wrapped NumPy
@@ -702,6 +711,26 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 dtypes=[torch.float], active_if=TEST_WITH_ROCM),
                    )),
+    UnaryUfuncInfo('sinc',
+                   ref=np_sinc_with_fp16_as_fp32,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
+                   skip_bfloat16_grad=True,
+                   handles_large_floats=False,
+                   handles_complex_extremals=False,
+                   promotes_integers_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2,
+                                                  torch.float16: 1e-2}),),
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/49133
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                dtypes=[torch.cfloat]),
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                dtypes=[torch.float], active_if=TEST_WITH_ROCM),
+                   )),
     UnaryUfuncInfo('sinh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
                    dtypesIfCPU=all_types_and_complex_and(torch.bool),

From 45b33c83f1434f4d1f4ec4e4499dbefd1d67a050 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Wed, 16 Dec 2020 10:23:50 -0800
Subject: [PATCH 21/34] Revert "Revert D24923679: Fixed einsum
 compatibility/performance issues (#46398)" (#49189)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49189

This reverts commit d307601365c3b848072b8b8381208aedc1a0aca5 and fixes the bug with diagonals and ellipsis combined.

Test Plan: Imported from OSS

Reviewed By: glaringlee

Differential Revision: D25540722

Pulled By: heitorschueroff

fbshipit-source-id: 86d0c9a7dcfda600b546457dad102af2ff33e353
---
 aten/src/ATen/native/Linear.cpp | 503 +++++++++++++++++++-------------
 test/test_linalg.py             | 270 ++++++++++++-----
 torch/functional.py             | 171 ++++++-----
 3 files changed, 596 insertions(+), 348 deletions(-)

diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index c9e03aaa3b6b..b9a9cd5e5ad0 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -136,241 +136,336 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   return result;
 }
 
-Tensor einsum(std::string eqn, TensorList tensors) {
-  constexpr size_t number_of_letters = 26;
-  std::string in_eqn;
-  size_t pos;
-  // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis.
-  // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter
-  // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices.
-  // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that
-  // the letter has not been assigned an index yet (because it has not been seen).
-  // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices).
-  // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet.
-  // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below.
-
-  std::array<std::int64_t, number_of_letters> letter_mapping; // map letter to internal (numerical) label
-  letter_mapping.fill(-1);
-  int64_t num_ell_idxes = -1;
-  int64_t first_ell_idx = 0;
-
-  // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes.
-  // For each operand, we have a vector mapping each dimension to an internal index.
-  // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and
-  // of the last occurrence of each index.
-  std::vector<std::vector<int64_t>> input_op_idxes;                   // the parsed operand indices
-  std::array<std::int64_t, number_of_letters> num_letter_occurrences; // number of occurrence in the equation of this letter
-  num_letter_occurrences.fill(0);
-  std::vector<std::int64_t> last_idx_occurrence;                      // the last operator (left to right) using this index
-
-  if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side
-    in_eqn = eqn.substr(0, pos);
-  } else {
-    in_eqn = eqn;
-  }
-  // remove spaces for einsum compatibility (#9929)
-  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
-
-  // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
-  int64_t operand = 0;
-  std::stringstream eqn_stream(in_eqn);
-  std::string term;
-  int64_t num_total_idxes = 0;
-  while (! eqn_stream.eof()) {
-    std::getline(eqn_stream, term, ',');  // term = string with indices of current term
-    TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
-
-    int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
-    // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
-    int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3;
-    int64_t dims_in_term = 0;              // dimensions we have seen
-    std::vector<int64_t> current_op_idxes; // mapping of operand dimensions to indices for current term
-    for (auto &c : term) {                 // c = character with a single letter or '.'
-      if (c == '.') {
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
-        if (ell_char_count == 3) {        // this completes the ellipsis
-          if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
-            first_ell_idx = num_total_idxes;
-            num_ell_idxes = candidate_num_ell_idxes;
-            num_total_idxes += num_ell_idxes;
-          }
-          else {                          // we have seen an ellipsis before, so we check compatibility
-            TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes,
-                     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
-          }
-          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
-            current_op_idxes.push_back(first_ell_idx + i);
-            last_idx_occurrence.push_back(operand);
-          }
-          dims_in_term += num_ell_idxes;                // keep track of dimensions
-        }
-      } else {                                          // a letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
-        if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
-          letter_mapping[letter_num] = num_total_idxes;
-          num_total_idxes++;
-          last_idx_occurrence.push_back(operand);
-        } else {                                        // letter we have already seen
-          last_idx_occurrence[letter_mapping[letter_num]] = operand;
-        }
-        num_letter_occurrences[letter_num]++;
-        current_op_idxes.push_back(letter_mapping[letter_num]);
-        dims_in_term++;
-      }
+// There are roughly three parts to compute einsum:
+// 1. Parse equation to extract the labels for each input operand and output
+// 2. Unsqueeze missing dimensions from input operands and permute to align them
+// 3. Compute result by multiplying input operands and summing contraction
+//    dimensions We do the last part by reducing to bmm.
+Tensor einsum(std::string equation, TensorList operands) {
+  TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand");
+  checkDeviceType("einsum()", operands, operands[0].device().type());
+
+  // Code used to identify ELLIPSIS ("...")
+  constexpr int ELLIPSIS = '.';
+
+  // Find arrow (->) to split equation into lhs and rhs
+  const auto arrow_pos = equation.find("->");
+  const auto lhs = equation.substr(0, arrow_pos);
+
+  const auto num_ops = operands.size();
+
+  // Convert labels for input operands into an index in [0, 25] and store
+  // them in op_labels for each operand along with ELLIPSIS if present.
+  std::vector<std::vector<int>> op_labels(num_ops);
+  bool found_ell = false;
+  std::size_t curr_op = 0;
+  for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
+    switch (lhs[i]) {
+      case ' ':
+        // Ignore spaces
+        break;
+
+      case '.':
+        TORCH_CHECK(
+            // Only one ellipsis per operand can be given
+            !found_ell,
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " for which an ellipsis was already found");
+        TORCH_CHECK(
+            // Ensure it's a valid ellipsis
+            i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.',
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " that is not part of any ellipsis");
+        op_labels[curr_op].push_back(ELLIPSIS);
+        found_ell = true;
+        break;
+
+      case ',':
+        // Move onto next operand
+        ++curr_op;
+        TORCH_CHECK(
+            curr_op < num_ops,
+            "einsum() fewer operands were provided than specified in the equation");
+        found_ell = false;
+        break;
+
+      default:
+        // Parse label
+        TORCH_CHECK(
+            lhs[i] >= 'a' && lhs[i] <= 'z',
+            "einsum() operand subscript must be in range [a, z] but found ",
+            lhs[i],
+            " for operand ",
+            curr_op);
+        // Convert label to index in [0, 25] and store
+        op_labels[curr_op].push_back(lhs[i] - 'a');
     }
-    TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
-    input_op_idxes.push_back(std::move(current_op_idxes));
-    operand++;
   }
-  // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
-  TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
-
-  // the following parses or infers output (right hand side)
-  // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the output indices. -1 means that the index has not been assigned a dimension yet
-  std::vector<int64_t> idxes_to_preprocessed_dims(num_total_idxes, -1);     // the position of the index in the tensor dimensions
-  int64_t num_output_dims = 0;
-  if (pos != std::string::npos) {            // parse the user provided right hand side
-    int64_t ell_char_count = 0;
-    for (auto &c : eqn.substr(pos+2)) {
-      if (c == '.') {                        // '.' as part of ellipsis
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
-        if (ell_char_count == 3) {           // ellipsis complete
-          TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
-          for (int64_t i = 0; i < num_ell_idxes; ++i) {
-            idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-            num_output_dims++;
-          }
-        }
-      } else if (! isspace(c)) {                              // letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';
-        TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output");
-        idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
-        num_output_dims++;
+
+  TORCH_CHECK(
+      curr_op == num_ops - 1,
+      "einsum() more operands were provided than specified in the equation");
+
+  // Labels must be within [a, z].
+  constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
+  std::vector<int> label_count(TOTAL_LABELS, 0);
+
+  // The maximum number of dimensions covered by any ellipsis, needed when
+  // unsqueezing missing dimensions from operands to permute and broadcast
+  int64_t ell_num_dim = 0;
+
+  // Compute label frequency and number of dimensions covered by ellipsis
+  // We do this after parsing labels to make it more readable and simpler
+  // to compute the number of dimensions covered by ellipsis.
+  for (auto i = decltype(num_ops){0}; i < num_ops; ++i) {
+    const auto operand = operands[i];
+    const auto labels = op_labels[i];
+    const int64_t ndims = operand.dim();
+    int64_t nlabels = labels.size();
+    bool has_ellipsis = false;
+
+    for (const auto& label : labels) {
+      if (label == ELLIPSIS) {
+        --nlabels;
+        has_ellipsis = true;
+        ell_num_dim = std::max(ell_num_dim, ndims - nlabels);
+      } else {
+        ++label_count[label];
       }
     }
-  } else { // create an inferred right hand side
-    // the ellipsis (if in the lhs) comes first
-    if (num_ell_idxes >= 0) {
-      for (int64_t i = 0; i < num_ell_idxes; ++i) {
-        idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-        num_output_dims++;
+
+    TORCH_CHECK(
+        has_ellipsis ? nlabels <= ndims : nlabels == ndims,
+        "einsum() the number of subscripts in the equation (",
+        nlabels,
+        has_ellipsis ? ") is more than the number of dimensions ("
+                     : ") does not match the number of dimensions (",
+        ndims,
+        ") for operand ",
+        i,
+        has_ellipsis ? "" : " and no ellipsis was given");
+  }
+
+  // We want to align the dimensions of every input tensor to have
+  // shape out_dims + sum_dims. For this, we create a mapping of label
+  // to index into the permuted shape.
+  std::vector<int64_t> label_perm_index(TOTAL_LABELS, -1);
+  
+  // Current index in the permuted shape
+  int64_t perm_index = 0;
+
+  // Start index of ellipsis dimensions in the permuted shape
+  int64_t ell_index = 0;
+  found_ell = false;
+
+  if (arrow_pos == std::string::npos) {
+    // Implicit output is ellipsis (...) + labels seen only once
+    perm_index = ell_num_dim;
+    found_ell = true;
+    for (int label = 0; label < TOTAL_LABELS; ++label) {
+      if (label_count[label] == 1) {
+        label_perm_index[label] = perm_index++;
       }
     }
-    // then the indices that occur exactly once in alphabetic order
-    for (size_t idx = 0; idx < number_of_letters; idx++) {
-      if (num_letter_occurrences[idx] == 1) {
-        idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims;
-        num_output_dims++;
+  } else {
+    // Parse explicit output
+    const auto rhs = equation.substr(arrow_pos + 2);
+    for (auto i = decltype(rhs.length()){0}; i < rhs.length(); ++i) {
+      switch (rhs[i]) {
+        case ' ':
+          // Ignore spaces
+          break;
+
+        case '.':
+          TORCH_CHECK(
+              // There can only be one ellipsis in the output
+              !found_ell,
+              "einsum() found \'.\' for output but an ellipsis (...) was already found");
+          TORCH_CHECK(
+              // Ensure ellipsis is correct
+              i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.',
+              "einsum() found \'.\' for output that is not part of any ellipsis (...)");
+          ell_index = perm_index;
+          perm_index += ell_num_dim;
+          found_ell = true;
+          break;
+
+        default:
+          TORCH_CHECK(
+              // Labels must be in [a, z]
+              rhs[i] >= 'a' && rhs[i] <= 'z',
+              "einsum() subscripts must be in range [a, z] but found ",
+              rhs[i],
+              " for the output");
+          const auto label = rhs[i] - 'a';
+          TORCH_CHECK(
+              // Ensure label appeared at least once for some input operand and at
+              // most once for the output
+              label_count[label] > 0 && label_perm_index[label] == -1,
+              "einsum() output subscript ",
+              rhs[i],
+              label_perm_index[label] > -1
+                  ? " appears more than once in the output"
+                  : " does not appear in the equation for any input operand");
+          label_perm_index[label] = perm_index++;
       }
     }
   }
-  // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the non-output indices - those that are eventually summed over
-  int64_t position = num_output_dims;
-  for (int64_t i = 0; i < num_total_idxes; i++) {
-    if (idxes_to_preprocessed_dims[i]==-1) {
-      idxes_to_preprocessed_dims[i] = position;
-      position++;
+
+  // Save output size before adding contraction dims (dims to sum out)
+  const int64_t out_size = perm_index;
+
+  // If ellipsis is not part of the output, add to contraction dimensions
+  if (!found_ell) {
+    ell_index = perm_index;
+    perm_index += ell_num_dim;
+  }
+
+  // Add contraction labels (labels not present in output)
+  for (int label = 0; label < TOTAL_LABELS; ++label) {
+    if (label_count[label] > 0 && label_perm_index[label] == -1) {
+      label_perm_index[label] = perm_index++;
     }
   }
 
-  // we now "homogenize the dimensions", i.e.
-  // - take diagonals for duplicated indices
-  // - permute the dimensions to match the order given by idxes_to_preprocessed_dims
-  // - unsqueeze to create all dimensions for each index in each tensor where they are missing
-  // we also check that sizes match
-  // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable)
-  std::vector<Tensor> preprocessed_operands;
-  std::vector<std::int64_t> size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet
-  for (int64_t op = 0; op < (int64_t) tensors.size(); op++) {
-    auto preprocessed_op = tensors[op];
-    std::vector<int64_t> idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear
-    std::vector<int64_t>& current_op_input_idxes = input_op_idxes[op];
-    int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input
-    for (size_t i = 0; i < current_op_input_idxes.size(); i++) {
-      auto idx = current_op_input_idxes[i];
-      auto dim_out = idxes_to_preprocessed_dims[idx];
-      if (idx_to_dim[dim_out] == -1) { // first appearance
-        idx_to_dim[dim_out] = dim;
-        if (size_of_dims[idx] == -1) { // keep track of sizes
-          size_of_dims[idx] = preprocessed_op.size(dim);
-        }
-        else {
-          TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+  // Here we unsqueeze missing dimensions to make all operands have the same
+  // number of dimensions. We take diagonals for repeated labels within the
+  // same operand. Finally we permute the operands to align dimensions as 
+  // per the perm_out_index we computed above.
+  std::vector<Tensor> permuted_operands;
+  for (auto i = decltype(num_ops){0}; i < num_ops; ++i) {
+    std::vector<int64_t> perm_shape(perm_index, -1);
+    std::vector<int64_t> label_dim(TOTAL_LABELS, -1);
+    Tensor operand = operands[i];
+    const auto labels = op_labels[i];
+    const auto original_sizes = operand.sizes();
+
+    std::size_t j = 0;
+    for (const auto& label : labels) {
+      if (label == ELLIPSIS) {
+        // Add missing dimensions covered by the ellipsis
+        const int64_t num_missing_dim =
+            ell_num_dim - (original_sizes.size() - labels.size() + 1);
+        for (int64_t k = 0; k < num_missing_dim; ++k) {
+          operand = operand.unsqueeze(j);
         }
-        dim++;
-      } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
-        TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
-        preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
-        // diagonal moves the diagonal dimension to the back
-        // now we permute the last dim back to idx_to_dim[dim_out]
-        std::vector<int64_t> perm(preprocessed_op.dim(), 0);
-        for (int64_t d = 0; d < preprocessed_op.dim(); d++) {
-          if (d == idx_to_dim[dim_out]) {
-            perm[d] = preprocessed_op.dim() - 1;
-          } else {
-            perm[d] = d - (d > idx_to_dim[dim_out]);
-          }
+        for (int64_t k = 0; k < ell_num_dim; ++k) {
+          perm_shape[ell_index + k] = j++;
         }
-        preprocessed_op = preprocessed_op.permute(perm);
+      } else if (label_dim[label] != -1) {
+        // Repeated label, take diagonal
+        const auto dim = label_dim[label];
+        TORCH_CHECK(
+            operand.size(j) == operand.size(dim),
+            "einsum() subscript ",
+            char(label + 'a'),
+            " is repeated for operand ",
+            i,
+            " but the sizes don't match, ",
+            operand.size(j),
+            " != ",
+            operand.size(dim));
+        operand = operand.diagonal(0, dim, j).movedim(-1, dim);
+      } else {
+        // Lookup output index for label
+        label_dim[label] = j;
+        perm_shape[label_perm_index[label]] = j++;
       }
     }
-    // now we permute the dimensions in the right order
-    std::vector<int64_t> permutation; // permutation for this tensor
-    for (auto &d : idx_to_dim) {
-      if (d > -1) {
-        permutation.push_back(d);
+
+    // Add dimensions for missing labels
+    for (int64_t& index : perm_shape) {
+      if (index == -1) {
+        operand = operand.unsqueeze(-1);
+        index = j++;
       }
     }
-    preprocessed_op = preprocessed_op.permute(permutation);
-    // finally, we insert dimensions for idxes not in the operand
-    for (size_t dim = 0; dim < idx_to_dim.size(); dim++) {
-      if (idx_to_dim[dim] == -1) {
-        preprocessed_op = preprocessed_op.unsqueeze(dim);
+
+    permuted_operands.push_back(operand.permute(perm_shape));
+  }
+
+  // Check if operands broadcast and keep track of last operand with
+  // dimension size != 1 for optimizing reductions
+  std::vector<std::size_t> dim_last_op(perm_index, 0);
+  bool has_zero_size_dim = false;
+  for (int64_t dim = 0; dim < perm_index; ++dim) {
+    auto broadcast_size = permuted_operands[0].size(dim);
+    for (auto i = decltype(num_ops){1}; i < num_ops; ++i) {
+      const auto dim_size = permuted_operands[i].size(dim);
+      if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) {
+        std::ostringstream msg;
+        msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:";
+        for (auto j = decltype(num_ops){0}; j < num_ops; ++j) {
+          msg << " " << operands[j].sizes() << "->"
+              << permuted_operands[j].sizes();
+        }
+        TORCH_CHECK(false, msg.str());
+      }
+      if (dim_size != 1) {
+        broadcast_size = dim_size;
+        dim_last_op[dim] = i;
       }
     }
+    has_zero_size_dim |= broadcast_size == 0;
+  }
+
+  // Compute result
+  Tensor result = permuted_operands[0];
 
-    preprocessed_operands.push_back(std::move(preprocessed_op));
+  // Fast path for when an operand has zero sized dim
+  if (has_zero_size_dim) {
+    std::vector<int64_t> out_shape(out_size);
+    for (int64_t i = 0; i < out_size; ++i) {
+      out_shape[i] = permuted_operands[dim_last_op[i]].size(i);
+    }
+    return at::zeros(out_shape, result.options());
   }
 
-  // now we reduce the indices from left to right
-  // numpy allows to optimize the path using various
-  // algorithms (see eigen_path in numpy docs)
-  // we start with the leftmost operator and reduce indices that
-  // appear only there
-  Tensor result = std::move(preprocessed_operands[0]);
-  for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-    if ((last_idx_occurrence[idx] == 0)
-        && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-      result = result.sum(idxes_to_preprocessed_dims[idx], true);
+  // Sum out or squeeze dimensions that are size 1 for all later operands
+  int64_t dim = out_size;
+  for (int64_t i = dim; i < perm_index; ++i, ++dim) {
+    if (dim_last_op[i] == 0) {
+      if (result.size(dim) == 1) {
+        result = result.squeeze(dim--);
+      } else {
+        result = result.sum(dim--);
+      }
     }
   }
 
-  // now we process each tensor using sumproduct_pair
-  for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) {
+  for (auto i = decltype(num_ops){1}; i < num_ops; ++i) {
+    Tensor operand = permuted_operands[i];
     std::vector<int64_t> sum_dims;
-    for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-      if ((last_idx_occurrence[idx] == i)
-          && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-        sum_dims.push_back(idxes_to_preprocessed_dims[idx]);
+
+    // Sum out or squeeze dimensions that are size 1 for all later operands
+    dim = out_size;
+    for (int64_t j = dim; j < perm_index; ++j, ++dim) {
+      if (dim_last_op[j] < i) {
+        operand = operand.squeeze(dim);
+        --dim;
+      } else if (dim_last_op[j] == i) {
+        if (result.size(dim) == 1) {
+          operand = operand.sum(dim);
+          result = result.squeeze(dim);
+          --dim;
+        } else {
+          sum_dims.push_back(dim);
+        }
       }
     }
-    result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true);
-  }
-  // finally, we squeeze out all non-result dimensions
-  auto sizes = result.sizes().vec();
-  for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) {
-    sizes.erase(sizes.begin() + dim);
+
+    // Multiply tensors and sum out dimensions in sum_dims
+    if (sum_dims.empty()) {
+      result = result.mul(operand);
+    } else if (sum_dims.size() == result.sizes().size()) {
+      result = result.flatten().dot(operand.flatten());
+    } else {
+      result = sumproduct_pair(result, operand, sum_dims, false);
+    }
   }
 
-  result = result.view(sizes);
   return result;
 }
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 5e7e0c273dcf..123e75e85de3 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2620,6 +2620,202 @@ def test_old_matrix_rank(self, device, dtype):
             self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True))
             self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True))
 
+    @dtypes(torch.double, torch.cdouble)
+    def test_einsum(self, device, dtype):
+        def check(equation, *operands):
+            ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands])
+            res = torch.einsum(equation, operands)
+            self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
+
+            # Check autograd
+            ops = [op.detach().requires_grad_() for op in operands]
+            self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops))
+            for op in ops:
+                self.assertTrue(op._version == 0)
+
+        # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+        x = torch.rand(5, device=device, dtype=dtype)
+        y = torch.rand(7, device=device, dtype=dtype)
+        A = torch.randn(3, 5, device=device, dtype=dtype)
+        B = torch.randn(2, 5, device=device, dtype=dtype)
+        C = torch.randn(2, 3, 5, device=device, dtype=dtype)
+        D = torch.randn(2, 5, 7, device=device, dtype=dtype)
+        E = torch.randn(7, 9, device=device, dtype=dtype)
+        F = torch.randn(2, 3, 3, 5, device=device, dtype=dtype)
+        G = torch.randn(5, 4, 6, device=device, dtype=dtype)
+        H = torch.randn(4, 4, device=device, dtype=dtype)
+        I = torch.rand(2, 3, 2, device=device, dtype=dtype)
+
+        # Note: gradcheck fails if the same input is given multiple times which is why the
+        # calls to clone below. (see https://github.com/pytorch/pytorch/issues/9282)
+
+        # Vector operations
+        check('i->', x)                     # sum
+        check('i,i->', x, x.clone())        # dot
+        check('i,i->i', x, x.clone())       # vector element-wisem mul
+        check('i,j->ij', x, y)              # outer
+
+        # Matrix operations
+        check("ij->ji", A)                  # transpose
+        check("ij->j", A)                   # row sum
+        check("ij->i", A)                   # col sum
+        check("ij,ij->ij", A, A.clone())    # matrix element-wise mul
+        check("ij,j->i", A, x)              # matrix vector multiplication
+        check("ij,kj->ik", A, B)            # matmul
+        check("ij,ab->ijab", A, E)          # matrix outer product
+
+        # Tensor operations
+        check("aij,ajk->aik", C, D)         # batch matmul
+        check("ijk,jk->i", C, A)            # tensor matrix contraction
+        check("aij,jk->aik", D, E)          # tensor matrix contraction
+        check("abcd,dfg->abcfg", F, G)      # tensor tensor contraction
+        check("ijk,jk->ik", C, A)           # tensor matrix contraction with double indices
+        check("ijk,jk->ij", C, A)           # tensor matrix contraction with double indices
+        check("ijk,ik->j", C, B)            # non contiguous
+        check("ijk,ik->jk", C, B)           # non contiguous with double indices
+
+        # Test diagonals
+        check("ii", H)                      # trace
+        check("ii->i", H)                   # diagonal
+        check('iji->j', I)                  # non-contiguous trace
+        check('ngrg...->nrg...', torch.rand((2, 1, 3, 1, 4), device=device, dtype=dtype))
+
+        # Test ellipsis
+        check("i...->...", H)
+        check("ki,...k->i...", A.t(), B)
+        check("k...,jk->...", A.t(), B)
+        check('...ik, ...j -> ...ij', C, x)
+        check('bik,k...j->i...j', C, torch.rand(5, 3, device=device, dtype=dtype))
+        check('i...j, ij... -> ...ij', C, torch.rand(2, 5, 2, 3, device=device, dtype=dtype))
+
+        # torch.bilinear with discontiguous tensors
+        l = torch.randn(10, 5, device=device, dtype=dtype).transpose(0, 1)
+        r = torch.randn(20, 5, device=device, dtype=dtype).transpose(0, 1)
+        w = torch.randn(15, 10, 20, device=device, dtype=dtype)
+        check("bn,anm,bm->ba", l, w, r)
+
+        # with strided tensors
+        check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2])
+
+    @dtypes(torch.double, torch.cdouble)
+    def test_einsum_random(self, device, dtype):
+        def check(equation, *operands):
+            ref = np.einsum(equation, *[op.cpu().numpy() for op in operands])
+            res = torch.einsum(equation, operands)
+            self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
+
+        for _ in range(20):
+            # Create a random number of input operands, each with a random
+            # number of dimensions randomly labeled.
+            op_labels = []
+            valid_labels = set()
+            for _ in range(random.randint(1, 3)):
+                labels = np.random.randint(0, 10, random.randint(1, 5))
+                op_labels.append(labels)
+                valid_labels.update(labels)
+            label_size = np.random.randint(1, 5, 10)
+            ell_sizes = np.random.randint(1, 5, 3)
+
+            # Build equation and tensors from input operand labels.
+            ops = []
+            equation = ''
+            for labels in op_labels:
+                sizes = [label_size[label] for label in labels]
+                labels = [chr(ord('a') + label) for label in labels]
+
+                # Add ellipsis dimensions at random
+                ell_num_dim = random.randint(0, 3)
+                if ell_num_dim > 0:
+                    ell_index = random.randint(0, len(labels))
+                    sizes[ell_index:ell_index] = ell_sizes[-ell_num_dim:]
+                    labels.insert(ell_index, "...")
+
+                equation += ''.join(labels) + ','
+                ops.append(torch.rand(sizes, device=device, dtype=dtype))
+            equation = equation[:-1]
+
+            # Test with implicit output
+            check(equation, *ops)
+
+            # Randomly choose some labels to be part of the output
+            out_labels = np.unique(np.random.choice(list(valid_labels), random.randint(1, len(valid_labels))))
+            out_labels = [chr(ord('a') + label) for label in out_labels]
+            ell_index = random.randint(0, len(out_labels))
+            out_labels.insert(ell_index, '...')
+            equation += '->' + ''.join(out_labels)
+
+            # Randomly test the output
+            check(equation, *ops)
+
+    def test_einsum_corner_cases(self, device):
+        def check(equation, *operands, expected_output):
+            tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple)
+                       else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands]
+            output = torch.einsum(equation, tensors)
+            self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
+
+        # Test equation variantions
+        check(' ', 1, expected_output=1)
+        check(' -> ', 1, expected_output=1)
+        check(' , ', 2, 2, expected_output=4)
+        check(' , , ', 2, 2, 2, expected_output=8)
+        check(' , -> ', 2, 2, expected_output=4)
+        check(' i ', [1], expected_output=[1])
+        check(' i -> ', [1], expected_output=1)
+        check(' i -> i ', [1], expected_output=[1])
+        check(' i , i ', [2], [2], expected_output=4)
+        check(' i , i -> i ', [2], [2], expected_output=[4])
+
+        # Test tensors with 0 size dimensions
+        check('i', [], expected_output=[])
+        check(' i j -> j', [[], []], expected_output=[])
+        check('ij->i', [[], []], expected_output=[0., 0.])
+        check(' i j k  ,  k  -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []])
+
+        # Test broadcasting
+        check('i,j', [2], [1, 2], expected_output=[[2, 4]])
+        check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]])
+
+        # Test ellipsis broadcasting
+        check('...', 1, expected_output=1)
+        check('...->', 1, expected_output=1)
+        check('...->...', 1, expected_output=1)
+        check('...', [1], expected_output=[1])
+        check('...->', [1], expected_output=1)
+        check('i...->i', [1], expected_output=[1])
+        check('i...->...i', [1], expected_output=[1])
+        check('...a->', [[2], [4]], expected_output=6)
+        check('a...b->ab', [[[1], [2]], [[3], [4]]], expected_output=[[3], [7]])
+
+    def test_einsum_error_cases(self, device):
+        def check(equation, operands, regex, exception=RuntimeError):
+            with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex):
+                torch.einsum(equation, operands)
+
+        x = torch.rand(2)
+        y = torch.rand(2, 3)
+
+        check('', [], r'must provide at least one operand')
+        check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis')
+        check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found')
+        check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0')
+        check(',', [x], r'fewer operands were provided than specified in the equation')
+        check('', [x, x], r'more operands were provided than specified in the equation')
+        check('', [x], r'the number of subscripts in the equation \(0\) does not match the number '
+                       r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number '
+                         r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number '
+                            r'of dimensions \(1\) for operand 0')
+        check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found')
+        check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)')
+        check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output')
+        check('a->aa', [x], r'output subscript a appears more than once in the output')
+        check('a->i', [x], r'output subscript i does not appear in the equation for any input operand')
+        check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2')
+        check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: '
+                               r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]')
+
     def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular,
                                      device, dtype):
         triangle_function = torch.triu if upper else torch.tril
@@ -3272,80 +3468,6 @@ def run_test(pivot):
         if self.device_type == 'cuda':
             run_test(False)
 
-    @onlyCPU
-    @slowTest
-    @dtypes(torch.double)
-    def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None:
-        # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
-        x = torch.randn(5, dtype=dtype, device=device)
-        y = torch.randn(7, dtype=dtype, device=device)
-        A = torch.randn(3, 5, dtype=dtype, device=device)
-        B = torch.randn(2, 5, dtype=dtype, device=device)
-        C = torch.randn(2, 3, 5, dtype=dtype, device=device)
-        D = torch.randn(2, 5, 7, dtype=dtype, device=device)
-        E = torch.randn(7, 9, dtype=dtype, device=device)
-        F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device)
-        G = torch.randn(7, 11, 13, dtype=dtype, device=device)
-        H = torch.randn(4, 4, dtype=dtype, device=device)
-        I = torch.randn(3, 4, 4, dtype=dtype, device=device)
-        l = torch.randn(5, 10, dtype=dtype, device=device)
-        r = torch.randn(5, 20, dtype=dtype, device=device)
-        w = torch.randn(30, 10, 20, dtype=dtype, device=device)
-        test_list: List[Union[Tuple[str, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [
-            # -- Vector
-            ("i->", x),                 # sum
-            ("i,i->", x, x),            # dot
-            ("i,i->i", x, x),           # vector element-wise mul
-            ("i,j->ij", x, y),          # outer
-            # -- Matrix
-            ("ij->ji", A),              # transpose
-            ("ij->j", A),               # row sum
-            ("ij->i", A),               # col sum
-            ("ij,ij->ij", A, A),        # matrix element-wise mul
-            ("ij,j->i", A, x),          # matrix vector multiplication
-            ("ij,kj->ik", A, B),        # matmul
-            ("ij,ab->ijab", A, E),      # matrix outer product
-            # -- Tensor
-            ("aij,ajk->aik", C, D),     # batch matmul
-            ("ijk,jk->i", C, A),        # tensor matrix contraction
-            ("aij,jk->aik", D, E),      # tensor matrix contraction
-            ("abcd,dfg->abcfg", F, G),  # tensor tensor contraction
-            ("ijk,jk->ik", C, A),       # tensor matrix contraction with double indices
-            ("ijk,jk->ij", C, A),       # tensor matrix contraction with double indices
-            ("ijk,ik->j", C, B),        # non contiguous
-            ("ijk,ik->jk", C, B),       # non contiguous with double indices
-            # -- Diagonal
-            ("ii", H),                 # trace
-            ("ii->i", H),              # diagonal
-            # -- Ellipsis
-            ("i...->...", H),
-            ("ki,...k->i...", A.t(), B),
-            ("k...,jk", A.t(), B),
-            ("...ii->...i", I),       # batch diagonal
-            # -- Other
-            ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
-            ("... ii->...i  ", I),       # batch diagonal with spaces
-        ]
-        for test in test_list:
-            actual = torch.einsum(test[0], test[1:])
-            expected = np.einsum(test[0], *[t.numpy() for t in test[1:]])
-            self.assertEqual(expected.shape, actual.shape, msg=test[0])
-            self.assertEqual(expected, actual, msg=test[0])
-            # test vararg
-            actual2 = torch.einsum(test[0], *test[1:])
-            self.assertEqual(expected.shape, actual2.shape, msg=test[0])
-            self.assertEqual(expected, actual2, msg=test[0])
-
-            def do_einsum(*args):
-                return torch.einsum(test[0], args)
-            # FIXME: following test cases fail gradcheck
-            if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}:
-                gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:])
-                self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
-            self.assertTrue(A._version == 0)  # check that we do not use inplace ops
-
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
diff --git a/torch/functional.py b/torch/functional.py
index f21fcda4566b..25b0c1fb3b19 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -296,76 +296,107 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
 def einsum(equation, *operands):
     r"""einsum(equation, *operands) -> Tensor
 
-This function provides a way of computing multilinear expressions (i.e. sums of products) using the
-Einstein summation convention.
-
-Args:
-    equation (string): The equation is given in terms of lower case letters (indices) to be associated
-           with each dimension of the operands and result. The left hand side lists the operands
-           dimensions, separated by commas. There should be one index letter per tensor dimension.
-           The right hand side follows after `->` and gives the indices for the output.
-           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
-           sorted list of all indices appearing exactly once in the left hand side.
-           The indices not apprearing in the output are summed over after multiplying the operands
-           entries.
-           If an index appears several times for the same operand, a diagonal is taken.
-           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
-           the ellipsis dimensions are at the beginning of the output.
-    operands (Tensor): The operands to compute the Einstein sum of.
-
-.. note::
-
-    This function does not optimize the given expression, so a different formula for the same computation may
-    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
-    can optimize the formula for you.
-
-Examples::
-
-    >>> x = torch.randn(5)
-    >>> y = torch.randn(4)
-    >>> torch.einsum('i,j->ij', x, y)  # outer product
-    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
-            [ 1.2616,  0.6335,  0.5113, -0.4351],
-            [ 1.4452,  0.7257,  0.5857, -0.4984],
-            [-0.4647, -0.2333, -0.1883,  0.1603],
-            [-1.1130, -0.5588, -0.4510,  0.3838]])
-
-
-    >>> A = torch.randn(3,5,4)
-    >>> l = torch.randn(2,5)
-    >>> r = torch.randn(2,4)
-    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
-    tensor([[-0.3430, -5.2405,  0.4494],
-            [ 0.3311,  5.5201, -3.0356]])
-
-
-    >>> As = torch.randn(3,2,5)
-    >>> Bs = torch.randn(3,5,4)
-    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
-    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
-             [-1.6706, -0.8097, -0.8025, -2.1183]],
-
-            [[ 4.2239,  0.3107, -0.5756, -0.2354],
-             [-1.4558, -0.3460,  1.5087, -0.8530]],
-
-            [[ 2.8153,  1.8787, -4.3839, -1.2112],
-             [ 0.3728, -2.1131,  0.0921,  0.8305]]])
-
-    >>> A = torch.randn(3, 3)
-    >>> torch.einsum('ii->i', A) # diagonal
-    tensor([-0.7825,  0.8291, -0.1936])
-
-    >>> A = torch.randn(4, 3, 3)
-    >>> torch.einsum('...ii->...i', A) # batch diagonal
-    tensor([[-1.0864,  0.7292,  0.0569],
-            [-0.9725, -1.0270,  0.6493],
-            [ 0.5832, -1.1716, -1.5084],
-            [ 0.4041, -1.1690,  0.8570]])
-
-    >>> A = torch.randn(2, 3, 4, 5)
-    >>> torch.einsum('...ij->...ji', A).shape # batch permute
-    torch.Size([2, 3, 5, 4])
-"""
+    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
+    based on the Einstein summation convention.
+
+    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
+    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of 
+    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
+    with some subscript and define which subscripts are part of the output. The output is then computed by summing
+    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
+    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
+    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
+
+    Equation:
+
+        The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of
+        the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a
+        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
+        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
+        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
+        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
+        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
+        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
+        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
+
+        Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation
+        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
+        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
+        at most once for the output.
+
+        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
+        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
+        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
+        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
+        'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not
+        explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions),
+        before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements
+        batch matrix multiplication `'...ij,...jk'`.
+
+        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
+        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
+
+    .. note::
+
+        ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions
+        covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output.
+
+    .. note::
+
+        This function does not optimize the given expression, so a different formula for the same computation may
+        run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
+        can optimize the formula for you.
+
+    Args:
+        equation (string): The subscripts for the Einstein summation.
+        operands (Tensor): The operands to compute the Einstein sum of.
+
+    Examples::
+
+        # trace
+        >>> torch.einsum('ii', torch.randn(4, 4))
+        tensor(-1.2104)
+
+        # diagonal
+        >>> torch.einsum('ii->i', torch.randn(4, 4))
+        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
+
+        # outer product
+        >>> x = torch.randn(5)
+        >>> y = torch.randn(4)
+        >>> torch.einsum('i,j->ij', x, y)
+        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
+                [-0.3744,  0.9381,  1.2685, -1.6070],
+                [ 0.7208, -1.8058, -2.4419,  3.0936],
+                [ 0.1713, -0.4291, -0.5802,  0.7350],
+                [ 0.5704, -1.4290, -1.9323,  2.4480]])
+
+        # batch matrix multiplication
+        >>> As = torch.randn(3,2,5)
+        >>> Bs = torch.randn(3,5,4)
+        >>> torch.einsum('bij,bjk->bik', As, Bs)
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        # batch permute
+        >>> A = torch.randn(2, 3, 4, 5)
+        >>> torch.einsum('...ij->...ji', A).shape 
+        torch.Size([2, 3, 5, 4])
+
+        # equivalent to torch.nn.functional.bilinear
+        >>> A = torch.randn(3,5,4)
+        >>> l = torch.randn(2,5)
+        >>> r = torch.randn(2,4)
+        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
+        tensor([[-0.3430, -5.2405,  0.4494],
+                [ 0.3311,  5.5201, -3.0356]])
+    """
     if not torch.jit.is_scripting():
         if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
             return handle_torch_function(einsum, operands, equation, *operands)

From d7659be58d4aa0215b2f1189ae56cfbf3d8915d2 Mon Sep 17 00:00:00 2001
From: James Donald <jdonald@fb.com>
Date: Wed, 16 Dec 2020 10:28:33 -0800
Subject: [PATCH 22/34] [caffe2][autograd] Avoid extensive -Wunused-variable
 warnings on _any_requires_grad (#49167)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49167

Building with clang and a fair warning level can result in hundreds of lines of compiler output of the form:
```
caffe2\gen_aten_libtorch\autograd\generated\VariableType_1.cpp(2279,8): warning: unused variable '_any_requires_grad' [-Wunused-variable]
   auto _any_requires_grad = compute_requires_grad( self );
        ^
caffe2\gen_aten_libtorch\autograd\generated\VariableType_1.cpp(2461,8): warning: unused variable '_any_requires_grad' [-Wunused-variable]
   auto _any_requires_grad = compute_requires_grad( grad_output, self );
        ^
caffe2\gen_aten_libtorch\autograd\generated\VariableType_1.cpp(2677,8): warning: unused variable '_any_requires_grad' [-Wunused-variable]
   auto _any_requires_grad = compute_requires_grad( self );
        ^
...
```
This happens when requires_derivative == False. Let's mark `_any_requires_grad` as potentially unused. If this were C++17 we would use `[[maybe_unused]]` but to retain compatibility with C++11 we just mark it with `(void)`.

Test Plan: CI + locally built

Reviewed By: ezyang

Differential Revision: D25421548

fbshipit-source-id: c56279a184b1c616e8717a19ee8fad60f36f37d1
---
 tools/autograd/gen_variable_type.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 10d67c668a32..ab18db90c166 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -188,6 +188,7 @@
 
 SETUP_ANY_REQUIRES_GRAD = CodeTemplate("""\
 auto _any_requires_grad = compute_requires_grad( ${args_with_derivatives} );
+(void)_any_requires_grad;
 """)
 
 SETUP_DERIVATIVE = CodeTemplate("""\

From afce5890ff3bdeb6f1fcb9164c1d03fd0e765352 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 16 Dec 2020 11:09:17 -0800
Subject: [PATCH 23/34] Revert D25421263: [pytorch][PR] [numpy] torch.{all/any}
 : output dtype is always bool

Test Plan: revert-hammer

Differential Revision:
D25421263 (https://github.com/pytorch/pytorch/commit/c508e5b1bfad7f244cd679f27e9258ad303618b6)

Original commit changeset: c6c681ef9400

fbshipit-source-id: 4c0c9acf42b06a3ed0af8f757ea4512ca35b6c59
---
 aten/src/ATen/native/ReduceOps.cpp            |  64 ++-----
 aten/src/ATen/native/SharedReduceOps.h        |  50 ++++++
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp  |  80 +++++----
 .../src/ATen/native/cuda/ReduceLogicKernel.cu |  35 ++--
 docs/source/tensors.rst                       |   9 +-
 docs/source/torch.rst                         |   2 -
 test/test_reductions.py                       | 170 ++++++++++--------
 torch/_tensor_docs.py                         |  80 ++++++++-
 torch/_torch_docs.py                          |  99 ----------
 9 files changed, 301 insertions(+), 288 deletions(-)

diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 865b84ae3d11..e4b0a1cb19b7 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -756,23 +756,14 @@ Tensor all(const Tensor& self) {
   TORCH_CHECK(self.layout() == Layout::Strided,
               "all only supports strided layout, got: ", self.layout());
 
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
-  if (self.is_cuda()) {
-    // As CUDA supports dynamic type casting, we use this overload of
-    // `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
-    // otherwise we use the overload below which casts the input to kBool (which is
-    // an extra operation).
-    auto iter = make_reduction(
-        "all", result, self, {}, false, self.scalar_type(), kBool);
-    return _all(result, iter);
-  }
-  auto iter =
-      make_reduction("all", result, self, {}, false, /*out_dtype=*/kBool);
+  Tensor result = at::empty({0}, self.options());
+  auto iter = make_reduction(
+    "all", result, self, {}, false, self.scalar_type());
   return _all(result, iter);
 }
 
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
+  Tensor result = at::empty({0}, self.options());
   return at::native::all_out(result, self, dim, keepdim);
 }
 
@@ -781,24 +772,13 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
               "all only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "all only supports strided layout, got: ", self.layout());
-  TORCH_CHECK(result.scalar_type() == ScalarType::Bool,
-              "all only supports bool tensor for result, got: ", result.scalar_type());
 
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
     return result;
   } else {
-    if (self.is_cuda()) {
-      // As CUDA supports dynamic type casting, we use this overload of
-      // `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
-      // otherwise we use the overload below which casts the input to kBool (which is
-      // an extra operation).
-      auto iter = make_reduction(
-          "all", result, self, dim, keepdim, self.scalar_type(), kBool);
-      return _all(result, iter);
-    }
-    auto iter =
-        make_reduction("all", result, self, dim, keepdim, /*out_dtype=*/kBool);
+    auto iter = make_reduction(
+      "all", result, self, dim, keepdim, self.scalar_type());
     return _all(result, iter);
   }
 }
@@ -819,23 +799,14 @@ Tensor any(const Tensor& self) {
   TORCH_CHECK(self.layout() == Layout::Strided || self.layout() == Layout::Sparse,
               "any only supports strided AND sparse layout, got: ", self.layout());
 
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
-  if (self.is_cuda()) {
-    // As CUDA supports dynamic type casting, we use this overload of
-    // `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
-    // otherwise we use the overload below which casts the input to kBool (which is
-    // an extra operation).
-    auto iter = make_reduction(
-        "any", result, self, {}, false, self.scalar_type(), kBool);
-    return _any(result, iter);
-  }
-  auto iter =
-      make_reduction("any", result, self, {}, false, /*out_dtype=*/kBool);
+  Tensor result = at::empty({0}, self.options());
+  auto iter = make_reduction(
+    "any", result, self, {}, false, self.scalar_type());
   return _any(result, iter);
 }
 
 Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
+  Tensor result = at::empty({0}, self.options());
   return at::native::any_out(result, self, dim, keepdim);
 }
 
@@ -844,24 +815,13 @@ Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
               "any only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "any only supports strided layout, got: ", self.layout());
-  TORCH_CHECK(result.scalar_type() == ScalarType::Bool,
-              "any only supports bool tensor for result, got: ", result.scalar_type());
 
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
     return result;
   } else {
-    if (self.is_cuda()) {
-      // As CUDA supports dynamic type casting, we use this overload of
-      // `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
-      // otherwise we use the overload below which casts the input to kBool (which is
-      // an extra operation).
-      auto iter = make_reduction(
-          "any", result, self, dim, keepdim, self.scalar_type(), kBool);
-      return _any(result, iter);
-    }
-    auto iter =
-        make_reduction("any", result, self, dim, keepdim, /*out_dtype=*/kBool);
+    auto iter = make_reduction(
+      "any", result, self, dim, keepdim, self.scalar_type());
     return _any(result, iter);
   }
 }
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index e25b943d13a8..4106a90c0729 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -386,6 +386,56 @@ struct NanSumOps {
 #endif
 };
 
+template <typename acc_t>
+struct AndOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
+    return static_cast<bool>(a) && static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return static_cast<bool>(a) && static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
+template <typename acc_t>
+struct OrOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
+    return static_cast<bool>(a) || static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return static_cast<bool>(a) || static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
 namespace detail {
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index f311a4da4550..10437f51d4b4 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -246,43 +246,55 @@ static void norm_kernel_tensor_iterator_impl(
 }
 
 static void and_kernel_impl(TensorIterator& iter) {
-  binary_kernel_reduce_vec(
-      iter,
-      [=](bool a, bool b) -> bool { return a && b; },
-      [=](Vec256<bool> a, Vec256<bool> b) {
-        // Adding the implementation here instead of in vec256_base to avoid
-        // return value inconsistency. Other comparison operators in
-        // vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
-        // follow the AVX2 convention. This would be convenient when combined
-        // with other vectorized operations. For example, one can use the
-        // logical operation results as a mask for a bit operation to
-        // retrieve/reset multiple elements in a vector.
-        //
-        // In this method, users would expect, e.g., all(), to return 1/0 as
-        // true/false.
-        Vec256<bool> c = Vec256<bool>();
-
-        for (decltype(c.size()) i = 0; i != Vec256<bool>::size(); i++) {
-          c[i] = a[i] && b[i];
-        }
-        return c;
-      },
-      /*ident=*/true);
+  if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+    binary_kernel_reduce_vec(
+        iter,
+        [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
+        [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+          // Adding the implementation here instead of in vec256_base to avoid
+          // return value inconsistency. Other comparison operators in
+          // vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
+          // follow the AVX2 convention. This would be convenient when combined
+          // with other vectorized operations. For example, one can use the
+          // logical operation results as a mask for a bit operation to
+          // retrieve/reset multiple elements in a vector.
+          //
+          // In this method, users would expect, e.g., all(), to return 1/0 as
+          // true/false.
+          Vec256<uint8_t> c = Vec256<uint8_t>();
+          for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+            c[i] = (a[i] && b[i]) ? 1 : 0;
+          }
+          return c;
+        },
+        /*ident=*/true);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "and_kernel", [&]() {
+      binary_kernel_reduce(
+          iter, AndOps<scalar_t>(), static_cast<scalar_t>(true));
+    });
+  }
 }
 
 static void or_kernel_impl(TensorIterator& iter) {
-  binary_kernel_reduce_vec(
-      iter,
-      [=](bool a, bool b) -> bool { return a || b; },
-      [=](Vec256<bool> a, Vec256<bool> b) {
-        Vec256<bool> c = Vec256<bool>();
-
-        for (decltype(c.size()) i = 0; i != Vec256<bool>::size(); i++) {
-          c[i] = a[i] || b[i];
-        }
-        return c;
-      },
-      /*ident=*/false);
+  if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+    binary_kernel_reduce_vec(
+        iter,
+        [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
+        [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+          Vec256<uint8_t> c = Vec256<uint8_t>();
+          for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+            c[i] = (a[i] || b[i]) ? 1 : 0;
+          }
+          return c;
+        },
+        /*ident=*/false);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "or_kernel", [&]() {
+      binary_kernel_reduce(
+          iter, OrOps<scalar_t>(), static_cast<scalar_t>(false));
+    });
+  }
 }
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
index fcf60678929e..a29a926ef257 100644
--- a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
@@ -3,33 +3,30 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/ReduceOps.h>
-#include <ATen/Dispatch.h>
 
 
 namespace at { namespace native {
 
 void and_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kHalf, kBFloat16, kBool, iter.common_dtype(), "and_cuda", [&]() {
-        gpu_reduce_kernel<scalar_t, bool>(
-            iter,
-            func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-              return (static_cast<bool>(a) && static_cast<bool>(b));
-            }),
-            true);
-      });
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "and_kernel", [&]() {
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return static_cast<scalar_t>(static_cast<bool>(a) && static_cast<bool>(b));
+        }),
+        static_cast<scalar_t>(true));
+  });
 }
 
 void or_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      kHalf, kBFloat16, kBool, iter.common_dtype(), "or_cuda", [&]() {
-        gpu_reduce_kernel<scalar_t, bool>(
-            iter,
-            func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-              return (static_cast<bool>(a) || static_cast<bool>(b));
-            }),
-            false);
-      });
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "or_kernel", [&]() {
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return static_cast<scalar_t>(static_cast<bool>(a) || static_cast<bool>(b));
+        }),
+        static_cast<scalar_t>(false));
+  });
 }
 
 REGISTER_DISPATCH(and_stub, &and_kernel_cuda);
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 5f3ffe43b8d0..2e1225b882e3 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -214,8 +214,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: arctan_
    .. automethod:: atan2
    .. automethod:: atan2_
-   .. automethod:: all
-   .. automethod:: any
    .. automethod:: backward
       :noindex:
    .. automethod:: baddbmm
@@ -647,3 +645,10 @@ view of a storage and defines numeric operations on it.
    .. automethod:: view_as
    .. automethod:: where
    .. automethod:: zero_
+
+.. class:: BoolTensor()
+
+   The following methods are unique to :class:`torch.BoolTensor`.
+
+   .. automethod:: all
+   .. automethod:: any
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index ed5c59a26c02..5dd74f62a531 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -361,8 +361,6 @@ Reduction Ops
     argmin
     amax
     amin
-    all
-    any
     max
     min
     dist
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 325f7eed4fad..7c877d822142 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1122,7 +1122,7 @@ def verify_against_numpy(t):
         verify_against_numpy(t)
 
     @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
-                                           include_bool=True, include_complex=True)))
+                                           include_bool=True, include_complex=False)))
     def test_all_any_vs_numpy(self, device, dtype):
         def _test_all_any(x):
             self.compare_with_numpy(torch.all, np.all, x)
@@ -1131,92 +1131,38 @@ def _test_all_any(x):
         def _test_all_any_with_dim(x, dim):
             torch_fn = partial(torch.all, dim=dim)
             np_fn = partial(np.all, axis=dim)
-            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=True)
+            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False)
 
             torch_fn = partial(torch.any, dim=dim)
             np_fn = partial(np.any, axis=dim)
-            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=True)
-
-        def _test_out_variant(x, dim):
-            out = torch.empty_like(x)
-            if dtype == torch.bool:
-                expected = torch.all(x, dim)
-                torch.all(x, dim, out=out)
-                self.assertEqual(expected, out)
-
-                expected = torch.any(x, dim)
-                torch.any(x, dim, out=out)
-                self.assertEqual(expected, out)
-            else:
-                with self.assertRaisesRegex(RuntimeError, "all only supports bool tensor for result, got"):
-                    torch.all(x, dim, out=out)
-
-                with self.assertRaisesRegex(RuntimeError, "any only supports bool tensor for result, got"):
-                    torch.any(x, dim, out=out)
-
-        def _test_all_any_with_dim_keepdim(x, dim, keepdim):
-            torch_fn = partial(torch.all, dim=dim, keepdim=keepdim)
-            np_fn = partial(np.all, axis=dim, keepdims=keepdim)
-            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=True)
-
-            torch_fn = partial(torch.any, dim=dim, keepdim=keepdim)
-            np_fn = partial(np.any, axis=dim, keepdims=keepdim)
-            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=True)
+            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False)
 
         for ndim in range(5):
             shape = _rand_shape(ndim, 1, 5)
             x = _generate_input(shape, dtype, device, with_extremal=False)
             _test_all_any(x)
-            _test_all_any(x.T)
-            _test_all_any(x[..., ::2])
 
             x = _generate_input(shape, dtype, device, with_extremal=True)
             _test_all_any(x)
-            _test_all_any(x.T)
-            _test_all_any(x[..., ::2])
 
             x = torch.zeros_like(x)
             _test_all_any(x)
-            _test_all_any(x.T)
-            _test_all_any(x[..., ::2])
 
             x = torch.ones_like(x)
             _test_all_any(x)
-            _test_all_any(x.T)
-            _test_all_any(x[..., ::2])
 
             for dim in range(ndim):
                 x = _generate_input(shape, dtype, device, with_extremal=False)
                 _test_all_any_with_dim(x, dim)
-                _test_all_any_with_dim(x.T, dim)
-                _test_all_any_with_dim(x[..., ::2], dim)
-                _test_out_variant(x, dim)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=True)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=False)
 
                 x = _generate_input(shape, dtype, device, with_extremal=True)
                 _test_all_any_with_dim(x, dim)
-                _test_all_any_with_dim(x.T, dim)
-                _test_all_any_with_dim(x[..., ::2], dim)
-                _test_out_variant(x, dim)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=True)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=False)
 
                 x = torch.zeros_like(x)
                 _test_all_any_with_dim(x, dim)
-                _test_all_any_with_dim(x.T, dim)
-                _test_all_any_with_dim(x[..., ::2], dim)
-                _test_out_variant(x, dim)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=True)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=False)
 
                 x = torch.ones_like(x)
                 _test_all_any_with_dim(x, dim)
-                _test_all_any_with_dim(x.T, dim)
-                _test_all_any_with_dim(x[..., ::2], dim)
-                _test_out_variant(x, dim)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=True)
-                _test_all_any_with_dim_keepdim(x, dim, keepdim=False)
 
     # TODO: part of this test covers torch.norm, with should be covered by test_linalg
     @onlyOnCPUAndCUDA
@@ -1894,6 +1840,82 @@ def check(a, q, args, kwargs, message):
                     RuntimeError, r'quantile\(\) out tensor must be on the same device as the input tensor'):
                 torch.quantile(torch.randn(1, device=device), 0.5, out=torch.scalar_tensor(1))
 
+    def test_logical_any(self, device):
+        x = torch.zeros([2, 3, 400], dtype=torch.uint8, device=device)
+
+        self.assertEqual(
+            torch.tensor(0, dtype=torch.uint8, device=device),
+            x.any())
+
+        self.assertEqual(
+            torch.zeros([1, 3, 400], dtype=torch.uint8, device=device),
+            x.any(0, keepdim=True))
+
+        self.assertEqual(
+            torch.zeros([2, 1, 400], dtype=torch.uint8, device=device),
+            x.any(1, keepdim=True))
+
+        self.assertEqual(
+            torch.zeros([2, 3, 1], dtype=torch.uint8, device=device),
+            x.any(2, keepdim=True))
+
+        # set the last element to 0
+        x[-1][-1][-1] = 1
+
+        self.assertEqual(
+            torch.tensor(1, dtype=torch.uint8, device=device),
+            x.any())
+
+        y = torch.zeros([1, 3, 400], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 1
+        self.assertEqual(y, x.any(0, keepdim=True))
+
+        y = torch.zeros([2, 1, 400], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 1
+        self.assertEqual(y, x.any(1, keepdim=True))
+
+        y = torch.zeros([2, 3, 1], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 1
+        self.assertEqual(y, x.any(2, keepdim=True))
+
+    def test_logical_all(self, device):
+        x = torch.ones([2, 3, 400], dtype=torch.uint8, device=device)
+
+        self.assertEqual(
+            torch.tensor(1, dtype=torch.uint8, device=device),
+            x.all())
+
+        self.assertEqual(
+            torch.ones([1, 3, 400], dtype=torch.uint8, device=device),
+            x.all(0, keepdim=True))
+
+        self.assertEqual(
+            torch.ones([2, 1, 400], dtype=torch.uint8, device=device),
+            x.all(1, keepdim=True))
+
+        self.assertEqual(
+            torch.ones([2, 3, 1], dtype=torch.uint8, device=device),
+            x.all(2, keepdim=True))
+
+        # set the last element to 0
+        x[-1][-1][-1] = 0
+
+        self.assertEqual(
+            torch.tensor(0, dtype=torch.uint8, device=device),
+            x.all())
+
+        y = torch.ones([1, 3, 400], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 0
+        self.assertEqual(y, x.all(0, keepdim=True))
+
+        y = torch.ones([2, 1, 400], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 0
+        self.assertEqual(y, x.all(1, keepdim=True))
+
+        y = torch.ones([2, 3, 1], dtype=torch.uint8, device=device)
+        y[-1][-1][-1] = 0
+        self.assertEqual(y, x.all(2, keepdim=True))
+
     def test_std_mean(self, device):
         x = torch.rand(100, 50, 20, device=device)
         for dim in range(x.dim()):
@@ -2218,25 +2240,21 @@ def test_reduction_empty(self, device):
                     # ignore if there is no allreduce.
                     self.assertTrue('dim' in str(err))
 
-        for dtype in torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
-                                                  include_bool=True, include_complex=True):
-            out_dtype = torch.bool  # output of all/any is bool irrespective of input dtype
-
-            # any
-            xb = x.to(dtype)
-            yb = x.to(dtype)
-            self.assertEqual((2, 0), xb.any(2).shape)
-            self.assertEqual((2, 0, 1), xb.any(2, keepdim=True).shape)
-            self.assertEqual(torch.zeros((2, 4), device=device, dtype=out_dtype), xb.any(1))
-            self.assertEqual(torch.zeros((2, 1, 4), device=device, dtype=out_dtype), xb.any(1, keepdim=True))
-            self.assertEqual(torch.zeros((), device=device, dtype=out_dtype), xb.any())
-
-            # all
-            self.assertEqual((2, 0), xb.all(2).shape)
-            self.assertEqual((2, 0, 1), xb.all(2, keepdim=True).shape)
-            self.assertEqual(torch.ones((2, 4), device=device, dtype=out_dtype), xb.all(1))
-            self.assertEqual(torch.ones((2, 1, 4), device=device, dtype=out_dtype), xb.all(1, keepdim=True))
-            self.assertEqual(torch.ones((), device=device, dtype=out_dtype), xb.all())
+        # any
+        xb = x.to(torch.uint8)
+        yb = x.to(torch.uint8)
+        self.assertEqual((2, 0), xb.any(2).shape)
+        self.assertEqual((2, 0, 1), xb.any(2, keepdim=True).shape)
+        self.assertEqual(torch.zeros((2, 4), device=device, dtype=torch.uint8), xb.any(1))
+        self.assertEqual(torch.zeros((2, 1, 4), device=device, dtype=torch.uint8), xb.any(1, keepdim=True))
+        self.assertEqual(torch.zeros((), device=device, dtype=torch.uint8), xb.any())
+
+        # all
+        self.assertEqual((2, 0), xb.all(2).shape)
+        self.assertEqual((2, 0, 1), xb.all(2, keepdim=True).shape)
+        self.assertEqual(torch.ones((2, 4), device=device, dtype=torch.uint8), xb.all(1))
+        self.assertEqual(torch.ones((2, 1, 4), device=device, dtype=torch.uint8), xb.all(1, keepdim=True))
+        self.assertEqual(torch.ones((), device=device, dtype=torch.uint8), xb.all())
 
 
 instantiate_device_type_tests(TestReductions, globals())
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 16284aeffb15..e9a3731cac12 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -432,9 +432,45 @@ def add_docstr_all(method, docstr):
 
 add_docstr_all('all',
                r"""
-all(dim=None, keepdim=False) -> Tensor
+.. function:: all() -> bool
 
-See :func:`torch.all`
+Returns True if all elements in the tensor are True, False otherwise.
+
+Example::
+
+    >>> a = torch.rand(1, 2).bool()
+    >>> a
+    tensor([[False, True]], dtype=torch.bool)
+    >>> a.all()
+    tensor(False, dtype=torch.bool)
+
+.. function:: all(dim, keepdim=False, out=None) -> Tensor
+
+Returns True if all elements in each row of the tensor in the given
+dimension :attr:`dim` are True, False otherwise.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> a = torch.rand(4, 2).bool()
+    >>> a
+    tensor([[True, True],
+            [True, False],
+            [True, True],
+            [True, True]], dtype=torch.bool)
+    >>> a.all(dim=1)
+    tensor([ True, False,  True,  True], dtype=torch.bool)
+    >>> a.all(dim=0)
+    tensor([ True, False], dtype=torch.bool)
 """)
 
 add_docstr_all('allclose',
@@ -453,9 +489,45 @@ def add_docstr_all(method, docstr):
 
 add_docstr_all('any',
                r"""
-any(dim=None, keepdim=False) -> Tensor
+.. function:: any() -> bool
+
+Returns True if any elements in the tensor are True, False otherwise.
+
+Example::
+
+    >>> a = torch.rand(1, 2).bool()
+    >>> a
+    tensor([[False, True]], dtype=torch.bool)
+    >>> a.any()
+    tensor(True, dtype=torch.bool)
+
+.. function:: any(dim, keepdim=False, out=None) -> Tensor
+
+Returns True if any elements in each row of the tensor in the given
+dimension :attr:`dim` are True, False otherwise.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size as
+:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensor having 1 fewer dimension than :attr:`input`.
+
+Args:
+    dim (int): the dimension to reduce
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
+    out (Tensor, optional): the output tensor
+
+Example::
 
-See :func:`torch.any`
+    >>> a = torch.randn(4, 2) < 0
+    >>> a
+    tensor([[ True,  True],
+            [False,  True],
+            [ True,  True],
+            [False, False]])
+    >>> a.any(1)
+    tensor([ True,  True,  True, False])
+    >>> a.any(0)
+    tensor([True, True])
 """)
 
 add_docstr_all('apply_',
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 61b06fd42c64..4f56ef928918 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -610,105 +610,6 @@ def merge_dicts(*dicts):
     True
 """)
 
-add_docstr(torch.all,
-           r"""
-all(input) -> Tensor
-
-Tests if all elements in :attr:`input` evaluate to `True`.
-
-Example::
-
-    >>> a = torch.rand(1, 2).bool()
-    >>> a
-    tensor([[False, True]], dtype=torch.bool)
-    >>> torch.all(a)
-    tensor(False, dtype=torch.bool)
-    >>> a = torch.arange(0, 3)
-    >>> a
-    tensor([0, 1, 2])
-    >>> torch.all(a)
-    tensor(False)
-
-.. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
-
-For each row of :attr:`input` in the given dimension :attr:`dim`,
-returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
-
-{keepdim_details}
-
-Args:
-    {input}
-    {dim}
-    {keepdim}
-
-Keyword args:
-    {out}
-
-Example::
-
-    >>> a = torch.rand(4, 2).bool()
-    >>> a
-    tensor([[True, True],
-            [True, False],
-            [True, True],
-            [True, True]], dtype=torch.bool)
-    >>> torch.all(a, dim=1)
-    tensor([ True, False,  True,  True], dtype=torch.bool)
-    >>> torch.all(a, dim=0)
-    tensor([ True, False], dtype=torch.bool)
-""".format(**single_dim_common))
-
-add_docstr(torch.any,
-           r"""
-any(input) -> Tensor
-
-Args:
-    {input}
-
-Tests if any element in :attr:`input` evaluates to `True`.
-
-Example::
-
-    >>> a = torch.rand(1, 2).bool()
-    >>> a
-    tensor([[False, True]], dtype=torch.bool)
-    >>> torch.any(a)
-    tensor(True, dtype=torch.bool)
-    >>> a = torch.arange(0, 3)
-    >>> a
-    tensor([0, 1, 2])
-    >>> torch.any(a)
-    tensor(True)
-
-.. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
-
-For each row of :attr:`input` in the given dimension :attr:`dim`,
-returns `True` if any element in the row evaluate to `True` and `False` otherwise.
-
-{keepdim_details}
-
-Args:
-    {input}
-    {dim}
-    {keepdim}
-
-Keyword args:
-    {out}
-
-Example::
-
-    >>> a = torch.randn(4, 2) < 0
-    >>> a
-    tensor([[ True,  True],
-            [False,  True],
-            [ True,  True],
-            [False, False]])
-    >>> torch.any(a, 1)
-    tensor([ True,  True,  True, False])
-    >>> torch.any(a, 0)
-    tensor([True, True])
-""".format(**single_dim_common))
-
 add_docstr(torch.angle,
            r"""
 angle(input, *, out=None) -> Tensor

From 48d1ad1adad8bc6286993c03a943b46c3120ae11 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Wed, 16 Dec 2020 11:11:40 -0800
Subject: [PATCH 24/34] Reland "Add test for empty tensors for batch matmuls"
 (#48797)

Summary:
This reverts commit c7746adbc6e6ace9d4c2b54e32c8d36a7b7b0e31.

Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48797

Reviewed By: mruberry

Differential Revision: D25575264

Pulled By: ngimel

fbshipit-source-id: c7f3b384db833d727bb5bd8a51f1493a13016d09
---
 aten/src/ATen/native/LinearAlgebra.cpp     | 27 +++++---
 aten/src/ATen/native/cuda/LinearAlgebra.cu | 76 ++++------------------
 aten/src/ATen/native/native_functions.yaml |  9 +--
 test/test_linalg.py                        | 32 ++++++++-
 4 files changed, 63 insertions(+), 81 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 1c3b9ca60c1c..5f757173e7fa 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -484,7 +484,7 @@ static void addmm_impl_cpu_(
   }
 }
 
-static void addbmm_impl_cpu_(
+static void addbmm_impl_(
     Tensor &result, const Tensor &self, const Tensor &batch1, const Tensor &batch2, Scalar beta, Scalar alpha) {
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -509,29 +509,38 @@ static void addbmm_impl_cpu_(
 
   const int64_t num_batches = batch1.size(0);
 
+  if (num_batches == 0) {
+    if (beta.to<c10::complex<double>>() != 0.0) {
+      result.mul_(beta);
+    } else {
+      result.zero_();
+    }
+    return;
+  }
+
   for (int64_t batch = 0; batch < num_batches; ++batch) {
-    addmm_impl_cpu_(result, result, batch1[batch], batch2[batch], beta, alpha);
+    result.addmm_(batch1[batch], batch2[batch], beta, alpha);
     beta = 1; // accumulate output once
   }
 }
 
-Tensor& addbmm_cpu_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor& addbmm_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor b_self = std::get<0>(expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm_out"));
   {
     at::NoNamesGuard guard;
-    addbmm_impl_cpu_(result, b_self, batch1, batch2, beta, alpha);
+    addbmm_impl_(result, b_self, batch1, batch2, beta, alpha);
   }
   at::namedinference::propagate_names_for_addmm(result, batch1, batch2, self);
   return result;
 }
 
-Tensor &addbmm_cpu_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
-  return addbmm_cpu_out(self, self, batch1, batch2, beta, alpha);
+Tensor &addbmm_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return native::addbmm_out(self, self, batch1, batch2, beta, alpha);
 }
 
-Tensor addbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor addbmm(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor result = at::empty({0}, self.options());
-  return addbmm_cpu_out(result, self, batch1, batch2, beta, alpha);
+  return native::addbmm_out(result, self, batch1, batch2, beta, alpha);
 }
 
 Tensor& addmm_cpu_out(Tensor &result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
@@ -650,7 +659,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
   if (self_or_result.numel() == 0) {
     return self_or_result;
   } else if (contraction_size == 0) {
-    if (is_bmm_out) {
+    if (is_bmm_out || (beta.to<c10::complex<double>>() == 0.0)) {
       return self_or_result.zero_();
     } else {
       return self_or_result.mul_(beta);
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index e155f9d367bc..88e4d2f9a8e3 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -175,6 +175,17 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor&
     }
   }
 
+  // handle pathological cases that blas may not like
+  if (result.numel() == 0) {
+    return result;
+  } else if (batch1_sizes[2] == 0) {
+    if (beta.to<c10::complex<double>>() == 0.0) {
+      return result.zero_();
+    } else {
+      return result.mul_(beta);
+    }
+  }
+
   bool transpose_result = false;
   Tensor result_;
   IntArrayRef result_strides = result.strides();
@@ -313,71 +324,6 @@ Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
   return native::bmm_out_cuda(result, self, mat2);
 }
 
-Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
-                        const Tensor& batch1, const Tensor& batch2,
-                        Scalar beta, Scalar alpha) {
-  TORCH_CHECK(batch1.dim() == 3 && batch2.dim() == 3,
-              "Batch tensors should be 3D, got dimensions ", batch1.dim(),
-              " and ", batch2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == batch1.device() &&
-              out.device() == batch2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              batch1.device(), " and ", batch2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  int64_t batchnum = batch1.size(0);
-  int64_t m1d1 = batch1.size(1);
-  int64_t innerdim = batch1.size(2);
-  int64_t m2d2 = batch2.size(2);
-  TORCH_CHECK(batchnum == batch2.size(0),
-              "equal number of batches expected");
-  TORCH_CHECK(m1d1 == self_.size(0),
-              "first dimension of batch1  must match first dimension of input");
-  TORCH_CHECK(m2d2 == self_.size(1),
-              "second dimension of batch2 must match second dimension of input");
-  TORCH_CHECK(innerdim == batch2.size(1),
-              "second dimension of batch1 must match first dimension of batch2");
-
-  if (&out != &self) {
-    at::native::resize_as_(out, self_);
-    if (beta.to<c10::complex<double>>() != 0.0) {
-      at::native::copy_(out, self_);
-    }
-  }
-
-  for (int64_t i=0; i<batchnum; i++) {
-    addmm_out_cuda(out, out, batch1[i], batch2[i], beta, alpha);
-    beta = 1;
-  }
-  return out;
-}
-
-Tensor& addbmm__cuda(Tensor& self,
-                     const Tensor& batch1, const Tensor& batch2,
-                     Scalar beta, Scalar alpha) {
-  addbmm_out_cuda(self, self, batch1, batch2, beta, alpha);
-  return self;
-}
-
-Tensor addbmm_cuda(const Tensor& self,
-                   const Tensor& batch1, const Tensor& batch2,
-                   Scalar beta, Scalar alpha)
-{
-  Tensor out = at::empty({0}, self.options());
-  addbmm_out_cuda(out, self, batch1, batch2, beta, alpha);
-  return out;
-}
-
 namespace {
 
 inline void dot_check(const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7c24b79b2758..61e5e42b38d5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5922,21 +5922,18 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: addbmm_cpu_
-    CUDA: addbmm__cuda
+    CPU, CUDA: addbmm_
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
-    CPU: addbmm_cpu_out
-    CUDA: addbmm_out_cuda
+    CPU, CUDA: addbmm_out
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: addbmm_cpu
-    CUDA: addbmm_cuda
+    CPU, CUDA: addbmm
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   use_c10_dispatcher: full
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 123e75e85de3..4a043094c5f8 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4043,7 +4043,7 @@ def test_strided_mm_bmm(self, device, dtype):
         torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
-    @precisionOverride({torch.half: 0.005, torch.bfloat16: 0.05})
+    @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1")
     @onlyOnCPUAndCUDA
     @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
@@ -4069,18 +4069,27 @@ def invert_perm(p):
             return (d[0], d[1], d[2])
 
         def generate_inputs():
+            # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
                 b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
+            # broadcasting tensors
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
                 b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
                 b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
                 yield b1, b2
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = torch.randn(shape1, dtype=dtype, device=device)
+                b2 = torch.randn(shape2, dtype=dtype, device=device)
+                yield b1, b2
 
         for (b1, b2), perm3 in itertools.product(generate_inputs(), itertools.permutations((0, 1, 2))):
             res1 = torch.bmm(b1, b2)
@@ -4261,6 +4270,17 @@ def generate_tensor():
                 ).to(device=device, dtype=dtype).sum(0)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = make_tensor(shape1, device, dtype, low=-1, high=1)
+                b2 = make_tensor(shape2, device, dtype, low=-1, high=1)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype).sum(0)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("addbmm", b1, b2, ref, out_tensor)
@@ -4312,6 +4332,16 @@ def generate_tensor():
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = make_tensor(shape1, device, dtype, low=-2, high=2)
+                b2 = make_tensor(shape2, device, dtype, low=-2, high=2)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)

From 1b6d18aa7c15428bb282183eb8d9f9889e738198 Mon Sep 17 00:00:00 2001
From: Igor Gitman <iggitma@microsoft.com>
Date: Wed, 16 Dec 2020 11:19:30 -0800
Subject: [PATCH 25/34] Adding support for CuDNN-based LSTM with projections
 (#47725)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46213

I didn't yet update the documentation, will add those change soon. A few other things that I didn't do, but want to clarify if I maybe should.

1. I didn't expose projections in c++ API: torch/csrc/api/src/nn/modules/rnn.cpp. Let me know if this is desirable and I will add those changes.
2. I didn't expose projections in "lstm_cell" function and "_thnn_differentiable_lstm_cell_backward" functions from aten/src/ATen/native/RNN.cpp. As far as I understand, they are not needed for nn.LSTM CPU execution. For lstm_cell, projections don't bring any real benefit, since if cell is used separately, it can be easily added in Python. For "_thnn_differentiable_lstm_cell_backward", I'm actually not sure where exactly that function is used, so I also disabled projections there for now. Please let me know if I should change that.
3. I added check that projections are not supported for quantized LSTMs to quantized_lstm_<data/input> functions. But I didn't add any checks to LSTMCell code. It seems that since I disabled projections in "lstm_cell" function, they should also not be available for quantized models through any other API than quantized_lstm_<data/input>. Please let me know if I'm not correct and I will add checks to other places.
4. Projections are not supported for CuDNN versions < 7.1.2. Should I add the check for CuDNN version and disable projections in that case? If so, what will be the best way to do that?
5. Currently I added projection weight as the last weight, so the layout is "w_ih, w_hh, b_ih, b_hh, w_hr". This breaks the assumption that biases come after weights and thus I had to add additional if-s in various places. Alternative way would be to have "w_ih, w_hh, w_hr, b_ih, b_hh" layout, in which case the assumption will be true. But in that case I will need to split the loop in get_parameters function from aten/src/ATen/native/cudnn/RNN.cpp. And in some cases, I will still need to add an "undefined" tensor in the 3rd position, because we get all 5 weights from CuDNN most of the time. So I'm not sure which way is better. Let me know if you think I should change to the weights-then-biases layout.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47725

Reviewed By: zou3519

Differential Revision: D25449794

Pulled By: ngimel

fbshipit-source-id: fe6ce59e481d1f5fd861a8ff7fa13d1affcedb0c
---
 aten/src/ATen/cudnn/AutocastRNN.cpp           |  26 +-
 aten/src/ATen/cudnn/Descriptors.h             |  14 +-
 aten/src/ATen/native/RNN.cpp                  |  85 ++++-
 aten/src/ATen/native/cudnn/RNN.cpp            | 330 +++++++++++-----
 aten/src/ATen/native/cudnn/RNNUtils.h         |   1 +
 aten/src/ATen/native/native_functions.yaml    |   6 +-
 .../check_backward_compatibility.py           |   3 +
 test/cpp/api/rnn.cpp                          | 157 +++++++-
 test/onnx/test_pytorch_onnx_caffe2.py         |   8 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  17 +-
 test/test_nn.py                               | 353 +++++++++++++++---
 tools/autograd/derivatives.yaml               |   6 +-
 torch/csrc/api/include/torch/nn/modules/rnn.h |   2 +
 torch/csrc/api/include/torch/nn/options/rnn.h |   6 +-
 torch/csrc/api/src/nn/modules/rnn.cpp         |  79 +++-
 torch/nn/modules/rnn.py                       | 122 ++++--
 torch/onnx/symbolic_opset9.py                 |   3 +
 17 files changed, 984 insertions(+), 234 deletions(-)

diff --git a/aten/src/ATen/cudnn/AutocastRNN.cpp b/aten/src/ATen/cudnn/AutocastRNN.cpp
index 31e1a26e8fb7..083d435975c7 100644
--- a/aten/src/ATen/cudnn/AutocastRNN.cpp
+++ b/aten/src/ATen/cudnn/AutocastRNN.cpp
@@ -27,6 +27,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
                           const c10::optional<Tensor>& cx,
                           int64_t mode,
                           int64_t hidden_size,
+                          int64_t proj_size,
                           int64_t num_layers,
                           bool batch_first,
                           double dropout,
@@ -43,10 +44,18 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
   // weight_stride0 is the number of weight tensors per layer and direction, as seen by model.parameters().
   // If bias is enabled, there are 4 such tensors (ih and hh weights, ih and hh biases).
   // If bias is not enabled, there are 2 (ih and hh weights).
-  // This organization holds for all rnn types (RNN, GRU, and LSTM).
-  TORCH_INTERNAL_ASSERT((weight_stride0 == 2) || (weight_stride0 == 4),
-                        "weight_stride0 must be 2 (if no bias) or 4 (if bias).  Received ",
-                        weight_stride0);
+  // This organization holds for all rnn types (RNN, GRU, and LSTM). If LSTM with projections is
+  // used, additional hr weight is added.
+  if (proj_size > 0) {
+    TORCH_INTERNAL_ASSERT((weight_stride0 == 3) || (weight_stride0 == 5),
+                          "weight_stride0 must be 3 (if no bias) or 5 (if bias) for LSTM with projections.  Received ",
+                          weight_stride0);
+  } else {
+    TORCH_INTERNAL_ASSERT((weight_stride0 == 2) || (weight_stride0 == 4),
+                          "weight_stride0 must be 2 (if no bias) or 4 (if bias).  Received ",
+                          weight_stride0);
+  }
+
 
   Tensor weight_buf, redispatch_weight_buf;
   std::vector<Tensor> redispatch_weight;
@@ -65,6 +74,10 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
     // Casts weight tensors to FP16 and ensures all weights for all layers are views into a large flat buffer,
     // with the right locations and layouts expected by cudnn.
     // This is (and should be) autograd-exposed.
+    bool include_bias = true;
+    if (weight_stride0 == 2 || (weight_stride0 == 3 && proj_size > 0)) {
+      include_bias = false;
+    }
     std::tie(redispatch_weight_buf, redispatch_weight) =
         at::native::cudnn_rnn::copy_weights_to_flat_buf_views(
             weight,
@@ -72,6 +85,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
             input.size(-1),
             mode,
             hidden_size,
+            proj_size,
             num_layers,
             batch_first,
             bidirectional,
@@ -79,9 +93,8 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
             /*flat_buf_options=*/weight[0].options().dtype(at::kHalf),
             /*set_orig_weights_to_flat_buf=*/false,
             /*allow_type_change=*/true,
-            /*include_bias=*/weight_stride0 == 4);
+            /*include_bias=*/include_bias);
   }
-
   return at::_cudnn_rnn(
       cached_cast(at::kHalf, input),
       needs_cast_and_flatten ? TensorList(redispatch_weight) : weight,
@@ -91,6 +104,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
       cached_cast(at::kHalf, cx),
       mode,
       hidden_size,
+      proj_size,
       num_layers,
       batch_first,
       dropout,
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 2aed3f66632f..64306d115e16 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -41,7 +41,7 @@ static inline void fixSizeOneDimStride(int dim, const int *size, int *stride, bo
   int64_t z = 1;
   int index = 0;
   std::vector<int> permutation(dim);
-  
+
   if (nhwc) {
     permutation[index++] = 1;
   }
@@ -244,10 +244,11 @@ struct TORCH_CUDA_API RNNDescriptor
                       &cudnnDestroyRNNDescriptor>
 {
   DropoutDescriptor dropout_desc_;
-  void set(cudnnHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc,
+  void set(cudnnHandle_t handle, int hidden_size, int proj_size, int num_layers, DropoutDescriptor&& dropout_desc,
            cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
            cudnnRNNMode_t mode, cudnnDataType_t datatype, cudnnDataType_t input_type, cudnnRNNAlgo_t algo, bool allow_tf32) {
     dropout_desc_ = std::move(dropout_desc);
+
     AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
           handle,
           mut_desc(),
@@ -259,12 +260,19 @@ struct TORCH_CUDA_API RNNDescriptor
           mode,
           algo,
           datatype));
+    if (proj_size != 0) {
+      AT_CUDNN_CHECK(cudnnSetRNNProjectionLayers(
+            handle,
+            /*rnnDesc=*/mut_desc(),
+            /*recProjSize=*/proj_size,
+            /*outProjSize=*/0));
+    }
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
     if (prop->major >= 7) {
       if (input_type == CUDNN_DATA_HALF) {
         cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
       }
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000 
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
       else if (input_type == CUDNN_DATA_FLOAT && !allow_tf32) {
         cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_FMA_MATH);
       }
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 9a48edef05d7..36e6416bf2b9 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -68,6 +68,14 @@ using CellParamsSerializationType = std::tuple<
 struct CellParamsBase : torch::CustomClassHolder {
   virtual Tensor matmul_ih(const Tensor& input) const = 0;
   virtual Tensor matmul_hh(const Tensor& h) const = 0;
+  // by default doing nothing. CellParams will override this
+  // to define correct behavior for LSTMs with projections.
+  // This function is not pure virtual, because it's useful to
+  // provide this default implementation, so that all cell params
+  // that don't support projections work correctly (e.g. QuantizedCellParams variations)
+  virtual Tensor matmul_hr(const Tensor& h) const {
+    return h;
+  }
   virtual Tensor linear_ih(const Tensor& input_ih) const = 0;
   virtual Tensor linear_hh(const Tensor& input_hh) const = 0;
 
@@ -79,19 +87,22 @@ struct CellParamsBase : torch::CustomClassHolder {
 
 // Pretty much all cells we support take the same set of arguments, but threading those
 // 4 arguments manually is really annoying. Their lifetime is externally managed, so we only
-// pass this struct of references around.
+// pass this struct of references around. LSTMs with projections have 5th argument w_hr, for all
+// other models it's always going to be undefined.
 struct CellParams : public CellParamsBase {
   CellParams(
       const Tensor& _w_ih,
       const Tensor& _w_hh,
       const Tensor& _b_ih,
-      const Tensor& _b_hh)
-      : w_ih(_w_ih), w_hh(_w_hh), b_ih_(_b_ih), b_hh_(_b_hh){};
+      const Tensor& _b_hh,
+      const Tensor& _w_hr)
+      : w_ih(_w_ih), w_hh(_w_hh), b_ih_(_b_ih), b_hh_(_b_hh), w_hr(_w_hr) {};
 
   const Tensor& w_ih;
   const Tensor& w_hh;
   const Tensor& b_ih_; /* optional */
   const Tensor& b_hh_; /* optional */
+  const Tensor& w_hr;  /* only defined for LSTMs with projections */
 
   Tensor matmul_ih(const Tensor& input) const override {
     return at::matmul(input, w_ih.t());
@@ -99,6 +110,12 @@ struct CellParams : public CellParamsBase {
   Tensor matmul_hh(const Tensor& h) const override {
     return at::matmul(h, w_hh.t());
   }
+  Tensor matmul_hr(const Tensor& h) const override {
+    if (w_hr.defined()) {
+      return at::matmul(h, w_hr.t());
+    }
+    return h;
+  }
   Tensor linear_ih(const Tensor& input) const override {
     return at::linear(input, w_ih, b_ih_);
   }
@@ -468,6 +485,9 @@ struct QRNNCellParamsWrapper {
   Tensor matmul_hh(const Tensor& h) const {
     return param_->matmul_hh(h);
   }
+  Tensor matmul_hr(const Tensor& h) const {
+    return param_->matmul_hr(h);
+  }
   Tensor linear_ih(const Tensor& input) const {
     return param_->linear_ih(input);
   }
@@ -509,18 +529,32 @@ static std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
 }
 
 // Parses a flat list of parameter tensors into a list of CellParams
-static std::vector<CellParams> gather_params(TensorList params, bool has_biases) {
+static std::vector<CellParams> gather_params(TensorList params, bool has_biases, bool has_projections = false) {
   static at::Tensor undefined;
   std::vector<CellParams> result;
   if (has_biases) {
-    TORCH_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters");
-    for (size_t i = 0; i < params.size(); i += 4) {
-      result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3]);
+    if (has_projections) {
+      TORCH_CHECK(params.size() % 5 == 0, "got an incorrect number of RNN parameters");
+      for (size_t i = 0; i < params.size(); i += 5) {
+        result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3], params[i + 4]);
+      }
+    } else {
+      TORCH_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters");
+      for (size_t i = 0; i < params.size(); i += 4) {
+        result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3], undefined);
+      }
     }
   } else {
-    TORCH_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters");
-    for (size_t i = 0; i < params.size(); i += 2) {
-      result.emplace_back(params[i], params[i + 1], undefined, undefined);
+    if (has_projections) {
+      TORCH_CHECK(params.size() % 3 == 0, "got an incorrect number of RNN parameters");
+      for (size_t i = 0; i < params.size(); i += 3) {
+        result.emplace_back(params[i], params[i + 1], undefined, undefined, params[i + 2]);
+      }
+    } else {
+      TORCH_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters");
+      for (size_t i = 0; i < params.size(); i += 2) {
+        result.emplace_back(params[i], params[i + 1], undefined, undefined, undefined);
+      }
     }
   }
   return result;
@@ -702,8 +736,10 @@ struct LSTMCell : Cell<std::tuple<Tensor, Tensor>, cell_params> {
       auto hgates = params.matmul_hh(hx);
       auto result = at::_thnn_fused_lstm_cell(
           igates, hgates, cx, params.b_ih(), params.b_hh());
+      // applying projections if w_hr is defined
+      auto hy = params.matmul_hr(std::get<0>(result));
       // Slice off the workspace argument (it's needed only for AD).
-      return std::make_tuple(std::move(std::get<0>(result)), std::move(std::get<1>(result)));
+      return std::make_tuple(std::move(hy), std::move(std::get<1>(result)));
     }
 
     const auto gates = params.linear_hh(hx).add_(
@@ -715,6 +751,7 @@ struct LSTMCell : Cell<std::tuple<Tensor, Tensor>, cell_params> {
     auto outgate = chunked_gates[3].sigmoid_();
     auto cy = (forgetgate * cx).add_(ingate * cellgate);
     auto hy = outgate * cy.tanh();
+    hy = params.matmul_hr(hy);
     return std::make_tuple(std::move(hy), std::move(cy));
   }
 
@@ -1404,8 +1441,10 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
             num_layers, dropout_p, train, bidirectional, batch_first);
     return std::make_tuple(std::move(output), std::move(hy), std::move(cy));
   }
-
+  // if cells are of different size, that means projections are used
+  bool has_projections = (hx[0].size(2) != hx[1].size(2));
   if (use_miopen(_input, dropout_p)) {
+    TORCH_CHECK(!has_projections, "LSTM with projections is not supported with MIOpen");
     Tensor output, hy, cy;
     lstm_miopen_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
               num_layers, dropout_p, train, bidirectional, batch_first);
@@ -1413,7 +1452,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
   }
   check_attributes(_input, _params, hx);
   auto input = batch_first ? _input.transpose(0, 1) : _input;
-  auto params = gather_params(_params, has_biases);
+  auto params = gather_params(_params, has_biases, has_projections);
   auto results = _lstm_impl<FullLayer, FullBidirectionalLayer>(
       input, params, hx[0], hx[1], num_layers, dropout_p, train, bidirectional);
   if (batch_first) {
@@ -1433,8 +1472,10 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
             _params, has_biases, num_layers, dropout_p, train, bidirectional);
     return std::make_tuple(std::move(output), std::move(hy), std::move(cy));
   }
-
+  // if cells are of different size, that means projections are used
+  bool has_projections = (hx[0].size(2) != hx[1].size(2));
   if (use_miopen(data, dropout_p)) {
+    TORCH_CHECK(!has_projections, "LSTM with projections is not supported with MIOpen");
     Tensor output, hy, cy;
     lstm_packed_miopen_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
             _params, has_biases, num_layers, dropout_p, train, bidirectional);
@@ -1442,7 +1483,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
   }
 
   PackedSequence input { data, batch_sizes };
-  auto params = gather_params(_params, has_biases);
+  auto params = gather_params(_params, has_biases, has_projections);
   auto result = _lstm_impl<PackedLayer, PackedBidirectionalLayer>(
       input, params, hx[0], hx[1], num_layers, dropout_p, train, bidirectional);
   auto & packed_output = std::get<0>(result);
@@ -1455,7 +1496,8 @@ std::tuple<Tensor, Tensor> lstm_cell(
     const Tensor& input, TensorList hx,
     const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) {
   TORCH_CHECK(hx.size() == 2, "lstm_cell expects two hidden states");
-  return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh});
+  static at::Tensor undefined;
+  return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
@@ -1552,19 +1594,22 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
 Tensor gru_cell(
     const Tensor& input, const Tensor& hx,
     const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) {
-  return GRUCell<CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh});
+  static at::Tensor undefined;
+  return GRUCell<CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
 
 Tensor rnn_tanh_cell(
     const Tensor& input, const Tensor& hx,
     const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) {
-  return SimpleCell<tanh_f, CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh});
+  static at::Tensor undefined;
+  return SimpleCell<tanh_f, CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
 
 Tensor rnn_relu_cell(
     const Tensor& input, const Tensor& hx,
     const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) {
-  return SimpleCell<relu_f, CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh});
+  static at::Tensor undefined;
+  return SimpleCell<relu_f, CellParams>{}(input, hx, CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
 
 // Quantized implementations
@@ -1592,6 +1637,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_input(
     params.emplace_back(static_cast<c10::intrusive_ptr<CellParamsBase>>(param));
   }
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
+  TORCH_CHECK(hx[0].size(2) == hx[1].size(2), "quantized LSTM with projections is not supported");
   auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar;
   auto input = batch_first ? _input.transpose(0, 1) : _input;
   TORCH_CHECK(has_biases, "quantized LSTM requires biases");
@@ -1685,6 +1731,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
     params.emplace_back(static_cast<c10::intrusive_ptr<CellParamsBase>>(param));
   }
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
+  TORCH_CHECK(hx[0].size(2) == hx[1].size(2), "quantized LSTM with projections is not supported");
 
   auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar;
 
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 4e53b1835c6d..1dc9d5cba945 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -19,7 +19,7 @@ namespace at { namespace native {
 Tensor _cudnn_rnn_flatten_weight(
     TensorList weight_arr, int64_t weight_stride0,
     int64_t input_size,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first,
     bool fn_bidirectional
     ) {
@@ -30,7 +30,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
     TensorList weight, int64_t weight_stride0,
     const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first, double fn_dropout,
     bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
     const Tensor& fn_dropout_state
@@ -42,7 +42,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
     const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
     const Tensor& grad_cy_r,
-    int64_t mode, int64_t hidden_size,
+    int64_t mode, int64_t hidden_size, int64_t proj_size,
     int64_t num_layers, bool batch_first, double dropout,
     bool train, bool bidirectional, IntArrayRef batch_sizes,
     const Tensor& dropout_state, const Tensor& reserve,
@@ -92,6 +92,7 @@ namespace {
 
   struct RNNDescriptorParams {
     int64_t hidden_size;
+    int64_t proj_size;
     int64_t num_layers;
     cudnnDirectionMode_t bidirectional;
     cudnnRNNMode_t mode;
@@ -135,19 +136,19 @@ namespace {
       this->algo = algo;
     }
 
-    void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) {
+    void set(int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) {
       this->set_mode(mode);
       this->hidden_size = hidden_size;
+      this->proj_size = proj_size;
       this->num_layers = num_layers;
       this->set_bidirectional(bidirectional);
       this->datatype = datatype;
       this->input_datatype = input_datatype;
     }
 
-
     RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
       RNNDescriptor rnn_desc;
-      rnn_desc.set(handle, hidden_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN());
+      rnn_desc.set(handle, hidden_size, proj_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN());
       return rnn_desc;
     }
 
@@ -359,7 +360,7 @@ namespace {
     size_t weight_size;
     AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype));
     auto elem_size = dataSize(datatype);
-    AT_ASSERTM(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size");
+    TORCH_INTERNAL_ASSERT(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size");
     return weight_size / elem_size;
   }
 
@@ -378,6 +379,58 @@ namespace {
     }
   }
 
+  void add_projection_weights(
+        cudnnHandle_t handle,
+        const RNNDescriptor& rnn_desc,
+        const TensorDescriptor& x_desc,
+        const FilterDescriptor& w_desc,
+        const Tensor& weight_buf,
+        int64_t layer,
+        std::vector<Tensor>& params
+  ) {
+    void* matrix_pointer = nullptr;
+    // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases)
+    int64_t linear_id = 8;
+    FilterDescriptor lin_layer_mat_desc;
+    AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
+        /*handle=*/handle,
+        /*rnnDesc=*/rnn_desc.desc(),
+        /*layer=*/layer,
+        /*xDesc=*/x_desc.desc(),
+        /*wDesc=*/w_desc.desc(),
+        /*w=*/weight_buf.data_ptr(),
+        /*linLayerID=*/linear_id,
+        /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(),
+        /*linLayerMat=*/&matrix_pointer));
+
+    cudnnDataType_t data_type;
+    cudnnTensorFormat_t format;
+    int nb_dims;
+    constexpr int min_dim = 3;
+    int filter_dim_a[min_dim];
+    AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
+          lin_layer_mat_desc.desc(),
+          min_dim,
+          &data_type,
+          &format,
+          &nb_dims,
+          filter_dim_a
+          ));
+
+    TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
+    auto elem_size = dataSize(getCudnnDataType(weight_buf));
+    auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+    TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
+    size_t offset = offset_bytes / elem_size;
+
+    int mat_numel = prod_intlist(filter_dim_a, filter_dim_a + nb_dims);
+    // Generate a new parameter tensor which is a view into the weight_buf.
+    std::initializer_list<int64_t> size = {mat_numel, 1};
+    Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
+    params.emplace_back(std::move(param));
+  }
+
+
   /*
     Returns weight and bias tensors for each layer of the RNN. These tensors
     are views on the underlying weight buffer allocated by CuDNN.
@@ -433,24 +486,20 @@ namespace {
           cudnnTensorFormat_t format;
           int nb_dims;
           constexpr int min_dim = 3;
-          // TODO: The use of CPU tensor here is a bit goofy in C++,
-          // some sort of alloca would be good enough except that it is
-          // kind of convenient to be able to prod() on it.
-          Tensor filter_dim_a = at::empty(min_dim, at::initialTensorOptions().dtype(kInt));
+          int filter_dim_a[min_dim];
           AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
                 lin_layer_mat_desc.desc(),
                 min_dim,
                 &data_type,
                 &format,
                 &nb_dims,
-                filter_dim_a.data_ptr<int>()
+                filter_dim_a
                 ));
 
-          AT_ASSERTM(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
-          filter_dim_a = filter_dim_a.slice(0, 0, nb_dims);
+          TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
           auto elem_size = dataSize(getCudnnDataType(weight_buf));
           auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
-          AT_ASSERTM(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
+          TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
           size_t offset = offset_bytes / elem_size;
 
           // for all the RNN types provided by CUDNN, all the ih weights
@@ -458,7 +507,7 @@ namespace {
           // (same for the hh weights, and the ih and hh biases).
           // Since we're storing all the weights in a single tensor anyway,
           // might as well merge the CUDNN ones into a single tensor as well
-          int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data_ptr<int>();
+          int mat_numel = prod_intlist(filter_dim_a, filter_dim_a + nb_dims);
           if (linear_id == 0 || linear_id == num_linear_layers / 2) {
             // We could also exclude bias params by restricting cudnn_methods to just { cudnnGetRNNLinLayerMatrixParams }
             // at the very top.  However, to do so would throw off the cur_offset account, which is currently a strict
@@ -474,15 +523,20 @@ namespace {
               layer_params_count++;
             }
           } else {
-            AT_ASSERTM(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset);
+            TORCH_INTERNAL_ASSERT(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset);
           }
           cur_offset = offset + mat_numel;
         }
       } // for cudnn_method
+      if (rnn.proj_size != 0) {
+        add_projection_weights(handle, rnn_desc, x_desc, w_desc, weight_buf, layer, params);
+        layer_params_count++;
+      }
+
       if (layer == 0) {
         global_layer_params_count = layer_params_count;
       } else {
-        AT_ASSERTM(global_layer_params_count == layer_params_count,
+        TORCH_INTERNAL_ASSERT(global_layer_params_count == layer_params_count,
                    "global_layer_params_count = ", global_layer_params_count,
                    "; layer_params_count = ", layer_params_count);
       }
@@ -502,7 +556,11 @@ namespace {
     int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
     const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
     std::vector<void*> data_ptrs;
-    data_ptrs.reserve(num_dir_layers * 2 * 2);
+    if (rnn.proj_size != 0) {
+      data_ptrs.reserve(num_dir_layers * (2 * 2 + 1));
+    } else {
+      data_ptrs.reserve(num_dir_layers * 2 * 2);
+    }
     for (int64_t layer = 0; layer < num_dir_layers; layer++) {
       for (auto cudnn_method : cudnn_methods) {
         // This API returns a separate pointer for weight of every gate,
@@ -526,34 +584,73 @@ namespace {
           data_ptrs.push_back(matrix_pointer);
         }
       }
+      if (rnn.proj_size != 0) {
+        // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases)
+        int64_t linear_id = 8;
+        FilterDescriptor lin_layer_mat_desc;
+        void* matrix_pointer;
+        AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
+              handle,
+              rnn_desc.desc(),
+              layer,
+              x_desc.desc(),
+              w_desc.desc(),
+              weight_buf.data_ptr(),
+              linear_id,
+              lin_layer_mat_desc.mut_desc(),
+              &matrix_pointer
+              ));
+        data_ptrs.push_back(matrix_pointer);
+      }
     }
     return data_ptrs;
   }
 
+  void _viewOrCopyOneParam(const Tensor& param_from, const Tensor& param_to,
+                          bool copy, bool allow_type_change=false) {
+    // if copying, allow_type_change may be true or false.
+    // if viewing, allow_type_change must be false.
+    TORCH_INTERNAL_ASSERT(copy || !allow_type_change,
+                          "if viewing, type change is not allowed.");
+    TORCH_INTERNAL_ASSERT(allow_type_change || (param_from.scalar_type() == param_to.scalar_type()),
+                          "parameter types mismatch");
+    if (copy) {
+        param_to.copy_(param_from.view_as(param_to));
+    } else {
+        param_from.resize_as_(param_to);
+    }
+  }
+
   void _viewOrCopyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to,
                          bool copy, bool allow_type_change=false) {
-    AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch");
+    TORCH_INTERNAL_ASSERT(params_from.size(0) == params_to.size(0), "number of layers mismatch");
     for (size_t i = 0; i < params_from.size(0); i++) {
       auto layer_params_from = params_from[i];
       auto layer_params_to = params_to[i];
       // NOTE: these lists have all weights before all biases, so if the layer
       // doesn't use biases, iteration will terminate once layer_params_from ends
       // and ignore them.
+
+      // NOTE: there is an exception from the above statement. If LSTMs with projections
+      // are used, weights layout will be w_ih, w_hh, b_ih, b_hh, w_hr. So need to handle no-bias
+      // case specially, because will need to copy 0->0, 1->1, 2->4. This case can be uniquely
+      // identified by checking if number of defined parameters for each layer is 3.
+      if (layer_params_from.size() == 3 && layer_params_to.size() != 3) {
+        _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change);
+        _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change);
+        _viewOrCopyOneParam(layer_params_from[2], layer_params_to[4], copy, allow_type_change);
+        continue;
+      }
+      if (layer_params_to.size() == 3 && layer_params_from.size() != 3) {
+        _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change);
+        _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change);
+        _viewOrCopyOneParam(layer_params_from[4], layer_params_to[2], copy, allow_type_change);
+        continue;
+      }
       for (auto a = layer_params_from.begin(), b = layer_params_to.begin();
-           a != layer_params_from.end() && b != layer_params_to.end();
-           ++a, ++b) {
-        auto param_from = *a, param_to = *b;
-        // if copying, allow_type_change may be true or false.
-        // if viewing, allow_type_change must be false.
-        TORCH_INTERNAL_ASSERT(copy || !allow_type_change,
-                              "if viewing, type change is not allowed.");
-        TORCH_INTERNAL_ASSERT(allow_type_change || (param_from.scalar_type() == param_to.scalar_type()),
-                              "parameter types mismatch");
-        if (copy) {
-            param_to.copy_(param_from.view_as(param_to));
-        } else {
-            param_from.resize_as_(param_to);
-        }
+            a != layer_params_from.end() && b != layer_params_to.end();
+            ++a, ++b) {
+        _viewOrCopyOneParam(*a, *b, copy, allow_type_change);
       }
     }
   }
@@ -576,14 +673,26 @@ namespace {
   }
 
   std::vector<int64_t> _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
+    if (rnn.proj_size != 0) {
+      return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.proj_size};
+    } else {
+      return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
+    }
+  }
+
+  std::vector<int64_t> _cell_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
     return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
   }
 
   std::vector<int64_t> _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
+    auto out_size = rnn.hidden_size;
+    if (rnn.proj_size != 0) {
+      out_size = rnn.proj_size;
+    }
     if (tensors.is_input_packed()) {
-      return {tensors.batch_sizes_sum, rnn.hidden_size * rnn.num_directions()};
+      return {tensors.batch_sizes_sum, out_size * rnn.num_directions()};
     } else {
-      return {tensors.seq_length, tensors.mini_batch, rnn.hidden_size * rnn.num_directions()};
+      return {tensors.seq_length, tensors.mini_batch, out_size * rnn.num_directions()};
     }
   }
 
@@ -633,6 +742,11 @@ namespace {
   }
 
   cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) {
+    // LSTM with projections only works with standard algorithm
+    if (rnn.proj_size != 0) {
+      return CUDNN_RNN_ALGO_STANDARD;
+    }
+
     if (getCudnnDataType(input) == CUDNN_DATA_HALF &&
         !tensors.is_input_packed()) {
       if (use_persist_common_heuristics(rnn, tensors) &&
@@ -661,6 +775,7 @@ namespace cudnn_rnn {
       int64_t input_size,
       int64_t mode,
       int64_t hidden_size,
+      int64_t proj_size,
       int64_t num_layers,
       bool batch_first,
       bool bidirectional,
@@ -673,12 +788,11 @@ namespace cudnn_rnn {
     // because to extract flat_buf_datatype from flat_buf_options, we'd need to say
     // auto flat_buf_datatype = getCudnnDataTypeFromScalarType(typeMetaToScalarType(options.dtype()));
     // typeMetaToScalarType is a surprisingly nontrivial function.  We should avoid it if we can.
-
     TORCH_CHECK(weight_arr.size() > 0,
                 "copy_weights_to_flat_buf_views: cannot flatten empty weight list");
 
     RNNDescriptorParams rnn;
-    rnn.set(mode, hidden_size, num_layers, bidirectional, promote_rnn_math_type(flat_buf_datatype), flat_buf_datatype);
+    rnn.set(mode, hidden_size, proj_size, num_layers, bidirectional, promote_rnn_math_type(flat_buf_datatype), flat_buf_datatype);
 
     auto handle = getCudnnHandle();
     RNNDescriptor rnn_desc = rnn.descriptor(handle);
@@ -700,21 +814,27 @@ namespace cudnn_rnn {
     std::vector<Tensor> params_arr;
     size_t params_stride0;
     std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
-
     MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
                       params{params_arr, params_stride0};
 
     // Copy weights
     _viewOrCopyParams(weight, params, /*copy=*/true, allow_type_change);
-
     if (set_orig_weights_to_flat_buf) {
       // Update the storage
       for (size_t i = 0; i < weight.size(0); i++) {
-        for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
-             orig_param_it != weight[i].end() && new_param_it != params[i].end();
-             orig_param_it++, new_param_it++) {
-          auto orig_param = *orig_param_it, new_param = *new_param_it;
-          orig_param.set_(new_param.view_as(orig_param));
+        // There is a special case for LSTM with projections and no bias,
+        // where weight copy is done in 0->0, 1->1, 2->4 layout
+        if (weight[i].size() == 3 && params[i].size() == 5) {
+          weight[i][0].set_(params[i][0].view_as(weight[i][0]));
+          weight[i][1].set_(params[i][1].view_as(weight[i][1]));
+          weight[i][2].set_(params[i][4].view_as(weight[i][2]));
+        } else {
+          for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
+              orig_param_it != weight[i].end() && new_param_it != params[i].end();
+              orig_param_it++, new_param_it++) {
+            auto orig_param = *orig_param_it, new_param = *new_param_it;
+            orig_param.set_(new_param.view_as(orig_param));
+          }
         }
       }
     }
@@ -733,7 +853,7 @@ using namespace cudnn_rnn;
 Tensor _cudnn_rnn_flatten_weight(
     TensorList weight_arr, int64_t weight_stride0,
     int64_t input_size,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first,
     bool fn_bidirectional
     ) {
@@ -744,6 +864,7 @@ Tensor _cudnn_rnn_flatten_weight(
       input_size,
       fn_mode,
       fn_hidden_size,
+      fn_proj_size,
       fn_num_layers,
       batch_first,
       fn_bidirectional,
@@ -762,12 +883,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
     TensorList weight, int64_t weight_stride0,
     const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first, double fn_dropout,
     bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
     const Tensor& fn_dropout_state
     ) {
-
   check_attributes(input_r, weight, {hx, cx}, /*check_dtype=*/true);
   auto input = input_r;
   auto weight_buf = weight_buf_r;
@@ -781,7 +901,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   }
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
 
@@ -799,6 +919,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   }
 
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
+  auto cell_size = _cell_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
   TORCH_CHECK(hx.is_contiguous(),
@@ -811,7 +932,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto hy = at::empty(hidden_size, hx.options());
   Tensor cy;
   if (cx.defined()) {
-    cy = at::empty(hidden_size, cx.options());
+    cy = at::empty(cell_size, cx.options());
   } else {
     cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors
   }
@@ -837,9 +958,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     w_desc.set(weight_buf, 3);
   }
 
-  TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
-           "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
-
+  TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size),
+          "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes());
   size_t workspace_size;
   auto x_descs_arr = descs.get_x_descs();
   auto y_descs_arr = descs.get_y_descs();
@@ -851,7 +971,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
         &workspace_size
         ));
   Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
-
   Tensor reserve;
   // NB: Previously, the test was for fn.requires_grad, but we don't have
   // this information.  Use 'train' as a proxy.
@@ -908,7 +1027,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
     const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
     const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy,
     const Tensor& grad_cy,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first, double fn_dropout,
     bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
     const Tensor& fn_dropout_state, const Tensor& fn_reserve,
@@ -921,7 +1040,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
 
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
 
@@ -942,6 +1061,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
 
   auto input_size = _input_size(fn.tensors);
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
+  auto cell_size = _cell_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
   TORCH_CHECK(hx.is_contiguous(),
@@ -955,10 +1075,10 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto w = weight_buf;
   auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this
   auto dhy = grad_hy.contiguous().view(hidden_size);
-  auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor();
+  auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(cell_size) : Tensor();
   auto dhx = at::empty(hidden_size, hx.options());
-  AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
-  auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor();
+  TORCH_INTERNAL_ASSERT(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
+  auto dcx = cx.defined() ? at::empty(cell_size, cx.options()) : Tensor();
 
   TORCH_CHECK(fn_train,
            "cudnn RNN backward can only be called in training mode");
@@ -970,12 +1090,12 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
 
   TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
            "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
-  TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
-           "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
+  TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size),
+           "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes());
   TORCH_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
            "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes());
-  TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
-           "Expected d_cell size ", IntArrayRef{hidden_size}, ", got ", dcy.sizes());
+  TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(cell_size),
+           "Expected d_cell size ", IntArrayRef{cell_size}, ", got ", dcy.sizes());
 
   TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
            "Gradients aren't CUDA tensors");
@@ -1031,20 +1151,19 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
     const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0,
     const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
     const Tensor& output_r,
-    int64_t fn_mode, int64_t fn_hidden_size,
+    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
     int64_t fn_num_layers, bool batch_first, double fn_dropout,
     bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
     const Tensor& fn_dropout_state, const Tensor& fn_reserve
     ) {
 
   MatrixRef<Tensor> weight{ weight_arr, static_cast<size_t>(weight_stride0) };
-
   auto input = input_r;
   auto output = output_r;
 
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
 
@@ -1140,7 +1259,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
     const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
     const Tensor& grad_cy_r,
-    int64_t mode, int64_t hidden_size,
+    int64_t mode, int64_t hidden_size, int64_t proj_size,
     int64_t num_layers, bool batch_first, double dropout,
     bool train, bool bidirectional, IntArrayRef batch_sizes,
     const Tensor& dropout_state, const Tensor& reserve,
@@ -1156,10 +1275,10 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
 
   Tensor dx, dhx, dcx;
   // NB: unconditionally compute this gradient, because it mutates reserve
-  std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
+  std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
   std::vector<Tensor> dw;
   if (output_mask[3]) {
-    dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
+    dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
   }
   return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{dx, dhx, dcx, dw};
 }
@@ -1266,7 +1385,7 @@ DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions opti
 
 Tensor try_get_weight_buf(
       const Tensor& input, TensorList parameters, bool has_biases,
-      cudnnRNNMode_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional) {
+      cudnnRNNMode_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional) {
 
   // Prepare all relevant descriptors
   auto handle = getCudnnHandle();
@@ -1274,7 +1393,7 @@ Tensor try_get_weight_buf(
   auto datatype = getCudnnDataType(any_param);
 
   RNNDescriptorParams rnn;
-  rnn.set(mode, hidden_size, num_layers, bidirectional, promote_rnn_math_type(datatype), datatype);
+  rnn.set(mode, hidden_size, proj_size, num_layers, bidirectional, promote_rnn_math_type(datatype), datatype);
   RNNDescriptor rnn_desc = rnn.descriptor(handle);
 
   TensorGeometry x_geom ({1, input.size(-1)});
@@ -1301,13 +1420,34 @@ Tensor try_get_weight_buf(
 
   int64_t num_parameters = parameters.size();
   int64_t num_ptrs = expected_data_ptrs.size();
-  AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
-  AT_ASSERT(num_ptrs % (has_biases ? 4 : 2) == 0);
-  for (int64_t param_i = 0, ptr_i = 0;
-       ptr_i < num_ptrs;
-       ptr_i += (has_biases ? 2 : 4), param_i += 2) {
-    if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
-    if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+  if (proj_size != 0) {
+    AT_ASSERT(num_parameters % (has_biases ? 5 : 3) == 0);
+    AT_ASSERT(num_ptrs % 5 == 0);
+    if (has_biases) {
+      AT_ASSERT(num_ptrs == num_parameters);
+      for (int64_t i = 0; i < num_parameters; i++) {
+        if (expected_data_ptrs[i] != parameters[i].data_ptr()) return {};
+      }
+    } else {
+      AT_ASSERT(num_parameters % 3 == 0);
+      AT_ASSERT(num_ptrs == num_parameters * 5 / 3);
+      for (int64_t param_i = 0, ptr_i = 0;
+          ptr_i < num_ptrs;
+          ptr_i += 5, param_i += 3) {
+        if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
+        if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+        if (expected_data_ptrs[ptr_i + 4] != parameters[param_i + 2].data_ptr()) return {};
+      }
+    }
+  } else {
+    AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
+    AT_ASSERT(num_parameters % (has_biases ? 4 : 2) == 0);
+    for (int64_t param_i = 0, ptr_i = 0;
+        ptr_i < num_ptrs;
+        ptr_i += (has_biases ? 2 : 4), param_i += 2) {
+      if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
+      if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+    }
   }
   if (!parameters[num_parameters - 1].is_contiguous()) return {};
   return weight_buf;
@@ -1321,22 +1461,32 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
   Tensor hx, cx;
   std::tie(hx, cx) = unpack_hidden(hidden);
   int64_t hidden_size = hx.size(2);
+  int64_t proj_size = 0;
+  // For LSTM models with projections hidden size could be different
+  if (cx.defined() && cx.size(2) != hx.size(2)) {
+    hidden_size = cx.size(2);
+    proj_size = hx.size(2);
+  }
 
   // TODO:  try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a c10::optional<Tensor>
   // in weight_buf's slot.  Do we want try_get_weight_buf to return a c10::optional<Tensor>
   // instead of a defined or undefined Tensor?
   auto weight_buf = try_get_weight_buf(
-      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
+      input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional);
 
   TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
   IntArrayRef batch_sizes { _batch_sizes.data_ptr<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
 
   auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
   std::unique_lock<DropoutState> lock { dropout_state };
+  int64_t num_params = has_biases ? 4 : 2;
+  if (proj_size != 0) {
+    ++num_params;
+  }
   // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
   auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(mode), hidden_size, num_layers, /*batch_first=*/false,
+      input, params, num_params, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, proj_size, num_layers, /*batch_first=*/false,
       dropout_p, train, bidirectional, batch_sizes, dropout_state.buffer);
 
   return {std::get<0>(cudnn_output),
@@ -1351,16 +1501,24 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
   Tensor hx, cx;
   std::tie(hx, cx) = unpack_hidden(hidden);
   int64_t hidden_size = hx.size(2);
-
+  int64_t proj_size = 0;
+  // For LSTM models with projections hidden size could be different
+  if (cx.defined() && cx.size(2) != hx.size(2)) {
+    hidden_size = cx.size(2);
+    proj_size = hx.size(2);
+  }
   auto weight_buf = try_get_weight_buf(
-      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
-
+      input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional);
   auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
   std::unique_lock<DropoutState> lock { dropout_state };
+  int64_t num_params = has_biases ? 4 : 2;
+  if (proj_size != 0) {
+    ++num_params;
+  }
   // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
   auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(mode), hidden_size, num_layers, batch_first, dropout_p,
+      input, params, num_params, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, proj_size, num_layers, batch_first, dropout_p,
       train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer);
 
   return {std::get<0>(cudnn_output),
diff --git a/aten/src/ATen/native/cudnn/RNNUtils.h b/aten/src/ATen/native/cudnn/RNNUtils.h
index 89b58ebef1d8..e1b79bb3c81f 100644
--- a/aten/src/ATen/native/cudnn/RNNUtils.h
+++ b/aten/src/ATen/native/cudnn/RNNUtils.h
@@ -14,6 +14,7 @@ TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_
     int64_t input_size,
     int64_t mode,
     int64_t hidden_size,
+    int64_t proj_size,
     int64_t num_layers,
     bool batch_first,
     bool bidirectional,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 61e5e42b38d5..1c0eb48f3bc3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -145,17 +145,17 @@
 - func: _use_cudnn_rnn_flatten_weight() -> bool
   use_c10_dispatcher: full
 
-- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
+- func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_rnn_flatten_weight
 
-- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+- func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
     CUDA: _cudnn_rnn
 
-- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+- func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
     CUDA: _cudnn_rnn_backward
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index fa2e54844935..b259d3532851 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -187,6 +187,9 @@
     ("aten::ifft", datetime.date(2021, 1, 31)),
     ("aten::irfft", datetime.date(2021, 1, 31)),
     ("aten::rfft", datetime.date(2021, 1, 31)),
+    ("aten::_cudnn_rnn_flatten_weight", datetime.date(2020, 12, 31)),
+    ("aten::_cudnn_rnn", datetime.date(2020, 12, 31)),
+    ("aten::_cudnn_rnn_backward", datetime.date(2020, 12, 31)),
     ("aten::quantile", datetime.date(2021, 1, 31)),
     ("aten::nanquantile", datetime.date(2021, 1, 31)),
     ("aten::_fft_with_size", datetime.date(2021, 1, 31)),
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index c51c38660c2d..e5be819b4804 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -14,8 +14,13 @@ bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
   auto nhid = 32;
   auto model = std::make_shared<SimpleContainer>();
   auto l1 = model->add(Linear(1, nhid), "l1");
-  auto rnn = model->add(model_maker(nhid), "rnn");
-  auto lo = model->add(Linear(nhid, 1), "lo");
+  auto rnn_model = model_maker(nhid);
+  auto rnn = model->add(rnn_model, "rnn");
+  auto nout = nhid;
+  if (rnn_model.get()->options_base.proj_size() > 0) {
+    nout = rnn_model.get()->options_base.proj_size();
+  }
+  auto lo = model->add(Linear(nout, 1), "lo");
 
   torch::optim::Adam optimizer(model->parameters(), 1e-2);
   auto forward_op = [&](torch::Tensor x) {
@@ -44,7 +49,6 @@ bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
         torch::rand({nlen, bs, 1}, backend).round().to(torch::kFloat32);
     auto labels = inputs.sum(0).detach();
     inputs.set_requires_grad(true);
-
     auto outputs = forward_op(inputs);
     torch::Tensor loss = torch::mse_loss(outputs, labels);
 
@@ -90,6 +94,35 @@ void check_lstm_sizes(std::tuple<torch::Tensor, std::tuple<torch::Tensor, torch:
   ASSERT_GT(cx.norm().item<float>(), 0);
 }
 
+void check_lstm_sizes_proj(std::tuple<torch::Tensor, std::tuple<torch::Tensor, torch::Tensor>> lstm_output) {
+  // Expect the LSTM to have 32 outputs and 3 layers, with an input of batch
+  // 10 and 16 time steps (10 x 16 x n)
+
+  torch::Tensor output = std::get<0>(lstm_output);
+  std::tuple<torch::Tensor, torch::Tensor> state = std::get<1>(lstm_output);
+  torch::Tensor hx = std::get<0>(state);
+  torch::Tensor cx = std::get<1>(state);
+
+  ASSERT_EQ(output.ndimension(), 3);
+  ASSERT_EQ(output.size(0), 10);
+  ASSERT_EQ(output.size(1), 16);
+  ASSERT_EQ(output.size(2), 32);
+
+  ASSERT_EQ(hx.ndimension(), 3);
+  ASSERT_EQ(hx.size(0), 3); // layers
+  ASSERT_EQ(hx.size(1), 16); // Batchsize
+  ASSERT_EQ(hx.size(2), 32); // 32 hidden dims
+
+  ASSERT_EQ(cx.ndimension(), 3);
+  ASSERT_EQ(cx.size(0), 3); // layers
+  ASSERT_EQ(cx.size(1), 16); // Batchsize
+  ASSERT_EQ(cx.size(2), 64); // 64 cell dims
+
+  // Something is in the hiddens
+  ASSERT_GT(hx.norm().item<float>(), 0);
+  ASSERT_GT(cx.norm().item<float>(), 0);
+}
+
 struct RNNTest : torch::test::SeedingFixture {};
 
 TEST_F(RNNTest, CheckOutputSizes) {
@@ -118,6 +151,33 @@ TEST_F(RNNTest, CheckOutputSizes) {
   ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
+TEST_F(RNNTest, CheckOutputSizesProj) {
+  LSTM model(LSTMOptions(128, 64).num_layers(3).dropout(0.2).proj_size(32));
+  // Input size is: sequence length, batch size, input size
+  auto x = torch::randn({10, 16, 128}, torch::requires_grad());
+  auto output = model->forward(x);
+  auto y = x.mean();
+
+  y.backward();
+  check_lstm_sizes_proj(output);
+
+  auto next = model->forward(x, std::get<1>(output));
+
+  check_lstm_sizes_proj(next);
+
+  auto output_hx = std::get<0>(std::get<1>(output));
+  auto output_cx = std::get<1>(std::get<1>(output));
+
+  auto next_hx = std::get<0>(std::get<1>(next));
+  auto next_cx = std::get<1>(std::get<1>(next));
+
+  torch::Tensor diff = next_hx - output_hx;
+  // Hiddens changed
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
+  diff = next_cx - output_cx;
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
+}
+
 TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
   torch::manual_seed(0);
   // Make sure the outputs match pytorch outputs
@@ -192,6 +252,11 @@ TEST_F(RNNTest, EndToEndLSTM) {
       [](int s) { return LSTM(LSTMOptions(s, s).num_layers(2)); }));
 }
 
+TEST_F(RNNTest, EndToEndLSTMProj) {
+  ASSERT_TRUE(test_RNN_xor<LSTM>(
+      [](int s) { return LSTM(LSTMOptions(s, s).num_layers(2).proj_size(s / 2)); }));
+}
+
 TEST_F(RNNTest, EndToEndGRU) {
   ASSERT_TRUE(
       test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).num_layers(2)); }));
@@ -235,11 +300,45 @@ TEST_F(RNNTest, Sizes_CUDA) {
   ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
+TEST_F(RNNTest, SizesProj_CUDA) {
+  torch::manual_seed(0);
+  LSTM model(LSTMOptions(128, 64).num_layers(3).dropout(0.2).proj_size(32));
+  model->to(torch::kCUDA);
+  auto x =
+      torch::randn({10, 16, 128}, torch::requires_grad().device(torch::kCUDA));
+  auto output = model->forward(x);
+  auto y = x.mean();
+
+  y.backward();
+  check_lstm_sizes_proj(output);
+
+  auto next = model->forward(x, std::get<1>(output));
+
+  check_lstm_sizes_proj(next);
+
+  auto output_hx = std::get<0>(std::get<1>(output));
+  auto output_cx = std::get<1>(std::get<1>(output));
+
+  auto next_hx = std::get<0>(std::get<1>(next));
+  auto next_cx = std::get<1>(std::get<1>(next));
+
+  torch::Tensor diff = next_hx - output_hx;
+  // Hiddens changed
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
+  diff = next_cx - output_cx;
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
+}
+
 TEST_F(RNNTest, EndToEndLSTM_CUDA) {
   ASSERT_TRUE(test_RNN_xor<LSTM>(
       [](int s) { return LSTM(LSTMOptions(s, s).num_layers(2)); }, true));
 }
 
+TEST_F(RNNTest, EndToEndLSTMProj_CUDA) {
+  ASSERT_TRUE(test_RNN_xor<LSTM>(
+      [](int s) { return LSTM(LSTMOptions(s, s).num_layers(2).proj_size(s / 2)); }, true));
+}
+
 TEST_F(RNNTest, EndToEndGRU_CUDA) {
   ASSERT_TRUE(test_RNN_xor<GRU>(
       [](int s) { return GRU(GRUOptions(s, s).num_layers(2)); }, true));
@@ -258,6 +357,9 @@ TEST_F(RNNTest, PrettyPrintRNNs) {
   ASSERT_EQ(
       c10::str(LSTM(LSTMOptions(128, 64).num_layers(3).dropout(0.2))),
       "torch::nn::LSTM(input_size=128, hidden_size=64, num_layers=3, bias=true, batch_first=false, dropout=0.2, bidirectional=false)");
+  ASSERT_EQ(
+      c10::str(LSTM(LSTMOptions(128, 64).num_layers(3).dropout(0.2).proj_size(32))),
+      "torch::nn::LSTM(input_size=128, hidden_size=64, num_layers=3, bias=true, batch_first=false, dropout=0.2, bidirectional=false, proj_size=32)");
   ASSERT_EQ(
       c10::str(GRU(GRUOptions(128, 64).num_layers(3).dropout(0.5))),
       "torch::nn::GRU(input_size=128, hidden_size=64, num_layers=3, bias=true, batch_first=false, dropout=0.5, bidirectional=false)");
@@ -503,6 +605,55 @@ TEST_F(RNNTest, BidirectionalMultilayerLSTM_CPU_vs_CUDA) {
   }
 }
 
+TEST_F(RNNTest, BidirectionalMultilayerLSTMProj_CPU_vs_CUDA) {
+  // Create two LSTMs with the same options
+  auto opt = LSTMOptions(2, 4).num_layers(3).batch_first(false).bidirectional(true).proj_size(2);
+  LSTM lstm_cpu {opt};
+  LSTM lstm_cuda {opt};
+
+  // Copy weights and biases from CPU LSTM to CUDA LSTM
+  {
+    at::NoGradGuard guard;
+    for (const auto& param : lstm_cpu->named_parameters(/*recurse=*/false)) {
+      lstm_cuda->named_parameters()[param.key()].copy_(lstm_cpu->named_parameters()[param.key()]);
+    }
+  }
+
+  lstm_cpu->flatten_parameters();
+  lstm_cuda->flatten_parameters();
+
+  // Move LSTM to CUDA
+  lstm_cuda->to(torch::kCUDA);
+
+  auto options = torch::TensorOptions()
+                  .dtype(torch::kFloat32).requires_grad(false);
+  auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, options)
+                  .reshape({3, 1, 2});
+  auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, options)
+                  .reshape({3, 1, 2}).to(torch::kCUDA);
+
+  // Call forward on both LSTMs
+  auto output_cpu = lstm_cpu->forward(input_cpu);
+  auto output_cuda = lstm_cuda->forward(input_cuda);
+
+  output_cpu = lstm_output_to_device(output_cpu, torch::kCPU);
+
+  // Assert that the output and state are equal on CPU and CUDA
+  ASSERT_EQ(std::get<0>(output_cpu).dim(), std::get<0>(output_cuda).dim());
+  for (int i = 0; i < std::get<0>(output_cpu).dim(); i++) {
+    ASSERT_EQ(std::get<0>(output_cpu).size(i), std::get<0>(output_cuda).size(i));
+  }
+  for (int i = 0; i < std::get<0>(output_cpu).size(0); i++) {
+    for (int j = 0; j < std::get<0>(output_cpu).size(1); j++) {
+      for (int k = 0; k < std::get<0>(output_cpu).size(2); k++) {
+        ASSERT_NEAR(
+          std::get<0>(output_cpu)[i][j][k].item<float>(),
+          std::get<0>(output_cuda)[i][j][k].item<float>(), 1e-5);
+      }
+    }
+  }
+}
+
 TEST_F(RNNTest, UsePackedSequenceAsInput) {
   {
     torch::manual_seed(0);
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 9437197e986b..114b28912a3f 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -464,11 +464,11 @@ def test_rnn_init_predict_split(self):
                                                   do_constant_folding=False)[0])
         prepared = c2.prepare(mp, device='CPU')
         if self.embed_params:
-            assert len(prepared.init_net.op) == 875
-            assert len(prepared.predict_net.op) == 130
+            assert len(prepared.init_net.op) == 879
+            assert len(prepared.predict_net.op) == 133
         else:
-            assert len(prepared.init_net.op) == 8
-            assert len(prepared.predict_net.op) == 997
+            assert len(prepared.init_net.op) == 12
+            assert len(prepared.predict_net.op) == 1000
 
     def test_alexnet(self):
         state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False)
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index c481d58e4bb5..b0c7143d0129 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2357,7 +2357,7 @@ def test_batchnorm1d_norunningstats(self):
         self.run_test(model, x)
 
         x = torch.randn(10, 10, 128)
-        self.run_test(model, x)  
+        self.run_test(model, x)
 
     def test_batchnorm2d(self):
         x = torch.randn(10, 3, 128, 128)
@@ -2642,6 +2642,21 @@ def forward(self, x):
         input = torch.randn((10, 16, 16))
         self.run_test(LSTMModel(), (input,))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()  # scripting prim_dtype
+    def test_lstm_proj_no_hidden(self):
+        class LSTMModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.rnn = torch.nn.LSTM(input_size=16, hidden_size=16, proj_size=8)
+
+            def forward(self, x):
+                return self.rnn(x)
+
+        input = torch.randn((10, 16, 16))
+        with self.assertRaises(RuntimeError):
+            self.run_test(LSTMModel(), (input,))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()
     def test_lstm(self):
diff --git a/test/test_nn.py b/test/test_nn.py
index a3d18bc3e49c..a5b66cf3d8c7 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2591,39 +2591,42 @@ def test_rnn_pruning(self):
         assert dict(l.named_parameters())['weight_ih_l0'] is not None
         assert 'weight_ih_l0_orig' not in dict(l.named_parameters())
 
-
     def test_rnn_weight_norm(self):
-        l = torch.nn.LSTM(32, 32)
-        # This Module has 4 parameters called:
-        # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'
+        def check_weight_norm(l, name, num_params):
+            # This Module has 4 or 5 parameters called:
+            # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0', weight_hr_l0
 
-        # Applying weight norm on one of them causes it to become a tensor
-        l = torch.nn.utils.weight_norm(l, name='weight_ih_l0')
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 3
-        )
+            # Applying weight norm on one of them causes it to become a tensor
+            l = torch.nn.utils.weight_norm(l, name=name)
+            self.assertEqual(
+                sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights]),
+                num_params - 1,
+            )
 
-        # Removing the weight norm reparametrization restores the Parameter
-        l = torch.nn.utils.remove_weight_norm(l, name='weight_ih_l0')
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 4
-        )
+            # Removing the weight norm reparametrization restores the Parameter
+            l = torch.nn.utils.remove_weight_norm(l, name=name)
+            self.assertEqual(
+                sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights]),
+                num_params,
+            )
+
+            # Make sure that, upon removal of the reparametrization, the
+            # `._parameters` and `.named_parameters` contain the right params.
+            # Specifically, the original weight ('weight_ih_l0') should be placed
+            # back in the parameters, while the reparametrization components
+            # ('weight_ih_l0_v' and 'weight_ih_l0_g') should be removed.
+            self.assertTrue(name in l._parameters)
+            self.assertIsNotNone(l._parameters[name])
+            self.assertTrue(name + '_v' not in l._parameters)
+            self.assertTrue(name + '_g' not in l._parameters)
+            self.assertTrue(name in dict(l.named_parameters()))
+            self.assertIsNotNone(dict(l.named_parameters())[name])
+            self.assertTrue(name + '_v' not in dict(l.named_parameters()))
+            self.assertTrue(name + '_g' not in dict(l.named_parameters()))
+
+        check_weight_norm(torch.nn.LSTM(32, 32), 'weight_ih_l0', 4)
+        check_weight_norm(torch.nn.LSTM(32, 32, proj_size=16), 'weight_hr_l0', 5)
 
-        # Make sure that, upon removal of the reparametrization, the
-        # `._parameters` and `.named_parameters` contain the right params.
-        # Specifically, the original weight ('weight_ih_l0') should be placed
-        # back in the parameters, while the reparametrization components
-        # ('weight_ih_l0_v' and 'weight_ih_l0_g') should be removed.
-        assert 'weight_ih_l0' in l._parameters
-        assert l._parameters['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_v' not in l._parameters
-        assert 'weight_ih_l0_g' not in l._parameters
-        assert 'weight_ih_l0' in dict(l.named_parameters())
-        assert dict(l.named_parameters())['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_v' not in dict(l.named_parameters())
-        assert 'weight_ih_l0_g' not in dict(l.named_parameters())
 
     def test_weight_norm(self):
         input = torch.randn(3, 5)
@@ -5790,6 +5793,7 @@ def test_cudnn_rnn_dropout_states_device(self):
     def test_cudnn_weight_format(self):
         rnns = [
             nn.LSTM(10, 20, batch_first=True),
+            nn.LSTM(10, 20, batch_first=True, proj_size=10),
             nn.GRU(10, 20, batch_first=True),
             nn.RNN(10, 20, batch_first=True)
         ]
@@ -5800,6 +5804,10 @@ def test_cudnn_weight_format(self):
             hx = torch.randn(1, 5, 20, requires_grad=True, device="cuda")
             all_vars = [input, hx] + list(rnn.parameters())
             if isinstance(rnn, nn.LSTM):
+                # LSTM with projections has different hx size
+                if rnn.proj_size > 0:
+                    hx = torch.randn(1, 5, 10, requires_grad=True, device="cuda")
+                    all_vars[1] = hx
                 cx = torch.randn(1, 5, 20, requires_grad=True, device="cuda")
                 all_vars[2:2] = [cx]
                 hx = (hx, cx)
@@ -5839,6 +5847,7 @@ def test_cudnn_weight_format(self):
     def test_cudnn_weight_tying(self):
         rnns = [
             nn.LSTM(10, 20, batch_first=True, bidirectional=True),
+            nn.LSTM(10, 20, batch_first=True, bidirectional=True, proj_size=10),
             nn.GRU(10, 20, batch_first=True, bidirectional=True),
             nn.RNN(10, 20, batch_first=True, bidirectional=True)
         ]
@@ -5851,6 +5860,10 @@ def test_cudnn_weight_tying(self):
             opt = torch.optim.SGD(rnn.parameters(), lr=0.1)
             opt.zero_grad()
             if isinstance(rnn, nn.LSTM):
+                # LSTM with projections has different hx size
+                if rnn.proj_size > 0:
+                    hx = torch.randn(2, 5, 10, requires_grad=True, device="cuda")
+                    all_vars[1] = hx
                 cx = torch.randn(2, 5, 20, requires_grad=True, device="cuda")
                 all_vars[2:2] = [cx]
                 hx = (hx, cx)
@@ -6111,6 +6124,82 @@ def get_inputs(input_shape, hidden_shape, mode):
             hidden_shape = update_shape(correct_hidden_shape, 0, bad_size)
             test(input_shape, hidden_shape, mode)
 
+    def test_projections_lstm_args_check(self):
+        input_size = 3
+        hidden_size = 5
+        proj_size = 2
+        num_layers = 2
+        batch_size = 4
+        seq_len = 6
+        num_directions = 1
+        bad_size = 7  # prime number so that no size can divide it.
+
+        def test(input_shape, hidden_h_shape, hidden_c_shape):
+            for input, hidden in get_inputs(input_shape, hidden_h_shape, hidden_c_shape):
+                model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size)
+                self.assertRaises(RuntimeError, lambda: model(input, hidden))
+
+        correct_input_shape = (seq_len, batch_size, input_size)
+        correct_hidden_h_shape = (num_layers * num_directions, batch_size, proj_size)
+        correct_hidden_c_shape = (num_layers * num_directions, batch_size, hidden_size)
+
+        def update_shape(shape, dim, new_dim_size):
+            new_shape = list(shape)
+            new_shape[dim] = new_dim_size
+            return tuple(new_shape)
+
+        def get_inputs(input_shape, hidden_h_shape, hidden_c_shape):
+            '''returns list( tuple(input, hidden) )
+            where input, hidden are inputs to a model'''
+            input = torch.randn(input_shape)
+            hidden_h = torch.randn(hidden_h_shape)
+            hidden_c = torch.randn(hidden_c_shape)
+            return [(input, (hidden_h, hidden_c))]
+
+        # Incorrect input batch size
+        input_shape = update_shape(correct_input_shape, 1, bad_size)
+        test(input_shape, correct_hidden_h_shape, correct_hidden_c_shape)
+
+        # Incorrect hidden batch size
+        input_shape = correct_input_shape
+        hidden_h_shape = update_shape(correct_hidden_h_shape, 1, bad_size)
+        hidden_c_shape = update_shape(correct_hidden_c_shape, 1, bad_size)
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
+        # Incorrect input size
+        input_shape = update_shape(correct_input_shape, 2, bad_size)
+        test(input_shape, correct_hidden_h_shape, correct_hidden_c_shape)
+
+        # Incorrect hidden size
+        input_shape = correct_input_shape
+        hidden_h_shape = update_shape(correct_hidden_h_shape, 2, bad_size)
+        hidden_c_shape = update_shape(correct_hidden_c_shape, 2, bad_size)
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
+        # Incorrect hidden[0]
+        input_shape = correct_input_shape
+        hidden_h_shape = update_shape(correct_hidden_h_shape, 0, bad_size)
+        hidden_c_shape = update_shape(correct_hidden_c_shape, 0, bad_size)
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
+        # Incorrect proj size = hidden size
+        input_shape = correct_input_shape
+        hidden_h_shape = update_shape(correct_hidden_h_shape, 0, hidden_size)
+        hidden_c_shape = correct_hidden_c_shape
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
+        # Incorrect proj size != hidden size
+        input_shape = correct_input_shape
+        hidden_h_shape = update_shape(correct_hidden_h_shape, 0, bad_size)
+        hidden_c_shape = correct_hidden_c_shape
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
+        # Incorrect cell size != hidden size
+        input_shape = correct_input_shape
+        hidden_h_shape = correct_hidden_h_shape
+        hidden_c_shape = update_shape(correct_hidden_c_shape, 0, bad_size)
+        test(input_shape, hidden_h_shape, hidden_c_shape)
+
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_rnn_check_device(self):
         input_size = 3
@@ -6148,6 +6237,40 @@ def test_rnn_check_device(self):
                                             "Input and hidden tensors are not at the same device"):
                     model(input.to('cuda:0'), (hidden.to('cuda:0'), hidden.to('cuda:1')))
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_projections_lstm_check_device(self):
+        input_size = 3
+        hidden_size = 5
+        proj_size = 2
+        num_layers = 2
+        batch_size = 4
+        seq_len = 6
+        num_directions = 1
+
+        correct_input_shape = (seq_len, batch_size, input_size)
+        correct_hidden_h_shape = (num_layers * num_directions, batch_size, proj_size)
+        correct_hidden_c_shape = (num_layers * num_directions, batch_size, hidden_size)
+
+        model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size)
+        input = torch.randn(correct_input_shape)
+        hidden_h = torch.randn(correct_hidden_h_shape)
+        hidden_c = torch.randn(correct_hidden_c_shape)
+
+        # input and weights are not at the same device
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Input and parameter tensors are not at the same device"):
+            model(input.to('cuda:0'))
+
+        # input and hiddens are not at the same device
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"Input and hidden tensors are not at the same device"):
+            model(input, (hidden_h.to('cuda:0'), hidden_c.to('cuda:0')))
+
+        # hidden tensors are not at the same CUDA device
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Input and hidden tensors are not at the same device"):
+            model(input.to('cuda:0'), (hidden_h.to('cuda:0'), hidden_c.to('cuda:1')))
+
     def test_rnn_initial_hidden_state(self):
         rnn_modes = ['RNN', 'GRU', 'LSTM']
         for mode in rnn_modes:
@@ -6162,9 +6285,29 @@ def test_rnn_initial_hidden_state(self):
             self.assertEqual(output1, output2)
             self.assertEqual(hidden1, hidden2)
 
+    def test_projections_lstm_initial_hidden_state(self):
+        for bidir in [False, True]:
+            rnn = nn.LSTM(30, 20, 2, bidirectional=bidir, proj_size=10)
+            num_dirs = 2 if bidir else 1
+            input = torch.randn(10, 32, 30)
+            hidden_h = torch.zeros(2 * num_dirs, 32, 10)
+            hidden_c = torch.zeros(2 * num_dirs, 32, 20)
+            hidden = (hidden_h, hidden_c)
+            output1, hidden1 = rnn(input, hidden)
+            output2, hidden2 = rnn(input)
+            self.assertEqual(output1, output2)
+            self.assertEqual(hidden1, hidden2)
+
+    def test_projections_errors_on_gru_and_rnn(self):
+        error_msg = "proj_size argument is only supported for LSTM, not RNN or GRU"
+        for mode in ['RNN', 'GRU']:
+            with self.assertRaisesRegex(ValueError, error_msg):
+                rnn = getattr(nn, mode)(30, 20, 2, proj_size=10)
+
     def _test_RNN_cpu_vs_cudnn(self, dropout, dtype=torch.double):
 
-        def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights_val):
+        def forward_backward(cuda, rnn, input_val, grad_output, weights_val, hx_val, grad_hy,
+                             cx_val=None, grad_cy=None):
             is_lstm = isinstance(rnn, nn.LSTM)
 
             for x_layer, y_layer in zip(rnn.all_weights, weights_val):
@@ -6179,8 +6322,12 @@ def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights
                 input = input_val.clone().requires_grad_(True)
                 input_var = input
             if is_lstm:
-                hx = (hx_val.clone().requires_grad_(True),
-                      hx_val.add(1).requires_grad_(True))
+                if cx_val is None:
+                    hx = (hx_val.clone().requires_grad_(True),
+                          hx_val.add(1).requires_grad_(True))
+                else:
+                    hx = (hx_val.clone().requires_grad_(True),
+                          cx_val.add(1).requires_grad_(True))
             else:
                 hx = hx_val.clone().requires_grad_(True)
 
@@ -6193,6 +6340,8 @@ def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights
                 else:
                     hx.data = hx.data.cuda()
                 grad_hy = grad_hy.cuda()
+                if grad_cy is not None:
+                    grad_cy = grad_cy.cuda()
                 grad_output = grad_output.cuda()
 
             output, hy = rnn(input, hx)
@@ -6201,7 +6350,10 @@ def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights
                 output = output.data
 
             if is_lstm:
-                torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_hy + 1])
+                if grad_cy is None:
+                    torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_hy + 1])
+                else:
+                    torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_cy + 1])
             else:
                 torch.autograd.backward([output, hy], [grad_output, grad_hy])
 
@@ -6215,6 +6367,7 @@ def forward_backward(cuda, rnn, input_val, hx_val, grad_output, grad_hy, weights
 
         input_size = 10
         hidden_size = 6
+        proj_size = 3
         num_layers = 2
         seq_length = 7
         batch = 6
@@ -6246,15 +6399,15 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
                     input_val = torch.randn(seq_length, batch, input_size, dtype=dtype)
                     grad_output = torch.randn(seq_length, batch, hidden_size * num_directions, dtype=dtype)
 
+                hx_val = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
+                grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
+
                 if not contig:
                     grad_output = make_noncontig(grad_output)
                     grad_hy = make_noncontig(grad_hy)
                     input_var = make_noncontig(input_val)
                     hx_val = make_noncontig(hx_val)
 
-                hx_val = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
-                grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
-
                 if variable_len:
                     lengths = [7, 5, 5, 2, 1, 1]
                     if lens_as_tensor:
@@ -6271,7 +6424,7 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
                              batch_first=batch_first).to(dtype)
 
                 outputs_cpu = forward_backward(
-                    False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+                    False, rnn, input_val, grad_output, rnn.all_weights, hx_val, grad_hy)
 
                 rnn_gpu = module(input_size,
                                  hidden_size,
@@ -6282,7 +6435,7 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
                                  batch_first=batch_first).to(dtype)
 
                 outputs_gpu = forward_backward(
-                    True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+                    True, rnn_gpu, input_val, grad_output, rnn.all_weights, hx_val, grad_hy)
 
                 compare_cpu_gpu(outputs_cpu, outputs_gpu)
 
@@ -6295,13 +6448,78 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
                 num_layers * num_directions, batch, hidden_size, dtype=dtype)
 
             rnn = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity).to(dtype)
-            outputs_cpu = forward_backward(False, rnn, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+            outputs_cpu = forward_backward(False, rnn, input_val, grad_output, rnn.all_weights, hx_val, grad_hy)
 
             rnn_gpu = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity).to(dtype)
-            outputs_gpu = forward_backward(True, rnn_gpu, input_val, hx_val, grad_output, grad_hy, rnn.all_weights)
+            outputs_gpu = forward_backward(True, rnn_gpu, input_val, grad_output, rnn.all_weights, hx_val, grad_hy)
 
             compare_cpu_gpu(outputs_cpu, outputs_gpu)
 
+        # checking LSTM with projections
+        for bias, bidirectional, batch_first, contig, variable_len, lens_as_tensor \
+                in product((True, False), repeat=6):
+            num_directions = 2 if bidirectional else 1
+            if batch_first:
+                input_val = torch.randn(batch, seq_length, input_size, dtype=dtype)
+                grad_output = torch.randn(batch, seq_length, proj_size * num_directions, dtype=dtype)
+            else:
+                input_val = torch.randn(seq_length, batch, input_size, dtype=dtype)
+                grad_output = torch.randn(seq_length, batch, proj_size * num_directions, dtype=dtype)
+
+            hx_val = torch.randn(num_layers * num_directions, batch, proj_size, dtype=dtype)
+            cx_val = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
+            grad_hy = torch.randn(num_layers * num_directions, batch, proj_size, dtype=dtype)
+            grad_cy = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype)
+
+            if not contig:
+                grad_output = make_noncontig(grad_output)
+                grad_hy = make_noncontig(grad_hy)
+                grad_cy = make_noncontig(grad_cy)
+                input_var = make_noncontig(input_val)
+                hx_val = make_noncontig(hx_val)
+                cx_val = make_noncontig(cx_val)
+
+            if variable_len:
+                lengths = [7, 5, 5, 2, 1, 1]
+                if lens_as_tensor:
+                    lengths = torch.tensor(lengths, dtype=torch.long)
+                input_val = rnn_utils.pack_padded_sequence(input_val, lengths, batch_first=batch_first)
+                grad_output = rnn_utils.pack_padded_sequence(grad_output, lengths, batch_first=batch_first).data
+
+            rnn = nn.LSTM(input_size,
+                          hidden_size,
+                          num_layers,
+                          bias=bias,
+                          dropout=dropout,
+                          bidirectional=bidirectional,
+                          batch_first=batch_first,
+                          proj_size=proj_size).to(dtype)
+
+            outputs_cpu = forward_backward(
+                False, rnn, input_val, grad_output, rnn.all_weights,
+                hx_val, grad_hy, cx_val, grad_cy)
+
+            rnn_gpu = nn.LSTM(input_size,
+                              hidden_size,
+                              num_layers,
+                              bias=bias,
+                              dropout=dropout,
+                              bidirectional=bidirectional,
+                              batch_first=batch_first,
+                              proj_size=proj_size).to(dtype)
+            # LSTM with projections is not supported with MIOpen
+            if TEST_WITH_ROCM and dtype == torch.float:
+                with self.assertRaisesRegex(RuntimeError,
+                                            "LSTM with projections is not supported with MIOpen"):
+                    outputs_gpu = forward_backward(
+                        True, rnn_gpu, input_val, grad_output, rnn.all_weights,
+                        hx_val, grad_hy, cx_val, grad_cy)
+            else:
+                outputs_gpu = forward_backward(
+                    True, rnn_gpu, input_val, grad_output, rnn.all_weights,
+                    hx_val, grad_hy, cx_val, grad_cy)
+                compare_cpu_gpu(outputs_cpu, outputs_gpu)
+
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     def test_RNN_cpu_vs_cudnn_no_dropout(self):
         if TEST_WITH_ROCM:
@@ -6324,25 +6542,27 @@ def test_RNN_cudnn_weight_norm(self):
         batch = 6
 
         # runs on CPU to acquire expected output
-        m = nn.LSTM(input_size, hidden_size, num_layers)
-        input = torch.randn(seq_length, batch, input_size)
-        expected_output = m(input)
+        def check_weight_norm(m, name):
+            input = torch.randn(seq_length, batch, input_size)
+            expected_output = m(input)
 
-        # adds weight normalization
-        name = 'weight_hh_l0'
-        m = torch.nn.utils.weight_norm(m, name=name)
+            # adds weight normalization
+            m = torch.nn.utils.weight_norm(m, name=name)
 
-        # moves to CUDA
-        m = m.cuda()
-        input = input.cuda()
+            # moves to CUDA
+            m = m.cuda()
+            input = input.cuda()
 
-        # otherwise, subsequent warnings will be hidden, and further tests rely on them
-        warnings.simplefilter("always")
-        self.assertEqual(m(input), expected_output)
+            # otherwise, subsequent warnings will be hidden, and further tests rely on them
+            warnings.simplefilter("always")
+            self.assertEqual(m(input), expected_output)
 
-        # remove weight norm
-        m = torch.nn.utils.remove_weight_norm(m, name=name)
-        self.assertEqual(m(input), expected_output)
+            # remove weight norm
+            m = torch.nn.utils.remove_weight_norm(m, name=name)
+            self.assertEqual(m(input), expected_output)
+
+        check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers), 'weight_hh_l0')
+        check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers, proj_size=3), 'weight_hr_l0')
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_partial_flat_weights(self):
@@ -12691,7 +12911,7 @@ def maybe_index_tuple(maybe_tuple_of_tensors, index):
             return tuple(maybe_tuple_of_tensors[j][:, index:index + 1, :].contiguous()
                          for j in range(2))
 
-        def check_lengths(lengths, enforce_sorted, use_default_hiddens):
+        def check_lengths(lengths, enforce_sorted, use_default_hiddens, proj_size):
             input_size = 3
             hidden_size = 4
             num_layers = 2
@@ -12702,15 +12922,17 @@ def check_lengths(lengths, enforce_sorted, use_default_hiddens):
                                  dtype=dtype, requires_grad=True)
             num_directions = 2 if bidirectional else 1
             lstm = nn.LSTM(input_size, hidden_size, bidirectional=bidirectional,
-                           num_layers=num_layers).to(device, dtype)
+                           num_layers=num_layers, proj_size=proj_size).to(device, dtype)
             lstm2 = deepcopy(lstm).to(device, dtype)
             x = x_leaf
 
             hidden0 = None
             if not use_default_hiddens:
-                hidden0 = tuple(torch.randn(num_directions * num_layers, len(lengths), hidden_size,
-                                            device=device, dtype=dtype)
-                                for _ in range(2))
+                real_hidden_size = hidden_size if proj_size == 0 else proj_size
+                hidden0 = (torch.randn(num_directions * num_layers, len(lengths), real_hidden_size,
+                                       device=device, dtype=dtype),
+                           torch.randn(num_directions * num_layers, len(lengths), hidden_size,
+                                       device=device, dtype=dtype))
 
             # Compute sequences separately
             seq_outs = []
@@ -12745,7 +12967,7 @@ def check_lengths(lengths, enforce_sorted, use_default_hiddens):
             for p1, p2 in zip(lstm.parameters(), lstm2.parameters()):
                 prec = dtype2prec_DONTUSE[dtype]
                 if dtype == torch.float16:
-                    prec = 2e-2
+                    prec = 4e-2
                 self.assertEqual(p1.grad, p2.grad, atol=prec, rtol=0)
 
         tests = [
@@ -12757,9 +12979,16 @@ def check_lengths(lengths, enforce_sorted, use_default_hiddens):
             [False, [2, 1, 3, 2, 10, 5, 3]],
         ]
 
+        rocm_error_msg = "LSTM with projections is not supported with MIOpen"
         for enforce_sorted, seq_lens, in tests:
             for use_default_hiddens in (True, False):
-                check_lengths(seq_lens, enforce_sorted, use_default_hiddens)
+                for proj_size in [0, 2]:
+                    # LSTM with projections is not supported with MIOpen
+                    if device != 'cpu' and dtype == torch.float32 and TEST_WITH_ROCM and proj_size > 0:
+                        with self.assertRaisesRegex(RuntimeError, rocm_error_msg):
+                            check_lengths(seq_lens, enforce_sorted, use_default_hiddens, proj_size)
+                    else:
+                        check_lengths(seq_lens, enforce_sorted, use_default_hiddens, proj_size)
 
     def _test_batchnorm_update_stats(self, device, dtype=torch.float):
         module = nn.BatchNorm1d(3).to(device, dtype)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index ffd1ad347751..1ea5a141cf36 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1780,12 +1780,12 @@
 
 # Only frst three of _cudnn_rnn outputs can have gradients.
 # _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf)
-- name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+- name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   dropout_state: non_differentiable
   output_differentiability: [True, True, True, False, False]
-  input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+  input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
 
-- name: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+- name: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
   dropout_state: non_differentiable
 
 # miopen
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index d3244532a7b6..ac6c58441f39 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -160,6 +160,8 @@ class TORCH_API LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
  protected:
   void check_forward_args(const Tensor& input, std::tuple<Tensor, Tensor> hidden, const Tensor& batch_sizes) const;
 
+  std::tuple<int64_t, int64_t, int64_t> get_expected_cell_size(const Tensor& input, const Tensor& batch_sizes) const;
+
   std::tuple<Tensor, Tensor> permute_hidden(std::tuple<Tensor, Tensor> hx, const Tensor& permutation) const;
 
   std::tuple<Tensor, std::tuple<Tensor, Tensor>> forward_helper(
diff --git a/torch/csrc/api/include/torch/nn/options/rnn.h b/torch/csrc/api/include/torch/nn/options/rnn.h
index ae37693399ef..09bbfe0fa2f4 100644
--- a/torch/csrc/api/include/torch/nn/options/rnn.h
+++ b/torch/csrc/api/include/torch/nn/options/rnn.h
@@ -38,6 +38,8 @@ struct TORCH_API RNNOptionsBase {
   TORCH_ARG(double, dropout) = 0.0;
   /// Whether to make the RNN bidirectional.
   TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added. Can only be used for LSTMs.
+  TORCH_ARG(int64_t, proj_size) = 0;
 };
 
 } // namespace detail
@@ -108,6 +110,8 @@ struct TORCH_API LSTMOptions {
   TORCH_ARG(double, dropout) = 0.0;
   /// If ``true``, becomes a bidirectional LSTM. Default: ``false``
   TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added
+  TORCH_ARG(int64_t, proj_size) = 0;
 };
 
 /// Options for the `GRU` module.
@@ -148,7 +152,7 @@ namespace detail {
 struct TORCH_API RNNCellOptionsBase {
   RNNCellOptionsBase(int64_t input_size, int64_t hidden_size, bool bias, int64_t num_chunks);
   virtual ~RNNCellOptionsBase() = default;
-  
+
   TORCH_ARG(int64_t, input_size);
   TORCH_ARG(int64_t, hidden_size);
   TORCH_ARG(bool, bias);
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index 634dcf03d9d1..b645a8490778 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -71,6 +71,17 @@ void RNNImplBase<Derived>::reset() {
       "num_layers=", options_base.num_layers());
   }
 
+  TORCH_CHECK(
+    0 <= options_base.proj_size() && options_base.proj_size() < options_base.hidden_size(),
+    "proj_size has to be a positive integer, smaller than ",
+    "hidden_size or zero to disable projections");
+
+  if (options_base.proj_size() > 0) {
+    TORCH_CHECK(
+      c10::get_if<enumtype::kLSTM>(&options_base.mode()),
+      "proj_size argument is only supported for LSTM, not RNN or GRU");
+  }
+
   int64_t gate_size = 0;
   if (c10::get_if<enumtype::kLSTM>(&options_base.mode())) {
     gate_size = 4 * options_base.hidden_size();
@@ -89,21 +100,29 @@ void RNNImplBase<Derived>::reset() {
 
   for (int64_t layer = 0; layer < options_base.num_layers(); layer++) {
     for (int64_t direction = 0; direction < num_directions; direction++) {
-      int64_t layer_input_size = layer == 0 ? options_base.input_size() : options_base.hidden_size() * num_directions;
+      int64_t real_hidden_size = options_base.proj_size() > 0 ? options_base.proj_size() : options_base.hidden_size();
+      int64_t layer_input_size = layer == 0 ? options_base.input_size() : real_hidden_size * num_directions;
 
       auto w_ih = torch::empty({gate_size, layer_input_size});
-      auto w_hh = torch::empty({gate_size, options_base.hidden_size()});
+      auto w_hh = torch::empty({gate_size, real_hidden_size});
       auto b_ih = torch::empty({gate_size});
       // Second bias vector included for CuDNN compatibility. Only one
       // bias vector is needed in standard definition.
       auto b_hh = torch::empty({gate_size});
-      std::vector<Tensor> layer_params = {w_ih, w_hh, b_ih, b_hh};
+      std::vector<Tensor> layer_params = {w_ih, w_hh};
 
       std::string suffix = direction == 1 ? "_reverse" : "";
       std::vector<std::string> param_names = {"weight_ih_l{layer}{suffix}", "weight_hh_l{layer}{suffix}"};
       if (options_base.bias()) {
         param_names.emplace_back("bias_ih_l{layer}{suffix}");
         param_names.emplace_back("bias_hh_l{layer}{suffix}");
+        layer_params.emplace_back(b_ih);
+        layer_params.emplace_back(b_hh);
+      }
+      if (options_base.proj_size() > 0) {
+        auto w_hr = torch::empty({options_base.proj_size(), options_base.hidden_size()});
+        layer_params.emplace_back(w_hr);
+        param_names.emplace_back("weight_hr_l{layer}{suffix}");
       }
       for (size_t i = 0; i < param_names.size(); i++) {  // NOLINT(modernize-loop-convert)
         std::string x = std::regex_replace(param_names[i], std::regex("\\{layer\\}"), c10::str(layer));
@@ -180,12 +199,17 @@ void RNNImplBase<Derived>::flatten_parameters() {
     {
       torch::NoGradGuard no_grad;
       if (torch::_use_cudnn_rnn_flatten_weight()) {
+        int64_t num_weights = options_base.bias() ? 4 : 2;
+        if (options_base.proj_size() > 0) {
+          ++num_weights;
+        }
         torch::_cudnn_rnn_flatten_weight(
               flat_weights_,
-              options_base.bias() ? 4 : 2,
+              num_weights,
               options_base.input_size(),
               static_cast<int64_t>(get_cudnn_mode_for_rnn(options_base.mode())),
               options_base.hidden_size(),
+              options_base.proj_size(),
               options_base.num_layers(),
               options_base.batch_first(),
               options_base.bidirectional());
@@ -260,7 +284,8 @@ std::tuple<int64_t, int64_t, int64_t> RNNImplBase<Derived>::get_expected_hidden_
     mini_batch = options_base.batch_first() ? input.size(0) : input.size(1);
   }
   int64_t num_directions = options_base.bidirectional() ? 2 : 1;
-  return std::make_tuple(options_base.num_layers() * num_directions, mini_batch, options_base.hidden_size());
+  int64_t real_hidden_size = options_base.proj_size() > 0 ? options_base.proj_size() : options_base.hidden_size();
+  return std::make_tuple(options_base.num_layers() * num_directions, mini_batch, real_hidden_size);
 }
 
 template <typename Derived>
@@ -306,8 +331,11 @@ void RNNImplBase<Derived>::pretty_print(std::ostream& stream) const {
          << ", bias=" << options_base.bias()
          << ", batch_first=" << options_base.batch_first()
          << ", dropout=" << options_base.dropout()
-         << ", bidirectional=" << options_base.bidirectional()
-         << ")";
+         << ", bidirectional=" << options_base.bidirectional();
+  if (options_base.proj_size() > 0) {
+    stream << ", proj_size=" << options_base.proj_size();
+  }
+  stream << ")";
 }
 
 template <typename Derived>
@@ -438,16 +466,27 @@ LSTMImpl::LSTMImpl(const LSTMOptions& options_)
               .bias(options_.bias())
               .batch_first(options_.batch_first())
               .dropout(options_.dropout())
-              .bidirectional(options_.bidirectional())),
+              .bidirectional(options_.bidirectional())
+              .proj_size(options_.proj_size())),
       options(options_) {}
 
+std::tuple<int64_t, int64_t, int64_t> LSTMImpl::get_expected_cell_size(
+  const Tensor& input, const Tensor& batch_sizes) const {
+  int64_t mini_batch = 0;
+  if (batch_sizes.defined()) {
+    mini_batch = batch_sizes[0].item<int64_t>();
+  } else {
+    mini_batch = options_base.batch_first() ? input.size(0) : input.size(1);
+  }
+  int64_t num_directions = options_base.bidirectional() ? 2 : 1;
+  return std::make_tuple(options_base.num_layers() * num_directions, mini_batch, options_base.hidden_size());
+}
+
 void LSTMImpl::check_forward_args(const Tensor& input, std::tuple<Tensor, Tensor> hidden, const Tensor& batch_sizes) const {
   this->check_input(input, batch_sizes);
-  auto expected_hidden_size = this->get_expected_hidden_size(input, batch_sizes);
-
-  this->check_hidden_size(std::get<0>(hidden), expected_hidden_size,
+  this->check_hidden_size(std::get<0>(hidden), this->get_expected_hidden_size(input, batch_sizes),
                           "Expected hidden[0] size {1}, got {2}");
-  this->check_hidden_size(std::get<1>(hidden), expected_hidden_size,
+  this->check_hidden_size(std::get<1>(hidden), this->get_expected_cell_size(input, batch_sizes),
                           "Expected hidden[1] size {1}, got {2}");
 }
 
@@ -471,10 +510,14 @@ std::tuple<Tensor, std::tuple<Tensor, Tensor>> LSTMImpl::forward_helper(
   std::tuple<Tensor, Tensor> hx;
   if (!hx_opt.has_value()) {
     int64_t num_directions = options.bidirectional() ? 2 : 1;
-    auto zeros = torch::zeros({options.num_layers() * num_directions,
-                     max_batch_size, options.hidden_size()},
-                     torch::dtype(input.dtype()).device(input.device()));
-    hx = std::make_tuple(zeros, zeros);
+    int64_t real_hidden_size = options.proj_size() > 0 ? options.proj_size() : options.hidden_size();
+    auto h_zeros = torch::zeros({options.num_layers() * num_directions,
+                       max_batch_size, real_hidden_size},
+                       torch::dtype(input.dtype()).device(input.device()));
+    auto c_zeros = torch::zeros({options.num_layers() * num_directions,
+                       max_batch_size, options.hidden_size()},
+                       torch::dtype(input.dtype()).device(input.device()));
+    hx = std::make_tuple(h_zeros, c_zeros);
   } else {
     hx = hx_opt.value();
     // Each batch of the hidden state should match the input sequence that
@@ -650,13 +693,13 @@ void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (!nonlinearity_str.empty() && nonlinearity_str != "kTanh") {
     stream << ", nonlinearity=" << nonlinearity_str;
   }
-  stream << ")"; 
+  stream << ")";
 }
 
 template <typename Derived>
 void RNNCellImplBase<Derived>::check_forward_input(const Tensor& input) const {
   TORCH_CHECK(
-    input.size(1) == options_base.input_size(), 
+    input.size(1) == options_base.input_size(),
     "input has inconsistent input_size: got ", input.size(1), " expected ", options_base.input_size());
 }
 
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 65b3c4f908df..be13f7002994 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -23,7 +23,7 @@ def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tens
 
 class RNNBase(Module):
     __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
-                     'batch_first', 'dropout', 'bidirectional']
+                     'batch_first', 'dropout', 'bidirectional', 'proj_size']
     __jit_unused_properties__ = ['all_weights']
 
     mode: str
@@ -34,10 +34,11 @@ class RNNBase(Module):
     batch_first: bool
     dropout: float
     bidirectional: bool
+    proj_size: int
 
     def __init__(self, mode: str, input_size: int, hidden_size: int,
                  num_layers: int = 1, bias: bool = True, batch_first: bool = False,
-                 dropout: float = 0., bidirectional: bool = False) -> None:
+                 dropout: float = 0., bidirectional: bool = False, proj_size: int = 0) -> None:
         super(RNNBase, self).__init__()
         self.mode = mode
         self.input_size = input_size
@@ -47,6 +48,7 @@ def __init__(self, mode: str, input_size: int, hidden_size: int,
         self.batch_first = batch_first
         self.dropout = float(dropout)
         self.bidirectional = bidirectional
+        self.proj_size = proj_size
         num_directions = 2 if bidirectional else 1
 
         if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
@@ -59,6 +61,10 @@ def __init__(self, mode: str, input_size: int, hidden_size: int,
                           "recurrent layer, so non-zero dropout expects "
                           "num_layers greater than 1, but got dropout={} and "
                           "num_layers={}".format(dropout, num_layers))
+        if proj_size < 0:
+            raise ValueError("proj_size should be a positive integer or zero to disable projections")
+        if proj_size >= hidden_size:
+            raise ValueError("proj_size has to be smaller than hidden_size")
 
         if mode == 'LSTM':
             gate_size = 4 * hidden_size
@@ -75,20 +81,34 @@ def __init__(self, mode: str, input_size: int, hidden_size: int,
         self._all_weights = []
         for layer in range(num_layers):
             for direction in range(num_directions):
-                layer_input_size = input_size if layer == 0 else hidden_size * num_directions
+                real_hidden_size = proj_size if proj_size > 0 else hidden_size
+                layer_input_size = input_size if layer == 0 else real_hidden_size * num_directions
 
                 w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
-                w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
+                w_hh = Parameter(torch.Tensor(gate_size, real_hidden_size))
                 b_ih = Parameter(torch.Tensor(gate_size))
                 # Second bias vector included for CuDNN compatibility. Only one
                 # bias vector is needed in standard definition.
                 b_hh = Parameter(torch.Tensor(gate_size))
-                layer_params = (w_ih, w_hh, b_ih, b_hh)
+                layer_params: Tuple[Tensor, ...] = ()
+                if self.proj_size == 0:
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh)
+                    else:
+                        layer_params = (w_ih, w_hh)
+                else:
+                    w_hr = Parameter(torch.Tensor(proj_size, hidden_size))
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh, w_hr)
+                    else:
+                        layer_params = (w_ih, w_hh, w_hr)
 
                 suffix = '_reverse' if direction == 1 else ''
                 param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
                 if bias:
                     param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
+                if self.proj_size > 0:
+                    param_names += ['weight_hr_l{}{}']
                 param_names = [x.format(layer, suffix) for x in param_names]
 
                 for name, param in zip(param_names, layer_params):
@@ -146,9 +166,13 @@ def flatten_parameters(self) -> None:
             # an inplace operation on self._flat_weights
             with torch.no_grad():
                 if torch._use_cudnn_rnn_flatten_weight():
+                    num_weights = 4 if self.bias else 2
+                    if self.proj_size > 0:
+                        num_weights += 1
                     torch._cudnn_rnn_flatten_weight(
-                        self._flat_weights, (4 if self.bias else 2),
-                        self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
+                        self._flat_weights, num_weights,
+                        self.input_size, rnn.get_cudnn_mode(self.mode),
+                        self.hidden_size, self.proj_size, self.num_layers,
                         self.batch_first, bool(self.bidirectional))
 
     def _apply(self, fn):
@@ -185,8 +209,12 @@ def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor])
         else:
             mini_batch = input.size(0) if self.batch_first else input.size(1)
         num_directions = 2 if self.bidirectional else 1
-        expected_hidden_size = (self.num_layers * num_directions,
-                                mini_batch, self.hidden_size)
+        if self.proj_size > 0:
+            expected_hidden_size = (self.num_layers * num_directions,
+                                    mini_batch, self.proj_size)
+        else:
+            expected_hidden_size = (self.num_layers * num_directions,
+                                    mini_batch, self.hidden_size)
         return expected_hidden_size
 
     def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int],
@@ -250,6 +278,8 @@ def forward(self,
 
     def extra_repr(self) -> str:
         s = '{input_size}, {hidden_size}'
+        if self.proj_size != 0:
+            s += ', proj_size={proj_size}'
         if self.num_layers != 1:
             s += ', num_layers={num_layers}'
         if self.bias is not True:
@@ -276,14 +306,23 @@ def __setstate__(self, d):
         for layer in range(num_layers):
             for direction in range(num_directions):
                 suffix = '_reverse' if direction == 1 else ''
-                weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
+                weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}',
+                           'bias_hh_l{}{}', 'weight_hr_l{}{}']
                 weights = [x.format(layer, suffix) for x in weights]
                 if self.bias:
-                    self._all_weights += [weights]
-                    self._flat_weights_names.extend(weights)
+                    if self.proj_size > 0:
+                        self._all_weights += [weights]
+                        self._flat_weights_names.extend(weights)
+                    else:
+                        self._all_weights += [weights[:4]]
+                        self._flat_weights_names.extend(weights[:4])
                 else:
-                    self._all_weights += [weights[:2]]
-                    self._flat_weights_names.extend(weights[:2])
+                    if self.proj_size > 0:
+                        self._all_weights += [weights[:2]] + [weights[-1:]]
+                        self._flat_weights_names.extend(weights[:2] + [weights[-1:]])
+                    else:
+                        self._all_weights += [weights[:2]]
+                        self._flat_weights_names.extend(weights[:2])
         self._flat_weights = [(lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn) for wn in self._flat_weights_names]
 
     @property
@@ -399,6 +438,8 @@ class RNN(RNNBase):
     """
 
     def __init__(self, *args, **kwargs):
+        if 'proj_size' in kwargs:
+            raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
         self.nonlinearity = kwargs.pop('nonlinearity', 'tanh')
         if self.nonlinearity == 'tanh':
             mode = 'RNN_TANH'
@@ -451,6 +492,14 @@ class LSTM(RNNBase):
     dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
     variable which is :math:`0` with probability :attr:`dropout`.
 
+    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
+    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
+    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
+    Second, the output hidden state of each layer will be multiplied by a learnable projection
+    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
+    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
+    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.
+
     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
@@ -466,6 +515,7 @@ class LSTM(RNNBase):
             LSTM layer except the last layer, with dropout probability equal to
             :attr:`dropout`. Default: 0
         bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
+        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
 
     Inputs: input, (h_0, c_0)
         - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
@@ -476,6 +526,8 @@ class LSTM(RNNBase):
         - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial hidden state for each element in the batch.
           If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
+          If ``proj_size > 0`` was specified, the shape has to be
+          `(num_layers * num_directions, batch, proj_size)`.
         - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial cell state for each element in the batch.
 
@@ -486,14 +538,16 @@ class LSTM(RNNBase):
         - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
           containing the output features `(h_t)` from the last layer of the LSTM,
           for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
-          given as the input, the output will also be a packed sequence.
+          given as the input, the output will also be a packed sequence. If ``proj_size > 0``
+          was specified, output shape will be `(seq_len, batch, num_directions * proj_size)`.
 
           For the unpacked case, the directions can be separated
           using ``output.view(seq_len, batch, num_directions, hidden_size)``,
           with forward and backward being direction `0` and `1` respectively.
           Similarly, the directions can be separated in the packed case.
         - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
-          containing the hidden state for `t = seq_len`.
+          containing the hidden state for `t = seq_len`. If ``proj_size > 0``
+          was specified, ``h_n`` shape will be `(num_layers * num_directions, batch, proj_size)`.
 
           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
@@ -505,11 +559,15 @@ class LSTM(RNNBase):
             `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
             Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`
         weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
-            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
+            was specified, the shape will be `(4*hidden_size, proj_size)`.
         bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
             `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
         bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
             `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
+        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
+            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
+            specified.
 
     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
@@ -531,15 +589,23 @@ class LSTM(RNNBase):
     def __init__(self, *args, **kwargs):
         super(LSTM, self).__init__('LSTM', *args, **kwargs)
 
+    def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+        return expected_hidden_size
+
     # In the future, we should prevent mypy from applying contravariance rules here.
     # See torch/nn/modules/module.py::_forward_unimplemented
     def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]):  # type: ignore
         self.check_input(input, batch_sizes)
-        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
-
-        self.check_hidden_size(hidden[0], expected_hidden_size,
+        self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
                                'Expected hidden[0] size {}, got {}')
-        self.check_hidden_size(hidden[1], expected_hidden_size,
+        self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
                                'Expected hidden[1] size {}, got {}')
 
     # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
@@ -577,10 +643,14 @@ def forward(self, input, hx=None):  # noqa: F811
 
         if hx is None:
             num_directions = 2 if self.bidirectional else 1
-            zeros = torch.zeros(self.num_layers * num_directions,
-                                max_batch_size, self.hidden_size,
-                                dtype=input.dtype, device=input.device)
-            hx = (zeros, zeros)
+            real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+            h_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, real_hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            c_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, self.hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            hx = (h_zeros, c_zeros)
         else:
             # Each batch of the hidden state should match the input sequence that
             # the user believes he/she is passing in.
@@ -709,6 +779,8 @@ class GRU(RNNBase):
     """
 
     def __init__(self, *args, **kwargs):
+        if 'proj_size' in kwargs:
+            raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
         super(GRU, self).__init__('GRU', *args, **kwargs)
 
     @overload  # type: ignore
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index bda62b638d22..a65cea494529 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1916,6 +1916,9 @@ def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
                        'ScaledTanh', 'HardSigmoid', 'Elu', 'Softsign', 'Softplus']
     variantToOnnxActivationMap = dict(zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations))
     weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == 'LSTM' and len(all_weights) != num_layers * weights_per_layer * (1 + bidirectional):
+        return _unimplemented("LSTM", "LSTMs with projections")
     assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
     layer_weights = [all_weights[i:i + weights_per_layer] for i in range(0, len(all_weights), weights_per_layer)]
     if batch_first:

From f98d8c6237ad6a2873fefc763d50c42e6d3c135c Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 16 Dec 2020 11:24:35 -0800
Subject: [PATCH 26/34] Move inplace_is_vmap_compatible to BatchedTensorImpl.h
 (#49118)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49118

I need this in the next stack up. It seems useful to have as a helper
function.

Test Plan: - run tests

Reviewed By: izdeby

Differential Revision: D25563546

Pulled By: zou3519

fbshipit-source-id: a4031fdc4b2373cc230ba3c66738d91dcade96e2
---
 aten/src/ATen/BatchedTensorImpl.cpp     | 15 ++++++++++++++
 aten/src/ATen/BatchedTensorImpl.h       |  4 ++++
 torch/csrc/autograd/FunctionsManual.cpp | 26 +++++++------------------
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp
index 8f373b1ea29b..7b06fa3ee521 100644
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/BatchedTensorImpl.cpp
@@ -139,4 +139,19 @@ Tensor addBatchDim(const Tensor& tensor, int64_t level, int64_t dim) {
   return makeBatched(batched->value(), std::move(new_bdims));
 }
 
+bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other) {
+  const auto* other_batched = maybeGetBatchedImpl(other);
+  if (!other_batched) {
+    return true;
+  }
+  const auto* self_batched = maybeGetBatchedImpl(self);
+  if (!self_batched) {
+    // self is not batched but other is batched
+    return false;
+  }
+  auto self_levels = createVmapLevelsBitset(self_batched->bdims());
+  auto other_levels = createVmapLevelsBitset(other_batched->bdims());
+  return self_levels == (self_levels | other_levels);
+}
+
 } // namespace at
diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/BatchedTensorImpl.h
index 51d37257e51e..634740de4d08 100644
--- a/aten/src/ATen/BatchedTensorImpl.h
+++ b/aten/src/ATen/BatchedTensorImpl.h
@@ -143,5 +143,9 @@ TORCH_API Tensor makeBatched(const Tensor& tensor, BatchDims bdims);
 // Adds a batch dim to `tensor`, returning a BatchedTensor
 TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t level, int64_t dim);
 
+// Checks if an inplace operation on self and other is "vmap compatible".
+// See NOTE: [vmap-incompatible in-place operations] for the definition of this.
+TORCH_API bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other);
+
 
 }
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index ed08e541661b..82d93d6948ba 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1022,6 +1022,8 @@ Tensor log_softmax_double_backward(const Tensor & grad, const Tensor & grad_outp
   return z * grad_output.sum(dim, true) * ((grad * z).sum(dim, true) - grad);
 }
 
+// NOTE: [How to write vmap-compatible backward formulas]
+//
 // See NOTE: [vmap-incompatible in-place operations] for what it means for an
 // in-place operation to be incompatible with vmap.
 //
@@ -1039,29 +1041,15 @@ Tensor log_softmax_double_backward(const Tensor & grad, const Tensor & grad_outp
 // - If the in-place operation followed some sequence of operations, if the
 //   we want to be able to vmap over the backward formula as-is (this is
 //   usually the case for simple (<15loc) backward formulas), then use
-//   inplace_is_vmap_compatible to guard the operation. For example:
+//   inplaceIsVmapCompatible to guard the operation. For example:
 //             c = a * b
 //     Before: c.mul_(grad)
-//     After:  c = inplace_is_vmap_compatible(c, grad) ? c.mul_(grad) : c * grad
+//     After:  c = at::inplaceIsVmapCompatible(c, grad) ? c.mul_(grad) : c * grad
 //
 // - If we don't want to vmap directly over the backward formula (e.g., if the
 //   backward formula is too complicated or has a lot of vmap-incompatible
 //   operations, then register the backward formula as an operator and eventually
 //   write a batching rule for it.
-static bool inplace_is_vmap_compatible(const Tensor& self, const Tensor& other) {
-  const auto* other_batched = at::maybeGetBatchedImpl(other);
-  if (!other_batched) {
-    return true;
-  }
-  const auto* self_batched = at::maybeGetBatchedImpl(self);
-  if (!self_batched) {
-    // self is not batched but other is batched
-    return false;
-  }
-  auto self_levels = at::createVmapLevelsBitset(self_batched->bdims());
-  auto other_levels = at::createVmapLevelsBitset(other_batched->bdims());
-  return self_levels == (self_levels | other_levels);
-}
 
 Tensor binary_cross_entropy_double_backward(const Tensor & grad_output, const Tensor & grad, const Tensor & input, const Tensor & target, const c10::optional<Tensor>& weight, int64_t reduction) {
   auto eps = 1e-12;
@@ -1069,7 +1057,7 @@ Tensor binary_cross_entropy_double_backward(const Tensor & grad_output, const Te
   auto one_m_inp_pl_eps = 1 - input + eps;
   // gradient wrt input
   auto gI = (input * input - 2 * input * target + target) / (inp_pl_eps.pow(2) * one_m_inp_pl_eps.pow(2));
-  if (inplace_is_vmap_compatible(gI, grad)) {
+  if (at::inplaceIsVmapCompatible(gI, grad)) {
     gI *= (grad * grad_output);
   } else {
     gI = gI * (grad * grad_output);
@@ -1090,7 +1078,7 @@ Tensor binary_cross_entropy_double_backward_grad_output(const Tensor & grad, con
   auto eps = 1e-12;
   // gradient wrt grad_output
   auto ggO = (input - target) / ((input + eps) * (1 - input + eps));
-  if (inplace_is_vmap_compatible(ggO, grad)) {
+  if (at::inplaceIsVmapCompatible(ggO, grad)) {
     ggO *= grad;
   } else {
     ggO = ggO * grad;
@@ -1990,7 +1978,7 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
     glambda = glambda.to(self.dtype());
     // computes v @ diag(glambda) @ vh
     Tensor glambda_term = at::matmul(v * glambda.unsqueeze(-2), vh);
-    if (inplace_is_vmap_compatible(result, glambda_term)) {
+    if (at::inplaceIsVmapCompatible(result, glambda_term)) {
       result.add_(glambda_term);
     } else {
       result = result + glambda_term;

From 2ec3e803eb5bd490cf93f3e725cc9f99df930bf5 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 16 Dec 2020 11:24:35 -0800
Subject: [PATCH 27/34] Update accumulate_grad to support vmap (#49119)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49119

I don't know how the accumulate_grad code gets hit via calling
autograd.grad, so I went through all places in accumulate_grad
that are definitely impossible to vmap through and changed them.

To support this:
- I added vmap support for Tensor::strides(). It returns the strides
that correspond to the public dimensions of the tensor (not the ones
being vmapped over).
- Changed an instance of empty_strided to new_empty_strided.
- Replaced an in-place operation in accumulate_grad.h

Test Plan:
- added a test for calling strides() inside of vmap
- added tests that exercise all of the accumulate_grad code path.
NB: I don't know why these tests exercise the code paths, but I've
verified that they do via gdb.

Suggestions for some saner test cases are very welcome.

Reviewed By: izdeby

Differential Revision: D25563543

Pulled By: zou3519

fbshipit-source-id: 05ac6c549ebd447416e6a07c263a16c90b2ef510
---
 aten/src/ATen/BatchedTensorImpl.cpp           |  7 ---
 aten/src/ATen/BatchedTensorImpl.h             |  2 -
 aten/src/ATen/test/vmap_test.cpp              |  1 -
 test/test_vmap.py                             | 48 +++++++++++++++++++
 .../csrc/autograd/functions/accumulate_grad.h |  9 ++++
 .../autograd/utils/grad_layout_contract.h     |  2 +-
 6 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp
index 7b06fa3ee521..f295d70c31fd 100644
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/BatchedTensorImpl.cpp
@@ -76,13 +76,6 @@ void BatchedTensorImpl::checkInvariants() const {
 }
 
 // The following are publically exposed as methods of Tensor
-IntArrayRef BatchedTensorImpl::strides() const {
-  TORCH_CHECK(false, "NYI: Getting tensor strides inside of vmap");
-}
-int64_t BatchedTensorImpl::stride(int64_t d) const {
-  TORCH_CHECK(false, "NYI: Getting tensor strides inside of vmap");
-}
-
 bool BatchedTensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
   TORCH_CHECK(memory_format == MemoryFormat::Contiguous,
       "NYI: querying is_contiguous inside of vmap for memory_format ",
diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/BatchedTensorImpl.h
index 634740de4d08..7fdef64146fd 100644
--- a/aten/src/ATen/BatchedTensorImpl.h
+++ b/aten/src/ATen/BatchedTensorImpl.h
@@ -74,8 +74,6 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
 
   // Override a bunch of methods inherited from TensorImpl to return error messages.
   bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const override;
-  IntArrayRef strides() const override;
-  int64_t stride(int64_t d) const override;
   void set_size(int64_t dim, int64_t new_size) override;
   void set_stride(int64_t dim, int64_t new_stride) override;
   void set_storage_offset(int64_t storage_offset) override;
diff --git a/aten/src/ATen/test/vmap_test.cpp b/aten/src/ATen/test/vmap_test.cpp
index 99845b5df0ae..c35f2ea912a9 100644
--- a/aten/src/ATen/test/vmap_test.cpp
+++ b/aten/src/ATen/test/vmap_test.cpp
@@ -16,7 +16,6 @@ TEST(VmapTest, TestBatchedTensor) {
     ASSERT_EQ(x.dim(), 2);
     ASSERT_EQ(x.numel(), 8);
     ASSERT_EQ(x.is_contiguous(), false);
-    ASSERT_THROW(x.strides(), c10::Error);
     ASSERT_THROW(x.storage(), c10::Error);
     ASSERT_THROW(x.storage_offset(), c10::Error);
   }
diff --git a/test/test_vmap.py b/test/test_vmap.py
index 5fa8426fd4ab..cc25dff3b306 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -1287,6 +1287,25 @@ def test_contiguous(self):
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(functools.partial(op, memory_format=torch.channels_last_3d))(tensor)
 
+    def test_stride(self):
+        B0 = 3
+
+        x = torch.randn(B0, 2, 5, 7)
+
+        def foo(x):
+            assert x.stride() == (7 * 5, 7, 1)
+            return x
+
+        vmap(foo)(x)
+
+        x = torch.randn(2, B0, 5, 7).movedim(1, 0)
+
+        def bar(x):
+            assert x.stride() == (7 * 5 * B0, 7, 1)
+            return x
+
+        vmap(bar)(x)
+
     def test_chunk(self):
         test = self._vmap_view_test
         op = torch.chunk
@@ -2341,6 +2360,35 @@ def test_diagonal(self, device):
         x = torch.randn(3, 4, 5, device=device, requires_grad=True)
         self._batched_grad_test(lambda x: x.diagonal(0, -1, -2), (x,))
 
+    @allowVmapFallbackUsage
+    def test_unrelated_output(self, device):
+        B0 = 3
+        x = torch.randn([], requires_grad=True)
+        y = torch.randn([], requires_grad=True)
+        gy = torch.randn(B0, requires_grad=True)
+
+        def vjp(v):
+            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            return torch.zeros_like(x) if res is None else res
+
+        result = vmap(vjp)(gy)
+        self.assertEqual(result, torch.zeros(B0, *x.shape, device=device))
+
+    @allowVmapFallbackUsage
+    def test_unrelated_output_multiple_grad(self, device):
+        B0 = 3
+        x = torch.randn([], requires_grad=True)
+        y = torch.randn([], requires_grad=True)
+        gy = torch.randn(B0, requires_grad=True)
+
+        def vjp(v):
+            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            return torch.zeros_like(x) if res is None else res
+
+        _ = vjp(gy[0])
+        result = vmap(vjp)(gy)
+        self.assertEqual(result, torch.zeros(B0, *x.shape, device=device))
+
 instantiate_device_type_tests(
     TestVmapBatchedGradient,
     globals(),
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index dafd07f64b84..fdc66e9cd422 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/autograd/utils/grad_layout_contract.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <ATen/BatchedTensorImpl.h>
 
 #include <mutex>
 
@@ -149,6 +150,14 @@ struct TORCH_API AccumulateGrad : public Node {
         auto result = new_grad + variable_grad;
         CHECK_RESULT(result, variable);
         update_grad(std::move(result));
+      } else if (!at::inplaceIsVmapCompatible(variable_grad, new_grad)) {
+        // Ideally we'd perform an in-place operation to avoid changing
+        // the grad tensor. However, if that's impossible because the grads
+        // are vmap-incompatible (See NOTE: [vmap-incompatible in-place operations]),
+        // then we just add them out-of-place.
+        auto result = variable_grad + new_grad;
+        CHECK_RESULT(result, variable);
+        update_grad(std::move(result));
       } else {
         // In this case we can avoid changing the grad tensor. There are three
         // scenarios when we'll hit this case:
diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index 9e60dc3397a4..4d1787d55c79 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -25,7 +25,7 @@ inline at::Tensor clone_obey_contract(const at::Tensor& new_grad, const at::Tens
     // (1)
     // Does this dicey-looking sequence attach the result to new_grad's
     // history if GradMode::is_enabled()?  Yes, and @alband says it should.
-    return std::move(at::empty_strided(variable.sizes(), variable.strides(),
+    return std::move(new_grad.new_empty_strided(variable.sizes(), variable.strides(),
                                        variable.options().memory_format(c10::nullopt))
                      .copy_(new_grad));
   } else {

From 6f814d45aa79e732874f5b44e899fcccde933a5e Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Wed, 16 Dec 2020 11:29:14 -0800
Subject: [PATCH 28/34] Update TensorPipe submodule (#49467)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49467

Credit to beauby for the Bazel fixes.

Test Plan: Export and run on CI

Reviewed By: beauby

Differential Revision: D25588027

fbshipit-source-id: efe1c543eb7438ca05254de67cf8b5cee625119a
---
 third_party/tensorpipe       |  2 +-
 third_party/tensorpipe.BUILD | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index 82a114882e21..9a5f0d797b37 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 82a114882e21b176916e2f12a7b566af3d63df71
+Subproject commit 9a5f0d797b3741d9614e93bfd262acf522eb00b8
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
index 66c7b1c7a1ab..45b99e64ec9a 100644
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@@ -93,7 +93,13 @@ TENSORPIPE_HEADERS = glob([
 TENSORPIPE_BASE_SRCS = glob([
     "tensorpipe/*.cc",
     "tensorpipe/channel/*.cc",
-    "tensorpipe/common/*.cc",
+    "tensorpipe/common/address.cc",
+    "tensorpipe/common/epoll_loop.cc",
+    "tensorpipe/common/error.cc",
+    "tensorpipe/common/fd.cc",
+    "tensorpipe/common/ibv.cc",
+    "tensorpipe/common/socket.cc",
+    "tensorpipe/common/system.cc",
     "tensorpipe/core/*.cc",
     "tensorpipe/transport/*.cc",
     "tensorpipe/util/*/*.cc",
@@ -107,7 +113,10 @@ TENSORPIPE_SRCS = TENSORPIPE_BASE_SRCS + glob([
 ])
 
 TENSORPIPE_SRCS_CUDA = TENSORPIPE_SRCS + glob([
+    "tensorpipe/common/cuda_loop.cc",
+    "tensorpipe/channel/cuda_basic/*.cc",
     "tensorpipe/channel/cuda_ipc/*.cc",
+    "tensorpipe/channel/cuda_xth/*.cc",
 ])
 
 cc_library(

From 39a23c797b3bdf4cf5b9a3f6d4f1dffb9a331402 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 16 Dec 2020 11:53:57 -0800
Subject: [PATCH 29/34] Add docs/README.md to make existing doc build info more
 discoverable (#49286)

Summary:
Closes gh-42003

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49286

Reviewed By: glaringlee

Differential Revision: D25535250

Pulled By: ezyang

fbshipit-source-id: a7790bfe4528fa6a31698126cc687793fdf7ac3f
---
 docs/README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 docs/README.md

diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000000..471f0aa9f888
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,2 @@
+Please see the [Writing documentation section of CONTRIBUTING.md](../CONTRIBUTING.md#writing-documentation)
+for details on both writing and building the docs.

From 9955355853a1c189a4a79209f82d39393b4be010 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 16 Dec 2020 12:30:27 -0800
Subject: [PATCH 30/34] Updated derivative rules for complex svd and pinverse
 (#47761)

Summary:
Updated `svd_backward` to work correctly for complex-valued inputs.
Updated `common_methods_invocations.py` to take dtype, device arguments for input construction.
Removed `test_pinverse` from `test_autograd.py`, it is replaced by entries to `common_methods_invocations.py`.
Added `svd` and `pinverse` to list of complex tests.

References for complex-valued SVD differentiation:

- https://giggleliu.github.io/2019/04/02/einsumbp.html
- https://arxiv.org/abs/1909.02659

The derived rules assume gauge invariance of loss functions, so the result would not be correct for loss functions that are not gauge invariant.
https://re-ra.xyz/Gauge-Problem-in-Automatic-Differentiation/

The same rule is implemented in Tensorflow and [BackwardsLinalg.jl](https://github.com/GiggleLiu/BackwardsLinalg.jl).

Ref. https://github.com/pytorch/pytorch/issues/33152

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47761

Reviewed By: izdeby

Differential Revision: D25574962

Pulled By: mruberry

fbshipit-source-id: 832b61303e883ad3a451b84850ccf0f36763a6f6
---
 test/test_autograd.py                         |  24 ----
 test/test_jit.py                              |   6 +-
 test/test_linalg.py                           |  10 +-
 test/test_ops.py                              |  23 +++-
 tools/autograd/gen_variable_type.py           |   2 +-
 torch/_torch_docs.py                          |   9 +-
 torch/csrc/autograd/FunctionsManual.cpp       |  47 +++++--
 .../_internal/common_methods_invocations.py   | 130 ++++++++++++++----
 8 files changed, 170 insertions(+), 81 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index be276e334df6..d823732c613e 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3000,30 +3000,6 @@ def test_igammac(self):
         gradcheck(torch.igamma, (s, x))
         gradgradcheck(torch.igamma, (s, x))
 
-    @skipIfNoLapack
-    def test_pinverse(self):
-        # Why is pinverse tested this way, and not ordinarily as other linear algebra methods?
-        # 1. Pseudo-inverses are not generally continuous, which means that they are not differentiable
-        # 2. Derivatives for pseudo-inverses exist typically for constant rank (Golub et al, 1973)
-        # 3. This method creates two orthogonal matrices, and a constructs a test case with large
-        #    singular values (given by x to the function).
-        # 4. This will ensure that small perturbations don't affect the rank of matrix, in which case
-        #    a derivative exists.
-        # 5. This test exists since pinverse is implemented using SVD, and is hence a backpropable method
-        m, n = 5, 10
-        U = torch.randn(n, m).qr()[0].t()  # Orthogonal with dimensions m x n
-        V = torch.randn(n, m).qr()[0].t()  # Orthogonal with dimensions m x n
-
-        def func(x):
-            S = torch.cat([x, torch.zeros(n - m)], 0)
-            M = U.mm(torch.diag(S)).mm(V.t())
-            return M.pinverse()
-
-        gradcheck(func, [torch.rand(m).add_(1).requires_grad_()])
-        gradcheck(func, [torch.rand(m).add_(10).requires_grad_()])
-        gradgradcheck(func, [torch.rand(m).add_(1).requires_grad_()])
-        gradgradcheck(func, [torch.rand(m).add_(10).requires_grad_()])
-
     def test_chain_matmul(self):
         def gen_matrices(p):
             matrices = []
diff --git a/test/test_jit.py b/test/test_jit.py
index 3a3e87d49e82..cd66f84cdc83 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15469,7 +15469,11 @@ def forward(self, x):
     'test_slogdet_batched_pos_det',
     'test_slogdet_batched_symmetric',
     'test_slogdet_batched_symmetric_pd',
-    'test_slogdet_batched_distinct_singular_values'
+    'test_slogdet_batched_distinct_singular_values',
+    'test_svd_check_grad_s',
+    'test_svd_check_grad_u',
+    'test_svd_check_grad_uv',
+    'test_svd_check_grad_v'
 }
 
 # chunk returns a list in scripting and we don't unpack the list,
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 4a043094c5f8..d947292fabc5 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1331,6 +1331,9 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
         # TODO: Fix autograd for matrix orders 'nuc', 2, and -2 by adding complex
         #       support to svd's backward method. Once this is done, these ords
         #       should be added to `matrix_ords` above
+        # Update: svd's backward now works with https://github.com/pytorch/pytorch/pull/47761
+        # However run_test_case doesn't work for 'matrix_ords_unsupported' cases
+        # because singular values of 'x' and 'x_real' can be different and so is their norms based on singular values
         matrix_ords_unsupported = ['nuc', 2, -2]
 
         def run_test_case(x, ord, keepdim):
@@ -1357,13 +1360,6 @@ def run_test_case(x, ord, keepdim):
                 x = torch.randn(25, 25, dtype=dtype, device=device, requires_grad=True)
                 run_test_case(x, ord, keepdim)
 
-            for ord in matrix_ords_unsupported:
-                x = torch.randn(25, 25, dtype=dtype, device=device, requires_grad=True)
-                with self.assertRaisesRegex(
-                        RuntimeError,
-                        r'svd does not support automatic differentiation for outputs with complex dtype'):
-                    res = torch.linalg.norm(x, ord, keepdim=keepdim)
-
     # Test that linal.norm gives the same result as numpy when inputs
     # contain extreme values (inf, -inf, nan)
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
diff --git a/test/test_ops.py b/test/test_ops.py
index 090232360309..dc475e00caf3 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -29,13 +29,15 @@ class TestOpInfo(TestCase):
     @onlyOnCPUAndCUDA
     @ops(op_db, dtypes=OpDTypes.unsupported)
     def test_unsupported_dtypes(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype)
-        if len(samples) == 0:
-            self.skipTest("Skipped! No sample inputs!")
-
-        # NOTE: only tests on first sample
-        sample = samples[0]
+        # sample_inputs can have a function for generating the input that doesn't work for specified dtype
+        # https://github.com/pytorch/pytorch/issues/49024
         with self.assertRaises(RuntimeError):
+            samples = op.sample_inputs(device, dtype)
+            if len(samples) == 0:
+                self.skipTest("Skipped! No sample inputs!")
+
+            # NOTE: only tests on first sample
+            sample = samples[0]
             op(*sample.input, *sample.args, **sample.kwargs)
 
     # Verifies that ops have their supported dtypes
@@ -74,7 +76,14 @@ def _check_helper(self, device, dtype, op, variant, check):
 
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         for sample in samples:
-            partial_fn = partial(variant, **sample.kwargs)
+            if sample.output_process_fn_grad is not None:
+                out_fn = sample.output_process_fn_grad
+
+                def variant_out_fn(*args, **kwargs):
+                    return out_fn(variant(*args, **kwargs))
+            else:
+                variant_out_fn = variant
+            partial_fn = partial(variant_out_fn, **sample.kwargs)
             if check == 'gradcheck':
                 self.assertTrue(gradcheck(partial_fn, (*sample.input,) + sample.args,
                                           check_grad_dtypes=True))
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index ab18db90c166..9c8800786fed 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,7 +78,7 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr',
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd',
     '_fft_c2c', '_fft_r2c',
 }
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4f56ef928918..02286f53fde2 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -8028,7 +8028,7 @@ def merge_dicts(*dicts):
 svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
 
 This function returns a namedtuple ``(U, S, V)`` which is the singular value
-decomposition of a input real matrix or batches of real matrices :attr:`input` such that
+decomposition of a input matrix or batches of matrices :attr:`input` such that
 :math:`input = U \times diag(S) \times V^T`.
 
 If :attr:`some` is ``True`` (default), the method returns the reduced
@@ -8040,6 +8040,8 @@ def merge_dicts(*dicts):
 If :attr:`compute_uv` is ``False``, the returned `U` and `V` matrices will be zero matrices
 of shape :math:`(m \times m)` and :math:`(n \times n)` respectively. :attr:`some` will be ignored here.
 
+Supports real-valued and complex-valued input.
+
 .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
           then the singular values of each matrix in the batch is returned in descending order.
 
@@ -8064,6 +8066,9 @@ def merge_dicts(*dicts):
 .. note:: When :attr:`compute_uv` = ``False``, backward cannot be performed since `U` and `V`
           from the forward pass is required for the backward operation.
 
+.. note:: With the complex-valued input the backward operation works correctly only
+          for gauge invariant loss functions. Please look at `Gauge problem in AD`_ for more details.
+
 Args:
     input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
                     batch dimensions consisting of :math:`m \times n` matrices.
@@ -8101,6 +8106,8 @@ def merge_dicts(*dicts):
     >>> u, s, v = torch.svd(a_big)
     >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.transpose(-2, -1)))
     tensor(2.6503e-06)
+
+.. _Gauge problem in AD: https://re-ra.xyz/Gauge-Problem-in-Automatic-Differentiation/
 """)
 
 add_docstr(torch.symeig,
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 82d93d6948ba..cfb4c17cddf4 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1824,16 +1824,21 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
   auto gsigma = grads[1];
 
   auto u = raw_u;
-  auto v = raw_v;
+  // Currently torch.svd for complex dtypes returns the conjugate of V,
+  // while the backward formula is derived with just V (without the conjugation)
+  // therefore here we need to conjugate the V output of SVD and grads[2].
+  // Once https://github.com/pytorch/pytorch/issues/45821 is resolved
+  // extra .conj(), that are marked below in the code, shall be removed.
+  auto v = raw_v.conj();  // TODO: remove .conj()
   auto gu = grads[0];
-  auto gv = grads[2];
+  auto gv = grads[2].conj();  // TODO: remove .conj()
 
   if (!some) {
     // We ignore the free subspace here because possible base vectors cancel
     // each other, e.g., both -v and +v are valid base for a dimension.
     // Don't assume behavior of any particular implementation of svd.
     u = raw_u.narrow(-1, 0, k);
-    v = raw_v.narrow(-1, 0, k);
+    v = raw_v.narrow(-1, 0, k).conj();  // TODO: remove .conj()
     if (gu.defined()) {
       gu = gu.narrow(-1, 0, k);
     }
@@ -1841,11 +1846,13 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
       gv = gv.narrow(-1, 0, k);
     }
   }
-  auto vt = v.transpose(-2, -1);
+  auto vh = v.conj().transpose(-2, -1);
 
   Tensor sigma_term;
   if (gsigma.defined()) {
-    sigma_term = at::matmul(u, at::matmul(gsigma.diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1), vt));
+    gsigma = gsigma.to(self.dtype());
+    // computes u @ diag(gsigma) @ vh
+    sigma_term = at::matmul(u * gsigma.unsqueeze(-2), vh);
   } else {
     sigma_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
@@ -1855,11 +1862,11 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
     return sigma_term;
   }
 
-  auto ut = u.transpose(-2, -1);
+  auto uh = u.conj().transpose(-2, -1);
   auto im = at::eye(m, self.options());
   auto in = at::eye(n, self.options());
-  auto sigma_mat = sigma.diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1);
-  auto sigma_mat_inv = sigma.pow(-1).diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1);
+  auto sigma_mat = sigma.diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).to(self.dtype());
+  auto sigma_mat_inv = sigma.pow(-1).diag_embed(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).to(self.dtype());
   auto sigma_sq = sigma.pow(2);
   auto F = sigma_sq.unsqueeze(-2) - sigma_sq.unsqueeze(-1);
   // The following two lines invert values of F, and fills the diagonal with 0s.
@@ -1871,26 +1878,38 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
   Tensor u_term, v_term;
 
   if (gu.defined()) {
-    u_term = at::matmul(u, at::matmul(F.mul(at::matmul(ut, gu) - at::matmul(gu.transpose(-2, -1), u)), sigma_mat));
+    auto guh = gu.conj().transpose(-2, -1);
+    u_term = at::matmul(u, at::matmul(F.mul(at::matmul(uh, gu) - at::matmul(guh, u)), sigma_mat));
     if (m > k) {
-      u_term = u_term + at::matmul(im - at::matmul(u, ut), at::matmul(gu, sigma_mat_inv));
+      u_term = u_term + at::matmul(im - at::matmul(u, uh), at::matmul(gu, sigma_mat_inv));
     }
-    u_term = at::matmul(u_term, vt);
+    u_term = at::matmul(u_term, vh);
   } else {
     u_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
 
   if (gv.defined()) {
-    auto gvt = gv.transpose(-2, -1);
-    v_term = at::matmul(sigma_mat, at::matmul(F.mul(at::matmul(vt, gv) - at::matmul(gvt, v)), vt));
+    auto gvh = gv.conj().transpose(-2, -1);
+    v_term = at::matmul(sigma_mat, at::matmul(F.mul(at::matmul(vh, gv) - at::matmul(gvh, v)), vh));
     if (n > k) {
-      v_term = v_term + at::matmul(sigma_mat_inv, at::matmul(gvt, in - at::matmul(v, vt)));
+      v_term = v_term + at::matmul(sigma_mat_inv, at::matmul(gvh, in - at::matmul(v, vh)));
     }
     v_term = at::matmul(u, v_term);
   } else {
     v_term = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
 
+  // for complex-valued input there is an additional term
+  // https://giggleliu.github.io/2019/04/02/einsumbp.html
+  // https://arxiv.org/abs/1909.02659
+  if (self.is_complex() && gu.defined()) {
+    // computes L = Identity.mul(uh @ gu)
+    Tensor L = at::matmul(uh, gu).diagonal(0, -2, -1).diag_embed(0, -2, -1);
+    L = L - L.conj().transpose(-2, -1);
+    Tensor imag_term = 0.5 * at::matmul(at::matmul(at::matmul(u, L), sigma_mat_inv), vh);
+    return u_term + sigma_term + v_term + imag_term;
+  }
+
   return u_term + sigma_term + v_term;
 }
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 6c18c0cbaa6d..3b1153c6607b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -45,13 +45,15 @@ def __init__(self, cls_name=None, test_name=None, *,
 class SampleInput(object):
     """Represents sample inputs to a function."""
 
-    __slots__ = ['input', 'args', 'kwargs']
+    # output_process_fn_grad is a function that modifies the output of op compatible with input
+    __slots__ = ['input', 'args', 'kwargs', 'output_process_fn_grad']
 
-    def __init__(self, input, *, args=tuple(), kwargs=None):
+    def __init__(self, input, *, args=tuple(), kwargs=None, output_process_fn_grad=None):
         # test_ops.py expects input to be a tuple
         self.input = input if isinstance(input, tuple) else (input,)
         self.args = args
         self.kwargs = kwargs if kwargs is not None else {}
+        self.output_process_fn_grad = output_process_fn_grad
 
 
 _NOTHING = object()  # Unique value to distinguish default from anything else
@@ -379,6 +381,92 @@ def sample_inputs(self, device, dtype, requires_grad=False):
             ]
 
 
+def sample_inputs_svd(op_info, device, dtype, requires_grad=False):
+    """
+    This function generates input for torch.svd with distinct singular values so that autograd is always stable.
+    Matrices of different size:
+        square matrix - S x S size
+        tall marix - S x (S-2)
+        wide matrix - (S-2) x S
+    and batched variants of above are generated.
+    Each SampleInput has a function 'output_process_fn_grad' attached to it that is applied on the output of torch.svd
+    It is needed for autograd checks, because backward of svd doesn't work for an arbitrary loss function.
+    """
+    from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
+
+    test_cases1 = (  # some=True (default)
+        # loss functions for complex-valued svd have to be "gauge invariant",
+        # i.e. loss functions shouldn't change when sigh of the singular vectors change.
+        # the simplest choice to satisfy this requirement is to apply 'abs'.
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device),
+            lambda usv: usv[1]),  # 'check_grad_s'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device),
+            lambda usv: abs(usv[0])),  # 'check_grad_u'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device),
+            lambda usv: abs(usv[2])),  # 'check_grad_v'
+        # TODO: replace lambda usv: usv[0][0, 0] * usv[2][0, 0] with lambda usv: usv[0][0, 0] * usv[2][0, 0].conj()
+        # once https://github.com/pytorch/pytorch/issues/45821 is resolved
+        # this test is important as it checks the additional term that is non-zero only for complex-valued inputs
+        # and when the loss function depends both on 'u' and 'v'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device),
+            lambda usv: usv[0][0, 0] * usv[2][0, 0]),  # 'check_grad_uv'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device)[:(S - 2)],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2][..., :, :(S - 2)]))),  # 'wide'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device)[:, :(S - 2)],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2]))),  # 'tall'
+        (random_fullrank_matrix_distinct_singular_value(M, dtype=dtype).to(device),
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2]))),  # 'large'
+        (random_fullrank_matrix_distinct_singular_value(S, 3, dtype=dtype).to(device),
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2]))),  # 'batched'
+        (random_fullrank_matrix_distinct_singular_value(S, 3, dtype=dtype).to(device)[..., :(S - 2), :],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2]))),  # 'wide_batched'
+        (random_fullrank_matrix_distinct_singular_value(S, 3, dtype=dtype).to(device)[..., :, :(S - 2)],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2]))),  # 'tall_batched'
+    )
+    test_cases2 = (  # some=False
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device)[:(S - 2)],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2][:, :(S - 2)]))),  # 'wide_all'
+        (random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device)[:, :(S - 2)],
+            lambda usv: (abs(usv[0][:, :(S - 2)]), usv[1], abs(usv[2]))),  # 'tall_all'
+        (random_fullrank_matrix_distinct_singular_value(S, 3, 3, dtype=dtype).to(device)[..., :(S - 2), :],
+            lambda usv: (abs(usv[0]), usv[1], abs(usv[2][..., :, :(S - 2)]))),  # 'wide_all_batched'
+        (random_fullrank_matrix_distinct_singular_value(S, 3, 3, dtype=dtype).to(device)[..., :, :(S - 2)],
+            lambda usv: (abs(usv[0][..., :, :(S - 2)]), usv[1], abs(usv[2]))),  # 'tall_all_batched'
+    )
+
+    out = []
+    for a, out_fn in test_cases1:
+        a.requires_grad = requires_grad
+        out.append(SampleInput(a, output_process_fn_grad=out_fn))
+
+    for a, out_fn in test_cases2:
+        a.requires_grad = requires_grad
+        kwargs = {'some': False}
+        out.append(SampleInput(a, kwargs=kwargs, output_process_fn_grad=out_fn))
+
+    return out
+
+
+def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
+    """
+    This function generates input for torch.pinverse with distinct singular values so that autograd is always stable.
+    Implementation of torch.pinverse depends on torch.svd, therefore it's sufficient to check only square S x S matrix
+    and the batched (3 x S x S) input.
+    """
+    from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
+
+    test_cases = (
+        random_fullrank_matrix_distinct_singular_value(S, dtype=dtype).to(device),  # pinverse
+        random_fullrank_matrix_distinct_singular_value(S, 3, dtype=dtype).to(device),  # pinverse 'batched'
+    )
+
+    out = []
+    for a in test_cases:
+        a.requires_grad = requires_grad
+        out.append(SampleInput(a))
+    return out
+
+
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
@@ -837,6 +925,20 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    promotes_integers_to_float=True,
                    handles_complex_extremals=False,
                    test_complex_grad=False),
+    OpInfo('svd',
+           op=torch.svd,
+           dtypes=floating_and_complex_types(),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_inputs_svd,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    OpInfo('pinverse',
+           op=torch.pinverse,
+           dtypes=floating_and_complex_types(),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_inputs_pinverse,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
 ]
 
 if TEST_SCIPY:
@@ -1726,30 +1828,6 @@ def method_tests():
          'batched_symmetric_pd', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma], itemgetter(1)),
         ('slogdet', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3), NO_ARGS,
          'batched_distinct_singular_values', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma], itemgetter(1)),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S),
-            NO_ARGS, '', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], NO_ARGS,
-         'wide', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], NO_ARGS,
-         'tall', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S)[:(S - 2)], (False,),
-         'wide_all', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma], lambda usv: (usv[0], usv[1], usv[2][:, :(S - 2)])),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S)[:, :(S - 2)], (False,),
-         'tall_all', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
-         'large', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3), NO_ARGS,
-         'batched', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3)[..., :(S - 2), :], NO_ARGS,
-         'wide_batched', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3)[..., :, :(S - 2)], NO_ARGS,
-         'tall_batched', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3, 3)[..., :(S - 2), :], (False,),
-         'wide_all_batched', (), NO_ARGS,
-         [skipCPUIfNoLapack, skipCUDAIfNoMagma], lambda usv: (usv[0], usv[1], usv[2][..., :, :(S - 2)])),
-        ('svd', lambda dtype, device: random_fullrank_matrix_distinct_singular_value(S, 3, 3)[..., :, :(S - 2)], (False,),
-         'tall_all_batched', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma],
-         lambda usv: (usv[0][..., :, :(S - 2)], usv[1], usv[2])),
         ('qr', (S, S), (False,), 'square_single', (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
         ('qr', (S, S - 2), (True,), 'tall_single' , (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),
         ('qr', (S - 2, S), (False,), 'wide_single' , (), NO_ARGS, [skipCPUIfNoLapack, skipCUDAIfNoMagma]),

From 7729581414962ac0a23ebd269f165f6a877490ae Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 16 Dec 2020 12:31:06 -0800
Subject: [PATCH 31/34] [quant][docs] Add fx graph mode quantization to
 quantization docs (#49211)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49211

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D25507480

fbshipit-source-id: 9e9e4b5fef979f5621c1bbd1b49e9cc6830da617
---
 caffe2/contrib/aten/gen_op.py        |   4 +-
 docs/source/quantization-support.rst |   5 +-
 docs/source/quantization.rst         | 103 +++++++++++++++++++++++++--
 tools/codegen/gen.py                 |   3 +-
 torch/utils/_pytree.py               |  11 +--
 5 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 703bf3ec167f..2a822058bfdf 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -20,7 +20,7 @@
 import argparse
 import os
 from copy import deepcopy
-from typing import Dict, List
+from typing import Dict, List, Set
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--template_dir", default=".", help="where template.h is")
@@ -241,7 +241,7 @@ def emit_assignments(o, env):
         'implementations': [],
         'cases': [],
     }  # type: Dict[str, List]
-    seen = set()
+    seen: Set[str] = set()
     key = 0
     for o in filtered:
         # [DESCRIPTORS]
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index 60be24120d43..f782d51b5027 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -105,7 +105,7 @@ Fused modules are provided for common patterns in CNNs. Combining several
 operations together (like convolution and relu) allows for better quantization
 accuracy
 
-    
+
 * `torch.nn.intrinsic` — float versions of the modules, can be swapped with
   quantized version 1 to 1:
 
@@ -172,7 +172,6 @@ Layers for the quantization-aware training
   * :func:`~torch.quantization.fuse_modules`
 
 * Functions for graph mode quantization:
-
   * :func:`~torch.quantization.quantize_jit` - Function for graph mode post training static quantization
   * :func:`~torch.quantization.quantize_dynamic_jit` - Function for graph mode post training dynamic quantization
 
@@ -322,5 +321,3 @@ Quantized dtypes and quantization schemes
   * :attr:`torch.quint8` — 8-bit unsigned integer
   * :attr:`torch.qint8` — 8-bit signed integer
   * :attr:`torch.qint32` — 32-bit signed integer
-
-
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 6cfbe186544f..a389de60416a 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -80,7 +80,16 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 Quantization API Summary
 ---------------------------------------
 
-There are three types of quantization supported in PyTorch:
+PyTorch provides two different modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization.
+
+Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
+
+FX Graph Mode Quantization is a new automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process. Although people might need to refactor the model a bit to make the model compatible with FX Graph Mode Quantization (symbolically traceable with torch.fx).
+
+Eager Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^
+
+There are three types of quantization supported in Eager Mode Quantization:
 
 1. dynamic quantization (weights quantized with activations read/stored in
    floating point and quantized for compute.)
@@ -95,7 +104,7 @@ for a more comprehensive overview of the tradeoffs between these quantization
 types.
 
 Dynamic Quantization
-^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~
 
 This is the simplest to apply form of quantization where the weights are
 quantized ahead of time but the activations are dynamically quantized
@@ -148,7 +157,7 @@ To learn more about dynamic quantization please see our `dynamic quantization tu
 <https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
 
 Static Quantization
-^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~
 
 Static quantization quantizes the weights and activations of the model.  It
 fuses activations into preceding layers where possible.  It requires
@@ -238,7 +247,7 @@ To learn more about static quantization, please see the `static quantization tut
 <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
 
 Quantization Aware Training
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Quantization Aware Training models the effects of quantization during training
 allowing for higher accuracy compared to other quantization methods.  During
@@ -332,6 +341,92 @@ To learn more about quantization aware training, please see the `QAT
 tutorial
 <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
 
+(Prototype) FX Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Quantization types supported by FX Graph Mode can be classified in two ways:
+
+1.
+- Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data)
+- Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data)
+
+2.
+- Weight Only Quantization (only weight is statically quantized)
+- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized)
+- Static Quantization (both weight and activations are statically quantized)
+
+These two ways of classification are independent, so theoretically we can have 6 different types of quantization.
+
+The supported quantization types in FX Graph Mode Quantization are:
+- Post Training Quantization
+
+  - Weight Only Quantization
+  - Dynamic Quantization
+  - Static Quantization
+
+- Quantization Aware Training
+
+  - Static Quantization
+
+
+There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_dict` (an argument of the `prepare_fx` function).
+
+API Example::
+
+  import torch.quantization.quantize_fx as quantize_fx
+  import copy
+
+  model_fp = UserModel(...)
+
+  #
+  # post training dynamic/weight_only quantization
+  #
+
+  # we need to deepcopy if we still want to keep model_fp unchanged after quantization since quantization apis change the input model
+  model_to_quantize = copy.deepcopy(model_fp)
+  model_to_quantize.eval()
+  qconfig_dict = {"": torch.quantization.default_dynamic_qconfig}
+  # prepare
+  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
+  # no calibration needed when we only have dynamici/weight_only quantization
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # post training static quantization
+  #
+
+  model_to_quantize = copy.deepcopy(model_fp)
+  qconfig_dict = {"": torch.quantization.get_default_qconfig('qnnpack')}
+  model_to_quantize.eval()
+  # prepare
+  model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
+  # calibrate (not shown)
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # quantization aware training for static quantization
+  #
+
+  model_to_quantize = copy.deepcopy(model_fp)
+  qconfig_dict = {"": torch.quantization.get_default_qat_qconfig('qnnpack')}
+  model_to_quantize.train()
+  # prepare
+  model_prepared = quantize_fx.prepare_qat_fx(model_to_qunatize, qconfig_dict)
+  # training loop (not shown)
+  # quantize
+  model_quantized = quantize_fx.convert_fx(model_prepared)
+
+  #
+  # fusion
+  #
+  model_to_quantize = copy.deepcopy(model_fp)
+  model_fused = quantize_fx.fuse_fx(model_to_quantize)
+
+Please see the following tutorials for more information about FX Graph Mode Quantization:
+- FX Graph Mode Post Training Static Quantization (TODO: link)
+- FX Graph Mode Post Training Dynamic Quantization (TODO: link)
+
 Quantized Tensors
 ---------------------------------------
 
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 578db26fb566..33c5f0995013 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -469,7 +469,6 @@ def gen_one(f: NativeFunction) -> Optional[str]:
             else:
                 assert_never(self.target)
                 # Silence mypy's "Missing return statement" error
-                return None
 
         return list(mapMaybe(gen_one, g.functions()))
 
@@ -698,7 +697,7 @@ def compute_native_function_declaration(g: Union[StructuredNativeFunctions, Nati
         # only out has dispatch
         meta_name = meta.name(g)
         rs = []
-        seen = set()
+        seen: Set[Any] = set()
         out_args = native.arguments(g.out.func)
         for k, n in g.out.dispatch.items():
             if n in seen:
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index ecf7270fa03d..86a7c54c4a1e 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -112,8 +112,8 @@ def tree_flatten(pytree: PyTree) -> Tuple[List[Any], TreeSpec]:
     child_pytrees, context = flatten_fn(pytree)
 
     # Recursively flatten the children
-    result = []
-    children_specs = []
+    result : List[Any] = []
+    children_specs : List['TreeSpec'] = []
     for child in child_pytrees:
         flat, child_spec = tree_flatten(child)
         result += flat
@@ -178,11 +178,12 @@ def _broadcast_to_and_flatten(pytree: PyTree, spec: TreeSpec) -> Optional[List[A
         return None
 
     # Recursively flatten the children
-    result = []
+    result : List[Any] = []
     for child, child_spec in zip(child_pytrees, spec.children_specs):
         flat = _broadcast_to_and_flatten(child, child_spec)
-        if flat is None:
+        if flat is not None:
+            result += flat
+        else:
             return None
-        result += flat
 
     return result

From 5874925b461b67094ecf60ab1ae4339ae3d48cfe Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 16 Dec 2020 12:40:05 -0800
Subject: [PATCH 32/34] stft: Change require_complex warning to an error
 (#49022)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49022

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D25569586

Pulled By: mruberry

fbshipit-source-id: 09608088f540c2c3fc70465f6a23f2aec5f24f85
---
 aten/src/ATen/native/SpectralOps.cpp | 18 +++++++++++++-----
 test/test_jit.py                     |  6 +++---
 test/test_spectral_ops.py            | 23 +++++++++++++++--------
 torch/functional.py                  | 10 +++++++---
 4 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 4ae2ee326b88..c8eb3cc99a01 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -468,11 +468,19 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   auto win_length = win_lengthOpt.value_or(n_fft);
   const bool return_complex = return_complexOpt.value_or(
       self.is_complex() || (window.defined() && window.is_complex()));
-  if (!return_complexOpt && !return_complex) {
-    TORCH_WARN_ONCE("stft will require the return_complex parameter be explicitly "
-                    " specified in a future PyTorch release. Use return_complex=False "
-                    " to preserve the current behavior or return_complex=True to return "
-                    " a complex output.");
+  if (!return_complex) {
+    TORCH_CHECK(return_complexOpt.has_value(),
+        "stft requires the return_complex parameter be given for real inputs."
+        "You should pass return_complex=True to opt-in to complex dtype returns "
+        "(which will be required in a future pytorch release). "
+      );
+
+    TORCH_WARN_ONCE(
+        "stft with return_complex=False is deprecated. In a future pytorch "
+        "release, stft will return complex tensors for all inputs, and "
+        "return_complex=False will raise an error.\n"
+        "Note: you can still call torch.view_as_real on the complex output to "
+        "recover the old return format.");
   }
 
   if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
diff --git a/test/test_jit.py b/test/test_jit.py
index cd66f84cdc83..0a8c5af060dc 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -8780,7 +8780,7 @@ def test_pack_unpack_state(self):
     def test_torch_functional(self):
         def stft(input, n_fft):
             # type: (Tensor, int) -> Tensor
-            return torch.stft(input, n_fft)
+            return torch.stft(input, n_fft, return_complex=True)
 
         inps = (torch.randn(10), 7)
         self.assertEqual(stft(*inps), torch.jit.script(stft)(*inps))
@@ -8789,8 +8789,8 @@ def istft(input, n_fft):
             # type: (Tensor, int) -> Tensor
             return torch.istft(input, n_fft)
 
-        inps2 = (torch.stft(*inps), inps[1])
-        self.assertEqual(torch.istft(*inps2), torch.jit.script(torch.istft)(*inps2))
+        inps2 = (stft(*inps), inps[1])
+        self.assertEqual(istft(*inps2), torch.jit.script(istft)(*inps2))
 
         def lu(x):
             # type: (Tensor) -> Tuple[Tensor, Tensor]
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 04365a5828d4..6192d6c4d6b6 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -843,7 +843,9 @@ def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None,
             else:
                 window = None
             if expected_error is None:
-                result = x.stft(n_fft, hop_length, win_length, window, center=center)
+                with self.maybeWarnsRegex(UserWarning, "stft with return_complex=False"):
+                    result = x.stft(n_fft, hop_length, win_length, window,
+                                    center=center, return_complex=False)
                 # NB: librosa defaults to np.complex64 output, no matter what
                 # the input dtype
                 ref_result = librosa_stft(x, n_fft, hop_length, win_length, window, center)
@@ -1055,15 +1057,20 @@ def test_complex_stft_onesided(self, device):
                 with self.assertRaisesRegex(RuntimeError, 'complex'):
                     x.stft(10, window=window, pad_mode='constant', onesided=True)
             else:
-                y = x.stft(10, window=window, pad_mode='constant', onesided=True)
-                self.assertEqual(y.dtype, torch.double)
-                self.assertEqual(y.size(), (6, 51, 2))
+                y = x.stft(10, window=window, pad_mode='constant', onesided=True,
+                           return_complex=True)
+                self.assertEqual(y.dtype, torch.cdouble)
+                self.assertEqual(y.size(), (6, 51))
 
-        y = torch.rand(100, device=device, dtype=torch.double)
-        window = torch.randn(10, device=device, dtype=torch.cdouble)
+        x = torch.rand(100, device=device, dtype=torch.cdouble)
         with self.assertRaisesRegex(RuntimeError, 'complex'):
             x.stft(10, pad_mode='constant', onesided=True)
 
+    def test_stft_requires_complex(self, device):
+        x = torch.rand(100)
+        with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'):
+            y = x.stft(10, pad_mode='constant')
+
     @skipCUDAIfRocm
     @skipCPUIfNoMkl
     def test_fft_input_modification(self, device):
@@ -1091,7 +1098,7 @@ def test_fft_input_modification(self, device):
     def test_istft_round_trip_simple_cases(self, device, dtype):
         """stft -> istft should recover the original signale"""
         def _test(input, n_fft, length):
-            stft = torch.stft(input, n_fft=n_fft)
+            stft = torch.stft(input, n_fft=n_fft, return_complex=True)
             inverse = torch.istft(stft, n_fft=n_fft, length=length)
             self.assertEqual(input, inverse, exact_dtype=True)
 
@@ -1113,7 +1120,7 @@ def _test_istft_is_inverse_of_stft(stft_kwargs):
             for sizes in data_sizes:
                 for i in range(num_trials):
                     original = torch.randn(*sizes, dtype=dtype, device=device)
-                    stft = torch.stft(original, **stft_kwargs)
+                    stft = torch.stft(original, return_complex=True, **stft_kwargs)
                     inversed = torch.istft(stft, length=original.size(1), **istft_kwargs)
 
                     # trim the original for case when constructed signal is shorter than original
diff --git a/torch/functional.py b/torch/functional.py
index 25b0c1fb3b19..10fb6b1e41b7 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -464,9 +464,13 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
     r"""Short-time Fourier transform (STFT).
 
     .. warning::
-        Setting :attr:`return_complex` explicitly will be required in a future
-        PyTorch release. Set it to False to preserve the current behavior or
-        True to return a complex output.
+        From version 1.8.0, :attr:`return_complex` must always be given
+        explicitly for real inputs and `return_complex=False` has been
+        deprecated. Strongly prefer `return_complex=True` as in a future
+        pytorch release, this function will only return complex tensors.
+
+        Note that :func:`torch.view_as_real` can be used to recover a real
+        tensor with an extra last dimension for real and imaginary components.
 
     The STFT computes the Fourier transform of short overlapping windows of the
     input. This giving frequency components of the signal as they change over

From 7767dcfc8dd89ed16b97b4915218af4c69985058 Mon Sep 17 00:00:00 2001
From: Jeffrey Wan <jw3468@fb.com>
Date: Wed, 16 Dec 2020 13:17:25 -0800
Subject: [PATCH 33/34] Revert D25564477: [pytorch][PR] Add sinc operator

Test Plan: revert-hammer

Differential Revision:
D25564477 (https://github.com/pytorch/pytorch/commit/bbc71435b7bbaee310f488be766b1a37bb9a08ca)

Original commit changeset: 13f36a2b84da

fbshipit-source-id: 58cbe8109efaf499dd017531878b9fbbb27976bc
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 -
 aten/src/ATen/native/UnaryOps.cpp             |  5 ----
 aten/src/ATen/native/UnaryOps.h               |  1 -
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 16 ----------
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   | 14 ---------
 aten/src/ATen/native/native_functions.yaml    | 16 ----------
 docs/source/tensors.rst                       |  2 --
 docs/source/torch.rst                         |  1 -
 tools/autograd/derivatives.yaml               |  3 --
 tools/autograd/gen_variable_type.py           |  2 +-
 torch/_tensor_docs.py                         | 14 ---------
 torch/_torch_docs.py                          | 28 ------------------
 torch/overrides.py                            |  1 -
 .../_internal/common_methods_invocations.py   | 29 -------------------
 14 files changed, 1 insertion(+), 132 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 7f0b01d95049..92952799ec49 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -622,7 +622,6 @@ _(aten, signbit) \
 _(aten, silu) \
 _(aten, sgn) \
 _(aten, sin) \
-_(aten, sinc) \
 _(aten, sinh) \
 _(aten, size) \
 _(aten, sizes) \
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index ab58a8f277a5..9c91821aed80 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -343,10 +343,6 @@ Tensor& cos_out(Tensor& result, const Tensor& self) { return unary_op_impl_float
 Tensor cos(const Tensor& self) { return unary_op_impl_float(self, cos_stub); }
 Tensor& cos_(Tensor& self) { return unary_op_impl_(self, at::cos_out); }
 
-Tensor& sinc_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sinc_stub); }
-Tensor sinc(const Tensor& self) { return unary_op_impl_float(self, sinc_stub); }
-Tensor& sinc_(Tensor& self) { return unary_op_impl_(self, at::sinc_out); }
-
 Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sinh_stub); }
 Tensor sinh(const Tensor& self) { return unary_op_impl_float(self, sinh_stub); }
 Tensor& sinh_(Tensor& self) { return unary_op_impl_(self, at::sinh_out); }
@@ -721,7 +717,6 @@ DEFINE_DISPATCH(sign_stub);
 DEFINE_DISPATCH(signbit_stub);
 DEFINE_DISPATCH(sgn_stub);
 DEFINE_DISPATCH(sin_stub);
-DEFINE_DISPATCH(sinc_stub);
 DEFINE_DISPATCH(sinh_stub);
 DEFINE_DISPATCH(sqrt_stub);
 DEFINE_DISPATCH(tan_stub);
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index f732cb9a0141..a6db47f17153 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -55,7 +55,6 @@ DECLARE_DISPATCH(unary_fn, sign_stub);
 DECLARE_DISPATCH(unary_fn, signbit_stub);
 DECLARE_DISPATCH(unary_fn, sgn_stub);
 DECLARE_DISPATCH(unary_fn, sin_stub);
-DECLARE_DISPATCH(unary_fn, sinc_stub);
 DECLARE_DISPATCH(unary_fn, sinh_stub);
 DECLARE_DISPATCH(unary_fn, sqrt_stub);
 DECLARE_DISPATCH(unary_fn, tan_stub);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 8aa5957f4b7e..f7c4f9c34613 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -302,21 +302,6 @@ static void sgn_kernel(TensorIterator& iter){
   });
 }
 
-static void sinc_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.common_dtype(), "sinc_cpu", [&]() {
-    cpu_kernel(
-        iter,
-        [=](scalar_t a) -> scalar_t {
-          if (a == scalar_t(0)) {
-            return scalar_t(1);
-          } else {
-            scalar_t product = scalar_t(M_PI) * a;
-            return std::sin(product) / product;
-          }
-        });
-  });
-}
-
 static void sinh_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
     cpu_kernel_vec(
@@ -692,7 +677,6 @@ REGISTER_DISPATCH(neg_stub, &neg_kernel);
 REGISTER_DISPATCH(sign_stub, &sign_kernel);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel);
 REGISTER_DISPATCH(sgn_stub, &sgn_kernel);
-REGISTER_DISPATCH(sinc_stub, &sinc_kernel);
 REGISTER_DISPATCH(sinh_stub, &sinh_kernel);
 REGISTER_DISPATCH(cosh_stub, &cosh_kernel);
 REGISTER_DISPATCH(acosh_stub, &acosh_kernel);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 059da1f49f75..4d676181be79 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -103,19 +103,6 @@ void sigmoid_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void sinc_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "sinc_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      if (a == scalar_t(0)) {
-        return scalar_t(1);
-      } else {
-        scalar_t product = scalar_t(M_PI) * a;
-        return std::sin(product) / product;
-      }
-    });
-  });
-}
-
 void logit_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half,
@@ -258,7 +245,6 @@ REGISTER_DISPATCH(i0_stub, &i0_kernel_cuda);
 REGISTER_DISPATCH(rsqrt_stub, &rsqrt_kernel_cuda);
 REGISTER_DISPATCH(sqrt_stub, &sqrt_kernel_cuda);
 REGISTER_DISPATCH(sigmoid_stub, &sigmoid_kernel_cuda);
-REGISTER_DISPATCH(sinc_stub, &sinc_kernel_cuda);
 REGISTER_DISPATCH(logit_stub, &logit_kernel_cuda);
 REGISTER_DISPATCH(erf_stub, &erf_kernel_cuda);
 REGISTER_DISPATCH(erfc_stub, &erfc_kernel_cuda);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1c0eb48f3bc3..c25d28b27518 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3720,22 +3720,6 @@
   dispatch:
     CPU, CUDA: sin_out
 
-- func: sinc(Tensor self) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-  dispatch:
-    DefaultBackend: sinc
-
-- func: sinc_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
-  variants: function, method
-  dispatch:
-    DefaultBackend: sinc_
-
-- func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: sinc_out
-
 - func: sinh(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 2e1225b882e3..b2e25189540b 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -558,8 +558,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: sgn_
    .. automethod:: sin
    .. automethod:: sin_
-   .. automethod:: sinc
-   .. automethod:: sinc_
    .. automethod:: sinh
    .. automethod:: sinh_
    .. automethod:: asinh
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 5dd74f62a531..2bb6c0204395 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -340,7 +340,6 @@ Pointwise Ops
     sign
     signbit
     sin
-    sinc
     sinh
     sqrt
     square
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 1ea5a141cf36..1a03228c751b 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -961,9 +961,6 @@
 - name: sin(Tensor self) -> Tensor
   self: grad * self.cos().conj()
 
-- name: sinc(Tensor self) -> Tensor
-  self: grad * ((M_PI * self * (M_PI * self).cos() - (M_PI * self).sin()) / (M_PI * self * self)).conj()
-
 - name: sinh(Tensor self) -> Tensor
   self: grad * self.cosh().conj()
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 9c8800786fed..82d8949b6259 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -70,7 +70,7 @@
     'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
     'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
     'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_',
-    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinc', 'sinh',
+    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinh',
     'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
     'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index e9a3731cac12..79ce982da5e9 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3340,20 +3340,6 @@ def callable(a, b) -> number
 In-place version of :meth:`~Tensor.sin`
 """)
 
-add_docstr_all('sinc',
-               r"""
-sinc() -> Tensor
-
-See :func:`torch.sinc`
-""")
-
-add_docstr_all('sinc_',
-               r"""
-sinc_() -> Tensor
-
-In-place version of :meth:`~Tensor.sinc`
-""")
-
 add_docstr_all('sinh',
                r"""
 sinh() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 02286f53fde2..eb9a8ef81269 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7453,34 +7453,6 @@ def merge_dicts(*dicts):
     tensor([-0.5194,  0.1343, -0.4032, -0.2711])
 """.format(**common_args))
 
-add_docstr(torch.sinc,
-           r"""
-sinc(input, *, out=None) -> Tensor
-
-Computes the normalized sinc of :attr:`input.`
-
-.. math::
-    \text{out}_{i} =
-    \begin{cases}
-      1, & \text{if}\ \text{out}_{i}=0 \\
-      \sin(\pi \text{input}_{i}) / (\pi \text{input}_{i}), & \text{otherwise}
-    \end{cases}
-""" + r"""
-Args:
-    {input}
-
-Keyword args:
-    {out}
-
-Example::
-
-    >>> a = torch.randn(4)
-    >>> a
-    tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
-    >>> torch.sinc(a)
-    tensor([ 0.9186,  0.8631, -0.0259, -0.1300])
-""".format(**common_args))
-
 add_docstr(torch.sinh,
            r"""
 sinh(input, *, out=None) -> Tensor
diff --git a/torch/overrides.py b/torch/overrides.py
index 79016c9a0e9f..2af6e36ea914 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -771,7 +771,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.signbit: lambda input, out=None: -1,
         torch.sgn: lambda input, out=None: -1,
         torch.sin: lambda input, out=None: -1,
-        torch.sinc: lambda input, out=None: -1,
         torch.sinh: lambda input, out=None: -1,
         torch.slogdet: lambda input: -1,
         torch.smm: lambda input, mat2: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3b1153c6607b..990502cdd9f7 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -283,15 +283,6 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
                                     low=None, high=None,
                                     requires_grad=False))),)
 
-def np_sinc_with_fp16_as_fp32(x):
-    # Wraps numpy's sinc function so that fp16 values are promoted to fp32
-    # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
-    # at 0 for fp16.
-    if x.dtype == np.float16:
-        return np.sinc(x.astype(np.float32))
-    else:
-        return np.sinc(x)
-
 def np_unary_ufunc_integer_promotion_wrapper(fn):
     # Wrapper that passes PyTorch's default scalar
     #   type as an argument to the wrapped NumPy
@@ -799,26 +790,6 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 dtypes=[torch.float], active_if=TEST_WITH_ROCM),
                    )),
-    UnaryUfuncInfo('sinc',
-                   ref=np_sinc_with_fp16_as_fp32,
-                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
-                   skip_bfloat16_grad=True,
-                   handles_large_floats=False,
-                   handles_complex_extremals=False,
-                   promotes_integers_to_float=True,
-                   decorators=(precisionOverride({torch.bfloat16: 1e-2,
-                                                  torch.float16: 1e-2}),),
-                   skips=(
-                       # Reference: https://github.com/pytorch/pytorch/issues/49133
-                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
-                                dtypes=[torch.cfloat]),
-                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
-                                dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
-                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
-                                dtypes=[torch.float], active_if=TEST_WITH_ROCM),
-                   )),
     UnaryUfuncInfo('sinh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
                    dtypesIfCPU=all_types_and_complex_and(torch.bool),

From 4431731c682ee98cf83e7c409a51d05b953a72a3 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 16 Dec 2020 13:58:48 -0800
Subject: [PATCH 34/34] Making ops c10-full: Storage arguments (#49146)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49146

Add support for Storage arguments to IValue and the JIT typing system, and make ops that were blocked on that c10-full.
ghstack-source-id: 118710665

(Note: this ignores all push blocking failures!)

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25456799

fbshipit-source-id: da14f125af352de5fcf05a83a69ad5a69d5a3b45
---
 aten/src/ATen/core/ivalue.cpp                 |  8 +++++
 aten/src/ATen/core/ivalue.h                   | 15 ++++++++++
 aten/src/ATen/core/ivalue_inl.h               | 10 +++++++
 aten/src/ATen/core/jit_type.h                 | 30 +++++++++++++++++++
 aten/src/ATen/core/type.cpp                   |  4 +++
 aten/src/ATen/native/native_functions.yaml    |  2 ++
 .../include/nomnigraph/Graph/Graph.h          | 10 +++----
 .../check_backward_compatibility.py           |  1 +
 .../csrc/jit/frontend/schema_type_parser.cpp  |  3 +-
 torch/csrc/jit/python/pybind_utils.h          |  1 +
 torch/csrc/jit/serialization/unpickler.cpp    |  1 +
 11 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 60382e37b6ff..51b6d0828816 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -76,6 +76,8 @@ TypePtr IValue::type() const {
       return NoneType::get();
     case Tag::Tensor:
       return TensorType::create(toTensor());
+    case Tag::Storage:
+      return StorageType::get();
     case Tag::Double:
       return FloatType::get();
     case Tag::Int:
@@ -260,6 +262,8 @@ IValue IValue::equals(const IValue& rhs) const {
         return false;
       }
       return lhs.toTensor().eq(rhs.toTensor());
+    case Tag::Storage:
+      return rhs.isStorage() && lhs.toStorage().unsafeGetStorageImpl() == rhs.toStorage().unsafeGetStorageImpl();
     case Tag::Double:
       return rhs.isDouble() && lhs.toDouble() == rhs.toDouble();
     case Tag::Int:
@@ -310,6 +314,8 @@ size_t IValue::hash(const IValue& v) {
       // Tensor __hash__ is equivalent to `id()`, so take the pointer value of
       // the tensor to emulate it
       return c10::get_hash(v.payload.as_int);
+    case Tag::Storage:
+      return c10::get_hash(v.payload.as_int);
     case Tag::Int:
       return c10::get_hash(v.payload.as_int);
     case Tag::String:
@@ -647,6 +653,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return out << v.toNone();
     case IValue::Tag::Tensor:
       return out << v.toTensor();
+    case IValue::Tag::Storage:
+      return out << v.toStorage().unsafeGetStorageImpl();
     case IValue::Tag::Double: {
       double d = v.toDouble();
       int c = std::fpclassify(d);
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 5ab5a9c0a501..152c28d23afb 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -105,6 +105,7 @@ struct Capsule {
 #define TORCH_FORALL_TAGS(_) \
   _(None)                    \
   _(Tensor)                  \
+  _(Storage)                 \
   _(Double)                  \
   _(Int)                     \
   _(Bool)                    \
@@ -314,6 +315,20 @@ struct CAFFE2_API IValue final {
     return static_cast<at::TensorImpl*>(payload.as_intrusive_ptr);
   }
 
+  IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast<bool>(s)) {
+    // Note: the undefined tensor is not refcounted, so while it
+    // is tagged as a tensor, is_intrusive_ptr is set to false.
+    // This is not an optional optimization: our incref call
+    // *will not* do the right thing when called on an
+    // undefined tensor.
+    payload.as_intrusive_ptr = s.unsafeReleaseStorageImpl();
+  }
+  bool isStorage() const {
+    return Tag::Storage == tag;
+  }
+  c10::Storage toStorage() &&;
+  c10::Storage toStorage() const&;
+
   const IValue& toIValue() const {
     return *this;
   }
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 46bde6103043..a1e0491da6f6 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -137,6 +137,15 @@ inline at::Tensor IValue::toTensor() const& {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
   return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
 }
+inline c10::Storage IValue::toStorage() && {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(
+      moveToIntrusivePtr<at::StorageImpl>());
+}
+inline c10::Storage IValue::toStorage() const& {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(toIntrusivePtr<at::StorageImpl>());
+}
 inline c10::Stream IValue::toStream() && {
   return c10::Stream::unpack(payload.as_int);
 }
@@ -743,6 +752,7 @@ inline const ivalue::Object& IValue::toObjectRef() const {
     return this->method_name();            \
   }
 DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(at::Storage, toStorage)
 DEFINE_TO(c10::Stream, toStream)
 DEFINE_TO(float, toDouble)
 DEFINE_TO(double, toDouble)
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 993e10aa1b42..f0c93ca35256 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -31,6 +31,7 @@ using OptNameList = c10::optional<std::vector<std::string>>;
   _(EnumType)               \
   _(AnyEnumType)            \
   _(TensorType)             \
+  _(StorageType)            \
   _(TupleType)              \
   _(ListType)               \
   _(DictType)               \
@@ -1407,6 +1408,29 @@ struct CAFFE2_API StringType : public Type {
   StringType() : Type(TypeKind::StringType) {}
 };
 
+struct StorageType;
+using StorageTypePtr = std::shared_ptr<StorageType>;
+struct CAFFE2_API StorageType : public Type {
+  static StorageTypePtr create() {
+    return StorageTypePtr(new StorageType()); // NOLINT(modernize-make-shared)
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return annotation_str();
+  }
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    return "Storage";
+  }
+  static const TypeKind Kind = TypeKind::StorageType;
+  // global singleton
+  static StorageTypePtr get();
+
+ private:
+  StorageType() : Type(TypeKind::StorageType) {}
+};
+
 struct FunctionType;
 using FunctionTypePtr = std::shared_ptr<FunctionType>;
 struct CAFFE2_API FunctionType : public NamedType {
@@ -1757,6 +1781,12 @@ struct getTypePtr_<at::Tensor> final {
   }
 };
 template <>
+struct getTypePtr_<c10::Storage> final {
+  static TypePtr call() {
+    return StorageType::get();
+  }
+};
+template <>
 struct getTypePtr_<c10::Stream> final {
   static TypePtr call() {
     return StreamObjType::get();
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 429007e4242b..d84dc5e63a36 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -134,6 +134,10 @@ BoolTypePtr BoolType::get() {
   static auto value = BoolType::create();
   return value;
 }
+StorageTypePtr StorageType::get() {
+  static auto value = StorageType::create();
+  return value;
+}
 NoneTypePtr NoneType::get() {
   static auto value = NoneType::create();
   return value;
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c25d28b27518..cddabf292a06 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5423,12 +5423,14 @@
 # wrappers for legacy TH methods
 
 - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: set_
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index 5033417d8d78..eb6f1d7c4d95 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -40,11 +40,11 @@ class Node;
 
 // \brief Edge within a Graph.
 template <typename T, typename... U>
-class Edge : public StorageType<U...> {
+class Edge : public ::StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
-      : StorageType<U...>(std::forward<U...>(args)...),
+      : ::StorageType<U...>(std::forward<U...>(args)...),
         tail_(tail),
         head_(head) {
     DEBUG_PRINT("Creating instance of Edge: %p\n", this);
@@ -74,17 +74,17 @@ class Edge : public StorageType<U...> {
 
 // \brief Node within a Graph.
 template <typename T, typename... U>
-class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
+class Node : public ::StorageType<T>, public Notifier<Node<T, U...>> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
 
   /// \brief Create a node with data.
-  explicit Node(T&& data) : StorageType<T>(std::move(data)) {
+  explicit Node(T&& data) : ::StorageType<T>(std::move(data)) {
     DEBUG_PRINT("Creating instance of Node: %p\n", this);
   }
   /// \brief Create an empty node.
-  explicit Node() : StorageType<T>() {}
+  explicit Node() : ::StorageType<T>() {}
   Node(Node&&) = default;
   Node(const Node&) = delete;
   Node& operator=(const Node&) = delete;
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index b259d3532851..f886bf9616c2 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -198,6 +198,7 @@
     ("aten::thnn_conv2d_backward", datetime.date(2021, 1, 31)),
     ("aten::slow_conv_transpose3d_backward", datetime.date(2021, 1, 31)),
     ("aten::slow_conv_transpose2d_backward", datetime.date(2021, 1, 31)),
+    ("aten::set_", datetime.date(2021, 1, 31)),
 ]
 
 def allow_listed(schema, allow_list):
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 6d2f5162a2d4..fb68cb3bf84e 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -28,6 +28,7 @@ using c10::StreamObjType;
 using c10::StringType;
 using c10::Symbol;
 using c10::TensorType;
+using c10::StorageType;
 using c10::TupleType;
 using c10::VarType;
 
@@ -41,7 +42,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
       {"ScalarType", IntType::get()},
       {"Layout", IntType::get()},
       {"MemoryFormat", IntType::get()},
-      {"Storage", IntType::get()},
+      {"Storage", StorageType::get()},
       {"QScheme", QSchemeType::get()},
       {"Quantizer", QuantizerType::get()},
       {"ConstQuantizerPtr",
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 34ca7585be67..95b47d142122 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -809,6 +809,7 @@ inline IValue toIValue(
       return toTypeInferredIValue(obj);
     case TypeKind::FunctionType:
     case TypeKind::GeneratorType:
+    case TypeKind::StorageType:
     case TypeKind::QuantizerType:
     case TypeKind::VarType:
     case TypeKind::QSchemeType:
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 1979b4417f10..15c4a89c2f1e 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -54,6 +54,7 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
     }
     switch (w.static_type->kind()) {
       case TensorType::Kind:
+      case StorageType::Kind:
       case NumberType::Kind:
       case FloatType::Kind:
       case IntType::Kind: