From 0b5303e833ed7e46e5aad4e1770f9bf4ef7d6c6b Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Wed, 27 Jan 2021 08:58:21 -0800
Subject: [PATCH 01/41] Propagate CreationMeta when chaining views (#51061)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49824

## Background

When creating a view of a view, there was a possibility that the new view would be less restrictive than the previous view, incorrectly sidestepping the error that should be thrown when using in-place operations on the new view.

The fix addresses this by propagating `CreationMeta` from the previous view to the new view. Currently, the old view's `creation_meta` is only propagated when the new view's `creation_meta == CreationMeta::DEFAULT`. This ensures that the new view is not less restrictive than the previous view wrt. allowing in-place operations.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/51061

Test Plan:
```
python test/test_autograd.py TestAutogradDeviceTypeCPU.test_inplace_view_of_multiple_output_view_cpu
python test/test_autograd.py TestAutogradDeviceTypeCUDA.test_inplace_view_of_multiple_output_view_cuda
python test/test_autograd.py TestAutogradDeviceTypeCPU.test_inplace_multiple_output_view_of_view_cpu
python test/test_autograd.py TestAutogradDeviceTypeCUDA.test_inplace_multiple_output_view_of_view_cuda
```

Reviewed By: heitorschueroff

Differential Revision: D26076434

Pulled By: jbschlosser

fbshipit-source-id: c47f0ddcef9b8449427b671aff9ad08edca70fcd
---
 test/test_autograd.py                   | 14 ++++++++++++++
 torch/csrc/autograd/VariableTypeUtils.h | 10 ++++++++++
 torch/csrc/autograd/variable.h          |  9 +++++++++
 3 files changed, 33 insertions(+)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 28ef8d41b346..df588f97701a 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -7377,6 +7377,20 @@ def test_inplace_view_multiple_outputs(self, device):
         with self.assertRaises(RuntimeError):
             v1[0].mul_(2)
 
+    def test_inplace_view_of_multiple_output_view(self, device):
+        a = torch.rand(10, device=device, requires_grad=True).clone()
+        b = a.unbind(0)
+        c = b[0].view_as(b[0])
+        with self.assertRaises(RuntimeError):
+            c.mul_(2)
+
+    def test_inplace_multiple_output_view_of_view(self, device):
+        a = torch.rand(10, device=device, requires_grad=True).clone()
+        b = a.view_as(a)
+        c = b.unbind(0)
+        with self.assertRaises(RuntimeError):
+            c[0].mul_(2)
+
     def test_inplace_view_makes_base_require_grad(self, device):
         # in-place modification to view makes base require grad
         a = torch.randn(4, 4, device=device, requires_grad=False)
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index 2894a75fed69..85b83f2aa6ee 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -145,6 +145,7 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif
       if (base.is_view()) {
         auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
         const auto& base_bw_info = diff_view_meta->get_backward_view();
+        creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta);
         return make_variable_differentiable_view(tensor, base_bw_info.chain(base, tensor, view_func),
                                                  c10::nullopt, creation_meta, allow_tensor_metadata_change);
       } else {
@@ -188,6 +189,10 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif
   }
 
   if (is_fw_differentiable || is_bw_differentiable) {
+    if (base.is_view()) {
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+      creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta);
+    }
     return make_variable_differentiable_view(tensor, std::move(new_bw_info), std::move(new_fw_info),
                                              creation_meta, allow_tensor_metadata_change);
   } else {
@@ -234,6 +239,11 @@ inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& ten
     }
   }
 
+  if ((is_fw_differentiable || is_bw_differentiable) && base.is_view()) {
+    auto diff_view_meta = static_cast<DifferentiableViewMeta*>(torch::autograd::impl::get_autograd_meta(base));
+    creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta);
+  }
+
   for(Tensor &tensor : tensors) {
     if (is_fw_differentiable || is_bw_differentiable) {
       tensor = make_variable_differentiable_view(tensor, new_bw_info, new_fw_info, creation_meta);
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 9cdf40fe2c63..83bbc9406081 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -502,6 +502,15 @@ struct TORCH_API ViewInfo {
 enum class CreationMeta: uint8_t { DEFAULT, IN_CUSTOM_FUNCTION, MULTI_OUTPUT_NODE,
                                    NO_GRAD_MODE, MULTI_OUTPUT_SAFE };
 
+/// Handles correctly propagating CreationMeta when a new view is created from a previous view.
+/// In general, we don't want the new view to be _less_ restrictive than the previous view
+/// (it's okay to be _more_ restrictive). A CreationMeta value of DEFAULT is currently the least
+/// restrictive, as the behavior for all other CreationMeta values is to error out for in-place ops.
+/// If this changes, the logic here will need to be updated to properly handle the new semantics.
+inline CreationMeta propagate_creation_meta(CreationMeta prev_view_creation_meta, CreationMeta new_view_creation_meta) {
+  return (new_view_creation_meta == CreationMeta::DEFAULT) ? prev_view_creation_meta : new_view_creation_meta;
+}
+
 /// Unified function to handle error checking when rebase happens
 /// indirect=true means that the caller is not doing the inplace, but the inplace happened
 /// somewhere else.

From 4a2aa0f5f1572d8a9caca9496669f8f52cfa1522 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Wed, 27 Jan 2021 09:09:09 -0800
Subject: [PATCH 02/41] index_put_ for complex tensors on CUDA (#51148)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51148

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D26102025

Pulled By: anjali411

fbshipit-source-id: b1b6fd12fda03c4520a3c3200226edf352496188
---
 aten/src/ATen/native/cuda/Indexing.cu | 2 +-
 test/test_indexing.py                 | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 035dc188c81c..6b3304cff421 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -230,7 +230,7 @@ void index_put_accum_kernel(Tensor & self, const c10::List<c10::optional<Tensor>
            std::min(std::max<int>(1,nElemBefore), at::cuda::getCurrentDeviceProperties()->maxGridSize[2]));
       dim3 block(C10_WARP_SIZE, indices_per_block);
 
-      AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       value_.scalar_type(), "indexing_backward", [&] {
         indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
           sorted_indices.data_ptr<int64_t>(),
diff --git a/test/test_indexing.py b/test/test_indexing.py
index b92fd94e8cbd..10e4a9bafe95 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -762,9 +762,9 @@ def test_int_indices(self, device):
         self.assertEqual(v[:, [0, 4, 2]].shape, (5, 3, 3))
         self.assertEqual(v[:, [[0, 1], [4, 3]]].shape, (5, 2, 2, 3))
 
-    @dtypes(torch.float, torch.bfloat16, torch.long, torch.bool)
-    @dtypesIfCPU(torch.float, torch.long, torch.bool, torch.bfloat16)
-    @dtypesIfCUDA(torch.half, torch.long, torch.bool, torch.bfloat16)
+    @dtypes(torch.cfloat, torch.cdouble, torch.float, torch.bfloat16, torch.long, torch.bool)
+    @dtypesIfCPU(torch.cfloat, torch.cdouble, torch.float, torch.long, torch.bool, torch.bfloat16)
+    @dtypesIfCUDA(torch.cfloat, torch.cdouble, torch.half, torch.long, torch.bool, torch.bfloat16)
     def test_index_put_src_datatype(self, device, dtype):
         src = torch.ones(3, 2, 4, device=device, dtype=dtype)
         vals = torch.ones(3, 2, 4, device=device, dtype=dtype)

From 16dd5ca8abd926bed2f49365171821c33a749799 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Wed, 27 Jan 2021 10:29:41 -0800
Subject: [PATCH 03/41] Followup of kron PR (#51045)

Summary:
Followup of https://github.com/pytorch/pytorch/pull/50927

Pull Request resolved: https://github.com/pytorch/pytorch/pull/51045

Reviewed By: mruberry

Differential Revision: D26089204

Pulled By: ngimel

fbshipit-source-id: 77291dd83fba32d6f80a8540910b112a1d85a892
---
 aten/src/ATen/native/LinearAlgebra.cpp | 22 ++++++++++++++--------
 test/test_linalg.py                    | 12 ++++++++++++
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index a4b6e4df43f8..47ff70b93231 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -2223,17 +2223,17 @@ Tensor chain_matmul(TensorList matrices) {
 Calculates the Kronecker product between two Tensors.
 */
 Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto maxdim = std::max(self.dim(), other.dim());
-  auto pad_self = maxdim - self.dim();
-  auto pad_other = maxdim - other.dim();
+  int64_t maxdim = std::max(self.dim(), other.dim());
+  int64_t pad_self = maxdim - self.dim();
+  int64_t pad_other = maxdim - other.dim();
   c10::SmallVector<int64_t, 10> a_reshape(2 * maxdim);
   c10::SmallVector<int64_t, 10> b_reshape(2 * maxdim);
   c10::SmallVector<int64_t, 10> result_reshape(maxdim);
-  for (int i = 0; i < maxdim; i++) {
-    a_reshape[2 * i] = i >= pad_self ? self.sizes()[i - pad_self] : 1;
+  for (int64_t i = 0; i < maxdim; i++) {
+    a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1);
     a_reshape[2 * i + 1] = 1;
     b_reshape[2 * i] = 1;
-    b_reshape[2 * i + 1] = i >= pad_other ? other.sizes()[i - pad_other] : 1;
+    b_reshape[2 * i + 1] = (i >= pad_other ? other.sizes()[i - pad_other] : 1);
     result_reshape[i] = a_reshape[2 * i] * b_reshape[2 * i + 1];
   }
   auto self_view = at::_unsafe_view(self, a_reshape);
@@ -2241,8 +2241,14 @@ Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) {
   if (!result.defined()) {
     result = at::_unsafe_view(at::mul(self_view, other_view), result_reshape);
   } else {
-    at::mul_out(result, self_view, other_view);
-    result.resize_(result_reshape);
+    c10::SmallVector<int64_t, 10> mul_shape(2 * maxdim);
+    for (int64_t i = 0; i < maxdim; i++) {
+      mul_shape[2 * i] = a_reshape[2 * i];
+      mul_shape[2 * i + 1] = b_reshape[2 * i + 1];
+    }
+    resize_output(result, result_reshape);
+    auto result_mul = at::_unsafe_view(result, mul_shape);
+    at::mul_out(result_mul, self_view, other_view);
   }
   return result;
 }
diff --git a/test/test_linalg.py b/test/test_linalg.py
index f2ee0dcaaef9..fd70ebaad04f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -828,6 +828,18 @@ def run_test_skipped_elements(a_shape, b_shape):
             # run_test_transposed(a_shape, b_shape)
             run_test_skipped_elements(a_shape, b_shape)
 
+        # Test that kron perserve memory format
+        a = torch.randn(1, 2, 3, 4, dtype=dtype, device=device).contiguous(memory_format=torch.channels_last)
+        b = torch.randn(1, 2, 3, 4, dtype=dtype, device=device).contiguous(memory_format=torch.channels_last)
+        c = torch.kron(a, b)
+        self.assertTrue(c.is_contiguous(memory_format=torch.channels_last))
+        torch.kron(a, b, out=c)
+        self.assertTrue(c.is_contiguous(memory_format=torch.channels_last))
+        c = c.contiguous(memory_format=torch.contiguous_format)
+        torch.kron(a, b, out=c)
+        self.assertTrue(c.is_contiguous(memory_format=torch.contiguous_format))
+
+
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_kron_empty(self, device, dtype):
 

From e9ffad088f720b70dc6836cc8672b812560ededa Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 27 Jan 2021 10:32:35 -0800
Subject: [PATCH 04/41] numeric suite: add types to eager (#51168)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51168

Adds types to function I/O for numeric suite.  This is for readability
and static type checking with mypy.

Test Plan:
```
mypy torch/quantization/
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D26092454

fbshipit-source-id: d37cf61e4d9604f4bc550b392f55fb59165f7624
---
 torch/quantization/_numeric_suite.py | 68 +++++++++++++++++-----------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/torch/quantization/_numeric_suite.py b/torch/quantization/_numeric_suite.py
index 100ff54d4436..de0b4083b390 100644
--- a/torch/quantization/_numeric_suite.py
+++ b/torch/quantization/_numeric_suite.py
@@ -1,10 +1,9 @@
-
 import torch
 import torch.nn as nn
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 from torch.quantization import prepare
-from typing import Dict
+from typing import Dict, List, Optional, Any, Union, Callable, Set
 
 from .quantization_mappings import (
     get_default_compare_output_module_list,
@@ -18,7 +17,10 @@
 }
 
 
-def _find_match(str_list, key_str, postfix):
+def _find_match(
+    str_list: Union[Dict[str, Any], List[str]], key_str: str,
+    postfix: str,
+) -> Optional[str]:
     split_str = key_str.split(".")
     if split_str[-1] == postfix:
         match_string = "".join(key_str.split(".")[0:-1])
@@ -42,11 +44,14 @@ def _find_match(str_list, key_str, postfix):
                     return s2
                 if match_string == pattern2:
                     return s2
+        return None
     else:
         return None
 
 
-def compare_weights(float_dict, quantized_dict):
+def compare_weights(
+    float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+) -> Dict[str, Dict[str, torch.Tensor]]:
     r"""Compare the weights of the float module with its corresponding quantized
     module. Return a dict with key corresponding to module names and each entry being
     a dictionary with two keys 'float' and 'quantized', containing the float and
@@ -105,7 +110,10 @@ def compare_weights(float_dict, quantized_dict):
     return weight_dict
 
 
-def _get_logger_dict_helper(mod, target_dict, prefix=""):
+def _get_logger_dict_helper(
+    mod: nn.Module, target_dict: Dict[str, Any],
+    prefix: str = "",
+) -> None:
     r"""This is the helper function for get_logger_dict
 
     Args:
@@ -127,7 +135,7 @@ def get_prefix(prefix):
         _get_logger_dict_helper(child, target_dict, module_prefix)
 
 
-def get_logger_dict(mod, prefix=""):
+def get_logger_dict(mod: nn.Module, prefix: str = "") -> Dict[str, Dict]:
     r"""Traverse the modules and save all logger stats into target dict.
     This is mainly used for quantization accuracy debug.
 
@@ -195,11 +203,11 @@ def forward(self, x):
         return x
 
 
-def _convert_tuple_to_list(t):
+def _convert_tuple_to_list(t: Any) -> Any:
     return list(_convert_tuple_to_list(x) for x in t) if type(t) is tuple else t
 
 
-def _dequantize_tensor_list(t):
+def _dequantize_tensor_list(t: Any) -> Any:
     return (
         list(_dequantize_tensor_list(x) for x in t)
         if type(t) is list
@@ -228,7 +236,7 @@ def __init__(self, q_module, float_module, Logger):
         self.dequant = nnq.DeQuantize()
         self.logger = Logger()
 
-    def forward(self, *x):
+    def forward(self, *x) -> torch.Tensor:
         xl = _convert_tuple_to_list(x)
         output = self.orig_module(*xl)
         xl_float = _dequantize_tensor_list(xl)
@@ -236,7 +244,7 @@ def forward(self, *x):
         self.logger(output, shadow_output)
         return output
 
-    def add(self, x, y):
+    def add(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         output = self.orig_module.add(x, y)
         x = x.dequantize()
         y = y.dequantize()
@@ -244,14 +252,14 @@ def add(self, x, y):
         self.logger(output, shadow_output)
         return output
 
-    def add_scalar(self, x, y):
+    def add_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
         output = self.orig_module.add_scalar(x, y)
         x = x.dequantize()
         shadow_output = self.shadow_module.add_scalar(x, y)
         self.logger(output, shadow_output)
         return output
 
-    def mul(self, x, y):
+    def mul(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         output = self.orig_module.mul(x, y)
         x = x.dequantize()
         y = y.dequantize()
@@ -259,21 +267,21 @@ def mul(self, x, y):
         self.logger(output, shadow_output)
         return output
 
-    def mul_scalar(self, x, y):
+    def mul_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
         output = self.orig_module.mul_scalar(x, y)
         x = x.dequantize()
         shadow_output = self.shadow_module.mul_scalar(x, y)
         self.logger(output, shadow_output)
         return output
 
-    def cat(self, x, dim=0):
+    def cat(self, x: List[torch.Tensor], dim: int = 0) -> torch.Tensor:
         output = self.orig_module.cat(x, dim)
         x = [y.dequantize() for y in x]
         shadow_output = self.shadow_module.cat(x, dim)
         self.logger(output, shadow_output)
         return output
 
-    def add_relu(self, x, y):
+    def add_relu(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         output = self.orig_module.add_relu(x, y)
         x = x.dequantize()
         y = y.dequantize()
@@ -282,7 +290,10 @@ def add_relu(self, x, y):
         return output
 
 
-def prepare_model_with_stubs(float_module, q_module, module_swap_list, Logger):
+def prepare_model_with_stubs(
+    float_module: nn.Module, q_module: nn.Module,
+    module_swap_list: Set[type], Logger: Callable,
+) -> None:
     r"""Prepare the model by attaching the float module to its matching quantized
     module as the shadow if the float module type is in module_swap_list.
 
@@ -322,8 +333,9 @@ def prepare_model_with_stubs(float_module, q_module, module_swap_list, Logger):
 
 
 def compare_model_stub(
-    float_model, q_model, module_swap_list, *data, Logger=ShadowLogger
-):
+    float_model: nn.Module, q_model: nn.Module, module_swap_list: Set[type],
+    *data, Logger=ShadowLogger
+) -> Dict[str, Dict]:
     r"""Compare quantized module in a model with its floating point counterpart,
     feeding both of them the same input. Return a dict with key corresponding to
     module names and each entry being a dictionary with two keys 'float' and
@@ -361,7 +373,9 @@ def compare_model_stub(
     return ob_dict
 
 
-def get_matching_activations(float_module, q_module):
+def get_matching_activations(
+    float_module: nn.Module, q_module: nn.Module,
+) -> Dict[str, Dict[str, torch.Tensor]]:
     r"""Find the matching activation between float and quantized modules.
 
     Args:
@@ -387,11 +401,11 @@ def get_matching_activations(float_module, q_module):
 
 
 def prepare_model_outputs(
-    float_module,
-    q_module,
+    float_module: nn.Module,
+    q_module: nn.Module,
     Logger=OutputLogger,
     allow_list=None
-):
+) -> None:
     r"""Prepare the model by attaching the logger to both float module
     and quantized module if they are in the allow_list.
 
@@ -406,9 +420,9 @@ def prepare_model_outputs(
         allow_list = get_default_compare_output_module_list()
 
     qconfig_debug = torch.quantization.QConfig(activation=Logger, weight=None)
-    float_module.qconfig = qconfig_debug
+    float_module.qconfig = qconfig_debug  # type: ignore
     prepare(float_module, inplace=True, allow_list=allow_list)
-    q_module.qconfig = qconfig_debug
+    q_module.qconfig = qconfig_debug  # type: ignore
     prepare(
         q_module,
         inplace=True,
@@ -418,12 +432,12 @@ def prepare_model_outputs(
 
 
 def compare_model_outputs(
-    float_model,
-    q_model,
+    float_model: nn.Module,
+    q_model: nn.Module,
     *data,
     Logger=OutputLogger,
     allow_list=None
-):
+) -> Dict[str, Dict[str, torch.Tensor]]:
     r"""Compare output activations between float and quantized models at
     corresponding locations for the same input. Return a dict with key corresponding
     to quantized module names and each entry being a dictionary with two keys

From 9b6d463704d1dbac9948cde8d39e6790640a23eb Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 27 Jan 2021 10:34:56 -0800
Subject: [PATCH 05/41] Move std and var tests to OpInfos (#50901)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50901

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D26083289

Pulled By: mruberry

fbshipit-source-id: 7e14ff37bba46dd456e0bc0aa9c4e0a632d0734c
---
 test/test_ops.py                              | 18 +++----
 test/test_torch.py                            |  6 ---
 .../_internal/common_methods_invocations.py   | 49 +++++++++++++++----
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index bd82aca3820a..40627cd4b264 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2,7 +2,6 @@
 
 import torch
 
-from torch.testing import floating_and_complex_types_and
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, IS_SANDCASTLE, clone_input_helper)
 from torch.testing._internal.common_methods_invocations import \
@@ -191,7 +190,8 @@ def check_variant_backward(self, input, forward_result, expected_grad, expected_
     #   against eager's gold standard op function variant
     @ops(op_db)
     def test_variant_consistency_eager(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype, requires_grad=True)
+        test_backward = op.test_complex_grad or not dtype.is_complex
+        samples = op.sample_inputs(device, dtype, requires_grad=test_backward)
         if len(samples) == 0:
             self.skipTest("Skipped! No sample inputs!")
 
@@ -237,7 +237,7 @@ def test_variant_consistency_eager(self, device, dtype, op):
                 self.assertEqual(variant_forward, expected_forward)
 
                 # Compares variant's backward
-                if variant is not inplace or op.test_inplace_grad:
+                if test_backward and (variant is not inplace or op.test_inplace_grad):
                     self.check_variant_backward(sample.input, variant_forward,
                                                 expected_grad, exception_during_backwards)
 
@@ -247,7 +247,10 @@ def test_variant_consistency_eager(self, device, dtype, op):
     # TODO WARNING: inplace x {traced, scripted} not currently tested
     @ops(op_db)
     def test_variant_consistency_jit(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype, requires_grad=True)
+        test_backward = (
+            (dtype.is_complex and op.test_complex_grad) or
+            (dtype.is_floating_point and (not op.skip_bfloat16_grad or dtype != torch.bfloat16)))
+        samples = op.sample_inputs(device, dtype, requires_grad=test_backward)
         if len(samples) == 0:
             self.skipTest("Skipped! No sample inputs!")
 
@@ -279,9 +282,6 @@ def fn(*inputs, **kwargs):
                         output = func(*inputs, **kwargs)
                         return op.output_func(output)
 
-                    # bfloat16 grad doesn't work for some operators
-                    dtypes_to_grad_check = floating_and_complex_types_and(torch.half) \
-                        if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16)
 
                     # Check scripted forward, grad, and grad grad
                     script_fn = create_script_fn(self, name, func_type, op.output_func)
@@ -291,7 +291,7 @@ def fn(*inputs, **kwargs):
                                             fn,
                                             (*sample.input,) + sample.args,
                                             sample.kwargs,
-                                            no_grad=(dtype not in dtypes_to_grad_check))
+                                            no_grad=not test_backward)
 
                     # Check traced forward, grad, and grad grad
                     traced_fn = create_traced_fn(self, variant)
@@ -300,7 +300,7 @@ def fn(*inputs, **kwargs):
                                             fn,
                                             (*sample.input,) + sample.args,
                                             sample.kwargs,
-                                            no_grad=(dtype not in dtypes_to_grad_check))
+                                            no_grad=not test_backward)
 
                     # Check alias annotation schema for correctness (make
                     #   sure inputs that aren't supposed to be modified aren't)
diff --git a/test/test_torch.py b/test/test_torch.py
index 4ace1012167d..55f5ee73c187 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6723,12 +6723,6 @@ def inner(self, device, dtype):
     ('remainder', 'negative_tensor', _small_3d,
         lambda t, d: [0 - _small_3d(t, d, has_zeros=False)],
         1e-1, 1e-2, 1e-5, _signed_types),
-    ('std', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('std', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('std', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('var', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('var', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('var', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
     ('ndimension', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('nelement', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('numel', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 90aa1468bd4a..8be600914d97 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -845,6 +845,21 @@ def sample_inputs_linalg_solve(op_info, device, dtype, requires_grad=False):
     return out
 
 
+def sample_inputs_std_var(op_info, device, dtype, requires_grad):
+    tensor_nd = make_tensor((S, S, S), device=device, dtype=dtype,
+                            low=None, high=None, requires_grad=requires_grad)
+    tensor_1d = make_tensor((S,), device=device, dtype=dtype,
+                            low=None, high=None, requires_grad=requires_grad)
+
+    return [
+        SampleInput(tensor_nd),
+        SampleInput(tensor_nd, kwargs=dict(dim=1)),
+        SampleInput(tensor_nd, kwargs=dict(dim=1, unbiased=True, keepdim=True)),
+        SampleInput(tensor_1d, kwargs=dict(dim=0, unbiased=True, keepdim=True)),
+        SampleInput(tensor_1d, kwargs=dict(dim=0, unbiased=False, keepdim=False)),
+    ]
+
+
 def _sample_inputs_svd(op_info, device, dtype, requires_grad=False, is_linalg_svd=False):
     """
     This function generates input for torch.svd with distinct singular values so that autograd is always stable.
@@ -1437,6 +1452,18 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                        SkipInfo('TestCommon', 'test_variant_consistency_jit',
                                 device_type='cuda', dtypes=[torch.float16]),
                    )),
+    OpInfo('std',
+           dtypes=floating_types_and(),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var,
+           supports_tensor_out=False,
+           test_complex_grad=False,
+           test_inplace_grad=False,
+           # std has only partial support for complex and half (#51127)
+           skips=(SkipInfo('TestOpInfo', 'test_unsupported_dtypes',
+                           dtypes=[torch.half, torch.complex64, torch.complex128]),),
+           assert_autodiffed=True,
+           ),
     UnaryUfuncInfo('tan',
                    ref=np.tan,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
@@ -1726,6 +1753,18 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
                   supports_tensor_out=False,
                   test_inplace_grad=False,
                   sample_inputs_func=sample_repeat_tile),
+    OpInfo('var',
+           dtypes=floating_types_and(),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var,
+           supports_tensor_out=False,
+           test_complex_grad=False,
+           test_inplace_grad=False,
+           # var has only partial support for complex and half (#51127)
+           skips=(SkipInfo('TestOpInfo', 'test_unsupported_dtypes',
+                           dtypes=[torch.half, torch.complex64, torch.complex128]),),
+           assert_autodiffed=True,
+           ),
 ]
 
 if TEST_SCIPY:
@@ -2294,16 +2333,6 @@ def method_tests():
         ('prod', (torch.tensor(0., requires_grad=True)), NO_ARGS, 'scalar_zero'),
         ('prod', (torch.tensor(0., requires_grad=True)), (0,), 'scalar_dim_zero', (), [0]),
         ('prod', (torch.tensor(0., requires_grad=True)), (0, True,), 'scalar_keepdim_dim_zero', (), [0]),
-        ('var', (S, S, S), NO_ARGS, '', (True,)),
-        ('var', (S, S, S), (1,), 'dim', (True,), [0]),
-        ('var', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]),
-        ('var', (S,), (0,), 'dim_1d', (True,), [0]),
-        ('var', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]),
-        ('std', (S, S, S), NO_ARGS, '', (True,)),
-        ('std', (S, S, S), (1,), 'dim', (True,), [0]),
-        ('std', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]),
-        ('std', (S,), (0,), 'dim_1d', (True,), [0]),
-        ('std', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]),
         ('var_mean', (S, S, S), NO_ARGS, ''),
         ('var_mean', (S, S, S), (1,), 'dim', [0]),
         ('var_mean', (S, S, S), (1, True, True), 'keepdim_dim', [0]),

From 00adc7b07f5f89af1c9d3db36c656233566b92e7 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 27 Jan 2021 10:49:10 -0800
Subject: [PATCH 06/41] Fix more JIT tests under Python-3.9 (#51182)

Summary:
Mostly replace `global Foo` with `make_global(Foo)`
The only real fix is generating Subscript annotation, which is a follow up from https://github.com/pytorch/pytorch/pull/48676

Fixes https://github.com/pytorch/pytorch/issues/49617

Pull Request resolved: https://github.com/pytorch/pytorch/pull/51182

Reviewed By: gmagogsfm

Differential Revision: D26095244

Pulled By: malfet

fbshipit-source-id: 0e043d9a2cf43fff71dfbb341f708cd7af87c39a
---
 test/jit/test_class_type.py | 44 +++++++++++++-------------
 test/jit/test_enum.py       | 63 +++++++++++++++----------------------
 test/jit/test_with.py       | 14 ++++-----
 test/test_jit.py            |  6 ++--
 torch/_jit_internal.py      |  4 ++-
 5 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 4d3d73e5f7c7..3a5881f365c1 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -11,7 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 import torch.testing._internal.jit_utils
 from torch.testing._internal.common_utils import IS_SANDCASTLE
 from typing import List, Tuple, Iterable, Optional, Dict
@@ -143,12 +143,12 @@ def __init__(self, x):
                         self.attr = x
 
     def test_class_type_as_param(self):
-        global FooTest  # see [local resolution in python]
-
         class FooTest(object):  # noqa: B903
             def __init__(self, x):
                 self.attr = x
 
+        make_global(FooTest)  # see [local resolution in python]
+
         @torch.jit.script
         def fn(foo: FooTest) -> torch.Tensor:
             return foo.attr
@@ -279,13 +279,13 @@ def forward(self, a):
         self.assertEqual(2 * input, output)
 
     def test_python_interop(self):
-        global Foo   # see [local resolution in python]
-
         class Foo(object):  # noqa: B903
             def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
+        make_global(Foo)   # see [local resolution in python]
+
         @torch.jit.script
         def use_foo(foo: Foo) -> Foo:
             return foo
@@ -305,13 +305,13 @@ def use_foo(foo: Foo) -> Foo:
         self.assertEqual(y, f2.y)
 
     def test_class_specialization(self):
-        global Foo  # see [local resolution in python]
-
         class Foo(object):  # noqa: B903
             def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
+        make_global(Foo)  # see [local resolution in python]
+
         def use_foo(foo: Foo, foo2: Foo, tup: Tuple[Foo, Foo]) -> torch.Tensor:
             a, b = tup
             return foo.x + foo2.y + a.x + b.y
@@ -329,8 +329,6 @@ def use_foo(foo: Foo, foo2: Foo, tup: Tuple[Foo, Foo]) -> torch.Tensor:
         FileCheck().check_count("prim::GetAttr", 4).run(graphstr)
 
     def test_class_sorting(self):
-        global Foo  # see [local resolution in python]
-
         class Foo(object):  # noqa: B903
             def __init__(self, x: int) -> None:
                 self.x = x
@@ -342,6 +340,8 @@ def __lt__(self, other) -> bool:
             def getVal(self):
                 return self.x
 
+        make_global(Foo)  # see [local resolution in python]
+
         def test(li: List[Foo], reverse: bool = False) -> Tuple[List[int], List[int]]:
             li_sorted = sorted(li)
             ret_sorted = torch.jit.annotate(List[int], [])
@@ -500,8 +500,6 @@ def forward(self, a):
         self.assertEqual(3 * input, output)
 
     def test_interface(self):
-        global Foo, Bar, OneTwo, OneTwoThree, OneTwoWrong, NotMember, NotMember2
-
         @torch.jit.script
         class Foo(object):
             def __init__(self):
@@ -571,6 +569,8 @@ def one(self, x, y):
             def two(self, x: int) -> int:
                 return 3
 
+        make_global(Foo, Bar, OneTwo, OneTwoThree, OneTwoWrong, NotMember, NotMember2)
+
         def use_them(x):
             a = Foo()
             b = Bar()
@@ -652,8 +652,6 @@ def __init__(self):
         # NamedTuple inheritance errors
 
     def test_overloaded_fn(self):
-        global Foo, MyClass  # see [local resolution in python]
-
         @torch.jit.script
         class Foo(object):
             def __init__(self, x):
@@ -673,6 +671,8 @@ def test_overload():
             a = Foo(torch.ones([3, 3]))
             return len(a), -a * torch.zeros([3, 3])
 
+        make_global(Foo)  # see [local resolution in python]
+
         self.checkScript(test_overload, ())
         # unary ops tested above
 
@@ -737,6 +737,8 @@ def __call__(self, val: int) -> int:
                 return self.x * val * 3
 
 
+        make_global(Foo)  # see [local resolution in python]
+
         def add():
             return MyClass(4) + 3
         def sub():  # noqa: E306
@@ -787,8 +789,6 @@ def test():
                 return Foo(torch.tensor(1)) + Foo(torch.tensor(1))
 
     def test_cast_overloads(self):
-        global Foo  # see [local resolution in python]
-
         @torch.jit.script
         class Foo(object):
             def __init__(self, val: float) -> None:
@@ -806,6 +806,8 @@ def __bool__(self):
             def __str__(self):
                 return str(self.val)
 
+        make_global(Foo)  # see [local resolution in python]
+
         def test(foo: Foo) -> Tuple[int, float, bool]:
             if foo:
                 pass
@@ -914,8 +916,6 @@ def forward(self, x):
             self.assertEqual(m.w, m_loaded.w)
 
     def test_py_class_to_ivalue_missing_attribute(self):
-        global Foo  # see [local resolution in python]
-
         class Foo(object):
             i : int
             f : float
@@ -924,6 +924,8 @@ def __init__(self, i : int, f : float):
                 self.i = i
                 self.f = f
 
+        make_global(Foo)  # see [local resolution in python]
+
         @torch.jit.script
         def test_fn(x : Foo) -> float:
             return x.i + x.f
@@ -1132,8 +1134,6 @@ def test_staticmethod(self):
         """
         Test static methods on class types.
         """
-        global ClassWithStaticMethod
-
         @torch.jit.script
         class ClassWithStaticMethod:
             def __init__(self, a: int, b: int):
@@ -1164,14 +1164,14 @@ def create_from(a: int, b: int) -> 'ClassWithStaticMethod':
         def test_function(a: int, b: int) -> 'ClassWithStaticMethod':
             return ClassWithStaticMethod.create_from(a, b)
 
+        make_global(ClassWithStaticMethod)
+
         self.checkScript(test_function, (1, 2))
 
     def test_classmethod(self):
         """
         Test classmethods on class types.
         """
-        global ClassWithClassMethod
-
         @torch.jit.script
         class ClassWithClassMethod:
             def __init__(self, a: int):
@@ -1184,6 +1184,8 @@ def __eq__(self, other: 'ClassWithClassMethod'):
             def create(cls, a: int) -> 'ClassWithClassMethod':
                 return cls(a)
 
+        make_global(ClassWithClassMethod)
+
         def test_function(a: int) -> 'ClassWithClassMethod':
             x = ClassWithClassMethod(a)
             # Support calling classmethod with an instance
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index b39732d0e9bc..1a5f79d6e3ca 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -9,7 +9,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
@@ -18,24 +18,20 @@
 
 class TestEnum(JitTestCase):
     def test_enum_value_types(self):
-        global IntEnum
-
         class IntEnum(Enum):
             FOO = 1
             BAR = 2
 
-        global FloatEnum
-
         class FloatEnum(Enum):
             FOO = 1.2
             BAR = 2.3
 
-        global StringEnum
-
         class StringEnum(Enum):
             FOO = "foo as in foo bar"
             BAR = "bar as in foo bar"
 
+        make_global(IntEnum, FloatEnum, StringEnum)
+
         @torch.jit.script
         def supported_enum_types(a: IntEnum, b: FloatEnum, c: StringEnum):
             return (a.name, b.name, c.name)
@@ -46,12 +42,12 @@ def supported_enum_types(a: IntEnum, b: FloatEnum, c: StringEnum):
             .check("StringEnum") \
             .run(str(supported_enum_types.graph))
 
-        global TensorEnum
-
         class TensorEnum(Enum):
             FOO = torch.tensor(0)
             BAR = torch.tensor(1)
 
+        make_global(TensorEnum)
+
         def unsupported_enum_types(a: TensorEnum):
             return a.name
 
@@ -59,12 +55,12 @@ def unsupported_enum_types(a: TensorEnum):
             torch.jit.script(unsupported_enum_types)
 
     def test_enum_comp(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def enum_comp(x: Color, y: Color) -> bool:
             return x == y
@@ -75,8 +71,6 @@ def enum_comp(x: Color, y: Color) -> bool:
         self.assertEqual(enum_comp(Color.RED, Color.GREEN), False)
 
     def test_enum_comp_diff_classes(self):
-        global Foo, Bar
-
         class Foo(Enum):
             ITEM1 = 1
             ITEM2 = 2
@@ -85,6 +79,8 @@ class Bar(Enum):
             ITEM1 = 1
             ITEM2 = 2
 
+        make_global(Foo, Bar)
+
         @torch.jit.script
         def enum_comp(x: Foo) -> bool:
             return x == Bar.ITEM1
@@ -98,12 +94,12 @@ def enum_comp(x: Foo) -> bool:
         self.assertEqual(enum_comp(Foo.ITEM1), False)
 
     def test_heterogenous_value_type_enum_error(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = "green"
 
+        make_global(Color)
+
         def enum_comp(x: Color, y: Color) -> bool:
             return x == y
 
@@ -111,12 +107,12 @@ def enum_comp(x: Color, y: Color) -> bool:
             torch.jit.script(enum_comp)
 
     def test_enum_name(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def enum_name(x: Color) -> str:
             return x.name
@@ -131,12 +127,12 @@ def enum_name(x: Color) -> str:
         self.assertEqual(enum_name(Color.GREEN), Color.GREEN.name)
 
     def test_enum_value(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def enum_value(x: Color) -> int:
             return x.value
@@ -151,12 +147,12 @@ def enum_value(x: Color) -> int:
         self.assertEqual(enum_value(Color.GREEN), Color.GREEN.value)
 
     def test_enum_as_const(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def enum_const(x: Color) -> bool:
             return x == Color.RED
@@ -171,12 +167,12 @@ def enum_const(x: Color) -> bool:
         self.assertEqual(enum_const(Color.GREEN), False)
 
     def test_non_existent_enum_value(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         def enum_const(x: Color) -> bool:
             if x == Color.PURPLE:
                 return True
@@ -187,12 +183,12 @@ def enum_const(x: Color) -> bool:
             torch.jit.script(enum_const)
 
     def test_enum_ivalue_type(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def is_color_enum(x: Any):
             return isinstance(x, Color)
@@ -207,8 +203,6 @@ def is_color_enum(x: Any):
         self.assertEqual(is_color_enum(1), False)
 
     def test_closed_over_enum_constant(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
@@ -240,8 +234,6 @@ def closed_over_aliased_value():
         self.assertEqual(closed_over_aliased_value(), Color.RED.value)
 
     def test_enum_as_module_attribute(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
@@ -268,8 +260,6 @@ def forward(self):
         self.assertEqual(scripted(), Color.RED.value)
 
     def test_string_enum_as_module_attribute(self):
-        global Color
-
         class Color(Enum):
             RED = "red"
             GREEN = "green"
@@ -282,18 +272,19 @@ def __init__(self, e: Color):
             def forward(self):
                 return (self.e.name, self.e.value)
 
+        make_global(Color)
         m = TestModule(Color.RED)
         scripted = torch.jit.script(m)
 
         self.assertEqual(scripted(), (Color.RED.name, Color.RED.value))
 
     def test_enum_return(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
 
+        make_global(Color)
+
         @torch.jit.script
         def return_enum(cond: bool):
             if cond:
@@ -305,8 +296,6 @@ def return_enum(cond: bool):
         self.assertEqual(return_enum(False), Color.GREEN)
 
     def test_enum_module_return(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
@@ -319,6 +308,7 @@ def __init__(self, e: Color):
             def forward(self):
                 return self.e
 
+        make_global(Color)
         m = TestModule(Color.RED)
         scripted = torch.jit.script(m)
 
@@ -333,8 +323,6 @@ def forward(self):
 
 
     def test_enum_iterate(self):
-        global Color
-
         class Color(Enum):
             RED = 1
             GREEN = 2
@@ -347,6 +335,7 @@ def iterate_enum(x: Color):
                     res.append(e.value)
             return res
 
+        make_global(Color)
         scripted = torch.jit.script(iterate_enum)
 
         FileCheck() \
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index f958dc46c39a..35ff5b959737 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -4,7 +4,7 @@
 from typing import Any, List
 
 import torch
-from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
 # Make the helper files in test/ importable
@@ -29,8 +29,6 @@ def test_with_as(self):
         Check that with statements that use the 'as' keyword to bind expressions
         to targets work as expected.
         """
-        global Context
-
         @torch.jit.script
         class Context(object):
             """
@@ -50,6 +48,8 @@ def __enter__(self):
             def __exit__(self, type: Any, value: Any, tb: Any):
                 self.count.sub_(0.3)
 
+        make_global(Context)
+
         def test_basic(x: torch.Tensor) -> torch.Tensor:
             """Basic test with one with-statement."""
 
@@ -185,8 +185,6 @@ def test_with_no_as(self):
         Check that with statements that do not use the 'as' keyword to bind expressions
         to targets work as expected.
         """
-        global Context
-
         @torch.jit.script
         class Context(object):
             """
@@ -206,6 +204,8 @@ def __enter__(self):
             def __exit__(self, type: Any, value: Any, tb: Any):
                 self.count.sub_(0.3)
 
+        make_global(Context)
+
         def test_basic(x: torch.Tensor) -> torch.Tensor:
             """Basic test with one with-statement."""
 
@@ -341,8 +341,6 @@ def test_with_exceptions(self):
         Check that exceptions thrown in the bodies of with-statements are
         handled correctly.
         """
-        global Context
-
         @torch.jit.script
         class Context(object):
             """
@@ -362,6 +360,8 @@ def __enter__(self):
             def __exit__(self, type: Any, value: Any, tb: Any):
                 self.count.sub_(0.3)
 
+        make_global(Context)
+
         @torch.jit.script
         def method_that_raises() -> torch.Tensor:
             raise Exception("raised exception")
diff --git a/test/test_jit.py b/test/test_jit.py
index 4d37cd0a3ef9..e745f898824a 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -65,7 +65,7 @@
     freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
-    _trace, enable_cpu_fuser_if, do_input_map, get_execution_plan, \
+    _trace, enable_cpu_fuser_if, do_input_map, get_execution_plan, make_global, \
     execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \
     RUN_CUDA
 from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, nn_functional_tests, get_script_args, \
@@ -6609,8 +6609,6 @@ def bar(c, b):
                    .check("in foo").check("in baz").run(str(cm.exception))
 
     def test_error_stacktrace_interface(self):
-        global IFace
-
         @torch.jit.script
         def baz(c, b):
             return c + b
@@ -6634,6 +6632,8 @@ def one(self, x, y):
                 # type: (Tensor, Tensor) -> Tensor
                 pass
 
+        make_global(IFace)
+
         @torch.jit.script
         def as_interface(x):
             # type: (IFace) -> IFace
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index be287d0a9a3b..f4dabcbf97de 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -241,7 +241,9 @@ def get_annotation_str(annotation):
     elif isinstance(annotation, ast.Attribute):
         return '.'.join([get_annotation_str(annotation.value), annotation.attr])
     elif isinstance(annotation, ast.Subscript):
-        return f"{get_annotation_str(annotation.value)}[{get_annotation_str(annotation.slice.value)}]"  # type: ignore
+        # In Python3.9+ subscript indicies are not wrapped in ast.Index
+        subscript_slice = annotation.slice if sys.version_info >= (3, 9) else annotation.slice.value  # type: ignore
+        return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]"
     elif isinstance(annotation, ast.Tuple):
         return ','.join([get_annotation_str(elt) for elt in annotation.elts])
     elif isinstance(annotation, ast.Constant) or isinstance(annotation, ast.NameConstant):

From d3ec204ef215fca6888d92e18ffa9db126d5b837 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 27 Jan 2021 11:17:53 -0800
Subject: [PATCH 07/41] [quant][graphmode][fx] Add functional conv2d + relu
 (#51079)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51079

Added support for functional conv2d + relu, will add conv1d and conv3d in future PR

Test Plan:
python test/test_quantization.py TestQuantizeFxOps.test_functional_conv

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D26089964

fbshipit-source-id: 8703de17de1469f7076651c386c83fb5922a56eb
---
 test/quantization/test_quantize_fx.py         | 80 ++++++++++++++---
 .../quantization/fx/quantization_patterns.py  | 88 ++++++++++++++-----
 2 files changed, 136 insertions(+), 32 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 8d7ba7dcb4e6..b2243eead1d0 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1497,7 +1497,7 @@ def forward(self, x):
                 self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
 
     @skipIfNoFBGEMM
-    def test_linear_functional(self):
+    def test_functional_linear(self):
         class FuncLinear(torch.nn.Module):
             def __init__(self, use_bias, has_relu, f_relu):
                 super(FuncLinear, self).__init__()
@@ -1595,22 +1595,80 @@ def forward(self, x):
                 quantized_nodes[dim])
 
     @skipIfNoFBGEMM
-    def test_conv2d_functional(self):
-        for bias in [True, False]:
-            conv = torch.nn.Conv2d(1, 1, 1, bias=bias)
+    def test_functional_conv(self):
+        """ Test for function conv and functional conv + relu
+        """
+        class FuncConv(torch.nn.Module):
+            def __init__(self, use_bias, has_relu, f_relu):
+                super().__init__()
+                self.w = torch.randn(3, 3, 3, 3)
+                self.b = torch.randn(3) if use_bias else None
+                self.stride = (1, 1)
+                self.padding = (0, 0)
+                self.dilation = (1, 1)
+                self.groups = 1
+                self.use_bias = use_bias
+                if has_relu:
+                    if f_relu:
+                        self.relu = F.relu
+                    else:
+                        self.relu = torch.nn.ReLU()
+                else:
+                    self.relu = torch.nn.Identity()
+
+            def forward(self, x):
+                x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups)
+                x = self.relu(x)
+                return x
+
+        data = (torch.randn((2, 3, 4, 4), dtype=torch.float),)
+
+        quant_type_to_prepare_expected_node_occurrence = {
+            QuantType.DYNAMIC: {},
             # There should be 3 observers: after input, weight and activation.
-            # No observer after bias.
-            prepare_expected_node_occurrence = {
+            QuantType.STATIC: {
                 ns.call_module(torch.quantization.HistogramObserver): 2,
                 ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1,
+            },
+            # There should be 3 observers: after input, weight and activation.
+            QuantType.QAT: {
+                ns.call_module(torch.quantization.FakeQuantize): 3,
+            },
+        }
+        quant_type_to_qconv_fun = {
+            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d),
+            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d),
+        }
+        quant_type_to_qconv_relu_fun = {
+            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu),
+            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu),
+        }
+
+        options = itertools.product(
+            self.static_quant_types,
+            (True, False),  # use_bias
+            (True, False),  # has_relu
+            (True, False),  # functional relu
+        )
+        for quant_type, use_bias, has_relu, f_relu in options:
+            model = FuncConv(use_bias, has_relu, f_relu)
+            if has_relu:
+                qconv_fun = quant_type_to_qconv_relu_fun[quant_type]
+            else:
+                qconv_fun = quant_type_to_qconv_fun[quant_type]
+
+            convert_node_occurrence = {
+                ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0,
+                qconv_fun: 1,
+                ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0
             }
-            expected_node_occurrence = \
-                {ns.call_function(torch.ops.quantized.conv2d): 1}
+            prepare_expected_node_occurrence = \
+                quant_type_to_prepare_expected_node_occurrence[quant_type]
             self.checkGraphModeFxOp(
-                conv, (torch.randn(4, 1, 4, 4),), QuantType.STATIC,
+                model, data, quant_type, qconv_fun,
                 prepare_expected_node_occurrence=prepare_expected_node_occurrence,
-                expected_node_occurrence=expected_node_occurrence,
-            )
+                expected_node_occurrence=convert_node_occurrence)
+
 
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index e6ac74dbf903..06f15240e761 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -218,12 +218,35 @@ def __init__(self, quantizer: QuantizerCls, node: Node):
     def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 debug: bool = False,
                 convert_custom_config_dict: Dict[str, Any] = None) -> Node:
+        # Supported combinations are:
+        # quant_type | activation (compute_type) | weight
+        #  static       quint8                      qint8
+
+        # tuple (activation_dtype, weight_dtype, compute_dtype)
+        supported_dtypes = [
+            (torch.quint8, torch.qint8, None),
+        ]
+
         # TODO: debug option for conv module
         qconfig = quantizer.qconfig_map[node.name]
+        dtypes = get_qconfig_dtypes(qconfig)
+        # leave the op unquantized if the dtype combination is not supported
+        if dtypes not in supported_dtypes:
+            warnings.warn(
+                "dtype combination: {} is not "
+                "supported by Conv "
+                "supported dtype combinations are: {}".format(dtypes, supported_dtypes))
+            if self.relu_node:
+                conv_out = quantizer.quantized_graph.node_copy(self.conv_node, load_arg(quantized=False))
+                relu_args = [conv_out]
+                relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:]))
+                relu_kwargs = load_arg(quantized=False)(self.relu_node.kwargs)
+                return quantizer.quantized_graph.create_node(
+                    "call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs)
+            else:
+                return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
+
         activation_statically_quantized = activation_is_statically_quantized(qconfig)
-        # only static qunatization (for both ptq and qat) is supported for conv
-        if not activation_statically_quantized:
-            return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
 
         if self.conv_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
@@ -246,21 +269,32 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 (load_arg(quantized=True)(self.conv_node.args[0]),),
                 {})
         else:  # call_function
-            assert self.conv_node.op == 'call_function'
-            if self.relu_node is not None:
-                raise Exception("functional conv + relu is not supported yet")
+            assert self.conv_node.op == "call_function"
             if debug:
                 args = load_arg(quantized=[0, 1])(self.conv_node.args)
                 args = load_arg(quantized=False)(self.conv_node.args)
                 kwargs = load_arg(quantized=False)(self.conv_node.kwargs)
-                conv_out = quantizer.quantized_graph.create_node(
-                    'call_function', torch.nn.functional.conv2d, args, kwargs)
-                root_module = quantizer.modules['']
-                return quantize_node(
-                    root_module, quantizer.quantized_graph, conv_out, quantizer.activation_post_process_map[self.conv_node.name])
+                op_out = quantizer.quantized_graph.create_node(
+                    "call_function", torch.nn.functional.conv2d, args, kwargs)
+                if self.relu_node:
+                    relu_args = [op_out]
+                    relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:]))
+                    relu_kwargs = load_arg(quantized=False)(self.relu_node.kwargs)
+                    op_out = quantizer.quantized_graph.create_node(
+                        "call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs)
+
+                if activation_statically_quantized:
+                    root_module = quantizer.modules['']
+                    act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name
+                    return quantize_node(
+                        root_module, quantizer.quantized_graph, op_out,
+                        quantizer.activation_post_process_map[act_post_process_name])
+                else:
+                    # output for dynamically quantized conv op is not quantized
+                    return op_out
             else:
-                assert len(self.conv_node.args) == 7, \
-                    'only conv2d calls with all arguments specified is support right now in debug=False option'
+                assert len(self.conv_node.args) >= 7, \
+                    "only conv2d calls with all arguments specified is supported right now in debug=False option"
                 args = load_arg(quantized=[0, 1])(self.conv_node.args)
                 # pack weight
                 weight = load_arg(quantized=True)(self.conv_node.args[1])
@@ -268,14 +302,23 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 prepack_args = tuple([weight] + list(other_args))
                 packed_weight = quantizer.quantized_graph.create_node(
                     'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {})
+                assert activation_statically_quantized, \
+                    "currently only static quantization is supported for conv"
                 # construct conv input
-                conv_input = load_arg(quantized=True)(self.conv_node.args[0])
-                activation_post_process = quantizer.activation_post_process_map[self.conv_node.name]
-                scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                qconv_args = (conv_input, packed_weight, scale, zero_point)
-                kwargs = load_arg(quantized=False)(self.conv_node.kwargs)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.conv2d, qconv_args, kwargs)
+                if activation_statically_quantized:
+                    qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d
+                    conv_input = load_arg(quantized=True)(self.conv_node.args[0])
+                    act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name
+                    activation_post_process = quantizer.activation_post_process_map[act_post_process_name]
+                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
+                    qconv_args = (conv_input, packed_weight, scale, zero_point)
+                    kwargs = load_arg(quantized=False)(self.conv_node.kwargs)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', qconv_op, qconv_args, kwargs)
+                else:
+                    # conv2d_dyanmic branch
+                    raise Exception("Only static quant is supported for conv")
+
 
 # handle linear, maybe followed by relu
 @register_quant_pattern(torch.nn.Linear)
@@ -316,6 +359,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         ]
         qconfig = quantizer.qconfig_map[node.name]
         dtypes = get_qconfig_dtypes(qconfig)
+        # leave the op unquantized if the dtype combination is not supported
         if dtypes not in supported_dtypes:
             warnings.warn(
                 "dtype combination: {} is not "
@@ -412,9 +456,9 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig))
                 packed_weight = quantizer.quantized_graph.create_node(
                     'call_function', prepack_op, prepack_args, {})
-                qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear
                 # construct linear input
                 if activation_statically_quantized:
+                    qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear
                     linear_input = load_arg(quantized=True)(self.linear_node.args[0])
                     act_post_process_name = self.relu_node.name if self.relu_node else self.linear_node.name
                     activation_post_process = \
@@ -484,6 +528,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         emb_node = node
         qconfig = quantizer.qconfig_map[node.name]
         dtypes = get_qconfig_dtypes(qconfig)
+        # leave the op unquantized if the dtype combination is not supported
         if dtypes not in supported_dtypes:
             warnings.warn(
                 "dtype combination: {} is not "
@@ -527,6 +572,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         assert node.op == 'call_module'
         qconfig = quantizer.qconfig_map[node.name]
         dtypes = get_qconfig_dtypes(qconfig)
+        # leave the op unquantized if the dtype combination is not supported
         if dtypes not in supported_dtypes:
             warnings.warn(
                 "dtype combination: {} is not "

From b77f72b5a0d7ccf30454a8b79c5fddd9b069e6cf Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Wed, 27 Jan 2021 11:41:58 -0800
Subject: [PATCH 08/41] Enable TensorPipe's SHM transport (#50760)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50760

The SHM transport uses shared-memory-backed ringbuffers to transfer small payloads between processes on the same machine.

It was disabled in v1.6 due to a CMake mishap but we've since realized that it also doesn't work that well in docker and other setups. Enabling it here to see whether CircleCI fails.
ghstack-source-id: 120470890

Test Plan: Exported three times to CircleCI with tests consistently passing

Reviewed By: mrshenli

Differential Revision: D23814828

fbshipit-source-id: f355cb6515776debad536924de4f4d3fbb05a874
---
 .circleci/config.yml                                       | 2 +-
 .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml | 2 +-
 cmake/Dependencies.cmake                                   | 2 --
 torch/csrc/distributed/rpc/tensorpipe_agent.cpp            | 6 ++++++
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4cf4dc4e2c6a..3fceba2db8dc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -574,7 +574,7 @@ jobs:
             hostname
             export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           else
-            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           fi
           echo "id=${id}" >> "${BASH_ENV}"
 
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 8cbb9a4e3f40..99b327c275a0 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -133,7 +133,7 @@ jobs:
             hostname
             export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           else
-            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           fi
           echo "id=${id}" >> "${BASH_ENV}"
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e138a86c61da..75afbb8b6cf4 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1347,7 +1347,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
     endif()
     set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
-    set(TP_ENABLE_SHM OFF CACHE BOOL "" FORCE)
     set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
 
     add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
@@ -1851,4 +1850,3 @@ if(USE_KINETO)
     set(USE_KINETO OFF)
   endif()
 endif()
-
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 4f56c916cb98..0417577d2499 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -14,6 +14,12 @@
 #include <ATen/cuda/CUDAMultiStreamGuard.h>
 #endif
 
+#if TENSORPIPE_HAS_SHM_TRANSPORT
+// Needed for ::getpid(), which is used to create a unique address.
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
 namespace torch {
 namespace distributed {
 namespace rpc {

From 1b7a4f9cde8a214c8d0b1d9753f42c0a7406f3eb Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Wed, 27 Jan 2021 11:44:18 -0800
Subject: [PATCH 09/41] .github: Add GitHub Actions workflow to build wheels
 (#50633)

Summary:
Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50633

Reviewed By: samestep

Differential Revision: D26083492

Pulled By: seemethere

fbshipit-source-id: c133671b9cf5074539133ee79fca5c680793a85d
---
 .../scripts/generate_binary_build_matrix.py   |  86 +++++++++++++
 .github/scripts/generate_pytorch_version.py   | 118 ++++++++++++++++++
 .github/workflows/build_linux_binaries.yml    |  86 +++++++++++++
 3 files changed, 290 insertions(+)
 create mode 100644 .github/scripts/generate_binary_build_matrix.py
 create mode 100755 .github/scripts/generate_pytorch_version.py
 create mode 100644 .github/workflows/build_linux_binaries.yml

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
new file mode 100644
index 000000000000..d95518bc6dae
--- /dev/null
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+"""Generates a matrix to be utilized through github actions
+
+Will output a condensed version of the matrix if on a pull request that only
+includes the latest version of python we support built on three different
+architectures:
+    * CPU
+    * Latest CUDA
+    * Latest ROCM
+"""
+
+import json
+import os
+import itertools
+
+CUDA_ARCHES = [
+    "10.1",
+    "10.2",
+    "11.0"
+]
+
+ROCM_ARCHES = [
+    "3.10",
+    "4.0"
+]
+
+FULL_ARCHES = [
+    "cpu",
+    *CUDA_ARCHES,
+    *ROCM_ARCHES
+]
+
+CONTAINER_IMAGES = {
+    **{
+        # TODO: Re-do manylinux CUDA image tagging scheme to be similar to
+        #       ROCM so we don't have to do this replacement
+        gpu_arch: f"pytorch/manylinux-cuda{gpu_arch.replace('.', '')}"
+        for gpu_arch in CUDA_ARCHES
+    },
+    **{
+        gpu_arch: f"pytorch/manylinux-rocm:{gpu_arch}"
+        for gpu_arch in ROCM_ARCHES
+    },
+    "cpu": "pytorch/manylinux-cpu"
+}
+
+FULL_PYTHON_VERSIONS = [
+    "3.6",
+    "3.7",
+    "3.8",
+    "3.9",
+]
+
+
+def is_pull_request():
+    return os.environ.get("GITHUB_HEAD_REF")
+
+def generate_matrix():
+    python_versions = FULL_PYTHON_VERSIONS
+    arches = FULL_ARCHES
+    if is_pull_request():
+        python_versions = [python_versions[-1]]
+        arches = ["cpu", CUDA_ARCHES[-1], ROCM_ARCHES[-1]]
+    matrix = []
+    for item in itertools.product(python_versions, arches):
+        python_version, arch_version = item
+        # Not my favorite code here
+        gpu_arch_type = "cuda"
+        if "rocm" in CONTAINER_IMAGES[arch_version]:
+            gpu_arch_type = "rocm"
+        elif "cpu" in CONTAINER_IMAGES[arch_version]:
+            gpu_arch_type = "cpu"
+        matrix.append({
+            "python_version": python_version,
+            "gpu_arch_type": gpu_arch_type,
+            "gpu_arch_version": arch_version,
+            "container_image": CONTAINER_IMAGES[arch_version]
+        })
+    return json.dumps({"include": matrix})
+
+def main():
+    print(generate_matrix())
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py
new file mode 100755
index 000000000000..93fc4ca6db3a
--- /dev/null
+++ b/.github/scripts/generate_pytorch_version.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+import re
+
+from datetime import datetime
+from distutils.util import strtobool
+from pathlib import Path
+
+LEADING_V_PATTERN = re.compile("^v")
+TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
+LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
+
+class NoGitTagException(Exception):
+    pass
+
+def get_pytorch_root():
+    return Path(subprocess.check_output(
+        ['git', 'rev-parse', '--show-toplevel']
+    ).decode('ascii').strip())
+
+def get_tag():
+    root = get_pytorch_root()
+    # We're on a tag
+    am_on_tag = (
+        subprocess.run(
+            ['git', 'describe', '--tags', '--exact'],
+            cwd=root,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL
+        ).returncode == 0
+    )
+    tag = ""
+    if am_on_tag:
+        dirty_tag = subprocess.check_output(
+            ['git', 'describe'],
+            cwd=root
+        ).decode('ascii').strip()
+        # Strip leading v that we typically do when we tag branches
+        # ie: v1.7.1 -> 1.7.1
+        tag = re.sub(LEADING_V_PATTERN, "", dirty_tag)
+        # Strip trailing rc pattern
+        # ie: 1.7.1-rc1 -> 1.7.1
+        tag = re.sub(TRAILING_RC_PATTERN, "", tag)
+    return tag
+
+def get_base_version():
+    root = get_pytorch_root()
+    dirty_version = open(root / 'version.txt', 'r').read().strip()
+    # Strips trailing a0 from version.txt, not too sure why it's there in the
+    # first place
+    return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)
+
+class PytorchVersion:
+    def __init__(self, gpu_arch_type, gpu_arch_version, no_build_suffix):
+        self.gpu_arch_type = gpu_arch_type
+        self.gpu_arch_version = gpu_arch_version
+        self.no_build_suffix = no_build_suffix
+
+    def get_post_build_suffix(self):
+        # CUDA 10.2 is the version to be uploaded to PyPI so it doesn't have a
+        # version suffix
+        if ((self.gpu_arch_type == "cuda" and self.gpu_arch_version == "10.2")
+                or self.no_build_suffix):
+            return ""
+        if self.gpu_arch_type == "cuda":
+            return f"+cu{self.gpu_arch_version.replace('.', '')}"
+        return f"+{self.gpu_arch_type}{self.gpu_arch_version}"
+
+    def get_release_version(self):
+        if not get_tag():
+            raise NoGitTagException(
+                "Not on a git tag, are you sure you want a release version?"
+            )
+        return f"{get_tag()}{self.get_post_build_suffix()}"
+
+    def get_nightly_version(self):
+        date_str = datetime.today().strftime('%Y%m%d')
+        build_suffix = self.get_post_build_suffix()
+        return f"{get_base_version()}.dev{date_str}{build_suffix}"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate pytorch version for binary builds"
+    )
+    parser.add_argument(
+        "--no-build-suffix",
+        type=strtobool,
+        help="Whether or not to add a build suffix typically (+cpu)",
+        default=os.environ.get("NO_BUILD_SUFFIX", False)
+    )
+    parser.add_argument(
+        "--gpu-arch-type",
+        type=str,
+        help="GPU arch you are building for, typically (cpu, cuda, rocm)",
+        default=os.environ.get("GPU_ARCH_TYPE", "cpu")
+    )
+    parser.add_argument(
+        "--gpu-arch-version",
+        type=str,
+        help="GPU arch version, typically (10.2, 4.0), leave blank for CPU",
+        default=os.environ.get("GPU_ARCH_VERSION", "")
+    )
+    args = parser.parse_args()
+    version_obj = PytorchVersion(
+        args.gpu_arch_type,
+        args.gpu_arch_version,
+        args.no_build_suffix
+    )
+    try:
+        print(version_obj.get_release_version())
+    except NoGitTagException:
+        print(version_obj.get_nightly_version())
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/build_linux_binaries.yml b/.github/workflows/build_linux_binaries.yml
new file mode 100644
index 000000000000..fc8917d74625
--- /dev/null
+++ b/.github/workflows/build_linux_binaries.yml
@@ -0,0 +1,86 @@
+name: Build Linux Wheels
+
+on:
+  # TODO: These are only runnable from workflow_dispatch, we need to eventually add
+  #       a cron
+  # TODO: Add an on_release trigger to build on tags
+  workflow_dispatch:
+
+jobs:
+  generate-build-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python .github/scripts/generate_binary_build_matrix.py
+          MATRIX=$(python .github/scripts/generate_binary_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+  build-wheel:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: generate-build-matrix
+    runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
+    container:
+      image: ${{ matrix.container_image }}
+    env:
+      DESIRED_PYTHON: ${{ matrix.python_version }}
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: ${{ matrix.gpu_arch_version }}
+      GPU_ARCH_VERSION: ${{ matrix.GPU_ARCH_VERSION }}
+      GPU_ARCH_TYPE: ${{ matrix.gpu_arch_type }}
+      PYTORCH_BUILD_NUMBER: 1
+      SKIP_ALL_TESTS: 1
+    steps:
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+        with:
+          path: pytorch
+          submodules: recursive
+      - name: Clone pytorch/builder
+        uses: actions/checkout@v2
+        with:
+          repository: pytorch/builder
+          path: builder
+      - name: Generate version string
+        working-directory: pytorch/
+        run: |
+          version=$(.github/scripts/generate_pytorch_version.py)
+          echo "Generated version: ${version}"
+          echo "PYTORCH_BUILD_VERSION=${version}" >> $GITHUB_ENV
+      # TODO: Remove this once we remove the need for the directories to be
+      #       in specific locations
+      - name: Symlink repositories to root directory (for legacy scripts purposes)
+        run: |
+          ln -s $(pwd)/pytorch /pytorch
+          ln -s $(pwd)/builder /builder
+      # TODO: Bundle the correct build script in the base container image so
+      #       that we don't have to do this type of specification
+      - name: Build PyTorch binary (CUDA specific)
+        if: ${{ matrix.gpu_arch_type == 'cuda' }}
+        run: |
+          /builder/manywheel/build.sh
+      - name: Build PyTorch binary (ROCM specific)
+        if: ${{ matrix.gpu_arch_type == 'rocm' }}
+        run: |
+          /builder/manywheel/build_rocm.sh
+      - name: Build PyTorch binary (CPU specific)
+        if: ${{ matrix.gpu_arch_type == 'cpu' }}
+        run: |
+          /builder/manywheel/build_cpu.sh
+      - uses: actions/upload-artifact@v2
+        with:
+          name: pytorch-wheel-py${{ matrix.python_version }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }}
+          path: /remote/**/*.whl
+      # TODO: Add a step here for uploading binaries

From dd1a97b3ae8a14a5c167fd97273531d2b5ad566b Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 27 Jan 2021 11:54:21 -0800
Subject: [PATCH 10/41] [quant][graphmode][fx] Add support for functional
 conv1d and conv3d (#51155)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51155

This PR added support for quantizing functional conv1d, conv3d,  conv1d_relu and conv3d_relu

Test Plan:
python test/test_quantization.py TestQuantizeFxOps.test_functional_conv

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D26089965

fbshipit-source-id: 4aea507d05b744807e993f6d3711ab308fb7591b
---
 test/quantization/test_quantize_fx.py         | 60 +++++++++++++------
 .../quantization/fx/quantization_patterns.py  | 20 +++++--
 torch/quantization/fx/utils.py                | 26 ++++++++
 3 files changed, 84 insertions(+), 22 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index b2243eead1d0..295144bddc3f 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1598,14 +1598,21 @@ def forward(self, x):
     def test_functional_conv(self):
         """ Test for function conv and functional conv + relu
         """
+        convs = {
+            1: torch.nn.functional.conv1d,
+            2: torch.nn.functional.conv2d,
+            3: torch.nn.functional.conv3d,
+        }
+
         class FuncConv(torch.nn.Module):
-            def __init__(self, use_bias, has_relu, f_relu):
+            def __init__(self, dim, use_bias, has_relu, f_relu):
                 super().__init__()
-                self.w = torch.randn(3, 3, 3, 3)
+                self.dim = dim
+                self.w = torch.randn(tuple([3] * (dim + 2)))
                 self.b = torch.randn(3) if use_bias else None
-                self.stride = (1, 1)
-                self.padding = (0, 0)
-                self.dilation = (1, 1)
+                self.stride = tuple([1] * dim)
+                self.padding = tuple([0] * dim)
+                self.dilation = tuple([1] * dim)
                 self.groups = 1
                 self.use_bias = use_bias
                 if has_relu:
@@ -1617,12 +1624,10 @@ def __init__(self, use_bias, has_relu, f_relu):
                     self.relu = torch.nn.Identity()
 
             def forward(self, x):
-                x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups)
+                x = convs[self.dim](x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups)
                 x = self.relu(x)
                 return x
 
-        data = (torch.randn((2, 3, 4, 4), dtype=torch.float),)
-
         quant_type_to_prepare_expected_node_occurrence = {
             QuantType.DYNAMIC: {},
             # There should be 3 observers: after input, weight and activation.
@@ -1636,31 +1641,50 @@ def forward(self, x):
             },
         }
         quant_type_to_qconv_fun = {
-            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d),
-            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d),
+            QuantType.STATIC: {
+                1: ns.call_function(torch.ops.quantized.conv1d),
+                2: ns.call_function(torch.ops.quantized.conv2d),
+                3: ns.call_function(torch.ops.quantized.conv3d)
+            },
+            QuantType.QAT: {
+                1: ns.call_function(torch.ops.quantized.conv1d),
+                2: ns.call_function(torch.ops.quantized.conv2d),
+                3: ns.call_function(torch.ops.quantized.conv3d)
+            },
         }
         quant_type_to_qconv_relu_fun = {
-            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu),
-            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu),
+            QuantType.STATIC: {
+                1: ns.call_function(torch.ops.quantized.conv1d_relu),
+                2: ns.call_function(torch.ops.quantized.conv2d_relu),
+                3: ns.call_function(torch.ops.quantized.conv3d_relu)
+            },
+            QuantType.QAT: {
+                1: ns.call_function(torch.ops.quantized.conv1d_relu),
+                2: ns.call_function(torch.ops.quantized.conv2d_relu),
+                3: ns.call_function(torch.ops.quantized.conv3d_relu)
+            },
         }
 
         options = itertools.product(
+            [1, 2, 3],  # dims
             self.static_quant_types,
             (True, False),  # use_bias
             (True, False),  # has_relu
             (True, False),  # functional relu
         )
-        for quant_type, use_bias, has_relu, f_relu in options:
-            model = FuncConv(use_bias, has_relu, f_relu)
+        for dim, quant_type, use_bias, has_relu, f_relu in options:
+            data_dims = [2, 3] + [4] * dim
+            data = (torch.randn(tuple(data_dims), dtype=torch.float),)
+            model = FuncConv(dim, use_bias, has_relu, f_relu)
             if has_relu:
-                qconv_fun = quant_type_to_qconv_relu_fun[quant_type]
+                qconv_fun = quant_type_to_qconv_relu_fun[quant_type][dim]
             else:
-                qconv_fun = quant_type_to_qconv_fun[quant_type]
+                qconv_fun = quant_type_to_qconv_fun[quant_type][dim]
 
             convert_node_occurrence = {
-                ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0,
+                ns.call_function(torch.quantize_per_tensor): 1,
                 qconv_fun: 1,
-                ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0
+                ns.call_method("dequantize"): 1
             }
             prepare_expected_node_occurrence = \
                 quant_type_to_prepare_expected_node_occurrence[quant_type]
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 06f15240e761..7d0e3f7d0a78 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -32,6 +32,8 @@
     quantize_node,
     get_per_tensor_qparams,
     get_linear_prepack_op_for_dtype,
+    get_qconv_prepack_op,
+    get_qconv_op,
 )
 
 from .quantization_types import QuantizerCls
@@ -188,7 +190,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
 @register_quant_pattern(torch.nn.Conv1d)
 @register_quant_pattern(torch.nn.Conv2d)
 @register_quant_pattern(torch.nn.Conv3d)
+@register_quant_pattern(torch.nn.functional.conv1d)
 @register_quant_pattern(torch.nn.functional.conv2d)
+@register_quant_pattern(torch.nn.functional.conv3d)
+# TODO: add qat.Conv1d and qat.Conv3d
 @register_quant_pattern(torch.nn.qat.Conv2d)
 @register_quant_pattern(torch.nn.intrinsic.ConvReLU1d)
 @register_quant_pattern(torch.nn.intrinsic.ConvReLU2d)
@@ -198,8 +203,12 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d)
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d)
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d)
+@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d))
+@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d))
+@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d))
 @register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d))
+@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d))
 # just for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
@@ -212,8 +221,10 @@ def __init__(self, quantizer: QuantizerCls, node: Node):
             self.relu_node = node
             node = node.args[0]  # type: ignore
         self.conv_node = node
-        if node.op == 'call_module':
+        if node.op == "call_module":
             self.conv = quantizer.modules[self.conv_node.target]
+        elif node.op == "call_function":
+            self.conv = node.target
 
     def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 debug: bool = False,
@@ -275,7 +286,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 args = load_arg(quantized=False)(self.conv_node.args)
                 kwargs = load_arg(quantized=False)(self.conv_node.kwargs)
                 op_out = quantizer.quantized_graph.create_node(
-                    "call_function", torch.nn.functional.conv2d, args, kwargs)
+                    "call_function", self.conv, args, kwargs)
                 if self.relu_node:
                     relu_args = [op_out]
                     relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:]))
@@ -300,13 +311,14 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 weight = load_arg(quantized=True)(self.conv_node.args[1])
                 other_args = load_arg(quantized=False)(self.conv_node.args[2:])
                 prepack_args = tuple([weight] + list(other_args))
+                prepack_op = get_qconv_prepack_op(self.conv)
                 packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {})
+                    "call_function", prepack_op, prepack_args, {})
                 assert activation_statically_quantized, \
                     "currently only static quantization is supported for conv"
                 # construct conv input
                 if activation_statically_quantized:
-                    qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d
+                    qconv_op = get_qconv_op(self.conv, self.relu_node is not None)
                     conv_input = load_arg(quantized=True)(self.conv_node.args[0])
                     act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name
                     activation_post_process = quantizer.activation_post_process_map[act_post_process_name]
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 8285e204b1ed..8d3445e46dba 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -179,6 +179,32 @@ def get_linear_prepack_op_for_dtype(dtype):
     else:
         raise Exception("can't get linear prepack op for dtype:", dtype)
 
+def get_qconv_prepack_op(conv_op: Callable) -> Callable:
+    prepack_ops = {
+        torch.nn.functional.conv1d: torch.ops.quantized.conv1d_prepack,
+        torch.nn.functional.conv2d: torch.ops.quantized.conv2d_prepack,
+        torch.nn.functional.conv3d: torch.ops.quantized.conv3d_prepack
+    }
+    prepack_op = prepack_ops.get(conv_op, None)
+    assert prepack_op, "Didn't find prepack op for {}".format(conv_op)
+    return prepack_op
+
+def get_qconv_op(conv_op: Callable, has_relu: bool) -> Callable:
+    qconv_op = {
+        # has relu
+        True: {
+            torch.nn.functional.conv1d: torch.ops.quantized.conv1d_relu,
+            torch.nn.functional.conv2d: torch.ops.quantized.conv2d_relu,
+            torch.nn.functional.conv3d: torch.ops.quantized.conv3d_relu
+        },
+        False: {
+            torch.nn.functional.conv1d: torch.ops.quantized.conv1d,
+            torch.nn.functional.conv2d: torch.ops.quantized.conv2d,
+            torch.nn.functional.conv3d: torch.ops.quantized.conv3d
+        }
+    }
+    return qconv_op[has_relu].get(conv_op)
+
 # Returns a function that can get a new attribute name for module with given
 # prefix, for example,
 # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer')

From 6d098095eb122685ae87b83a56b95771382a3ef6 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 27 Jan 2021 11:56:31 -0800
Subject: [PATCH 11/41] [numpy] torch.lgamma: promote integer inputs to float
 (#50140)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/42515

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50140

Reviewed By: mrshenli

Differential Revision: D25951094

Pulled By: mruberry

fbshipit-source-id: e53f1dbddff889710f05d43dbc9587382d3decb0
---
 aten/src/ATen/native/UnaryOps.cpp             | 35 ++-------------
 .../src/ATen/native/cuda/UnaryGammaKernels.cu |  2 +-
 aten/src/ATen/native/native_functions.yaml    | 14 +++---
 test/test_torch.py                            |  1 -
 test/test_unary_ufuncs.py                     |  2 -
 torch/csrc/jit/tensorexpr/kernel.cpp          |  5 ++-
 .../_internal/common_methods_invocations.py   | 44 +++++++++++++++++++
 7 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index ca311f86091e..9ff806bd5054 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -650,38 +650,9 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
   return self.copy_(args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(c10::pi<double>) / 4.));
 }
 
-// NB: If you use this macro, you may also need to add a CUDA forwarding
-// stub in CUDAUnaryOps
-
-#define IMPLEMENT_UNARY_OP_CORE(op)                                    \
-  Tensor op(const Tensor& self) {                                      \
-    Tensor result = at::empty({0}, self.options());                    \
-    at::op##_out(result, self);                                        \
-    return result;                                                     \
-  }
-
-#define IMPLEMENT_UNARY_OP_OUT_INPLACE(op, prefix, device)             \
-  Tensor& _##op##__##prefix(Tensor& self) {                            \
-    return at::op##_out(self, self);                                   \
-  }                                                                    \
-  Tensor& _##op##_out_##prefix(Tensor& result, const Tensor& self) {   \
-    checkDeviceType(#op, result, DeviceType::device);                  \
-    checkLayout(#op, result, Layout::Strided);                         \
-    auto iter = TensorIterator::unary_op(result, self);                \
-    op##_stub(iter.device_type(), iter);                               \
-    return result;                                                     \
-  }
-
-#define IMPLEMENT_UNARY_OP_VEC(op)                                     \
-  IMPLEMENT_UNARY_OP_CORE(op)                                          \
-  IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cpu, CPU)
-
-#define IMPLEMENT_UNARY_OP_VEC_CUDA(op)                                \
-  IMPLEMENT_UNARY_OP_CORE(op)                                          \
-  IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cpu, CPU)                         \
-  IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cuda, CUDA)
-
-IMPLEMENT_UNARY_OP_VEC_CUDA(lgamma)
+Tensor& lgamma_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, lgamma_stub); }
+Tensor lgamma(const Tensor& self) { return unary_op_impl_float(self, lgamma_stub); }
+Tensor& lgamma_(Tensor& self) { return unary_op_impl_(self, at::lgamma_out); }
 
 DEFINE_DISPATCH(abs_stub);
 DEFINE_DISPATCH(angle_stub);
diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
index 97dbeefccc77..cdcf92e719d8 100644
--- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
@@ -41,7 +41,7 @@ void polygamma_kernel_cuda(TensorIterator& iter, int64_t n) {
 }
 
 void lgamma_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "lgamma_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "lgamma_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::lgamma(a);
     });
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 906dc08eddd5..1856b9a9bf13 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5148,12 +5148,6 @@
   dispatch:
     CPU, CUDA: __irshift__
 
-- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
-  variants: method
-  dispatch:
-    CPU: _lgamma__cpu
-    CUDA: _lgamma__cuda
-
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
   dispatch:
@@ -5979,8 +5973,12 @@
 - func: lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
-    CPU: _lgamma_out_cpu
-    CUDA: _lgamma_out_cuda
+    CPU, CUDA: lgamma_out
+
+- func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+  dispatch:
+    CPU, CUDA: lgamma_
 
 - func: lgamma(Tensor self) -> Tensor
   variants: method, function
diff --git a/test/test_torch.py b/test/test_torch.py
index 55f5ee73c187..f027db3b9ef6 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6848,7 +6848,6 @@ def inner(self, device, dtype):
     ('round', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('trunc', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('ceil', '', _small_3d, lambda t, d: [], 1e-5, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
-    ('lgamma', '', _small_3d, lambda t, d: [], 1e-2, 1e-1, 1e-5, _float_types_no_half, [torch.bfloat16]),
 ]
 
 # Creates and decorates a generic test and adds it to the class.
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 365c33179206..3497ccd04cc1 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1684,8 +1684,6 @@ def _medium_2d(dtype, device):
     _TorchMathTestMeta('frac', reffn='fmod', refargs=lambda x: (x.numpy(), 1)),
     _TorchMathTestMeta('trunc'),
     _TorchMathTestMeta('round'),
-    # FIXME lgamma produces different result compared to scipy at -inf
-    _TorchMathTestMeta('lgamma', reffn='gammaln', ref_backend='scipy', replace_inf_with_nan=True),
     _TorchMathTestMeta('polygamma', args=[0], substr='_0', reffn='polygamma',
                        refargs=lambda x: (0, x.numpy()), input_fn=_generate_gamma_input, inputargs=[False],
                        ref_backend='scipy'),
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 910de06b2693..5a727cc5a92e 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1350,8 +1350,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::lgamma: {
-      return computeOneOperand(
-          "aten_lgamma", v, [](const ExprHandle& a) { return lgamma(a); });
+      return computeOneOperand("aten_lgamma", v, [](const ExprHandle& a) {
+        return lgamma(promoteIntegerToDefaultType(a));
+      });
     } break;
 
     case prim::ConstantChunk: {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8be600914d97..2ad65a9631dc 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1774,6 +1774,29 @@ def reference_sigmoid(x):
             return (1 / (1 + np.exp(-x)))
         return scipy.special.expit(x)
 
+    def reference_lgamma(x):
+        # scipy.special.gammaln returns `-inf` when input is `-inf`.
+        # While Pytorch, C and C++, all return `inf` when input is `-inf`.
+        # Reference:
+        # https://en.cppreference.com/w/cpp/numeric/math/lgamma
+        # https://en.cppreference.com/w/c/numeric/math/lgamma
+
+        # To handle the above discrepancy,
+        # we replace -inf with inf so values
+        # that were originally -inf map to inf as expected
+        if x.dtype.kind == 'f':
+            x = np.where(x == float('-inf'), np.array(float('inf'), dtype=x.dtype), x)
+
+        out = scipy.special.gammaln(x)
+
+        if x.dtype == np.float16:
+            # `scipy.special.gammaln` returns output of float32 when input is float16,
+            # while `torch.lgamma` preserves `float16`. But due to smaller range of float16,
+            # Pytorch version outputs `inf` while SciPy returns finite values.
+            out = out.astype(np.float16)
+
+        return out
+
     op_db_scipy_reference: List[OpInfo] = [
         UnaryUfuncInfo('sigmoid',
                        ref=reference_sigmoid,
@@ -1851,6 +1874,27 @@ def reference_sigmoid(x):
                                     dtypes=[torch.bfloat16]),
                        )
                        ),
+        UnaryUfuncInfo('lgamma',
+                       ref=reference_lgamma,
+                       decorators=(precisionOverride({torch.float16: 7e-1}),),
+                       dtypes=all_types_and(torch.bool),
+                       dtypesIfCPU=all_types_and(torch.bool, torch.bfloat16),
+                       dtypesIfCUDA=all_types_and(torch.bool, torch.half),
+                       skips=(
+                           # Reference: https://github.com/pytorch/pytorch/pull/50140#discussion_r552615345
+                           SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                    dtypes=[torch.bfloat16]),
+                           # Reference: https://github.com/pytorch/pytorch/pull/50140#issuecomment-756150214
+                           SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                    dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
+                           # Backward of `lgamma` uses `digamma` but `digamma`
+                           # is not implemented for `BFloat16`
+                           # Error Raised:
+                           #   RuntimeError: "digamma" not implemented for 'BFloat16'
+                           SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                                    dtypes=[torch.bfloat16]),
+                       ),
+                       safe_casts_outputs=True),
         OpInfo('xlogy',
                dtypes=all_types_and(torch.bool),
                dtypesIfCPU=all_types_and(torch.bool, torch.half, torch.bfloat16),

From 40eea6d9d1c46fcdf0eeb560cae808359a37229a Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Wed, 27 Jan 2021 12:58:30 -0800
Subject: [PATCH 12/41] Support device map for distributed autograd while using
 TensorPipe. (#44859)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44859

TensorPipe's `set_device_map` option was applied during the forward
pass. However, if we ran the backward pass for the graph we would not
automatically pick up the reverse device mapping.

As a result, users had to specify both forward and backward device mapping
which is very tedious to do.

In this PR, I've added this functionality such that TensorPipe automatically
picks up the reverse device mapping during the backward pass. This is done by
storing the appropriate device mapping in the "recv" autograd function for
distributed autograd.

#Closes: https://github.com/pytorch/pytorch/issues/44170
ghstack-source-id: 119950842

Test Plan:
1) waitforbuildbot
2) Unit test added.

Reviewed By: mrshenli

Differential Revision: D23751975

fbshipit-source-id: 2717d0ef5bde3db029a6172d98aad95734d52140
---
 test/cpp/rpc/e2e_test_base.h                  |  8 ++
 .../autograd/functions/recvrpc_backward.cpp   | 10 +-
 .../autograd/functions/recvrpc_backward.h     |  6 +-
 .../rpc_messages/rpc_with_autograd.cpp        | 38 ++++++--
 .../autograd/rpc_messages/rpc_with_autograd.h | 12 ++-
 torch/csrc/distributed/autograd/utils.cpp     | 14 ++-
 torch/csrc/distributed/autograd/utils.h       |  7 +-
 .../distributed/rpc/process_group_agent.cpp   |  3 +-
 .../distributed/rpc/process_group_agent.h     |  4 +-
 .../rpc/request_callback_no_python.cpp        | 11 ++-
 torch/csrc/distributed/rpc/rpc_agent.cpp      |  6 ++
 torch/csrc/distributed/rpc/rpc_agent.h        |  8 +-
 .../csrc/distributed/rpc/tensorpipe_agent.cpp | 93 +++++++++++++------
 torch/csrc/distributed/rpc/tensorpipe_agent.h | 11 ++-
 .../csrc/distributed/rpc/tensorpipe_utils.cpp |  3 +-
 .../testing/faulty_process_group_agent.cpp    |  3 +-
 .../rpc/testing/faulty_process_group_agent.h  |  5 +-
 torch/csrc/distributed/rpc/utils.cpp          | 10 +-
 .../distributed/rpc/dist_autograd_test.py     | 31 +++++++
 .../_internal/distributed/rpc_utils.py        |  4 +-
 20 files changed, 224 insertions(+), 63 deletions(-)

diff --git a/test/cpp/rpc/e2e_test_base.h b/test/cpp/rpc/e2e_test_base.h
index cea5079b1a4e..6526f8795c19 100644
--- a/test/cpp/rpc/e2e_test_base.h
+++ b/test/cpp/rpc/e2e_test_base.h
@@ -40,6 +40,14 @@ class TestE2EBase : public ::testing::Test {
     RpcAgent::setCurrentRpcAgent(rpcAgent);
     std::shared_ptr<TypeResolver> typeResolver =
         std::make_shared<TypeResolver>([&](const c10::QualifiedName& qn) {
+          // For Dict that is used for device map.
+          auto pos = qn.name().find("Dict");
+          if (pos != std::string::npos) {
+            return c10::StrongTypePtr(
+                nullptr,
+                c10::DictType::create(
+                    c10::IntType::create(), c10::IntType::create()));
+          }
           return c10::StrongTypePtr(
               nullptr, c10::TensorType::create(at::Tensor()));
         });
diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
index 509c5c6cbd08..33df6b540000 100644
--- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
@@ -13,10 +13,12 @@ using torch::autograd::variable_list;
 RecvRpcBackward::RecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     ContextPtr autogradContext,
-    rpc::worker_id_t fromWorkerId)
+    rpc::worker_id_t fromWorkerId,
+    std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap)
     : autogradMetadata_(autogradMetadata),
       autogradContext_(std::move(autogradContext)),
-      fromWorkerId_(fromWorkerId) {}
+      fromWorkerId_(fromWorkerId),
+      deviceMap_(std::move(deviceMap)) {}
 
 variable_list RecvRpcBackward::apply(variable_list&& grads) {
   std::vector<Variable> outputGrads;
@@ -49,7 +51,9 @@ variable_list RecvRpcBackward::apply(variable_list&& grads) {
   auto rpcAgent = rpc::RpcAgent::getCurrentRpcAgent();
   auto jitFuture = rpcAgent->send(
       rpcAgent->getWorkerInfo(fromWorkerId_),
-      std::move(gradCall).toMessage());
+      std::move(gradCall).toMessage(),
+      rpc::kUnsetRpcTimeout,
+      deviceMap_);
 
   // Record the future in the context.
   sharedContext->addOutstandingRpc(jitFuture);
diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
index 982e0331c102..69be98c928ef 100644
--- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
@@ -22,7 +22,8 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node {
   explicit RecvRpcBackward(
       const AutogradMetadata& autogradMetadata,
       std::shared_ptr<DistAutogradContext> autogradContext,
-      rpc::worker_id_t fromWorkerId);
+      rpc::worker_id_t fromWorkerId,
+      std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap);
 
   torch::autograd::variable_list apply(
       torch::autograd::variable_list&& grads) override;
@@ -38,6 +39,9 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node {
   // The worker id from which the RPC was received. During the backward pass,
   // we need to propagate the gradients to this workerId.
   rpc::worker_id_t fromWorkerId_;
+
+  // Device mapping for tensors sent over RPC.
+  const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap_;
 };
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
index 7389868d90c2..5aea96fa0c8b 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
@@ -18,11 +18,13 @@ RpcWithAutograd::RpcWithAutograd(
     worker_id_t fromWorkerId,
     MessageType messageType,
     const AutogradMetadata& autogradMetadata,
-    rpc::Message&& wrappedMessage)
+    rpc::Message&& wrappedMessage,
+    std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap)
     : fromWorkerId_(fromWorkerId),
       messageType_(messageType),
       autogradMetadata_(autogradMetadata),
-      wrappedMessage_(std::move(wrappedMessage)) {
+      wrappedMessage_(std::move(wrappedMessage)),
+      deviceMap_(std::move(deviceMap)) {
   TORCH_INTERNAL_ASSERT(
       messageType_ == MessageType::FORWARD_AUTOGRAD_REQ ||
       messageType_ == MessageType::FORWARD_AUTOGRAD_RESP);
@@ -36,13 +38,15 @@ RpcWithAutograd::RpcWithAutograd(
     const AutogradMetadata& autogradMetadata,
     std::unique_ptr<RpcCommandBase> wrappedRpc,
     MessageType wrappedMessageType,
-    std::vector<torch::Tensor> tensors)
+    std::vector<torch::Tensor> tensors,
+    std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap)
     : fromWorkerId_(fromWorkerId),
       messageType_(messageType),
       autogradMetadata_(autogradMetadata),
       wrappedRpc_(std::move(wrappedRpc)),
       wrappedMessageType_(wrappedMessageType),
-      tensors_(std::move(tensors)) {
+      tensors_(std::move(tensors)),
+      deviceMap_(std::move(deviceMap)) {
   TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cannot be null!");
   TORCH_INTERNAL_ASSERT(
       messageType_ == MessageType::FORWARD_AUTOGRAD_REQ ||
@@ -56,10 +60,17 @@ Message RpcWithAutograd::toMessageImpl() && {
   auto payload = std::move(wrappedMessage_).movePayload();
   TORCH_INTERNAL_ASSERT(!payload.empty());
 
+  // Convert deviceMap to c10::Dict for serialization.
+  c10::Dict<int64_t, int64_t> deviceMap;
+  for (const auto& mapEntry : deviceMap_) {
+    deviceMap.insert(mapEntry.first, mapEntry.second);
+  }
+
   std::vector<at::IValue> ivalues{wrappedMessageType,
                                   autogradMetadata_.autogradContextId,
                                   autogradMetadata_.autogradMessageId,
-                                  fromWorkerId_};
+                                  fromWorkerId_,
+                                  deviceMap};
 
   // Now pickle using JIT pickler.
   std::vector<torch::Tensor> tensorTable;
@@ -92,12 +103,19 @@ std::unique_ptr<RpcWithAutograd> RpcWithAutograd::fromMessage(
   auto tupleElements = rpc::readWrappedPayload(payload, message);
 
   // Gather all the fields.
-  TORCH_INTERNAL_ASSERT(tupleElements.size() == 4);
+  TORCH_INTERNAL_ASSERT(tupleElements.size() == 5);
   MessageType wrappedMessageType =
       static_cast<MessageType>(tupleElements[0].toInt());
   AutogradMetadata autogradMetadata(
       tupleElements[1].toInt(), tupleElements[2].toInt());
   worker_id_t workerId = tupleElements[3].toInt();
+  auto c10DeviceMap = tupleElements[4].to<c10::Dict<int64_t, int64_t>>();
+
+  // Convert to regular map.
+  std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap;
+  for (const auto& mapEntry : c10DeviceMap) {
+    deviceMap.insert({mapEntry.key(), mapEntry.value()});
+  }
 
   // Create new message type and build wrapped RPC.
   Message wrappedMessage(
@@ -116,7 +134,8 @@ std::unique_ptr<RpcWithAutograd> RpcWithAutograd::fromMessage(
       autogradMetadata,
       std::move(wrappedRpc),
       wrappedMessageType,
-      wrappedMessage.tensors());
+      wrappedMessage.tensors(),
+      deviceMap);
 }
 
 std::vector<torch::Tensor>& RpcWithAutograd::tensors() {
@@ -150,6 +169,11 @@ rpc::worker_id_t RpcWithAutograd::fromWorkerId() const {
   return fromWorkerId_;
 }
 
+const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& RpcWithAutograd::
+    deviceMap() {
+  return deviceMap_;
+}
+
 } // namespace autograd
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
index 657d2cf2641f..f4728ea37c63 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
@@ -18,7 +18,8 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
       rpc::worker_id_t fromWorkerId,
       rpc::MessageType messageType,
       const AutogradMetadata& autogradMetadata,
-      rpc::Message&& wrappedMessage);
+      rpc::Message&& wrappedMessage,
+      std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap = {});
 
   // Used when receiving an RPC over the wire.
   RpcWithAutograd(
@@ -27,7 +28,8 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
       const AutogradMetadata& autogradMetadata,
       std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
       rpc::MessageType wrappedMessageType,
-      std::vector<torch::Tensor> tensors);
+      std::vector<torch::Tensor> tensors,
+      std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap = {});
 
   rpc::Message toMessageImpl() && override;
 
@@ -52,6 +54,9 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
   // Retrieve the worker id from which the RPC originated.
   rpc::worker_id_t fromWorkerId() const;
 
+  // Retrieve the device map.
+  const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap();
+
  private:
   // WorkerId from which this RPC originated. This is necessary for knowing
   // which worker we need to contact during the backward pass.
@@ -83,6 +88,9 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
 
   // Tensors part of the wrappedRpc that need to be considered for autograd.
   std::vector<torch::Tensor> tensors_;
+
+  // Device mapping for tensors that are sent across an RPC to another node.
+  std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> deviceMap_;
 };
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index 08bb99471686..747a958948a4 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -52,7 +52,8 @@ void addSendRpcBackward(
 ContextPtr addRecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     std::vector<torch::Tensor>& tensors,
-    rpc::worker_id_t fromWorkerId) {
+    rpc::worker_id_t fromWorkerId,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap) {
   // Initialize autograd context if necessary.
   auto& autogradContainer = DistAutogradContainer::getInstance();
   auto autogradContext =
@@ -61,7 +62,7 @@ ContextPtr addRecvRpcBackward(
   if (!tensors.empty() && torch::autograd::compute_requires_grad(tensors)) {
     // Attach the tensors as inputs to the autograd function.
     auto grad_fn = std::make_shared<RecvRpcBackward>(
-        autogradMetadata, autogradContext, fromWorkerId);
+        autogradMetadata, autogradContext, fromWorkerId, deviceMap);
     for (auto& tensor : tensors) {
       if (tensor.requires_grad()) {
         torch::autograd::set_history(tensor, grad_fn);
@@ -102,7 +103,8 @@ Message getMessageWithAutograd(
     const rpc::worker_id_t dstId,
     torch::distributed::rpc::Message&& wrappedRpcMsg,
     MessageType msgType,
-    bool forceGradRecording) {
+    bool forceGradRecording,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap) {
   auto& autogradContainer = DistAutogradContainer::getInstance();
 
   // If there is no valid context and no tensor requires grads, send original
@@ -125,7 +127,8 @@ Message getMessageWithAutograd(
       RpcAgent::getCurrentRpcAgent()->getWorkerInfo().id_,
       msgType,
       autogradMetadata,
-      std::move(wrappedRpcMsg));
+      std::move(wrappedRpcMsg),
+      deviceMap);
 
   if (tensorsRequireGrad) {
     // Record autograd information for 'send'.
@@ -149,7 +152,8 @@ std::shared_ptr<JitFuture> sendMessageWithAutograd(
       dst.id_,
       std::move(wrappedRpcMsg),
       MessageType::FORWARD_AUTOGRAD_REQ,
-      forceGradRecording);
+      forceGradRecording,
+      agent.getDeviceMap(dst));
 
   std::shared_ptr<JitFuture> fut;
   // If profiler is enabled, wrap this message with profiling metadata that will
diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h
index 07ba45ed60d7..013558252fc2 100644
--- a/torch/csrc/distributed/autograd/utils.h
+++ b/torch/csrc/distributed/autograd/utils.h
@@ -30,7 +30,8 @@ TORCH_API void addSendRpcBackward(
 TORCH_API ContextPtr addRecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     std::vector<torch::Tensor>& tensors,
-    rpc::worker_id_t fromWorkerId);
+    rpc::worker_id_t fromWorkerId,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap);
 
 // This method is a wrapper utility used internally to wrap autograd info
 // and attach autograd function for each type of rpc call if it has valid
@@ -42,7 +43,9 @@ TORCH_API rpc::Message getMessageWithAutograd(
     const rpc::worker_id_t dstId,
     rpc::Message&& wrappedRpcMsg,
     rpc::MessageType msgType,
-    bool forceGradRecording = false);
+    bool forceGradRecording = false,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap =
+        {});
 
 // Send message after autograd checking
 TORCH_API std::shared_ptr<c10::ivalue::Future>
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index 9c1a703cfa6d..3cd940f3ee49 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -290,7 +290,8 @@ void ProcessGroupAgent::shutdownImpl() {
 std::shared_ptr<JitFuture> ProcessGroupAgent::send(
     const WorkerInfo& to,
     Message&& message,
-    const float rpcTimeoutSeconds) {
+    const float rpcTimeoutSeconds,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap) {
   // Throw if we previously encountered an exception in ::listenLoop.
   {
     std::unique_lock<std::mutex> guard(listenLoopExceptionMutex_);
diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h
index 8d2471a7d113..d1d957a66562 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.h
+++ b/torch/csrc/distributed/rpc/process_group_agent.h
@@ -91,7 +91,9 @@ class TORCH_API ProcessGroupAgent : public RpcAgent {
   std::shared_ptr<JitFuture> send(
       const WorkerInfo& to,
       Message&& message,
-      const float rpcTimeoutSeconds = kUnsetRpcTimeout) override;
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap =
+          {}) override;
 
   // put SendWork into a queue and notify the worker thread
   virtual void enqueueSend(SendWork work);
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index 09c56dc960c9..46192d7eb317 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -345,11 +345,20 @@ void RequestCallbackNoPython::processForwardAutogradReq(
     const std::shared_ptr<JitFuture>& responseFuture) const {
   auto& rpcWithAutograd = static_cast<RpcWithAutograd&>(rpc);
 
+    // Need to reverse the device map for the backward pass of distributed
+    // autograd.
+    std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> reverseDeviceMap;
+    for (const auto& mapEntry : rpcWithAutograd.deviceMap()) {
+      reverseDeviceMap.insert({mapEntry.second, mapEntry.first});
+    }
+
+
   // Attach 'recv' autograd function.
   auto autogradContext = addRecvRpcBackward(
       rpcWithAutograd.autogradMetadata(),
       rpcWithAutograd.tensors(),
-      rpcWithAutograd.fromWorkerId());
+      rpcWithAutograd.fromWorkerId(),
+      reverseDeviceMap);
   // For this recv thread on server side, before processRpc(),
   // set current_context_id_ to be context_id passed from client.
   // In this way, if there is nested rpc call in python rpc call, original
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 2033b2b771e2..5c9570bcac1d 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -286,6 +286,12 @@ bool RpcAgent::isGILProfilingEnabled() {
   return profilingEnabled_.load();
 }
 
+std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> RpcAgent::getDeviceMap(
+    const WorkerInfo& dest) {
+  // Default implementation has no device map.
+  return {};
+}
+
 std::unordered_map<std::string, std::string> RpcAgent::getDebugInfo() {
   /* This would later include more info other than metrics for eg: may include
      stack traces for the threads owned by the agent */
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index bfc6c38c07a1..956af3da899b 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -160,7 +160,9 @@ class TORCH_API RpcAgent {
   virtual std::shared_ptr<JitFuture> send(
       const WorkerInfo& to,
       Message&& message,
-      const float rpcTimeoutSeconds = kUnsetRpcTimeout) = 0;
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap =
+          {}) = 0;
 
   // Retries sending the message up to maxRetries times until an ACK is
   // receieved. The duration between consecutive sends is increased over
@@ -259,6 +261,10 @@ class TORCH_API RpcAgent {
   // Get the type resolver
   std::shared_ptr<TypeResolver> getTypeResolver();
 
+  // Retrieves the device map for the provided destination worker.
+  virtual std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> getDeviceMap(
+      const WorkerInfo& dest);
+
  protected:
   const WorkerInfo workerInfo_;
   const std::unique_ptr<RequestCallback> cb_;
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0417577d2499..518fc72e8304 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -39,6 +39,42 @@ const std::string kClientActiveCalls = "agent.client_active_calls";
 const std::string kServerActiveCalls = "agent.server_active_calls";
 const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls";
 
+std::vector<c10::DeviceIndex> getDevicesForTensors(
+    const std::vector<torch::Tensor>& tensors,
+    const tensorpipe::DeviceMap& deviceMap,
+    const std::string& remoteName) {
+  // If the deviceMap is overridden, use that instead.
+  const auto errStr = c10::str(
+      "TensorPipe RPC backend only supports CPU tensors by default, please "
+      "move your tensors to CPU before sending them over RPC, or call "
+      "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly "
+      "configure device mapping. ",
+      "Request device mapping is not available for destination ",
+      remoteName);
+  std::vector<c10::DeviceIndex> deviceIndices;
+  deviceIndices.reserve(tensors.size());
+  bool hasCudaTensor = false;
+  for (const auto& t : tensors) {
+    if (t.device().is_cpu()) {
+      deviceIndices.push_back(-1);
+    } else {
+      const auto deviceIter = deviceMap.find(t.device().index());
+      TORCH_CHECK(
+          deviceIter != deviceMap.end(),
+          errStr,
+          " for device ",
+          t.device(),
+          " but received a tensor on that device.");
+      deviceIndices.push_back(deviceIter->second);
+      hasCudaTensor = true;
+    }
+  }
+  if (!hasCudaTensor) {
+    deviceIndices.clear();
+  }
+  return deviceIndices;
+}
+
 } // namespace
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
@@ -547,7 +583,9 @@ void TensorPipeAgent::pipeWrite(
     Message&& rpcMessage,
     std::vector<c10::DeviceIndex>&& devices,
     std::shared_ptr<LazyStreamContext> ctx,
-    std::function<void(const tensorpipe::Error&)> fn) noexcept {
+    std::function<void(const tensorpipe::Error&)> fn,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>&
+        deviceMap) noexcept {
   tensorpipe::Message tpMessage;
   TensorpipeWriteBuffers tpBuffers;
 
@@ -587,7 +625,7 @@ void TensorPipeAgent::sendCompletedResponseMessage(
     responseMessage.setId(messageId);
     std::vector<c10::DeviceIndex> devices;
     try {
-      devices = getDevicesForTensors(pipe->getRemoteName(), responseMessage);
+      devices = getDevicesForRemote(pipe->getRemoteName(), responseMessage);
     } catch (const std::exception& e) {
       responseMessage = createExceptionResponse(e.what(), messageId);
     }
@@ -721,7 +759,8 @@ void TensorPipeAgent::respond(std::shared_ptr<tensorpipe::Pipe>& pipe) {
 std::shared_ptr<JitFuture> TensorPipeAgent::send(
     const WorkerInfo& toWorkerInfo,
     Message&& requestMessage,
-    const float rpcTimeoutSeconds) {
+    const float rpcTimeoutSeconds,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap) {
   TORCH_CHECK(
       requestMessage.isRequest(),
       "TensorPipeAgent::send(..) is only for sending requests.");
@@ -761,8 +800,15 @@ std::shared_ptr<JitFuture> TensorPipeAgent::send(
 
   // Get devices for tensors in the request message. This can throw if device
   // maps are not configured properly for this request.
-  auto devices =
-      getDevicesForTensors(clientPipe.pipe_->getRemoteName(), requestMessage);
+  std::vector<c10::DeviceIndex> devices;
+  if (deviceMap.empty()) {
+    devices =
+        getDevicesForRemote(clientPipe.pipe_->getRemoteName(), requestMessage);
+  } else {
+    // If deviceMap is specified, use that instead.
+    devices = getDevicesForTensors(
+        requestMessage.tensors(), deviceMap, clientPipe.pipe_->getRemoteName());
+  }
 
   futureResponseMessage->jitFuture->addCallback([this]() {
     TORCH_INTERNAL_ASSERT(
@@ -906,7 +952,8 @@ std::shared_ptr<JitFuture> TensorPipeAgent::send(
                     std::move(ctx));
               }
             });
-      });
+      },
+      deviceMap);
 
   return futureResponseMessage->jitFuture;
 }
@@ -1196,7 +1243,7 @@ void TensorPipeAgent::markFutureWithError(
   }
 }
 
-std::vector<c10::DeviceIndex> TensorPipeAgent::getDevicesForTensors(
+std::vector<c10::DeviceIndex> TensorPipeAgent::getDevicesForRemote(
     const std::string& remoteName,
     const Message& message) const {
   const auto& deviceMaps =
@@ -1222,30 +1269,16 @@ std::vector<c10::DeviceIndex> TensorPipeAgent::getDevicesForTensors(
     }
     return {};
   } else {
-    std::vector<c10::DeviceIndex> deviceIndices;
-    deviceIndices.reserve(message.tensors().size());
-    const auto& deviceMap = iter->second;
-    bool hasCudaTensor = false;
-    for (const auto& t : message.tensors()) {
-      if (t.device().is_cpu()) {
-        deviceIndices.push_back(-1);
-      } else {
-        const auto deviceIter = deviceMap.find(t.device().index());
-        TORCH_CHECK(
-            deviceIter != deviceMap.end(),
-            errStr,
-            " for device ",
-            t.device(),
-            " but received a tensor on that device.");
-        deviceIndices.push_back(deviceIter->second);
-        hasCudaTensor = true;
-      }
-    }
-    if (!hasCudaTensor) {
-      deviceIndices.clear();
-    }
-    return deviceIndices;
+    return getDevicesForTensors(message.tensors(), iter->second, errStr);
+  }
+}
+
+tensorpipe::DeviceMap TensorPipeAgent::getDeviceMap(const WorkerInfo& dest) {
+  auto it = opts_.deviceMaps.find(dest.name_);
+  if (it == opts_.deviceMaps.end()) {
+    return {};
   }
+  return it->second;
 }
 
 size_t TensorPipeAgent::timeoutMapSize() {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index b4d8796aeede..078750385538 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -185,7 +185,9 @@ class TensorPipeAgent : public RpcAgent {
   std::shared_ptr<JitFuture> send(
       const WorkerInfo& to,
       Message&& message,
-      const float rpcTimeoutSeconds = kUnsetRpcTimeout) override;
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap =
+          {}) override;
 
   // join() and sync() would be deprecated -
   // https://github.com/pytorch/pytorch/issues/27647
@@ -209,6 +211,8 @@ class TensorPipeAgent : public RpcAgent {
 
   void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override;
 
+  tensorpipe::DeviceMap getDeviceMap(const WorkerInfo& dest) override;
+
   using NetworkDataDict =
       std::unordered_map<std::string, AggregatedNetworkData>;
 
@@ -252,7 +256,8 @@ class TensorPipeAgent : public RpcAgent {
       Message&& message,
       std::vector<c10::DeviceIndex>&& devices,
       std::shared_ptr<LazyStreamContext> ctx,
-      std::function<void(const tensorpipe::Error&)>) noexcept;
+      std::function<void(const tensorpipe::Error&)>,
+      const tensorpipe::DeviceMap& deviceMap = {}) noexcept;
 
   // Callback of listener accept()
   void onListenerAccepted(
@@ -279,7 +284,7 @@ class TensorPipeAgent : public RpcAgent {
       uint64_t requestSize,
       const std::string& destWorkerName);
 
-  inline std::vector<c10::DeviceIndex> getDevicesForTensors(
+  inline std::vector<c10::DeviceIndex> getDevicesForRemote(
       const std::string& remoteName,
       const Message& message) const;
 
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index 1d17e4451372..9757e0971d2b 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -91,7 +91,8 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
     // Enforce memory copy if tensor is created from torch::from_blob, means
     // that the tensor doesn't own the memory.
     std::string metadata =
-        deviceIndices.empty() ? "" : std::to_string(deviceIndices[i]);
+        deviceIndices.empty() || deviceIndices[i] == -1
+        ? "" : std::to_string(deviceIndices[i]);
 
     if (!tensorData.storageHasDeleter()) {
       std::vector<char> storageData(
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
index 57dbef3a549b..870f9702ee0e 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
@@ -59,7 +59,8 @@ std::unordered_map<MessageType, float, std::hash<int>> FaultyProcessGroupAgent::
 std::shared_ptr<JitFuture> FaultyProcessGroupAgent::send(
     const WorkerInfo& to,
     Message&& message,
-    const float rpcTimeoutSeconds) {
+    const float rpcTimeoutSeconds,
+    const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap) {
   // We only fail control messages that have been specified by the test case.
   // For all other messages, we just send them without any failures.
   if (!shouldFailMessage(message.type())) {
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h
index 8cbe4c9a137d..ce8fee558274 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h
+++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h
@@ -46,8 +46,9 @@ class FaultyProcessGroupAgent : public ProcessGroupAgent {
   std::shared_ptr<JitFuture> send(
       const WorkerInfo& to,
       Message&& message,
-      const float rpcTimeoutSeconds =
-          torch::distributed::rpc::kUnsetRpcTimeout) override;
+      const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+      const std::unordered_map<c10::DeviceIndex, c10::DeviceIndex>& deviceMap =
+          {}) override;
 
  protected:
   // This function checks the messageTypesToFail_ to determine whether to use
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 0f137a72a252..d643ab87b8ea 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -174,11 +174,19 @@ std::unique_ptr<RpcCommandBase> deserializeResponse(
       RpcCommandBase& rpc = *rpcPtr;
       auto& rpcWithAutograd = static_cast<autograd::RpcWithAutograd&>(rpc);
 
+      // Need to reverse the device map for the backward pass of distributed
+      // autograd.
+      std::unordered_map<c10::DeviceIndex, c10::DeviceIndex> reverseDeviceMap;
+      for (const auto& mapEntry : rpcWithAutograd.deviceMap()) {
+        reverseDeviceMap.insert({mapEntry.second, mapEntry.first});
+      }
+
       // Attach 'recv' autograd function.
       addRecvRpcBackward(
           rpcWithAutograd.autogradMetadata(),
           rpcWithAutograd.tensors(),
-          rpcWithAutograd.fromWorkerId());
+          rpcWithAutograd.fromWorkerId(),
+          reverseDeviceMap);
 
       wrappedMsgType = rpcWithAutograd.wrappedMessageType();
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 15d5cfeca214..54e936bf0f0d 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -2235,3 +2235,34 @@ def test_verify_backend_options(self):
         self.assertEqual(self.rpc_backend_options.num_send_recv_threads, 8)
         self.assertEqual(self.rpc_backend_options.num_fail_sends, 3)
         self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4)
+
+class TensorPipeDistAutogradTest(RpcAgentTestFixture):
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_backward_pass(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        # The reverse of this device mapping should be used for the backward pass.
+        options.set_device_map(dst, {self.rank: (self.rank + 1) % self.world_size})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        t1 = torch.rand(10, device=self.rank, requires_grad=True)
+        t2 = torch.rand(10, device=self.rank, requires_grad=True)
+        with dist_autograd.context() as context_id:
+            res = rpc.rpc_sync(dst, torch.add, args=(t1, t2))
+            dist_autograd.backward(context_id, [res.sum()])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(torch.ones(10), grads[t1])
+            self.assertEqual(torch.ones(10), grads[t2])
+            self.assertEqual(t1.device, grads[t1].device)
+            self.assertEqual(t2.device, grads[t2].device)
+
+        rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py
index d35f3da5d2c2..bdf4bbd6eb78 100644
--- a/torch/testing/_internal/distributed/rpc_utils.py
+++ b/torch/testing/_internal/distributed/rpc_utils.py
@@ -23,6 +23,7 @@
 from torch.testing._internal.distributed.rpc.dist_autograd_test import (
     DistAutogradTest,
     FaultyAgentDistAutogradTest,
+    TensorPipeDistAutogradTest
 )
 from torch.testing._internal.distributed.rpc.dist_optimizer_test import (
     DistOptimizerTest,
@@ -139,7 +140,8 @@ class MultiProcess(Flag):
 # These suites should be standalone, and separate from the ones in the generic
 # list (not subclasses of those!).
 TENSORPIPE_TESTS = [
-    TensorPipeAgentRpcTest
+    TensorPipeAgentRpcTest,
+    TensorPipeDistAutogradTest
 ]
 
 

From f7e90cf31129163e31ee8bcbd37f54e7f9ddc39c Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 27 Jan 2021 13:23:46 -0800
Subject: [PATCH 13/41] Revert D26089965: [quant][graphmode][fx] Add support
 for functional conv1d and conv3d

Test Plan: revert-hammer

Differential Revision:
D26089965 (https://github.com/pytorch/pytorch/commit/dd1a97b3ae8a14a5c167fd97273531d2b5ad566b)

Original commit changeset: 4aea507d05b7

fbshipit-source-id: f54184cafb9dd07858683489d8bd147474e7e4b3
---
 test/quantization/test_quantize_fx.py         | 60 ++++++-------------
 .../quantization/fx/quantization_patterns.py  | 20 ++-----
 torch/quantization/fx/utils.py                | 26 --------
 3 files changed, 22 insertions(+), 84 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 295144bddc3f..b2243eead1d0 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -1598,21 +1598,14 @@ def forward(self, x):
     def test_functional_conv(self):
         """ Test for function conv and functional conv + relu
         """
-        convs = {
-            1: torch.nn.functional.conv1d,
-            2: torch.nn.functional.conv2d,
-            3: torch.nn.functional.conv3d,
-        }
-
         class FuncConv(torch.nn.Module):
-            def __init__(self, dim, use_bias, has_relu, f_relu):
+            def __init__(self, use_bias, has_relu, f_relu):
                 super().__init__()
-                self.dim = dim
-                self.w = torch.randn(tuple([3] * (dim + 2)))
+                self.w = torch.randn(3, 3, 3, 3)
                 self.b = torch.randn(3) if use_bias else None
-                self.stride = tuple([1] * dim)
-                self.padding = tuple([0] * dim)
-                self.dilation = tuple([1] * dim)
+                self.stride = (1, 1)
+                self.padding = (0, 0)
+                self.dilation = (1, 1)
                 self.groups = 1
                 self.use_bias = use_bias
                 if has_relu:
@@ -1624,10 +1617,12 @@ def __init__(self, dim, use_bias, has_relu, f_relu):
                     self.relu = torch.nn.Identity()
 
             def forward(self, x):
-                x = convs[self.dim](x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups)
+                x = F.conv2d(x, self.w, self.b, self.stride, self.padding, self.dilation, self.groups)
                 x = self.relu(x)
                 return x
 
+        data = (torch.randn((2, 3, 4, 4), dtype=torch.float),)
+
         quant_type_to_prepare_expected_node_occurrence = {
             QuantType.DYNAMIC: {},
             # There should be 3 observers: after input, weight and activation.
@@ -1641,50 +1636,31 @@ def forward(self, x):
             },
         }
         quant_type_to_qconv_fun = {
-            QuantType.STATIC: {
-                1: ns.call_function(torch.ops.quantized.conv1d),
-                2: ns.call_function(torch.ops.quantized.conv2d),
-                3: ns.call_function(torch.ops.quantized.conv3d)
-            },
-            QuantType.QAT: {
-                1: ns.call_function(torch.ops.quantized.conv1d),
-                2: ns.call_function(torch.ops.quantized.conv2d),
-                3: ns.call_function(torch.ops.quantized.conv3d)
-            },
+            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d),
+            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d),
         }
         quant_type_to_qconv_relu_fun = {
-            QuantType.STATIC: {
-                1: ns.call_function(torch.ops.quantized.conv1d_relu),
-                2: ns.call_function(torch.ops.quantized.conv2d_relu),
-                3: ns.call_function(torch.ops.quantized.conv3d_relu)
-            },
-            QuantType.QAT: {
-                1: ns.call_function(torch.ops.quantized.conv1d_relu),
-                2: ns.call_function(torch.ops.quantized.conv2d_relu),
-                3: ns.call_function(torch.ops.quantized.conv3d_relu)
-            },
+            QuantType.STATIC: ns.call_function(torch.ops.quantized.conv2d_relu),
+            QuantType.QAT: ns.call_function(torch.ops.quantized.conv2d_relu),
         }
 
         options = itertools.product(
-            [1, 2, 3],  # dims
             self.static_quant_types,
             (True, False),  # use_bias
             (True, False),  # has_relu
             (True, False),  # functional relu
         )
-        for dim, quant_type, use_bias, has_relu, f_relu in options:
-            data_dims = [2, 3] + [4] * dim
-            data = (torch.randn(tuple(data_dims), dtype=torch.float),)
-            model = FuncConv(dim, use_bias, has_relu, f_relu)
+        for quant_type, use_bias, has_relu, f_relu in options:
+            model = FuncConv(use_bias, has_relu, f_relu)
             if has_relu:
-                qconv_fun = quant_type_to_qconv_relu_fun[quant_type][dim]
+                qconv_fun = quant_type_to_qconv_relu_fun[quant_type]
             else:
-                qconv_fun = quant_type_to_qconv_fun[quant_type][dim]
+                qconv_fun = quant_type_to_qconv_fun[quant_type]
 
             convert_node_occurrence = {
-                ns.call_function(torch.quantize_per_tensor): 1,
+                ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0,
                 qconv_fun: 1,
-                ns.call_method("dequantize"): 1
+                ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0
             }
             prepare_expected_node_occurrence = \
                 quant_type_to_prepare_expected_node_occurrence[quant_type]
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 7d0e3f7d0a78..06f15240e761 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -32,8 +32,6 @@
     quantize_node,
     get_per_tensor_qparams,
     get_linear_prepack_op_for_dtype,
-    get_qconv_prepack_op,
-    get_qconv_op,
 )
 
 from .quantization_types import QuantizerCls
@@ -190,10 +188,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
 @register_quant_pattern(torch.nn.Conv1d)
 @register_quant_pattern(torch.nn.Conv2d)
 @register_quant_pattern(torch.nn.Conv3d)
-@register_quant_pattern(torch.nn.functional.conv1d)
 @register_quant_pattern(torch.nn.functional.conv2d)
-@register_quant_pattern(torch.nn.functional.conv3d)
-# TODO: add qat.Conv1d and qat.Conv3d
 @register_quant_pattern(torch.nn.qat.Conv2d)
 @register_quant_pattern(torch.nn.intrinsic.ConvReLU1d)
 @register_quant_pattern(torch.nn.intrinsic.ConvReLU2d)
@@ -203,12 +198,8 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d)
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d)
 @register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d)
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d))
-@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d))
 @register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d))
-@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d))
 # just for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
@@ -221,10 +212,8 @@ def __init__(self, quantizer: QuantizerCls, node: Node):
             self.relu_node = node
             node = node.args[0]  # type: ignore
         self.conv_node = node
-        if node.op == "call_module":
+        if node.op == 'call_module':
             self.conv = quantizer.modules[self.conv_node.target]
-        elif node.op == "call_function":
-            self.conv = node.target
 
     def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 debug: bool = False,
@@ -286,7 +275,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 args = load_arg(quantized=False)(self.conv_node.args)
                 kwargs = load_arg(quantized=False)(self.conv_node.kwargs)
                 op_out = quantizer.quantized_graph.create_node(
-                    "call_function", self.conv, args, kwargs)
+                    "call_function", torch.nn.functional.conv2d, args, kwargs)
                 if self.relu_node:
                     relu_args = [op_out]
                     relu_args.extend(load_arg(quantized=False)(self.relu_node.args[1:]))
@@ -311,14 +300,13 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
                 weight = load_arg(quantized=True)(self.conv_node.args[1])
                 other_args = load_arg(quantized=False)(self.conv_node.args[2:])
                 prepack_args = tuple([weight] + list(other_args))
-                prepack_op = get_qconv_prepack_op(self.conv)
                 packed_weight = quantizer.quantized_graph.create_node(
-                    "call_function", prepack_op, prepack_args, {})
+                    'call_function', torch.ops.quantized.conv2d_prepack, prepack_args, {})
                 assert activation_statically_quantized, \
                     "currently only static quantization is supported for conv"
                 # construct conv input
                 if activation_statically_quantized:
-                    qconv_op = get_qconv_op(self.conv, self.relu_node is not None)
+                    qconv_op = torch.ops.quantized.conv2d_relu if self.relu_node else torch.ops.quantized.conv2d
                     conv_input = load_arg(quantized=True)(self.conv_node.args[0])
                     act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name
                     activation_post_process = quantizer.activation_post_process_map[act_post_process_name]
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 8d3445e46dba..8285e204b1ed 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -179,32 +179,6 @@ def get_linear_prepack_op_for_dtype(dtype):
     else:
         raise Exception("can't get linear prepack op for dtype:", dtype)
 
-def get_qconv_prepack_op(conv_op: Callable) -> Callable:
-    prepack_ops = {
-        torch.nn.functional.conv1d: torch.ops.quantized.conv1d_prepack,
-        torch.nn.functional.conv2d: torch.ops.quantized.conv2d_prepack,
-        torch.nn.functional.conv3d: torch.ops.quantized.conv3d_prepack
-    }
-    prepack_op = prepack_ops.get(conv_op, None)
-    assert prepack_op, "Didn't find prepack op for {}".format(conv_op)
-    return prepack_op
-
-def get_qconv_op(conv_op: Callable, has_relu: bool) -> Callable:
-    qconv_op = {
-        # has relu
-        True: {
-            torch.nn.functional.conv1d: torch.ops.quantized.conv1d_relu,
-            torch.nn.functional.conv2d: torch.ops.quantized.conv2d_relu,
-            torch.nn.functional.conv3d: torch.ops.quantized.conv3d_relu
-        },
-        False: {
-            torch.nn.functional.conv1d: torch.ops.quantized.conv1d,
-            torch.nn.functional.conv2d: torch.ops.quantized.conv2d,
-            torch.nn.functional.conv3d: torch.ops.quantized.conv3d
-        }
-    }
-    return qconv_op[has_relu].get(conv_op)
-
 # Returns a function that can get a new attribute name for module with given
 # prefix, for example,
 # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer')

From 1c8d11c9e2ff64d92fb322c573d287588c26a718 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 27 Jan 2021 13:43:09 -0800
Subject: [PATCH 14/41] [PyTorch] Save a refcount bump in make_variable
 (#51180)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51180

This fast path still did a refcount bump because it copied the inner intrusive_ptr to the stack. Now it's moved.
ghstack-source-id: 120460258

Test Plan:
1) profile empty benchmark & inspect assembly to verify move
2) run framework overhead benchmarks

Reviewed By: bhosmer

Differential Revision: D26094951

fbshipit-source-id: b2e09f9ad885cb633402885ca1e61a370723f6b8
---
 aten/src/ATen/templates/TensorBody.h | 4 ++++
 torch/csrc/autograd/variable.h       | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 731cb79e031f..77b9ae978158 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -147,6 +147,10 @@ class TORCH_API Tensor {
     return impl_;
   }
 
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> unsafeReleaseIntrusivePtr()  {
+    return std::move(impl_);
+  }
+
   bool defined() const {
     return impl_;
   }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 83bbc9406081..ad8c1919dee6 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -666,7 +666,7 @@ inline Variable make_variable(
     bool allow_tensor_metadata_change = true) {
   if (data.defined()) {
     if (data.getIntrusivePtr().use_count() == 1 && data.getIntrusivePtr()->unique_version()) {
-      auto data_impl = data.getIntrusivePtr();
+      auto data_impl = data.unsafeReleaseIntrusivePtr();
       data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
       if (requires_grad) {
         data_impl->set_autograd_meta(std::make_unique<AutogradMeta>(data_impl.get(), requires_grad));

From eaf5ca09dc9939f15230638cdba0af0ac46eb2d8 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 27 Jan 2021 13:58:43 -0800
Subject: [PATCH 15/41] Migrate masked_scatter_ CUDA to ATen (#50039)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49542

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50039

Reviewed By: heitorschueroff

Differential Revision: D26096247

Pulled By: ngimel

fbshipit-source-id: ec1810d3412e0d7ab6b950265a3123519ad886c1
---
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |   2 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 160 ------------------
 aten/src/ATen/native/cuda/IndexKernel.cu      | 103 +++++++++++
 .../ATen/native/cuda/LegacyDefinitions.cpp    |  15 --
 aten/src/THC/THCTensorMasked.cuh              |  16 --
 aten/src/THC/generic/THCTensorMasked.cu       | 141 ---------------
 aten/src/THC/generic/THCTensorMasked.h        |  19 ---
 test/test_torch.py                            |  93 +++++-----
 8 files changed, 153 insertions(+), 396 deletions(-)

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index a9004a5b6d01..069a2c1152c4 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -20,8 +20,6 @@ namespace cuda {
 
 Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value);
 Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value);
-Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source);
-Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source);
 Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source);
 Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index);
 Tensor _th_take(const Tensor & self, const Tensor & index);
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index ddae7965dded..b60968a4b041 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -200,166 +200,6 @@ Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value)
     }
     return self;
 }
-Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaBoolTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_", false, DeviceType::CUDA, ScalarType::Byte);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaBFloat16Tensor_maskedCopy(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_masked_scatter_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
-Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaBoolTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto mask_ = checked_dense_tensor_unwrap(mask, "mask", 2, "_th_masked_scatter_bool_", false, DeviceType::CUDA, ScalarType::Bool);
-            auto source_ = checked_dense_tensor_unwrap(source, "source", 3, "_th_masked_scatter_bool_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaBFloat16Tensor_maskedCopyBool(globalContext().getTHCState(), self_, mask_, source_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_masked_scatter_bool_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index d88f202487af..091e1ec22a19 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -10,7 +10,13 @@
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/native/cuda/Loops.cuh>
 #include <THC/THCTensorInfo.cuh>
+#include <THC/THCThrustAllocator.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
 
 namespace at { namespace native {
 
@@ -252,6 +258,103 @@ Tensor& take_out_cuda(Tensor& out, const Tensor& self, const Tensor& index) {
     return out;
 }
 
+namespace {
+
+template <typename mask_t>
+void masked_scatter_cuda_impl(Tensor& self, const Tensor& mask, const Tensor& source){
+  auto srcSize = source.numel();
+
+  // Determine our output size
+  auto totalElements = mask.sum().item<int64_t>();
+
+  // The number of `1` elements present in the mask must be <= the
+  // number of elements available in `src`
+  TORCH_CHECK(totalElements <= srcSize, "source nElements must be == mask `1` elements");
+
+  auto mask_cont = mask.contiguous();
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  auto maskPrefixSum = at::empty_like(mask, mask.options().dtype(kLong));
+
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+
+  thrust::device_ptr<mask_t> maskData(mask_cont.data_ptr<mask_t>());
+  thrust::device_ptr<int64_t> maskPrefixSumData(
+      maskPrefixSum.data_ptr<int64_t>());
+
+  thrust::exclusive_scan(
+      thrust::cuda::par(allocator).on(c10::cuda::getCurrentCUDAStream()),
+      maskData,
+      maskData + mask_cont.numel(),
+      maskPrefixSumData);
+
+  // We are getting elements from `src` based on an offset from
+  // `maskPrefixSum`, so that should be made contiguous too
+  auto source_contig = source.contiguous();
+
+  auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self)
+      .add_input(self)
+      .add_input(mask_cont)
+      .add_input(maskPrefixSum)
+      .build();
+
+  AT_DISPATCH_ALL_TYPES_AND3(
+      ScalarType::Bool,
+      ScalarType::BFloat16,
+      ScalarType::Half,
+      self.scalar_type(),
+      "masked_scatter_",
+      [&]() {
+        auto source_ptr = source_contig.data_ptr<scalar_t>();
+        gpu_kernel(
+            iter, [=] GPU_LAMBDA(scalar_t a, mask_t mask, int64_t maskPrefixSum) -> scalar_t {
+              if (mask) {
+                return source_ptr[maskPrefixSum];
+              }
+              return a;
+            });
+        cudaGetLastError();
+      });
+}
+
+} // anonymous namespace
+
+Tensor & masked_scatter__cuda(Tensor& self, const Tensor& mask, const Tensor& source) {
+  at::assert_no_internal_overlap(self);
+  TORCH_CHECK(
+      self.scalar_type() == source.scalar_type(),
+      "masked_scatter: expected self and source to have same dtypes but got",
+      self.scalar_type(),
+      " and ",
+      source.scalar_type());
+
+  TensorArg self_arg{self, "self", 1};
+  TensorArg mask_arg{mask, "mask", 2};
+  TensorArg source_arg{source, "source", 3};
+  checkAllSameGPU("masked_scatter_", {self_arg, mask_arg, source_arg});
+
+  Tensor b_mask;
+  std::tie(b_mask) = expand_inplace(self, mask, "masked_scatter_");
+
+  if (b_mask.dtype() == ScalarType::Byte) {
+    TORCH_WARN("masked_scatter_ received a mask with dtype torch.uint8, this behavior is now deprecated," \
+            "please use a mask with dtype torch.bool instead.");
+  }
+
+  auto mask_dtype = b_mask.scalar_type();
+  if (mask_dtype == ScalarType::Bool) {
+    masked_scatter_cuda_impl<bool>(self, b_mask, source);
+  } else {
+    masked_scatter_cuda_impl<uint8_t>(self, b_mask, source);
+  }
+
+  return self;
+}
+
 REGISTER_DISPATCH(index_stub, &index_kernel);
 REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
 
diff --git a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
index 1bbe47dbfb2e..735f2c8b2875 100644
--- a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
@@ -61,19 +61,4 @@ Tensor & masked_fill__cuda(Tensor& self, const Tensor & mask, const Tensor & val
   return self;
 }
 
-Tensor & masked_scatter__cuda(Tensor& self, const Tensor & mask, const Tensor & source) {
-  at::assert_no_internal_overlap(self);
-  Tensor b_mask;
-  std::tie(b_mask) = expand_inplace(self, mask, "masked_scatter_");
-  // As we dispatch on self and TH is type-checked, we need different definitions.
-  // This can be fixed by moving to ATen.
-  if (b_mask.dtype() == at::ScalarType::Byte) {
-    TORCH_WARN("masked_scatter_ received a mask with dtype torch.uint8, this behavior is now deprecated," \
-            "please use a mask with dtype torch.bool instead.");
-    return legacy::cuda::_th_masked_scatter_(self, b_mask, source);
-  } else {
-    return legacy::cuda::_th_masked_scatter_bool_(self, b_mask, source);
-  }
-}
-
 }} // namespace at::native
diff --git a/aten/src/THC/THCTensorMasked.cuh b/aten/src/THC/THCTensorMasked.cuh
index 88f2d78b698d..4e696ba392ce 100644
--- a/aten/src/THC/THCTensorMasked.cuh
+++ b/aten/src/THC/THCTensorMasked.cuh
@@ -25,22 +25,6 @@ struct TensorMaskedFillOp {
   T value;
 };
 
-template <typename T, typename MaskT, typename MaskPrefixSumT>
-struct TensorMaskedCopyOp {
-  TensorMaskedCopyOp(T* s) : in(s) {}
-
-  __device__ inline void operator()(T* out,
-                                    MaskT* mask,
-                                    MaskPrefixSumT* maskPrefixSum) {
-    if (*mask) {
-      *out = in[*maskPrefixSum];
-    }
-  }
-
-  // Where we are copying from
-  T* in;
-};
-
 template <typename T, typename MaskT, typename MaskPrefixSumT>
 struct TensorMaskedSelectOp {
   TensorMaskedSelectOp(T* t) : out(t) {}
diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu
index 4e93ac260e42..4a3c93241aec 100644
--- a/aten/src/THC/generic/THCTensorMasked.cu
+++ b/aten/src/THC/generic/THCTensorMasked.cu
@@ -47,145 +47,4 @@ void THCTensor_(maskedFillByte)(THCState* state,
   THCudaByteTensor_free(state, maskCuda);
 }
 
-void THCTensor_(maskedCopy)(THCState* state,
-                            THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
-  ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
-  ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
-  ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
-
-  // `mask` and `tensor` must have the same number of elements
-  THArgCheck(maskSize == tensorSize, 2,
-             "mask and tensor must have the same number of elements");
-
-  // Determine our output size
-  int64_t totalElements = THTensor_wrap(mask).sum().item<int64_t>();
-
-  // The number of `1` elements present in the mask must be <= the
-  // number of elements available in `src`
-  if (totalElements > srcSize) {
-    THArgCheck(false, 2, "source nElements must be == mask `1` elements");
-  }
-
-  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
-  // iterator prefix sums? Convert `mask` to the same datatype as what
-  // we're accumulating the prefix sum in (int64_t) to get around it
-  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
-  at::IntArrayRef maskSizes = mask->sizes();
-  THCudaLongTensor_resize(state, maskLong, maskSizes, {});
-  THCTensor_(copy)(state, maskLong, mask);
-
-  // Use a prefix sum to determine the output locations of the masked elements
-  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
-  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, {});
-
-  THCThrustAllocator thrustAlloc(state);
-  thrust::device_ptr<int64_t>
-    maskData(THCudaLongTensor_data(state, maskLong));
-  thrust::device_ptr<int64_t>
-    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
-
-  thrust::exclusive_scan(
-#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
-    thrust::cuda::par(thrustAlloc).on(c10::cuda::getCurrentCUDAStream()),
-#endif
-    maskData,
-    maskData + THCudaLongTensor_nElement(state, maskLong),
-    maskPrefixSumData);
-
-  // We are getting elements from `src` based on an offset from
-  // `maskPrefixSum`, so that should be made contiguous too
-  THCTensor* contigSrc = THCTensor_(newContiguous)(state, src);
-
-  // update `tensor` where `mask` == 1 but pull from `src` at
-  // maskPrefixSum
-  bool status = THC_pointwiseApply3<scalar_t, uint8_t, int64_t>(
-    state, tensor, mask, maskPrefixSum,
-    TensorMaskedCopyOp<scalar_t, unsigned char, int64_t>(
-      THCTensor_(data)(state, contigSrc)));
-
-  THCTensor_(free)(state, contigSrc);
-  THCudaLongTensor_free(state, maskLong);
-  THCudaLongTensor_free(state, maskPrefixSum);
-
-  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCTensor_(maskedCopyBool)(THCState* state,
-                                THCTensor *tensor, THCudaBoolTensor *mask, THCTensor *src)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
-  ptrdiff_t maskSize = THCudaBoolTensor_nElement(state, mask);
-  ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
-  ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
-
-  // `mask` and `tensor` must have the same number of elements
-  THArgCheck(maskSize == tensorSize, 2,
-             "mask and tensor must have the same number of elements");
-
-  // Determine our output size
-  int64_t totalElements = THTensor_wrap(mask).sum().item<int64_t>();
-
-  // The number of `1` elements present in the mask must be <= the
-  // number of elements available in `src`
-  if (totalElements > srcSize) {
-    THArgCheck(false, 2, "source nElements must be == mask `1` elements");
-  }
-
-  // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
-  // iterator prefix sums? Convert `mask` to the same datatype as what
-  // we're accumulating the prefix sum in (int64_t) to get around it
-  THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
-  at::IntArrayRef maskSizes = mask->sizes();
-  THCudaLongTensor_resize(state, maskLong, maskSizes, {});
-  THCTensor_(copy)(state, maskLong, mask);
-
-  // Use a prefix sum to determine the output locations of the masked elements
-  THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
-  THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, {});
-
-  THCThrustAllocator thrustAlloc(state);
-  thrust::device_ptr<int64_t>
-    maskData(THCudaLongTensor_data(state, maskLong));
-  thrust::device_ptr<int64_t>
-    maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
-
-  thrust::exclusive_scan(
-#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
-    thrust::cuda::par(thrustAlloc).on(c10::cuda::getCurrentCUDAStream()),
-#endif
-    maskData,
-    maskData + THCudaLongTensor_nElement(state, maskLong),
-    maskPrefixSumData);
-
-  // We are getting elements from `src` based on an offset from
-  // `maskPrefixSum`, so that should be made contiguous too
-  THCTensor* contigSrc = THCTensor_(newContiguous)(state, src);
-
-  // update `tensor` where `mask` == 1 but pull from `src` at
-  // maskPrefixSum
-  bool status = THC_pointwiseApply3<scalar_t, bool, int64_t>(
-    state, tensor, mask, maskPrefixSum,
-    TensorMaskedCopyOp<scalar_t, bool, int64_t>(
-      THCTensor_(data)(state, contigSrc)));
-
-  THCTensor_(free)(state, contigSrc);
-  THCudaLongTensor_free(state, maskLong);
-  THCudaLongTensor_free(state, maskPrefixSum);
-
-  THArgCheck(status, 2, CUTORCH_DIM_WARNING);
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCTensor_(maskedCopyByte)(THCState* state,
-                                THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
-  THCudaByteTensor* maskCuda = THTensor_wrap(mask).cuda().unsafeReleaseTensorImpl();
-  THCTensor_(copy)(state, maskCuda, mask);
-  THCTensor_(maskedCopy)(state, tensor, maskCuda, src);
-  THCudaByteTensor_free(state, maskCuda);
-}
-
 #endif
diff --git a/aten/src/THC/generic/THCTensorMasked.h b/aten/src/THC/generic/THCTensorMasked.h
index 87e95a344973..16f4a262c8a0 100644
--- a/aten/src/THC/generic/THCTensorMasked.h
+++ b/aten/src/THC/generic/THCTensorMasked.h
@@ -21,23 +21,4 @@ TORCH_CUDA_CU_API void THCTensor_(maskedFillByte)(
     THByteTensor* mask,
     scalar_t value);
 
-TORCH_CUDA_CU_API void THCTensor_(maskedCopy)(
-    THCState* state,
-    THCTensor* tensor,
-    THCudaByteTensor* mask,
-    THCTensor* src);
-
-TORCH_CUDA_CU_API void THCTensor_(maskedCopyBool)(
-    THCState* state,
-    THCTensor* tensor,
-    THCudaBoolTensor* mask,
-    THCTensor* src);
-
-// FIXME: remove now that we have THCudaByteTensor?
-TORCH_CUDA_CU_API void THCTensor_(maskedCopyByte)(
-    THCState* state,
-    THCTensor* tensor,
-    THByteTensor* mask,
-    THCTensor* src);
-
 #endif
diff --git a/test/test_torch.py b/test/test_torch.py
index f027db3b9ef6..424ab4f40c0f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1191,49 +1191,6 @@ def test_scatterReduce(self):
             for method in ["add", "multiply"]:
                 self._test_scatter_base(self, lambda t: t, 'scatter_', reduction=method)
 
-        def test_masked_scatter(self):
-            with warnings.catch_warnings(record=True) as w:
-                warnings.simplefilter("always")
-                for maskType in [torch.uint8, torch.bool]:
-                    for dt in torch.testing.get_all_dtypes():
-                        num_copy, num_dest = 3, 10
-                        dest = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dt)
-                        dest2 = dest.clone()
-                        src = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=dt)
-                        mask = torch.tensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0), dtype=maskType)
-
-                        if dt == torch.bool:
-                            # torch.bool is a special case and is being tested
-                            # in a separate test
-                            continue
-
-                        # TODO: update test when masked scatter is supported for complex
-                        if dt == torch.half or dt.is_complex:
-                            self.assertRaises(RuntimeError, lambda: dest.masked_scatter_(mask, src))
-                            continue
-
-                        dest.masked_scatter_(mask, src)
-                        j = 0
-                        for i in range(num_dest):
-                            if mask[i]:
-                                dest2[i] = src[j]
-                                j += 1
-                        self.assertEqual(dest, dest2, atol=0, rtol=0)
-
-                        # make source bigger than number of 1s in mask
-                        src = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=dt)
-                        dest.masked_scatter_(mask, src)
-
-                        # make src smaller. this should fail
-                        src = torch.zeros(num_copy - 1, dtype=dt)
-                        with self.assertRaises(RuntimeError):
-                            dest.masked_scatter_(mask, src)
-            self.assertEqual(len(w), 27)
-
-            warn = 'masked_scatter_ received a mask with dtype torch.uint8,'
-            for wi in w:
-                self.assertEqual(str(wi.message)[0:55], str(warn))
-
         def test_masked_fill(self):
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
@@ -4521,6 +4478,56 @@ def test_scatter_add_bool(self, device):
                                             [False, True, False, True, False],
                                             [True, False, True, False, True]], device=device))
 
+    @onlyOnCPUAndCUDA
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_masked_scatter(self, device, dtype):
+        dt = dtype
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            for maskType in [torch.uint8, torch.bool]:
+                num_copy, num_dest = 3, 10
+                dest = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dt, device=device)
+                dest2 = dest.clone()
+                dest_ones = dest.clone()
+                dest_ones_expected = dest.clone()
+                src = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=dt, device=device)
+                src_ones = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=dt, device=device)
+                mask = torch.tensor((0, 0, 0, 0, 1, 0, 1, 0, 1, 0), dtype=maskType, device=device)
+
+                if dt == torch.bool:
+                    # torch.bool is a special case and is being tested
+                    # in a separate test
+                    return
+
+                # TODO: update test when masked scatter is supported for complex
+                # and cpu supports half
+                if (dt == torch.half and self.device_type == 'cpu') or dt.is_complex:
+                    self.assertRaises(RuntimeError, lambda: dest.masked_scatter_(mask, src))
+                    return
+
+                dest.masked_scatter_(mask, src)
+                j = 0
+                for i in range(num_dest):
+                    if mask[i]:
+                        dest2[i] = src[j]
+                        dest_ones_expected[i] = src_ones[j]
+                        j += 1
+                self.assertEqual(dest, dest2, atol=0, rtol=0)
+
+                dest_ones.masked_scatter_(mask, src_ones)
+                self.assertEqual(dest_ones, dest_ones_expected, atol=0, rtol=0)
+
+                # make src smaller. this should fail
+                src = torch.zeros(num_copy - 1, dtype=dt, device=device)
+                with self.assertRaises(RuntimeError):
+                    dest.masked_scatter_(mask, src)
+
+        self.assertEqual(len(w), 3)
+
+        warn = 'masked_scatter_ received a mask with dtype torch.uint8,'
+        for wi in w:
+            self.assertEqual(str(wi.message)[0:55], str(warn))
+
     def test_masked_scatter_bool_tensor(self, device):
         src = torch.tensor([True, True, True], device=device)
         dst = torch.tensor([False, False, False], device=device)

From 3b6f30824cd131a63f6ac587b97106aa8df4f73b Mon Sep 17 00:00:00 2001
From: Lillian Johnson <lillianjohnson@fb.com>
Date: Wed, 27 Jan 2021 15:01:46 -0800
Subject: [PATCH 16/41] OpInfo JIT op.output_func handling support (#50775)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50775

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D25964541

Pulled By: Lilyjjo

fbshipit-source-id: 8cf1ee9191d526cc46ae283f38c2d64bd60afdb2
---
 test/test_jit.py                              | 30 +++++++++----------
 test/test_ops.py                              | 11 ++++---
 torch/testing/_internal/common_jit.py         | 10 +++----
 .../_internal/common_methods_invocations.py   |  6 +---
 .../_internal/jit_metaprogramming_utils.py    | 11 +++----
 5 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index e745f898824a..17f7f66ac8a5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15682,7 +15682,7 @@ def check(name):
                 def fn(*inputs, **kwargs):
                     attr = getattr(inputs[0], name)
                     output = attr(*inputs[1:], **kwargs)
-                    return output_process_fn(output)
+                    return output
 
                 check_types = test_name not in EXCLUDE_TYPE_CHECK
                 # XXX: this test should always run with disable_autodiff_subgraph_inlining(True),
@@ -15698,7 +15698,7 @@ def fn(*inputs, **kwargs):
                             traced_fn = create_traced_fn(self, fn)
 
                             check_against_reference(self, traced_fn,
-                                                    fn, (self_variable,) + args_variable, kwargs_variable,
+                                                    fn, output_process_fn, (self_variable,) + args_variable, kwargs_variable,
                                                     check_types=check_types)
                             if IS_SANDCASTLE:
                                 autodiff_nodes = autodiff_nodes + fusible_nodes
@@ -15708,9 +15708,9 @@ def fn(*inputs, **kwargs):
                                 self.assertAutodiffNode(traced_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes)
 
                         if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
-                            script_fn = create_script_fn(self, name, 'method', output_process_fn)
+                            script_fn = create_script_fn(self, name, 'method')
                             check_against_reference(self, script_fn,
-                                                    fn, (self_variable,) + args_variable, kwargs_variable,
+                                                    fn, output_process_fn, (self_variable,) + args_variable, kwargs_variable,
                                                     check_types=check_types)
 
                             if IS_SANDCASTLE:
@@ -15725,21 +15725,20 @@ def fn(*inputs, **kwargs):
                     # functional interface tests
                     if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
                         def fn(*inputs, **kwargs):
-                            output = getattr(torch, name)(*inputs, **kwargs)
-                            return output_process_fn(output)
+                            return getattr(torch, name)(*inputs, **kwargs)
 
                         f_args_variable = (self_variable,) + args_variable
                         f_args_tensor = (self_tensor,) + args_tensor
 
                         if not is_inplace and test_name not in EXCLUDE_TRACED:
                             check_against_reference(self,
-                                                    create_traced_fn(self, fn),
-                                                    fn, f_args_variable, kwargs_variable, check_types=check_types)
+                                                    create_traced_fn(self, fn), fn, output_process_fn,
+                                                    f_args_variable, kwargs_variable, check_types=check_types)
 
                         if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                             check_against_reference(self,
-                                                    create_script_fn(self, name, 'functional', output_process_fn),
-                                                    fn, f_args_variable, kwargs_variable,
+                                                    create_script_fn(self, name, 'functional'),
+                                                    fn, output_process_fn, f_args_variable, kwargs_variable,
                                                     check_types=check_types)
 
                 # alias annotation testing
@@ -15781,8 +15780,7 @@ def do_test(self, name=name, args=args, test_name=test_name, check_ad=check_ad):
             output_variable = getattr(F, name)(self_variable, *args_variable, **kwargs_variable)
 
         def fn(*inputs, **kwargs):
-            output = getattr(F, name)(*inputs, **kwargs)
-            return output_process_fn(output)
+            return getattr(F, name)(*inputs, **kwargs)
 
         f_args_variable = (self_variable,) + args_variable
         f_args_tensor = (self_tensor,) + args_tensor
@@ -15793,8 +15791,9 @@ def run_test():
                 # XXX: this test should always run with disable_autodiff_subgraph_inlining(True),
                 #      so that we don't regress on autodiff support.
                 with disable_autodiff_subgraph_inlining():
-                    script_fn = create_script_fn(self, name, 'nn_functional', output_process_fn)
-                    check_against_reference(self, script_fn, fn, f_args_variable, kwargs_variable, no_grad=no_grad)
+                    script_fn = create_script_fn(self, name, 'nn_functional')
+                    check_against_reference(self, script_fn, fn, output_process_fn, 
+                                            f_args_variable, kwargs_variable, no_grad=no_grad)
                     # For tests we disabled AD subgraph inlining, make sure it's not falling back to autograd
                     if (doAutodiffCheck(test_name)):
                         self.assertAutodiffNode(script_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes)
@@ -15914,7 +15913,8 @@ def create_nn_module(*args, **kwargs):
         f_args_variable = deepcopy(unpack_variables(args_variable))
 
         # Check against Python module as reference
-        check_against_reference(self, create_script_module, create_nn_module, f_args_variable, no_grad=no_grad)
+        check_against_reference(self, create_script_module, create_nn_module, 
+                                lambda x: x, f_args_variable, no_grad=no_grad)
 
     if 'slowTest' in kwargs:
         do_test = slowTest(do_test)
diff --git a/test/test_ops.py b/test/test_ops.py
index 40627cd4b264..a01f96fe877b 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -278,17 +278,15 @@ def test_variant_consistency_jit(self, device, dtype, op):
                 #   autodiff support. Context manager forces the graph to contain
                 #   DifferentiableGraph nodes if they are present
                 with disable_autodiff_subgraph_inlining():
-                    def fn(*inputs, **kwargs):
-                        output = func(*inputs, **kwargs)
-                        return op.output_func(output)
 
 
                     # Check scripted forward, grad, and grad grad
-                    script_fn = create_script_fn(self, name, func_type, op.output_func)
+                    script_fn = create_script_fn(self, name, func_type)
 
                     check_against_reference(self,
                                             script_fn,
-                                            fn,
+                                            func,
+                                            op.output_func,
                                             (*sample.input,) + sample.args,
                                             sample.kwargs,
                                             no_grad=not test_backward)
@@ -297,7 +295,8 @@ def fn(*inputs, **kwargs):
                     traced_fn = create_traced_fn(self, variant)
                     check_against_reference(self,
                                             traced_fn,
-                                            fn,
+                                            func,
+                                            op.output_func,
                                             (*sample.input,) + sample.args,
                                             sample.kwargs,
                                             no_grad=not test_backward)
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 8c2b407beea1..a93e13b665be 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -36,7 +36,7 @@ def check_output_types(self, func, ref_outputs, args, kwargs):
     'grid_sample',
 ])
 
-def check_against_reference(self, func, reference_func, args, kwargs=None,
+def check_against_reference(self, func, reference_func, output_func, args, kwargs=None,
                             allow_unused=True, check_types=True, no_grad=False):
     kwargs = kwargs if kwargs else {}
 
@@ -72,10 +72,10 @@ def clone_inputs(requires_grad):
 
     with enable_profiling_mode_for_profiling_tests():
         # test single grad case
-        outputs = self.runAndSaveRNG(reference_func, recording_inputs, kwargs)
+        outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs))
         grads = torch.autograd.grad(allSum(outputs), recording_tensors,
                                     allow_unused=allow_unused)
-        outputs_test = self.runAndSaveRNG(func, recording_inputs, kwargs)
+        outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs))
         grads_test = torch.autograd.grad(allSum(outputs_test), recording_tensors,
                                          allow_unused=allow_unused)
         self.assertEqual(outputs, outputs_test)
@@ -84,7 +84,7 @@ def clone_inputs(requires_grad):
         if self._testMethodName in nn_functional_single_grad:
             return
 
-        outputs = self.runAndSaveRNG(reference_func, recording_inputs, kwargs)
+        outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs))
         l1 = allSum(outputs)
         grads = torch.autograd.grad(l1, recording_tensors, create_graph=True,
                                     allow_unused=allow_unused)
@@ -92,7 +92,7 @@ def clone_inputs(requires_grad):
         l2 = (allSum(grads) * l1)
         grads2 = torch.autograd.grad(l2, recording_tensors, allow_unused=allow_unused)
         recording_inputs, recording_tensors = clone_inputs(True)
-        outputs_test = self.runAndSaveRNG(func, recording_inputs, kwargs)
+        outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs))
         l1_test = allSum(outputs_test)
         grads_test = torch.autograd.grad(
             l1_test, recording_tensors, create_graph=True, allow_unused=allow_unused)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2ad65a9631dc..324a9346c45b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1326,11 +1326,7 @@ def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
            supports_tensor_out=False,
            sample_inputs_func=sample_inputs_slogdet,
            output_func=itemgetter(1),
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
-           skips=(
-               # These tests do not work with output_func=itemgetter(1)
-               # TODO: remove this once https://github.com/pytorch/pytorch/issues/49326 is resolved
-               SkipInfo('TestCommon', 'test_variant_consistency_jit'),)),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, float('inf')),
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index cd134b38aba9..25ab7d1fc3f5 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -290,14 +290,15 @@ def gen_script_fn_and_args(method_name, func_type, *args, **kwargs):
     CU = torch.jit.CompilationUnit(script)
     return CU.the_method, tensors
 
-# create a script function from (name, func_type, output_process_fn),
-# returns a function takes in (args, kwargs) and runs the compiled function and
-# then applies the post process fn to the outputs
-def create_script_fn(self, method_name, func_type, output_process_fn):
+# create a script function from (name, func_type),
+# returns a function takes in (args, kwargs) and runs the compiled function
+def create_script_fn(self, method_name, func_type):
+    # function returns tuple containing original output and
+    # filtered output to be used in checking gradients 
     def script_fn(*args, **kwargs):
         fn, tensors = gen_script_fn_and_args(method_name, func_type, *args, **kwargs)
         self.assertExportImport(fn.graph, tensors)
-        output = output_process_fn(fn(*tensors))
+        output = fn(*tensors)
         # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087
         script_fn.last_graph = fn.graph_for(*tensors)  # type: ignore[attr-defined]
         return output

From 2de4ecd4ebc99d509b8f13ff12ed241c7433a0ad Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Wed, 27 Jan 2021 15:17:09 -0800
Subject: [PATCH 17/41] Add serialization logic for complex numbers (#50885)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50885

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D26094906

Pulled By: anjali411

fbshipit-source-id: 7b2614f3ee4a30c4b4cf04aaa3432988b38a0721
---
 test/test_complex.py                       | 12 ++++++++++++
 torch/csrc/jit/python/pybind_utils.h       |  2 ++
 torch/csrc/jit/serialization/pickler.cpp   | 10 ++++++++++
 torch/csrc/jit/serialization/pickler.h     |  1 +
 torch/csrc/jit/serialization/unpickler.cpp | 13 ++++++++++---
 5 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/test/test_complex.py b/test/test_complex.py
index e6a705032d74..cc8ed7f62398 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -12,6 +12,18 @@ def fn(a: complex):
 
         self.checkScript(fn, (3 + 5j,))
 
+    def test_pickle(self):
+        class ComplexModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super().__init__()
+                self.a = 3 + 5j
+
+            def forward(self, b: int):
+                return b
+
+        loaded = self.getExportImportCopy(ComplexModule())
+        self.assertEqual(loaded.a, 3 + 5j)
+
 class TestComplexTensor(TestCase):
     @dtypes(*torch.testing.get_all_complex_dtypes())
     def test_to_list(self, device, dtype):
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 06c57f32f15c..eca56876999f 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -293,6 +293,8 @@ inline InferredType tryToInferType(py::handle input) {
     return InferredType(IntType::get());
   } else if (py::isinstance<py::float_>(input)) {
     return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexDoubleType::get());
   } else if (py::isinstance<py::str>(input)) {
     return InferredType(StringType::get());
   } else if (THPLayout_Check(input.ptr())) {
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index e2118af2019d..a0dd826c4267 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -49,6 +49,8 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     pushTuple(ivalue);
   } else if (ivalue.isDouble()) {
     pushDouble(ivalue.toDouble());
+  } else if (ivalue.isComplexDouble()) {
+    pushComplexDouble(ivalue);
   } else if (ivalue.isInt()) {
     pushInt(ivalue.toInt());
   } else if (ivalue.isBool()) {
@@ -464,6 +466,14 @@ void Pickler::pushDouble(double value) {
   // Python pickle format is big endian, swap.
   push<double>(swapDouble(value));
 }
+void Pickler::pushComplexDouble(const IValue& value) {
+  c10::complex<double> d = value.toComplexDouble();
+  pushGlobal("builtins", "complex");
+  pushIValue(d.real());
+  pushIValue(d.imag());
+  push<PickleOpCode>(PickleOpCode::TUPLE2);
+  push<PickleOpCode>(PickleOpCode::REDUCE);
+}
 
 void Pickler::pushLong(const std::string& data) {
   uint64_t size = data.size();
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 21d0f61a18eb..4dc216ec702b 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -160,6 +160,7 @@ class TORCH_API Pickler {
   void endTypeTag(const IValue& value);
   void pushBool(bool value);
   void pushDouble(double value);
+  void pushComplexDouble(const IValue& value);
   void pushGenericList(const IValue& ivalue);
   void pushIntList(const IValue& ivalue);
   void pushList(const IValue& ivalue);
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index f363fe73f1e9..efeaac75c41c 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -57,6 +57,7 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case StorageType::Kind:
       case NumberType::Kind:
       case FloatType::Kind:
+      case ComplexDoubleType::Kind:
       case IntType::Kind:
       case NoneType::Kind:
       case GeneratorType::Kind:
@@ -80,9 +81,6 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case AnyEnumType::Kind:
         // no op, there is nothing to tag
         break;
-      // TODO(@anjali411): Implement serialization/deserialization for complex
-      // numbers
-      case ComplexDoubleType::Kind:
       case EnumType::Kind:
         // TODO(gmagogsfm): Implement serialization/deserialization of Enum.
         AT_ASSERT(false);
@@ -543,6 +541,15 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
+  } else if (module_name == "builtins" && class_name == "complex") {
+    globals_.emplace_back([this] {
+      auto elems = pop(stack_).toTuple()->elements();
+      AT_ASSERT(elems.size() == 2);
+      auto complex =
+          c10::complex<double>(elems.at(0).toDouble(), elems.at(1).toDouble());
+      stack_.emplace_back(complex);
+    });
+
   } else if (module_name == "collections" && class_name == "OrderedDict") {
     // collections.OrderedDict is used in tensor serialization for a tensor's
     // backward hooks (but they are not actually saved with this Pickler)

From 621198978a4073df213757cc29d305ee25405446 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Wed, 27 Jan 2021 15:41:41 -0800
Subject: [PATCH 18/41] Move USE_NUMPY to more appropriate targets (#51143)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51143

Test Plan: CI

Reviewed By: wconstab

Differential Revision: D26084123

fbshipit-source-id: af4abe4ef87c1ebe5434938320526a925f5c34c8
---
 caffe2/CMakeLists.txt   | 3 +++
 caffe2/core/macros.h.in | 4 ----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 62f9e8be3e4c..1b1656d1fffa 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1484,6 +1484,7 @@ if(BUILD_PYTHON)
   # ---[ Python.
   if(BUILD_CAFFE2)
   add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
+  target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY")
   if(NOT MSVC)
     set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
   endif()
@@ -1514,6 +1515,7 @@ if(BUILD_PYTHON)
 
   if(USE_CUDA)
     add_library(caffe2_pybind11_state_gpu MODULE ${Caffe2_GPU_PYTHON_SRCS})
+    target_compile_options(caffe2_pybind11_state_gpu PRIVATE "-DUSE_NUMPY")
     if(NOT MSVC)
       set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
     endif()
@@ -1542,6 +1544,7 @@ if(BUILD_PYTHON)
 
   if(USE_ROCM)
     add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
+    target_compile_options(caffe2_pybind11_state_hip PRIVATE "-DUSE_NUMPY")
     if(NOT MSVC)
       target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_CXX_FLAGS} -fvisibility=hidden)
     endif()
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index dd9f9902be1f..bd9a447b879d 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -44,10 +44,6 @@ static_assert(
 #cmakedefine CAFFE2_USE_NVTX
 #cmakedefine CAFFE2_USE_TRT
 
-#ifndef USE_NUMPY
-#cmakedefine USE_NUMPY
-#endif
-
 #ifndef EIGEN_MPL2_ONLY
 #cmakedefine EIGEN_MPL2_ONLY
 #endif

From 98d9a6317d0818d70d154fe1d8fa7efc3c38c4ae Mon Sep 17 00:00:00 2001
From: yangu <yangu@microsoft.com>
Date: Wed, 27 Jan 2021 15:46:46 -0800
Subject: [PATCH 19/41] Rename profile.next_step() to profile.step() to
 consistent with optimizer.step() (#51032)

Summary:
Similar with Optimizer.step(), profile.next_step() occurs every iteration and calls at the end of each iteration. So it's better to make them same naming style.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/51032

Reviewed By: heitorschueroff

Differential Revision: D26097847

Pulled By: ilia-cher

fbshipit-source-id: ea2e5c8e865d99f90b004ec7797271217efeeb68
---
 test/test_profiler.py      |  2 +-
 torch/profiler/profiler.py | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/test/test_profiler.py b/test/test_profiler.py
index 826a9f5d0b57..9dfe099163a7 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -267,7 +267,7 @@ def trace_handler(p):
         ) as p:
             for idx in range(8):
                 self.payload()
-                p.next_step()
+                p.step()
 
         self.assertEqual(called_num[0], 2)
 
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 25bee1c2019f..8024bb727a43 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -86,7 +86,7 @@ class profile(object):
         print(p.key_averages().table(
             sort_by="self_cuda_time_total", row_limit=-1))
 
-    Usimg the profiler's ``schedule``, ``on_trace_ready`` and ``next_step`` functions:
+    Usimg the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
 
     .. code-block:: python
 
@@ -96,7 +96,7 @@ class profile(object):
         def trace_handler(prof):
             print(prof.key_averages().table(
                 sort_by="self_cuda_time_total", row_limit=-1))
-            # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step()) + ".json")
+            # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
 
         with torch.profiler.profile(
             activities=[
@@ -120,7 +120,7 @@ def trace_handler(prof):
                 for iter in range(N):
                     code_iteration_to_profile(iter)
                     # send a signal to the profiler that the next iteration has started
-                    p.next_step()
+                    p.step()
     """
     def __init__(
             self,
@@ -172,7 +172,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             self.step_rec_fn.__exit__(None, None, None)
         self._exit_actions()
 
-    def next_step(self):
+    def step(self):
         """
         Signals the profiler that the next profiling step has started.
         """
@@ -232,12 +232,6 @@ def next_step(self):
             self.step_rec_fn = prof.record_function("ProfilerStep#" + str(self.step_num))
             self.step_rec_fn.__enter__()
 
-    def step(self):
-        """
-        Returns the current profiling step.
-        """
-        return self.step_num
-
     def export_chrome_trace(self, path: str):
         """
         Exports the collected trace in Chrome JSON format.

From 1321f2bfe6a100cb327e20a9d285bd51b650231a Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 27 Jan 2021 15:50:51 -0800
Subject: [PATCH 20/41] [PyTorch] Port Caffe2 opti for BatchMatMul batch size 1
 to baddbmm (#51057)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51057

Caffe2 has an
[optimization](https://github.com/pytorch/pytorch/blob/f8eefbdf7a229abbb864e47e0b664c7628d80224/caffe2/operators/batch_matmul_op.h#L192)
for the case where the batch size is 1 that uses the underlying `gemm`
instead of `gemm_batched` BLAS function. This diff tries to port that
optimization to `baddbmm_mkl`.

Note that I have very little linear algebra background and am just
going off existing code and cblas API documentation, so please
review without assuming I know what I'm doing with the math itself.
ghstack-source-id: 120342923

Reviewed By: hlu1

Differential Revision: D26056613

fbshipit-source-id: feef80344b96601fc2bd0a2e8c8f6b57510d7856
---
 aten/src/ATen/native/mkl/LinearAlgebra.cpp | 59 ++++++++++++++++++++--
 test/test_linalg.py                        | 42 +++++++--------
 2 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
index 0fc22c2c637d..cb14f9ae3333 100644
--- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
@@ -32,6 +32,36 @@ Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2,
 
 namespace at { namespace native {
 
+static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int  M, const int N, const int K, const float alpha, const float* A,
+  const int lda, const float* B, const int ldb, const float beta, float* C, const int ldc) {
+  cblas_sgemm(CblasRowMajor, trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int  M, const int N, const int K, const double alpha, const double* A,
+  const int lda, const double* B, const int ldb, const double beta, double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int  M, const int N, const int K, const c10::complex<float> alpha,
+  const c10::complex<float>* A, const int lda, const c10::complex<float>* B, const int ldb,
+  const c10::complex<float> beta, c10::complex<float>* C, const int ldc) {
+  cblas_cgemm(CblasRowMajor, trans_A, trans_B, M, N, K, reinterpret_cast<const void *>(&alpha),
+    reinterpret_cast<const void*>(A), lda, reinterpret_cast<const void*>(B), ldb,
+    reinterpret_cast<const void*>(&beta), reinterpret_cast<void*>(C), ldc);
+}
+
+static inline void gemm(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int  M, const int N, const int K, const c10::complex<double> alpha,
+  const c10::complex<double>* A, const int lda, const c10::complex<double>* B, const int ldb,
+  const c10::complex<double> beta, c10::complex<double>* C, const int ldc) {
+  cblas_zgemm(CblasRowMajor, trans_A, trans_B, M, N, K, reinterpret_cast<const void *>(&alpha),
+    reinterpret_cast<const void*>(A), lda, reinterpret_cast<const void*>(B), ldb,
+    reinterpret_cast<const void*>(&beta), reinterpret_cast<void*>(C), ldc);
+}
+
 static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
   const int batch_size, const int M, const int N, const int K, const float alpha,
   const float** A, const int lda, const float** B, const int ldb, const float beta,
@@ -101,6 +131,31 @@ static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, c
   const int ldb = trans_B == CblasTrans ? mat2_strides[2] : mat2_strides[1];
   const int ldc = res.strides()[1];
 
+  // avoid using tensor accessor in the case of mat1/mat2 not being transposed
+  // or only transposed in the last two axes
+  const bool canAvoidTensorAccessor = mat1_strides[0] == mat1_sizes[1] * mat1_sizes[2] &&
+    mat2_strides[0] == mat2_sizes[1] * mat2_sizes[2];
+
+  scalar_t* const res_data = static_cast<scalar_t*>(res.data_ptr());
+
+  if (batch_size == 1) {
+    const scalar_t* A;
+    const scalar_t* B;
+    if (canAvoidTensorAccessor) {
+      scalar_t* mat1_data = static_cast<scalar_t*>(mat1.data_ptr());
+      scalar_t* mat2_data = static_cast<scalar_t*>(mat2.data_ptr());
+      A = mat1_data;
+      B = mat2_data;
+    } else {
+      auto mat1_acc = mat1.accessor<scalar_t, 3>();
+      auto mat2_acc = mat2.accessor<scalar_t, 3>();
+      A = mat1_acc[0].data();
+      B = mat2_acc[0].data();
+    }
+    gemm(trans_A, trans_B, M, N, K, alpha, A, lda, B, ldb, beta, res_data, ldc);
+    return;
+  }
+
   std::vector<const scalar_t*> A;
   A.reserve(batch_size);
   std::vector<const scalar_t*> B;
@@ -110,10 +165,8 @@ static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, c
 
   // avoid using tensor accessor in the case of mat1/mat2 not being transposed
   // or only transposed in the last two axis
-  scalar_t* res_data = static_cast<scalar_t*>(res.data_ptr());
   const auto res_sizes = res.sizes();
-  if (mat1_strides[0] == mat1_sizes[1] * mat1_sizes[2] &&
-      mat2_strides[0] == mat2_sizes[1] * mat2_sizes[2]) {
+  if (canAvoidTensorAccessor) {
     scalar_t* mat1_data = static_cast<scalar_t*>(mat1.data_ptr());
     scalar_t* mat2_data = static_cast<scalar_t*>(mat2.data_ptr());
     for (int64_t batch = 0; batch < batch_size; batch++) {
diff --git a/test/test_linalg.py b/test/test_linalg.py
index fd70ebaad04f..4f061dccc0de 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4727,7 +4727,7 @@ def test_bmm(self, device, dtype):
             # undefined bahavior
             return
 
-        num_batches = 10
+        batch_sizes = [1, 10]
         M, N, O = 23, 8, 12
         numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32
 
@@ -4736,17 +4736,18 @@ def test_bmm(self, device, dtype):
             is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
 
         if not is_supported:
-            b1 = torch.randn(num_batches, M, N, device=device).to(dtype)
-            b2 = torch.randn(num_batches, N, O, device=device).to(dtype)
-            self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED",
-                                   lambda: torch.bmm(b1, b2))
+            for num_batches in batch_sizes:
+                b1 = torch.randn(num_batches, M, N, device=device).to(dtype)
+                b2 = torch.randn(num_batches, N, O, device=device).to(dtype)
+                self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED",
+                                       lambda: torch.bmm(b1, b2))
             return
 
         def invert_perm(p):
             d = {x: i for i, x in enumerate(p)}
             return (d[0], d[1], d[2])
 
-        def generate_inputs():
+        def generate_inputs(num_batches):
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
@@ -4769,21 +4770,22 @@ def generate_inputs():
                 b2 = torch.randn(shape2, dtype=dtype, device=device)
                 yield b1, b2
 
-        for (b1, b2), perm3 in itertools.product(generate_inputs(), itertools.permutations((0, 1, 2))):
-            res1 = torch.bmm(b1, b2)
-            res2 = torch.full((num_batches, M, O), math.nan, dtype=dtype, device=device) \
-                .permute(perm3).contiguous().permute(invert_perm(perm3))
-            torch.bmm(b1, b2, out=res2)
-            expect = torch.from_numpy(
-                b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
-            self.assertEqual(expect, res1)
-            self.assertEqual(expect, res2)
+        for num_batches in batch_sizes:
+            for (b1, b2), perm3 in itertools.product(generate_inputs(num_batches), itertools.permutations((0, 1, 2))):
+                res1 = torch.bmm(b1, b2)
+                res2 = torch.full((num_batches, M, O), math.nan, dtype=dtype, device=device) \
+                    .permute(perm3).contiguous().permute(invert_perm(perm3))
+                torch.bmm(b1, b2, out=res2)
+                expect = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
+                self.assertEqual(expect, res1)
+                self.assertEqual(expect, res2)
 
-            if self.device_type == 'cuda':
-                # check that mixed arguments are rejected
-                self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2.cpu()))
-                self.assertRaises(RuntimeError, lambda: torch.bmm(b1.cpu(), b2))
-                self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2, out=res2.cpu()))
+                if self.device_type == 'cuda':
+                    # check that mixed arguments are rejected
+                    self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2.cpu()))
+                    self.assertRaises(RuntimeError, lambda: torch.bmm(b1.cpu(), b2))
+                    self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2, out=res2.cpu()))
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA

From 3f23ad5bce75539a23e8d462b4339e75a6171095 Mon Sep 17 00:00:00 2001
From: "Luca(Wei) Chen" <Wei@cvluca.com>
Date: Wed, 27 Jan 2021 15:55:58 -0800
Subject: [PATCH 21/41] [Bug] fix for module_has_exports (#50680)

Summary:
The attributes in `dir(mod)` may not be valid, this will throw error when calling `getattr`.
Use `hasattr` to test if it is valid.

Here is an example:
```python
class A:
    def __init__(self, x):
        if x:
            self._attr = 1

    property
    def val(self):
        return getattr(self, '_attr')

a = A(False)
print('val' in dir(a))
print(hasattr(a, 'val'))

b = A(True)
print('val' in dir(b))
print(hasattr(b, 'val'))
```

And the outputs:
```
True
False
True
True
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50680

Reviewed By: malfet

Differential Revision: D26103975

Pulled By: eellison

fbshipit-source-id: 67a799afe7d726153c91654d483937c5e198ba94
---
 test/jit/test_tracer.py | 18 ++++++++++++++++++
 torch/_jit_internal.py  |  9 +++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 9dff4b0f4549..71f572dedb54 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1877,6 +1877,24 @@ def forward(self, inputs):
         tm = torch.jit.trace(m, torch.tensor(1.))
         self.assertFalse(hasattr(tm, "submod"))
 
+    def test_trace_with_conditional_property(self):
+        class Net(nn.Module):
+            def __init__(self, attr=None):
+                super(Net, self).__init__()
+                if attr is not None:
+                    self._attr = attr
+                self.attr_name = '_attr'
+
+            @property
+            def attr(self):
+                return getattr(self, self.attr_name)
+
+            def forward(self, x):
+                return x
+
+        x = torch.ones(1)
+        torch.jit.trace(Net(), x)
+
 
 class TestMixTracingScripting(JitTestCase):
     def test_trace_script(self):
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index f4dabcbf97de..ad7a2cf4ba88 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -600,10 +600,11 @@ def _copy_to_script_wrapper(fn):
 
 def module_has_exports(mod):
     for name in dir(mod):
-        item = getattr(mod, name)
-        if callable(item):
-            if get_torchscript_modifier(item) is FunctionModifiers.EXPORT:
-                return True
+        if hasattr(mod, name):
+            item = getattr(mod, name)
+            if callable(item):
+                if get_torchscript_modifier(item) is FunctionModifiers.EXPORT:
+                    return True
     return False
 
 def should_drop(fn):

From 42aeb68128e914676df1b82f162eeda56343ba6e Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 27 Jan 2021 15:58:35 -0800
Subject: [PATCH 22/41] [TensorExpr] Move 'initializer' field from 'Tensor' to
 'Buf'. (#50993)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50993

This is the first step to make 'Tensor` a thin wrapper over 'Buf' and
'Stmt', which will be finished in subsequent PRs. This change also
allows to remove 'buf_initializers_' from 'LoopNest', making it "less
stateful".

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D26038224

Pulled By: ZolotukhinM

fbshipit-source-id: f418816e54c62f291fa45812901487394e9b95b5
---
 test/cpp/tensorexpr/test_loopnest.cpp  |  8 +++---
 torch/csrc/jit/tensorexpr/expr.h       | 20 ++++++++++---
 torch/csrc/jit/tensorexpr/loopnest.cpp | 39 +++++++++-----------------
 torch/csrc/jit/tensorexpr/loopnest.h   | 11 ++------
 torch/csrc/jit/tensorexpr/tensor.h     | 12 ++------
 5 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 89ad7eb2aecb..74f69079e2db 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -3566,7 +3566,7 @@ TEST(LoopNest, DeadStoreElimination) {
   Stmt* stmt = Block::make({stmt1});
 
   // Will eliminate if not used by an output.
-  LoopNest loop(stmt, {f.node()}, {}, {});
+  LoopNest loop(stmt, {f.node()}, {});
   loop.eliminateDeadStores();
 
   std::ostringstream oss;
@@ -3580,7 +3580,7 @@ TEST(LoopNest, DeadStoreElimination) {
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 
   // But won't eliminate if used by different outputs.
-  LoopNest loop2(stmt, {f.node(), g.node()}, {}, {});
+  LoopNest loop2(stmt, {f.node(), g.node()}, {});
   loop2.eliminateDeadStores();
 
   oss.clear();
@@ -3621,7 +3621,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
 
   // Will eliminate the write to g, but not f since it used by the producer of
   // h.
-  LoopNest loop(stmt, {h.node()}, {}, {});
+  LoopNest loop(stmt, {h.node()}, {});
   loop.eliminateDeadStores();
 
   std::ostringstream oss;
@@ -3636,7 +3636,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 
   // Sanity check won't eliminate if g is an output.
-  LoopNest loop2(stmt, {h.node(), g.node()}, {}, {});
+  LoopNest loop2(stmt, {h.node(), g.node()}, {});
   loop2.eliminateDeadStores();
 
   oss.clear();
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 92aad34d3b7d..9645051e6a9a 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -183,11 +183,18 @@ class TORCH_API Buf : public ExprNode<Buf> {
 
   Buf(const std::string& name_hint,
       const std::vector<const Expr*>& dims,
-      Dtype dtype)
-      : Buf(new Var(name_hint, kHandle), dims, dtype) {}
+      Dtype dtype,
+      const Expr* initializer = nullptr)
+      : Buf(new Var(name_hint, kHandle), dims, dtype, initializer) {}
 
-  Buf(const Var* var, const std::vector<const Expr*>& dims, Dtype dtype)
-      : ExprNodeBase(dtype, kPrimitive), base_handle_(var), dims_(dims) {
+  Buf(const Var* var,
+      const std::vector<const Expr*>& dims,
+      Dtype dtype,
+      const Expr* initializer = nullptr)
+      : ExprNodeBase(dtype, kPrimitive),
+        base_handle_(var),
+        dims_(dims),
+        initializer_(initializer) {
     TORCH_CHECK(var);
   }
 
@@ -207,9 +214,14 @@ class TORCH_API Buf : public ExprNode<Buf> {
     dims_ = dims;
   };
 
+  const Expr* initializer() const {
+    return initializer_;
+  };
+
  private:
   const Var* base_handle_;
   std::vector<const Expr*> dims_;
+  const Expr* initializer_;
 };
 
 class TORCH_API BufHandle : public ExprHandle {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 71869d1d33d7..b189f04d55b2 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -544,10 +544,7 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) {
     return body;
   }
 
-  const Expr* initializer = t->initializer();
-  if (initializer) {
-    buf_initializers_[t->buf()] = initializer;
-  }
+  const Expr* init_expr = t->buf()->initializer();
 
   std::vector<const Expr*> indices(t->args().begin(), t->args().end());
 
@@ -561,8 +558,8 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) {
           t->reduce_dim(dim_index),
           body);
     }
-    if (initializer) {
-      Store* init = new Store(t->buf(), indices, initializer, new IntImm(1));
+    if (init_expr) {
+      Store* init = new Store(t->buf(), indices, init_expr, new IntImm(1));
       body = new Block({init, body});
     }
   }
@@ -2352,8 +2349,9 @@ void LoopNest::rfactor(
   }
 
   std::vector<const Expr*> new_dims = {};
-  Buf* tmp_buf =
-      new Buf(new Var("tmp_buf", kHandle), new_dims, reduce_op->dtype());
+  const Expr* init = reduce_op->accumulator()->initializer();
+  TORCH_INTERNAL_ASSERT(init);
+  Buf* tmp_buf = new Buf("tmp_buf", new_dims, reduce_op->dtype(), init);
 
   auto old_acc = reduce_op->accumulator();
   auto new_inner = reduce_op->reduce_args();
@@ -2425,26 +2423,17 @@ void LoopNest::rfactor(
     throw std::runtime_error("TODO: enable non-root insertion points");
   }
 
-  // From this point forward any errors cannot be handled silently.
-  auto init_it = buf_initializers_.find(reduce_op->accumulator());
-  if (init_it != buf_initializers_.end()) {
-    buf_initializers_[tmp_buf] = init_it->second;
-    Stmt* init_stmt =
-        new Store(tmp_buf, new_outer, init_it->second, new IntImm(1));
+  Stmt* init_stmt = new Store(tmp_buf, new_outer, init, new IntImm(1));
 
-    // Wrap it in any loops lower than the insertion point of the new reduction.
-    for (auto* ol : output_loops) {
-      init_stmt = ol->cloneWithNewBody(init_stmt);
-    }
+  // Wrap it in any loops lower than the insertion point of the new reduction.
+  for (auto* ol : output_loops) {
+    init_stmt = ol->cloneWithNewBody(init_stmt);
+  }
 
-    if (output_contains_target) {
-      parent_block->insert_stmt_before(init_stmt, new_root_for);
-    } else {
-      new_root_for->body()->prepend_stmt(init_stmt);
-    }
+  if (output_contains_target) {
+    parent_block->insert_stmt_before(init_stmt, new_root_for);
   } else {
-    // We may support this but not possible now.
-    throw std::runtime_error("can't rfactor reduction with no initializer\n");
+    new_root_for->body()->prepend_stmt(init_stmt);
   }
 
   auto second_buf = dynamic_cast<const Buf*>(second_reduce->accumulator());
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 3f37468c0a80..6c016f9f55ed 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -28,17 +28,14 @@ class TORCH_API LoopNest {
   LoopNest(const std::vector<Tensor*>& output_tensors);
 
   // A constructor for building a LoopNest from a pre-baked Stmt and meta-info
-  // TODO: Nuke intermediate_bufs_ and possibly buf_initializers from here if
-  // they can be deduced.
+  // TODO: Nuke intermediate_bufs_ from here if they can be deduced.
   LoopNest(
       Stmt* stmt,
       const std::unordered_set<const Buf*>& output_bufs,
-      const std::unordered_set<const Buf*>& intermediate_bufs,
-      const std::unordered_map<const Buf*, const Expr*>& buf_initializers)
+      const std::unordered_set<const Buf*>& intermediate_bufs)
       : root_stmt_(stmt),
         output_bufs_(output_bufs),
-        intermediate_bufs_(intermediate_bufs),
-        buf_initializers_(buf_initializers) {}
+        intermediate_bufs_(intermediate_bufs) {}
 
   Stmt* root_stmt() const {
     return root_stmt_;
@@ -133,8 +130,6 @@ class TORCH_API LoopNest {
   std::unordered_set<const Buf*> input_bufs_;
   std::unordered_set<const Buf*> output_bufs_;
   std::unordered_set<const Buf*> intermediate_bufs_;
-  // Holds the initializer Expr of buffers that have been initialized.
-  std::unordered_map<const Buf*, const Expr*> buf_initializers_;
 };
 
 TORCH_API Stmt* FlattenIndexes(Stmt* s);
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index e5e399db348b..53e0faa9eb16 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -90,12 +90,6 @@ class TORCH_API Tensor : KernelScopedObject {
     return reduce_args_[index];
   }
 
-  void initializeTo(const Expr* initializer) {
-    initializer_ = initializer;
-  }
-  const Expr* initializer() const {
-    return initializer_;
-  }
   virtual Stmt* ElementStmt() const;
 
   template <typename... Ts>
@@ -111,8 +105,6 @@ class TORCH_API Tensor : KernelScopedObject {
   const Expr* body_;
   std::vector<const Expr*> reduce_dims_;
   std::vector<const Var*> reduce_args_;
-
-  const Expr* initializer_{nullptr};
 };
 
 class TORCH_API CompoundTensor : public Tensor {
@@ -268,12 +260,12 @@ Tensor* Reduce(
   ExprHandle body =
       Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(all_vars));
   std::vector<const Expr*> output_args(vars.begin(), vars.end());
-  Buf* func_result = new Buf(func_name, dims, body.dtype());
+  const Expr* init_expr = new Cast(body.dtype(), reducer.initializer());
+  Buf* func_result = new Buf(func_name, dims, body.dtype(), init_expr);
   const ReduceOp* reduce_op =
       reducer(func_result, body, output_args, reduce_vars);
   Tensor* t =
       new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
-  t->initializeTo(new Cast(body.dtype(), reducer.initializer()));
   return t;
 }
 

From b804084428506675867d7b20eeee0cdee490a409 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 27 Jan 2021 15:58:35 -0800
Subject: [PATCH 23/41] [TensorExpr] Move 'lowerToStmt' method from 'LoopNest'
 to 'Tensor'. (#50994)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50994

Eventually, 'Tensor' will be fully responsible for its 'Stmt' and moving
this method to it is one step in that direction.

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D26038222

Pulled By: ZolotukhinM

fbshipit-source-id: 0549f0ae6b46a93ff7608a22e79faa5115eef661
---
 torch/csrc/jit/tensorexpr/loopnest.cpp | 42 +-------------------------
 torch/csrc/jit/tensorexpr/loopnest.h   |  1 -
 torch/csrc/jit/tensorexpr/tensor.cpp   | 37 +++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/tensor.h     |  2 ++
 4 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index b189f04d55b2..6e63743496cf 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -508,7 +508,7 @@ LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
 
   std::vector<Stmt*> loops;
   for (Tensor* t : tensors_to_compute) {
-    Stmt* loop = lowerToStmt(t);
+    Stmt* loop = t->lowerToStmt();
     // Flatten initializers.
     if (Block* block = dynamic_cast<Block*>(loop)) {
       for (auto* s : block->stmts()) {
@@ -532,46 +532,6 @@ LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
   }
 }
 
-Stmt* LoopNest::lowerToStmt(Tensor* t) {
-  Stmt* body = t->ElementStmt();
-
-  // If this Tensor has no functional body, it already has its axes expanded.
-  if (nullptr == t->body()) {
-    return body;
-  }
-
-  if (t->ndim() == 0 && t->reduce_ndim() == 0) {
-    return body;
-  }
-
-  const Expr* init_expr = t->buf()->initializer();
-
-  std::vector<const Expr*> indices(t->args().begin(), t->args().end());
-
-  if (t->reduce_ndim() > 0) {
-    for (size_t i = 0; i < t->reduce_ndim(); i++) {
-      // Going in reverse order: from innermost loop to the outermost
-      size_t dim_index = t->reduce_ndim() - i - 1;
-      body = new For(
-          t->reduce_arg(dim_index),
-          new IntImm(0),
-          t->reduce_dim(dim_index),
-          body);
-    }
-    if (init_expr) {
-      Store* init = new Store(t->buf(), indices, init_expr, new IntImm(1));
-      body = new Block({init, body});
-    }
-  }
-
-  for (size_t i = 0; i < t->ndim(); i++) {
-    // Going in reverse order: from innermost loop to the outermost
-    size_t dim_index = t->ndim() - i - 1;
-    body = new For(t->arg(dim_index), new IntImm(0), t->dim(dim_index), body);
-  }
-  return body;
-}
-
 class FunctionInliner : public IRMutator {
  public:
   FunctionInliner(Store* producer, std::unordered_set<const Buf*> outputs)
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 6c016f9f55ed..1bff2e96d415 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -122,7 +122,6 @@ class TORCH_API LoopNest {
  private:
   std::vector<Tensor*> findAllNeededTensors(
       const std::vector<Tensor*>& tensors);
-  Stmt* lowerToStmt(Tensor* t);
   Stmt* insertAllocFree(Stmt* stmt);
 
   Stmt* root_stmt_;
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index d12f6999c8d5..e32f9f293940 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -8,6 +8,43 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+Stmt* Tensor::lowerToStmt() const {
+  Stmt* s = ElementStmt();
+
+  // If this Tensor has no functional body, it already has its axes expanded.
+  if (nullptr == body()) {
+    return s;
+  }
+
+  if (ndim() == 0 && reduce_ndim() == 0) {
+    return s;
+  }
+
+  const Expr* init_expr = buf()->initializer();
+
+  std::vector<const Expr*> indices(args().begin(), args().end());
+
+  if (reduce_ndim() > 0) {
+    for (size_t i = 0; i < reduce_ndim(); i++) {
+      // Going in reverse order: from innermost loop to the outermost
+      size_t dim_index = reduce_ndim() - i - 1;
+      s = new For(
+          reduce_arg(dim_index), new IntImm(0), reduce_dim(dim_index), s);
+    }
+    if (init_expr) {
+      Store* init = new Store(buf(), indices, init_expr, new IntImm(1));
+      s = new Block({init, s});
+    }
+  }
+
+  for (size_t i = 0; i < ndim(); i++) {
+    // Going in reverse order: from innermost loop to the outermost
+    size_t dim_index = ndim() - i - 1;
+    s = new For(arg(dim_index), new IntImm(0), dim(dim_index), s);
+  }
+  return s;
+}
+
 Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 53e0faa9eb16..31454f745944 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -99,6 +99,8 @@ class TORCH_API Tensor : KernelScopedObject {
   template <typename... Ts>
   inline ExprHandle call(const Ts&... ts);
 
+  Stmt* lowerToStmt() const;
+
  private:
   const Buf* buf_;
   std::vector<const Var*> args_;

From e9751694266130eb9dc7be64fc0b99e4ad6fcb6c Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 27 Jan 2021 15:58:35 -0800
Subject: [PATCH 24/41] [TensorExpr] Redesign `Tensor` class. (#50995)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50995

This change makes 'Tensor' a thin wrapper over 'Buf' and 'Stmt', and
merges it with recently introduced 'CompoundTensor'. A statement for the
tensor is either passed directly to the Tensor constructor (akin to
'CompoundTensor'), or is built immediately in constructor.

LoopNest is no longer responsible for constructing statements from
tensors - it simply stitches already constructed statements contained in
Tensors. This has a side effect that now we cannot construct several
loopnests from the same tensors - we need to explicitly clone statements
if we want to do that. A special copy constructor was added to LoopNest
to make it more convenient (note: this only affects tests, we don't
usually create multiple loopnests in other places).

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D26038223

Pulled By: ZolotukhinM

fbshipit-source-id: 27a2e5900437cfb0c151e8f89815edec53608e17
---
 benchmarks/cpp/tensorexpr/bench_reduce.cpp |   7 +-
 test/cpp/tensorexpr/test_llvm.cpp          |  28 +++---
 test/cpp/tensorexpr/test_loopnest.cpp      |  29 ++----
 test/cpp/tensorexpr/test_reductions.cpp    |  50 +++++-----
 test/cpp/tensorexpr/tutorial.cpp           |  77 ++++++++++++----
 torch/csrc/jit/tensorexpr/ir_printer.cpp   |  14 +--
 torch/csrc/jit/tensorexpr/kernel.cpp       |   6 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp     |  24 +++--
 torch/csrc/jit/tensorexpr/loopnest.h       |   2 +
 torch/csrc/jit/tensorexpr/tensor.cpp       |  95 +++++++++----------
 torch/csrc/jit/tensorexpr/tensor.h         | 102 ++++-----------------
 11 files changed, 209 insertions(+), 225 deletions(-)

diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index cd467d74162e..06bc9b055176 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -1,4 +1,5 @@
 #include <benchmark/benchmark.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
@@ -365,7 +366,8 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
     te::For* mi = loops[1];
     // TODO: rfactor works on the untransformed var set. This is a problem since we need to
     // look for the loop after Split to rfactor.
-    loop.rfactor(BT->body(), mi->var());
+    auto bt_body = te::NodeFinder<te::ReduceOp>::find(loop.root_stmt())[0];
+    loop.rfactor(bt_body, mi->var());
   }
   
   loop.prepareForCodegen();
@@ -411,7 +413,8 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) {
     TORCH_CHECK(loops.size() == 2);
     te::For* mo = loops[0];
     te::For* mi = loops[1];
-    loop.rfactor(BT->body(), mi->var());
+    auto bt_body = te::NodeFinder<te::ReduceOp>::find(loop.root_stmt())[0];
+    loop.rfactor(bt_body, mi->var());
   }
 
   {
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 7afb839dc7e0..343c965c294c 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -1,17 +1,17 @@
 #ifdef TORCH_ENABLE_LLVM
 #include <gtest/gtest.h>
 
-#include "test/cpp/tensorexpr/test_base.h"
-
-#include "test/cpp/tensorexpr/padded_buffer.h"
-#include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/ir.h"
-#include "torch/csrc/jit/tensorexpr/ir_printer.h"
-#include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
-#include "torch/csrc/jit/tensorexpr/llvm_codegen.h"
-#include "torch/csrc/jit/tensorexpr/loopnest.h"
-#include "torch/csrc/jit/tensorexpr/tensor.h"
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <test/cpp/tensorexpr/padded_buffer.h>
+#include <test/cpp/tensorexpr/test_utils.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 
 #include <cmath>
 #include <numeric>
@@ -1481,7 +1481,8 @@ TEST(LLVM, RFactorReduction) {
   loops = loop.getLoopStmtsFor(b);
   loop_m = loops.at(2);
   loop_n = loops.at(1);
-  loop.rfactor(b->body(), loop_n->var(), loop_n->body());
+  auto b_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(b_body, loop_n->var(), loop_n->body());
 
   loop.prepareForCodegen();
   Stmt* s = loop.root_stmt();
@@ -1522,7 +1523,8 @@ TEST(LLVM, RFactorVectorizedReduction) {
   For* loop_k = loops.at(0);
   For* loop_m = loops.at(1);
   For* loop_n = loops.at(2);
-  loopnest.rfactor(b->body(), loop_n->var());
+  auto b_body = NodeFinder<ReduceOp>::find(loopnest.root_stmt())[0];
+  loopnest.rfactor(b_body, loop_n->var());
 
   loops = NodeFinder<For>::find(loopnest.root_stmt());
   loop_k = loops.at(0);
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 74f69079e2db..6f44b5e2b033 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -826,7 +826,7 @@ TEST(LoopNest, ScheduleInlineSimple) {
       });
 
   LoopNest l1({y});
-  LoopNest l2({y});
+  LoopNest l2(l1);
   l2.computeInline(x->buf());
 
   l1.prepareForCodegen();
@@ -1156,7 +1156,7 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
   }
 
   LoopNest l1({y});
-  LoopNest l2({y});
+  LoopNest l2(l1);
   l2.computeInline(x->buf());
 
   l1.prepareForCodegen();
@@ -1228,7 +1228,6 @@ TEST(LoopNest, ScheduleSplitAThenInline) {
     return a->call(j + ExprHandle(8));
   });
 
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1247,7 +1246,6 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
     return a->call(j + ExprHandle(8));
   });
 
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1275,8 +1273,6 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) {
   Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
     return a->call(j + ExprHandle(8));
   });
-
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1296,7 +1292,6 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
     return a->call(j + ExprHandle(8));
   });
 
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1325,7 +1320,6 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
     return a->call(j + ExprHandle(8));
   });
 
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1357,7 +1351,6 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) {
     return a->call(j) - ExprHandle(1);
   });
 
-  LoopNest loop({b});
   For* i_outer;
   For* i_inner;
 
@@ -1714,10 +1707,11 @@ TEST(LoopNest, LoopNestComputeAt_2) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
+  LoopNest orig_loopnest({c});
 
   {
     // First let's try to compute P at axis cy (the outer loop)
-    LoopNest l({c});
+    LoopNest l(orig_loopnest);
     std::vector<For*> loops = l.getLoopStmtsFor(c);
     l.computeAt(l.getLoopBodyFor(p), loops[0]);
     l.prepareForCodegen();
@@ -1748,7 +1742,7 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   }
   {
     // Now let's try to compute P at axis cx (the inner loop)
-    LoopNest l({c});
+    LoopNest l(orig_loopnest);
     std::vector<For*> loops = l.getLoopStmtsFor(c);
     l.computeAt(l.getLoopBodyFor(p), loops[1]);
     l.prepareForCodegen();
@@ -1823,9 +1817,10 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     }
   }
 
+  LoopNest orig_loopnest({D});
   {
     // First let's try to compute A at axis dy (the outer loop)
-    LoopNest l({D});
+    LoopNest l(orig_loopnest);
     std::vector<For*> loops = l.getLoopStmtsFor(D);
     l.computeAt(l.getLoopBodyFor(A), loops[0]);
     l.prepareForCodegen();
@@ -1861,7 +1856,7 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   }
   {
     // Now let's try to compute A at axis dx (the inner loop)
-    LoopNest l({D});
+    LoopNest l(orig_loopnest);
     std::vector<For*> loops = l.getLoopStmtsFor(D);
     l.computeAt(l.getLoopBodyFor(A), loops[1]);
     l.prepareForCodegen();
@@ -1897,10 +1892,6 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   }
 }
 
-TEST(LoopNest, LoopNestComputeAt_4) {
-  // TODO: Verify that computeAt works with reduction axis
-}
-
 class LoopOrderHelper : public IRVisitor {
   std::stringstream ordering;
 
@@ -3668,7 +3659,7 @@ TEST(LoopNest, CompoundTensorSimple) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   Block* body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body);
+  Tensor* A = new Tensor(a_buf.node(), body);
 
   LoopNest l({A});
   l.prepareForCodegen();
@@ -3707,7 +3698,7 @@ TEST(LoopNest, CompoundTensorUsed) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   Block* body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body);
+  Tensor* A = new Tensor(a_buf.node(), body);
   Tensor* B = Compute(
       "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return A->call(i, j + 1) + A->call(i, j + 2);
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index f69217df9bde..9c538741d9f4 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -259,9 +259,9 @@ TEST(Reductions, ReduceMax) {
 
   Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
 
-  loop = LoopNest({m2d});
-  loop.prepareForCodegen();
-  s = loop.root_stmt();
+  LoopNest loop2({m2d});
+  loop2.prepareForCodegen();
+  s = loop2.root_stmt();
   s = IRSimplifier::simplify(s);
 
   SimpleIREvaluator cg2(s, {in2_, m2d});
@@ -372,9 +372,9 @@ TEST(Reductions, ReduceAnyAll) {
       },
       {{10, "j"}});
 
-  loop = LoopNest({allGreaterThan});
-  loop.prepareForCodegen();
-  s = loop.root_stmt();
+  LoopNest loop2({allGreaterThan});
+  loop2.prepareForCodegen();
+  s = loop2.root_stmt();
   s = IRSimplifier::simplify(s);
 
   SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue});
@@ -699,7 +699,8 @@ TEST(Reductions, ReduceRfactor) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(1)->var();
-  loop.rfactor(c->body(), v);
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v);
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -734,7 +735,8 @@ TEST(Reductions, Reduce3DRfactorInternal) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(1)->var();
-  loop.rfactor(c->body(), v);
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v);
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -769,7 +771,8 @@ TEST(Reductions, Reduce3DRfactorInner) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(2)->var();
-  loop.rfactor(c->body(), v);
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v);
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -804,7 +807,8 @@ TEST(Reductions, Reduce3DRfactorOuter) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(0)->var();
-  loop.rfactor(c->body(), v);
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v);
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -841,7 +845,8 @@ TEST(Reductions, Reduce3DRfactorWithOuter) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(3)->var();
-  loop.rfactor(c->body(), v);
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v);
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -870,12 +875,13 @@ TEST(Reductions, Reduce3DRfactorRepeated) {
   }
 
   Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  LoopNest orig_loopnest({c});
 
   for (int rVar1 = 0; rVar1 < 3; ++rVar1) {
     for (int rVar2 = 0; rVar2 < 2; ++rVar2) {
       std::vector<float> out(1, -1.f);
 
-      LoopNest loop({c});
+      LoopNest loop(orig_loopnest);
       auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
       ASSERT_EQ(reduces.size(), 1);
       auto v1 = reduces[0]->reduce_args()[rVar1];
@@ -921,7 +927,8 @@ TEST(Reductions, ReduceRfactorInsertionPoint) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(0)->var();
-  loop.rfactor(c->body(), v, loops.at(0)->body());
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v, loops.at(0)->body());
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -956,7 +963,8 @@ TEST(Reductions, Reduce3DRfactorInsertionPoint) {
   LoopNest loop({c});
   std::vector<For*> loops = loop.getLoopStmtsFor(c);
   auto v = loops.at(1)->var();
-  loop.rfactor(c->body(), v, loops.at(1)->body());
+  auto c_body = NodeFinder<ReduceOp>::find(loop.root_stmt())[0];
+  loop.rfactor(c_body, v, loops.at(1)->body());
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
   loop.prepareForCodegen();
@@ -985,13 +993,12 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
       in_,
       {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}});
   LoopNest refloop({c});
+  LoopNest loop(refloop);
   refloop.prepareForCodegen();
   SimpleIREvaluator ref_cg(
       IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
   ref_cg.call({in, ref});
 
-  LoopNest loop({c});
-
   // rfactor out "c".
   auto reduces = NodeFinder<ReduceOp>::find(loop.root_stmt());
   loop.rfactor(reduces[0], reduces[0]->reduce_args()[3]);
@@ -1373,7 +1380,7 @@ TEST(Reductions, ReduceInlineConsumer) {
   }
 
   LoopNest l1({y});
-  LoopNest l2({y});
+  LoopNest l2(l1);
   l2.computeInline(x->buf());
 
   l1.prepareForCodegen();
@@ -1431,7 +1438,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   }
 
   LoopNest l1({y});
-  LoopNest l2({y});
+  LoopNest l2(l1);
   l2.computeInline(x->buf());
 
   l1.prepareForCodegen();
@@ -1863,11 +1870,11 @@ TEST(Reductions, ReductionVectorize) {
 
   Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l_before({tensor});
+  LoopNest l(l_before);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
   cg_before.call({in_, out_before});
 
-  LoopNest l({tensor});
   l.vectorize(l.getLoopStmtsFor(tensor)[0]);
 
   Stmt* s = l.root_stmt();
@@ -1923,11 +1930,11 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
 
   LoopNest l_before({tensor});
+  LoopNest l(l_before);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor});
   cg_before.call({in_, out_before});
 
-  LoopNest l({tensor});
   ASSERT_THROWS_WITH(
       l.vectorize(l.getLoopStmtsFor(tensor)[1]), "reduction axis");
 
@@ -1935,7 +1942,8 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   // loop.
   std::vector<For*> loops = l.getLoopStmtsFor(tensor);
   auto v = loops.at(1)->var();
-  l.rfactor(tensor->body(), v);
+  auto tensor_body = NodeFinder<ReduceOp>::find(l.root_stmt())[0];
+  l.rfactor(tensor_body, v);
 
   loops = NodeFinder<For>::find(l.root_stmt());
   l.vectorize(loops[2]);
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 31e05549186e..b935e5d2b6b6 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -118,33 +118,53 @@ int main(int argc, char* argv[]) {
 
   std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl;
   {
-    // A tensor computation is represented by objects of Tensor class and
+    // A tensor computation is represented by Tensor class objects and
     // consists of the following pieces:
     //   - domain, which is specified by a Buf expression
-    //   - an expression (or several expressions if we want to perform several
-    //   independent computations over the same domain) for its elements, as a
-    //   function of indices
-    //
-    // TODO: Update this section once Tensor/Function cleanup is done
+    //   - a tensor statement, specified by a Stmt object, that computation to
+    //   be performed in this domain
+
+    // Let's start with defining a domain. We do this by creating a Buf object.
+
+    // First, let's specify the sizes:
     std::vector<const Expr*> dims = {
         new IntImm(64), new IntImm(32)}; // IntImm stands for Integer Immediate
                                          // and represents an integer constant
 
-    // Next we need to create arguments. The arguments are Vars, and they play
-    // role of placeholders. The computation that the tensor would describe
-    // would use these arguments.
+    // Now we can create a Buf object by providing a name, dimensions, and a
+    // data type of the elements:
+    const Buf* buf = new Buf("X", dims, kInt);
+
+    // Next we need to spefify the computation. We can do that by either
+    // constructing a complete tensor statement for it (statements are
+    // examined in details in subsequent section), or by using a convenience
+    // method where we could specify axis and an element expression for the
+    // computation. In the latter case a corresponding statement would be
+    // constructed automatically.
+
+    // Let's define two variables, i and j - they will be axis in our
+    // computation.
     const Var* i = new Var("i", kInt);
     const Var* j = new Var("j", kInt);
     std::vector<const Var*> args = {i, j};
 
     // Now we can define the body of the tensor computation using these
-    // arguments.
+    // variables. What this means is that values in our tensor are:
+    //   X[i, j] = i * j
     Expr* body = new Mul(i, j);
 
     // Finally, we pass all these pieces together to Tensor constructor:
-    Tensor* X = new Tensor("X", dims, args, body);
+    Tensor* X = new Tensor(buf, args, body);
     std::cout << "Tensor computation: " << *X << std::endl;
-    // Prints: Tensor computation: Tensor X(i[64], j[32]) = i * j
+    // Prints:
+    // Tensor computation: Tensor X[64, 32]:
+    // for (int i = 0; i < 64; i++) {
+    //   for (int j = 0; j < 32; j++) {
+    //     X[i, j] = i * j;
+    //   }
+    // }
+
+    // TODO: Add an example of constructing a Tensor with a complete Stmt.
 
     // Similarly to how we provide a more convenient way of using handles for
     // constructing Exprs, Tensors also have a more convenient API for
@@ -155,11 +175,17 @@ int main(int argc, char* argv[]) {
         {{64, "i"}, {32, "j"}},
         [](const VarHandle& i, const VarHandle& j) { return i / j; });
     std::cout << "Tensor computation: " << *Z << std::endl;
-    // Prints: Tensor computation: Tensor Z(i[64], j[32]) = i / j
+    // Prints:
+    // Tensor computation: Tensor Z[64, 32]:
+    // for (int i = 0; i < 64; i++) {
+    //   for (int j = 0; j < 32; j++) {
+    //     Z[i, j] = i / j;
+    //   }
+    // }
 
     // Tensors might access other tensors and external placeholders in their
     // expressions. It can be done like so:
-    Placeholder P("P", kFloat, {64, 32});
+    Placeholder P("P", kInt, {64, 32});
     Tensor* R = Compute(
         "R",
         {{64, "i"}, {32, "j"}},
@@ -167,7 +193,13 @@ int main(int argc, char* argv[]) {
           return Z->call(i, j) * P.load(i, j);
         });
     std::cout << "Tensor computation: " << *R << std::endl;
-    // Prints: Tensor computation: Tensor R(i[64], j[32]) = Z(i, j) * P[i, j]
+    // Prints:
+    // Tensor computation: Tensor R[64, 32]:
+    // for (int i = 0; i < 64; i++) {
+    //   for (int j = 0; j < 32; j++) {
+    //     R[i, j] = (Z(i, j)) * (P[i, j]);
+    //   }
+    // }
 
     // Placeholders could be thought of as external tensors, i.e. tensors for
     // which we don't have the element expression. In other words, for `Tensor`
@@ -211,8 +243,19 @@ int main(int argc, char* argv[]) {
     std::cout << "Tensor computation X: " << *X
               << "Tensor computation Y: " << *Y << std::endl;
     // Prints:
-    // Tensor computation X: Tensor X(i[64], j[32]) = (A[i, j]) + (B[i, j])
-    // Tensor computation Y: Tensor Y(i[64], j[32]) = sigmoid(X(i, j))
+    // Tensor computation X: Tensor X[64, 32]:
+    // for (int i = 0; i < 64; i++) {
+    //   for (int j = 0; j < 32; j++) {
+    //     X[i, j] = (A[i, j]) + (B[i, j]);
+    //   }
+    // }
+
+    // Tensor computation Y: Tensor Y[64, 32]:
+    // for (int i = 0; i < 64; i++) {
+    //   for (int j = 0; j < 32; j++) {
+    //     Y[i, j] = sigmoid(X(i, j));
+    //   }
+    // }
 
     // Creating a loop nest is as quite simple, we just need to specify what are
     // the output tensors in our computation and LoopNest object will
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 1df2f96671df..96fb11d3a982 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -596,19 +596,15 @@ std::string to_string(const Tensor* t) {
     return "(null tensor)\n";
   }
   std::ostringstream oss;
-  if (!t->body()) {
-    oss << "Tensor " << t->buf()->name_hint() << " = " << *t->ElementStmt()
-        << "\n";
-    return oss.str();
-  }
-  oss << "Tensor " << t->buf()->name_hint() << "(";
-  for (size_t i = 0; i < t->ndim(); i++) {
+  // TODO: move this to Buf printer
+  oss << "Tensor " << t->buf()->name_hint() << "[";
+  for (size_t i = 0; i < t->buf()->ndim(); i++) {
     if (i != 0) {
       oss << ", ";
     }
-    oss << *t->arg(i) << "[" << *t->dim(i) << "]";
+    oss << *t->buf()->dim(i);
   }
-  oss << ") = " << *t->body() << "\n";
+  oss << "]:\n" << *t->stmt() << "\n";
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 5a727cc5a92e..1da0ab8beefb 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -119,7 +119,7 @@ size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) {
 }
 
 static at::ScalarType tensorType(Tensor* t) {
-  return static_cast<at::ScalarType>(t->body()->dtype().scalar_type());
+  return static_cast<at::ScalarType>(t->buf()->dtype().scalar_type());
 }
 
 static std::vector<ExprHandle> computeIndicesToBroadcast(
@@ -608,7 +608,7 @@ std::vector<ExprHandle> TensorExprKernel::valueShape(
   if (it == tensors_.end()) {
     return {};
   }
-  return ExprVectorToExprHandleVector(it->second->dims());
+  return ExprVectorToExprHandleVector(it->second->buf()->dims());
 }
 
 Tensor* TensorExprKernel::computeOneOperand(
@@ -1125,7 +1125,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     case aten::type_as: {
       auto const& n = v->node();
       Tensor* rhs = tensors_.at(n->inputs()[1]->unique());
-      auto dtype = rhs->body()->dtype();
+      auto dtype = rhs->buf()->dtype();
       return computeOneOperand(
           "aten_type_as", v, [dtype](const ExprHandle& lhs) {
             return Cast::make(dtype, lhs);
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 6e63743496cf..7d6e208291ba 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -24,6 +24,11 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+LoopNest::LoopNest(const LoopNest& other)
+    : root_stmt_(Stmt::clone(other.root_stmt_)),
+      output_bufs_(other.output_bufs_),
+      intermediate_bufs_(other.intermediate_bufs_) {}
+
 class FunctionCallUseCount : public IRVisitor {
  public:
   std::unordered_map<const Buf*, size_t> findUses(Stmt* s) {
@@ -424,11 +429,7 @@ class DepTracker : public IRVisitor {
  public:
   std::vector<Tensor*> findUsedTensors(Tensor* tensor) {
     used_tensors.clear();
-    if (tensor->body()) {
-      tensor->body()->accept(this);
-    } else {
-      tensor->ElementStmt()->accept(this);
-    }
+    tensor->stmt()->accept(this);
     return used_tensors;
   }
 
@@ -508,7 +509,12 @@ LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
 
   std::vector<Stmt*> loops;
   for (Tensor* t : tensors_to_compute) {
-    Stmt* loop = t->lowerToStmt();
+    Stmt* loop = t->stmt();
+    if (loop->get_parent()) {
+      std::cerr << "Error: creating a loopnest from already used Tensors\n";
+      loops = {};
+      break;
+    }
     // Flatten initializers.
     if (Block* block = dynamic_cast<Block*>(loop)) {
       for (auto* s : block->stmts()) {
@@ -544,6 +550,7 @@ class FunctionInliner : public IRMutator {
         throw std::logic_error("cannot inline Buf with compound indices");
       }
       index_vars_.insert(index_var);
+      producer_index_vars_.push_back(index_var);
     }
   }
 
@@ -563,9 +570,9 @@ class FunctionInliner : public IRMutator {
     }
 
     std::vector<const Var*> index_vars;
-    TORCH_INTERNAL_ASSERT(buf->ndim() == t->args().size());
+    TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size());
     for (size_t i = 0; i < buf->ndim(); i++) {
-      const Var* func_callee_arg = dynamic_cast<const Var*>(t->arg(i));
+      const Var* func_callee_arg = producer_index_vars_.at(i);
       const Expr* func_caller_param = v->param(i);
       auto iter = inline_mapping_.find(func_callee_arg);
       if (iter != inline_mapping_.end()) {
@@ -686,6 +693,7 @@ class FunctionInliner : public IRMutator {
 
   // Index Vars present in the producer.
   std::unordered_set<const Var*> index_vars_;
+  std::vector<const Var*> producer_index_vars_;
 
   std::unordered_map<const Var*, const Expr*> inline_mapping_;
 
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 1bff2e96d415..7c27ca6968a5 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -37,6 +37,8 @@ class TORCH_API LoopNest {
         output_bufs_(output_bufs),
         intermediate_bufs_(intermediate_bufs) {}
 
+  LoopNest(const LoopNest& other);
+
   Stmt* root_stmt() const {
     return root_stmt_;
   }
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index e32f9f293940..3eec21d13f0b 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -8,56 +8,60 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Stmt* Tensor::lowerToStmt() const {
-  Stmt* s = ElementStmt();
+Stmt* Tensor::constructStmt(
+    const std::vector<const Var*>& args,
+    const Expr* body,
+    const std::vector<const Expr*>& reduce_dims,
+    const std::vector<const Var*>& reduce_args) const {
+  std::vector<const Expr*> indices(args.begin(), args.end());
 
-  // If this Tensor has no functional body, it already has its axes expanded.
-  if (nullptr == body()) {
-    return s;
-  }
+  const Expr* mask = new IntImm(1);
+  Stmt* s = new Store(buf_, indices, body, mask);
 
-  if (ndim() == 0 && reduce_ndim() == 0) {
+  size_t ndim = buf()->ndim();
+  size_t reduce_ndim = reduce_dims.size();
+
+  if (ndim == 0 && reduce_ndim == 0) {
     return s;
   }
 
   const Expr* init_expr = buf()->initializer();
 
-  std::vector<const Expr*> indices(args().begin(), args().end());
-
-  if (reduce_ndim() > 0) {
-    for (size_t i = 0; i < reduce_ndim(); i++) {
+  if (reduce_ndim > 0) {
+    for (size_t i = 0; i < reduce_ndim; i++) {
       // Going in reverse order: from innermost loop to the outermost
-      size_t dim_index = reduce_ndim() - i - 1;
+      size_t dim_index = reduce_ndim - i - 1;
       s = new For(
-          reduce_arg(dim_index), new IntImm(0), reduce_dim(dim_index), s);
+          reduce_args[dim_index], new IntImm(0), reduce_dims[dim_index], s);
     }
     if (init_expr) {
-      Store* init = new Store(buf(), indices, init_expr, new IntImm(1));
-      s = new Block({init, s});
+      Store* init_stmt = new Store(buf(), indices, init_expr, new IntImm(1));
+      s = new Block({init_stmt, s});
     }
   }
 
-  for (size_t i = 0; i < ndim(); i++) {
+  for (size_t i = 0; i < ndim; i++) {
     // Going in reverse order: from innermost loop to the outermost
-    size_t dim_index = ndim() - i - 1;
-    s = new For(arg(dim_index), new IntImm(0), dim(dim_index), s);
+    size_t dim_index = ndim - i - 1;
+    s = new For(args[dim_index], new IntImm(0), buf()->dim(dim_index), s);
   }
   return s;
 }
 
 Tensor* Compute(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
   std::vector<const Expr*> dims;
   std::vector<const Var*> args;
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
-  return new Tensor(func_name, dims, args, body);
+  const Buf* buf = new Buf(name, dims, body->dtype());
+  return new Tensor(buf, args, body);
 }
 
 Tensor* Compute(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&)>& body_func) {
   if (dim_args.size() != 1) {
@@ -68,11 +72,12 @@ Tensor* Compute(
   std::vector<const Var*> args;
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarHandle(args[0])).node();
-  return new Tensor(func_name, dims, args, body);
+  const Buf* buf = new Buf(name, dims, body->dtype());
+  return new Tensor(buf, args, body);
 }
 
 Tensor* Compute(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func) {
@@ -83,11 +88,12 @@ Tensor* Compute(
   std::vector<const Var*> args;
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
-  return new Tensor(func_name, dims, args, body);
+  const Buf* buf = new Buf(name, dims, body->dtype());
+  return new Tensor(buf, args, body);
 }
 
 Tensor* Compute(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
@@ -101,11 +107,12 @@ Tensor* Compute(
   const Expr* body =
       body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
           .node();
-  return new Tensor(func_name, dims, args, body);
+  const Buf* buf = new Buf(name, dims, body->dtype());
+  return new Tensor(buf, args, body);
 }
 
 Tensor* Compute(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(
         const VarHandle&,
@@ -116,32 +123,26 @@ Tensor* Compute(
     throw malformed_input("mismatch between body and arg size (4)");
   }
   std::vector<const Expr*> dims;
-  std::vector<const Var*> args_nodes;
-  unpack_dim_args(dim_args, &dims, &args_nodes);
-  auto args = VarVectorToVarHandleVector(args_nodes);
-  const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
-  return new Tensor(func_name, dims, args_nodes, body);
-}
-
-Stmt* Tensor::ElementStmt() const {
-  std::vector<const Expr*> indices;
-  for (size_t i = 0; i < buf_->ndim(); i++) {
-    indices.push_back(args_[i]);
-  }
-
-  const Expr* mask = new IntImm(1);
-  Stmt* update_stmt = new Store(buf_, indices, body_, mask);
-  return update_stmt;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(
+                         VarHandle(args[0]),
+                         VarHandle(args[1]),
+                         VarHandle(args[2]),
+                         VarHandle(args[3]))
+                         .node();
+  const Buf* buf = new Buf(name, dims, body->dtype());
+  return new Tensor(buf, args, body);
 }
 
 Tensor* Reduce(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
     const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args) {
   return Reduce(
-      func_name,
+      name,
       dim_args,
       reducer,
       [&](ParameterList& p) { return buffer.load(p); },
@@ -149,13 +150,13 @@ Tensor* Reduce(
 }
 
 Tensor* Reduce(
-    const std::string& func_name,
+    const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
     Tensor* tensor,
     const std::vector<DimArg>& reduce_args) {
   return Reduce(
-      func_name,
+      name,
       dim_args,
       reducer,
       [&](ParameterList& p) { return tensor->call(p); },
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 31454f745944..609b5c30a839 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -14,17 +14,10 @@ namespace tensorexpr {
 
 class TORCH_API Tensor : KernelScopedObject {
  public:
-  Tensor(
-      const std::string& name,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const Expr* body)
-      // TODO: Function should not create buffers, they should be created
-      // manually before constructing a function.
-      : buf_(new Buf(name, dims, body->dtype())), args_(args), body_(body) {}
-
-  Tensor(Buf* buf, const std::vector<const Var*>& args, const Expr* body)
-      : buf_(buf), args_(args), body_(body) {}
+  Tensor(const Buf* buf, const std::vector<const Var*>& args, const Expr* body)
+      : buf_(buf) {
+    stmt_ = constructStmt(args, body, {}, {});
+  }
 
   Tensor(
       const Buf* buf,
@@ -32,65 +25,19 @@ class TORCH_API Tensor : KernelScopedObject {
       const std::vector<const Expr*>& reduce_dims,
       const std::vector<const Var*>& reduce_args,
       const Expr* body)
-      : buf_(buf),
-        args_(args),
-        body_(body),
-        reduce_dims_(reduce_dims),
-        reduce_args_(reduce_args) {}
+      : buf_(buf) {
+    stmt_ = constructStmt(args, body, reduce_dims, reduce_args);
+  }
 
-  virtual ~Tensor() {}
+  Tensor(const Buf* buf, Stmt* stmt) : buf_(buf), stmt_(stmt) {}
 
-  // Wrappers over accessors to fields of the underlying function
-  const Expr* body() const {
-    return body_;
-  }
   const Buf* buf() const {
     return buf_;
   }
-  size_t ndim() const {
-    return buf()->ndim();
-  }
-  const Expr* dim(size_t index) const {
-    if (index >= ndim()) {
-      throw out_of_range_index();
-    }
-    return buf()->dim(index);
-  }
-  std::vector<const Expr*> dims() const {
-    return buf()->dims();
-  }
-  const Var* arg(size_t index) const {
-    if (index >= ndim()) {
-      throw out_of_range_index();
-    }
-    return args_[index];
-  }
-  const std::vector<const Var*>& args() const {
-    return args_;
-  }
-  size_t reduce_ndim() const {
-    return reduce_dims_.size();
-  }
-  std::vector<const Expr*> reduce_dims() const {
-    return reduce_dims_;
-  }
-  std::vector<const Var*> reduce_args() const {
-    return reduce_args_;
-  }
-  const Expr* reduce_dim(size_t index) const {
-    if (index >= reduce_ndim()) {
-      throw out_of_range_index();
-    }
-    return reduce_dims_[index];
-  }
-  const Var* reduce_arg(size_t index) const {
-    if (index >= reduce_ndim()) {
-      throw out_of_range_index();
-    }
-    return reduce_args_[index];
-  }
 
-  virtual Stmt* ElementStmt() const;
+  Stmt* stmt() const {
+    return stmt_;
+  }
 
   template <typename... Ts>
   inline ExprHandle operator()(const Ts&... ts);
@@ -99,31 +46,14 @@ class TORCH_API Tensor : KernelScopedObject {
   template <typename... Ts>
   inline ExprHandle call(const Ts&... ts);
 
-  Stmt* lowerToStmt() const;
-
  private:
-  const Buf* buf_;
-  std::vector<const Var*> args_;
-  const Expr* body_;
-  std::vector<const Expr*> reduce_dims_;
-  std::vector<const Var*> reduce_args_;
-};
-
-class TORCH_API CompoundTensor : public Tensor {
- public:
-  CompoundTensor(
-      const Buf* buf,
+  Stmt* constructStmt(
       const std::vector<const Var*>& args,
-      Stmt* stmt)
-      : Tensor(buf, args, {}, {}, nullptr), stmt_(stmt) {}
-
-  virtual ~CompoundTensor() {}
-
-  Stmt* ElementStmt() const override {
-    return stmt_;
-  }
+      const Expr* body,
+      const std::vector<const Expr*>& reduce_dims,
+      const std::vector<const Var*>& reduce_args) const;
 
- private:
+  const Buf* buf_;
   Stmt* stmt_;
 };
 

From dc2a44c4fc5f5efba16b8567ab970f8bcf1fe007 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 27 Jan 2021 16:47:01 -0800
Subject: [PATCH 25/41] Back out "Revert D25850783: Add torch::deploy, an
 embedded torch-python interpreter" (#51124)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51124

Original commit changeset: 1c7133627da2

Test Plan: Test locally with interpreter_test and on CI

Reviewed By: suo

Differential Revision: D26077905

fbshipit-source-id: fae83bf9822d79e9a9b5641bc5191a7f3fdea78d
---
 .github/workflows/lint.yml                    |   6 +
 .gitignore                                    |   3 +
 .jenkins/pytorch/build.sh                     |  11 +
 .jenkins/pytorch/test.sh                      |   8 +
 CMakeLists.txt                                |   5 +
 torch/__init__.py                             |  10 +-
 torch/_ops.py                                 |   3 +-
 torch/_utils_internal.py                      |  16 +-
 torch/csrc/Module.cpp                         |   2 +
 torch/csrc/deploy/.gitignore                  |   1 +
 torch/csrc/deploy/CMakeLists.txt              |   3 +
 torch/csrc/deploy/README.md                   |  10 +
 torch/csrc/deploy/example/simple.pt           | Bin 0 -> 2432 bytes
 torch/csrc/deploy/example/trace_simple.py     |  20 ++
 torch/csrc/deploy/interpreter/CMakeLists.txt  | 115 +++++++
 .../deploy/interpreter/CMakePythonModules.txt |  69 ++++
 torch/csrc/deploy/interpreter/freeze.py       | 269 +++++++++++++++
 .../deploy/interpreter/hide_symbols.script    |   5 +
 torch/csrc/deploy/interpreter/interpreter.cpp | 324 ++++++++++++++++++
 torch/csrc/deploy/interpreter/interpreter.h   |  67 ++++
 .../deploy/interpreter/interpreter_impl.h     |  26 ++
 torch/csrc/deploy/interpreter/test_main.cpp   |  49 +++
 .../deploy/interpreter/third_party/README.md  |   2 +
 torch/cuda/__init__.py                        |   4 +
 torch/utils/__init__.py                       |   8 +-
 25 files changed, 1024 insertions(+), 12 deletions(-)
 create mode 100644 torch/csrc/deploy/.gitignore
 create mode 100644 torch/csrc/deploy/CMakeLists.txt
 create mode 100644 torch/csrc/deploy/README.md
 create mode 100644 torch/csrc/deploy/example/simple.pt
 create mode 100644 torch/csrc/deploy/example/trace_simple.py
 create mode 100644 torch/csrc/deploy/interpreter/CMakeLists.txt
 create mode 100644 torch/csrc/deploy/interpreter/CMakePythonModules.txt
 create mode 100644 torch/csrc/deploy/interpreter/freeze.py
 create mode 100644 torch/csrc/deploy/interpreter/hide_symbols.script
 create mode 100644 torch/csrc/deploy/interpreter/interpreter.cpp
 create mode 100644 torch/csrc/deploy/interpreter/interpreter.h
 create mode 100644 torch/csrc/deploy/interpreter/interpreter_impl.h
 create mode 100644 torch/csrc/deploy/interpreter/test_main.cpp
 create mode 100644 torch/csrc/deploy/interpreter/third_party/README.md

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 54acbe7b1c6a..9c215540108b 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -170,6 +170,8 @@ jobs:
           # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
           # in a follow up PR.
           # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
+          # deploy/interpreter files are excluded due to using macros and other techniquies
+          # that are not easily converted to accepted c++
           python tools/clang_tidy.py                               \
             --verbose                                              \
             --paths torch/csrc/                                    \
@@ -186,6 +188,10 @@ jobs:
             -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
             -g"-torch/csrc/generic/*.cpp"                          \
             -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
+            -g"-torch/csrc/deploy/interpreter/interpreter.cpp"     \
+            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
+            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
+            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
diff --git a/.gitignore b/.gitignore
index e1fe94cb9bf9..a3a832ce7555 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,9 @@ torch/csrc/autograd/generated/*
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
 torch/csrc/cudnn/cuDNN.cpp
+torch/csrc/deploy/interpreter/cpython
+torch/csrc/deploy/interpreter/frozen
+torch/csrc/deploy/interpreter/third_party/typing_extensions.py
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/jit/generated/*
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index fad9c8e49e64..dfd359c1ddf4 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -23,6 +23,17 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then
+  # Enabling DEPLOY build (embedded torch python interpreter, experimental)
+  # only on one config for now, can expand later
+  export USE_DEPLOY=ON
+
+  # Deploy feature builds cpython. It requires these packages.
+  # TODO move this to dockerfile?
+  sudo apt-get -qq update
+  sudo apt-get -qq install libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev
+fi
+
 echo "Python version:"
 python --version
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 73563f145eb8..d70a377ec086 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -354,6 +354,11 @@ test_vec256() {
   fi
 }
 
+test_torch_deploy() {
+  SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
+  assert_git_not_dirty
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -371,6 +376,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then
+    test_torch_deploy
+  fi
   install_torchvision
   test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a23208752afb..c138f261c27b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -919,3 +919,8 @@ endif()
 
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
+
+# ---[ Torch Deploy
+if(USE_DEPLOY)
+  add_subdirectory(torch/csrc/deploy)
+endif()
diff --git a/torch/__init__.py b/torch/__init__.py
index 3f9df8bc009a..f27af91eb493 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -22,7 +22,11 @@
 from ._utils import _import_dotted_name
 from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \
     USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS
-from .version import __version__
+# TODO(torch_deploy) figure out how to freeze version.py in fbcode build
+if sys.executable == 'torch_deploy':
+    __version__ = "torch-deploy-1.8"
+else:
+    from .version import __version__
 from ._six import string_classes as _string_classes
 
 from typing import Set, Type, TYPE_CHECKING
@@ -134,7 +138,7 @@
 
 # See Note [Global dependencies]
 def _load_global_deps():
-    if platform.system() == 'Windows':
+    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
         return
 
     lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
@@ -516,7 +520,7 @@ class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase):
 ################################################################################
 
 def manager_path():
-    if platform.system() == 'Windows':
+    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
         return b""
     path = get_file_path('torch', 'bin', 'torch_shm_manager')
     prepare_multiprocessing_environment(get_file_path('torch'))
diff --git a/torch/_ops.py b/torch/_ops.py
index dd0c8cd19fde..96c8baac7838 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -2,7 +2,6 @@
 
 import contextlib
 import ctypes
-import os
 import sys
 import types
 
@@ -67,7 +66,7 @@ def __getattr__(self, op_name):
         return op
 
 class _Ops(types.ModuleType):
-    __file__ = os.path.join(os.path.dirname(__file__), '_ops.py')
+    __file__ = '_ops.py'
 
     def __init__(self):
         super(_Ops, self).__init__('torch.ops')
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index be7d8fcaa685..c77e960ae659 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -1,6 +1,7 @@
 
 import os
 import inspect
+import sys
 import tempfile
 
 # this arbitrary-looking assortment of functionality is provided here
@@ -8,11 +9,16 @@
 # use is the FB build environment, where this source file is replaced
 # by an equivalent.
 
-if os.path.basename(os.path.dirname(__file__)) == 'shared':
-    torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+if sys.executable == 'torch_deploy':
+    # __file__ is meaningless in the context of frozen torch used in torch deploy.
+    # setting empty torch_parent should allow below functions to operate without crashing,
+    # but it's unclear if there is a valid use case for them in the context of deploy.
+    torch_parent = ""
 else:
-    torch_parent = os.path.dirname(os.path.dirname(__file__))
-
+    if os.path.basename(os.path.dirname(__file__)) == 'shared':
+        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    else:
+        torch_parent = os.path.dirname(os.path.dirname(__file__))
 
 def get_file_path(*path_components):
     return os.path.join(torch_parent, *path_components)
@@ -60,7 +66,7 @@ def get_source_lines_and_file(obj, error_msg=None):
 
 TEST_MASTER_ADDR = '127.0.0.1'
 TEST_MASTER_PORT = 29500
-# USE_GLOBAL_DEPS controls whether __init__.py tries to load 
+# USE_GLOBAL_DEPS controls whether __init__.py tries to load
 # libtorch_global_deps, see Note [Global dependencies]
 USE_GLOBAL_DEPS = True
 # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index eb80fff32b81..bbd3ccef505f 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -692,6 +692,8 @@ extern "C"
 #ifdef _WIN32
 __declspec(dllexport)
 #endif
+TORCH_API PyObject* initModule();
+// separate decl and defn for msvc error C2491
 PyObject* initModule() {
   HANDLE_TH_ERRORS
   at::internal::lazy_init_num_threads();
diff --git a/torch/csrc/deploy/.gitignore b/torch/csrc/deploy/.gitignore
new file mode 100644
index 000000000000..aa484a97a20f
--- /dev/null
+++ b/torch/csrc/deploy/.gitignore
@@ -0,0 +1 @@
+example/generated/*
diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt
new file mode 100644
index 000000000000..9da314905860
--- /dev/null
+++ b/torch/csrc/deploy/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_subdirectory(interpreter)
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
new file mode 100644
index 000000000000..4fab5aa4ef56
--- /dev/null
+++ b/torch/csrc/deploy/README.md
@@ -0,0 +1,10 @@
+# Torch Deploy
+This is an experimental feature to embed multiple python interpreters inside the torch library,
+providing a solution to the 'GIL problem' for multithreading with the convenience of python
+and eager or torchscripted pytorch programs.
+
+# libinterpreter
+This is an internal library used behind the scenes to enable multiple python interpreters in
+a single deploy runtime.  libinterpreter.so is DLOPENed multiple times by the deploy library.
+Each copy of libinterpreter exposes a simple interpreter interface but hides its python and other
+internal symbols, preventing the different python instances from seeing each other.
diff --git a/torch/csrc/deploy/example/simple.pt b/torch/csrc/deploy/example/simple.pt
new file mode 100644
index 0000000000000000000000000000000000000000..50f9a087aa822821647a8acabe28bb84207475b2
GIT binary patch
literal 2432
zcmah~4Ny~87JmE%jN!K_C{c`n7DSRLExN!vhoWYYRE&s2)-FvT36Ky7`GEqhO3~?p
zL|_?ZsctLdMyZHOE5mjH5kaV>s1(uBHlpj!S}e8F-KAEovhVR{g6sCo+_~?)d%knu
z`R@776^k4k3BtvNc+OmjmkEO^D@UW`D^iRpe1R~=lh``jhIZRyK=*I)!cZZq8){(n
z@pmC?B^ynOG?YYeu>YNUa`b@-j{Wg<^dcXFL{|=d_n*LpxJB4{DiZ_0cLHYUD8#+K
z1H*4J@SRPF(w;cn*y4?rHxI+^Z-0Txury4*pNcYmEbge-j72SNa8-N+niN~G_qF}N
z<@_7^%N(&n#=@$r$*7cjV)N&FVM)6^KJ=2I;OA!2;OT%JqWUNghic$ze6Z|;`Ixk4
zK3-5SLHR&3X6&{=+nIhCDBc5>@l4=GrIPiSf`i2ftn~r#S$8M-R@H4N@;w^W9&!<C
zLhDGMi)<|L&q9HFH_46di#iqX8*C43gr<&b;L)|2O#7+{a%zUbxBU^o$RJcKh{Oo5
zc<jsQg*%TH<2=zL(ANf|(-+QI6jL9inQM=Y<aP)+pO4C%3_M-UN7EV|vi`+@TkA`s
zMy@mQ&gncXI=l@sEo^MNQ4V7|C+rDdh4#K26g&;W%F>18t;$kz&B73@dhIeC+LVs`
zlLq9;xM(`F6k0QwC>~BDzdtq({_3B=an%TTuO5a^Bl;k4EEHMiO5sOy27bIifQgN2
zB-}WWA20bC8nwOTsGTFcx{C#kmHi}<zbk^Bum>KbDNxWcOnTn)#y)>Gs-O0PW1JTh
zx_jV>{nvnD3B?xEILwnAA<f|jV6<g1GK70!*NS}bAcx5sULnq3wjAs83H0yUjC<~C
zab%E%X6S)m{$qw{!!7c=t07psG!ZV|^+Ac>LMW+lz(s#$;PQ1cyw7?Ix5xsJ`Ii&$
zMoAm|aArNmfEhSP6EJ$c5-*z~F>S0H@&#F7Svm^Qi5Wn6X(N+=iojq=D4xeZ!0GNS
z_)UBu4zBgWo|~V*DY*)xx09rY-*HlLX#)}-b5IW_z$M%b<64iwf$(M6o6`(+8<WtK
zW&jb#6K`J*!y$tfFWp>@-=5os3(8l*-#T1TEOK_~{^#dUxef$D5Ah52=1v4y>y4L_
zsS(C3dR}q13M{ghOQlAwUY;S9GB*@#&?<5@N~Vqoes5clgh7e=ZKWzb!>F^9PdM?U
zxki=7z?ABhskthRLTXfI8?<_9-b$v<J}5}yOd$-EXG*$K=P(Ij3N_jk<L1eaOr4{|
zj`9)coFr5>gghr##}J)2Zqhl6h#(PD#5M+r*djY)iCnJLXq0lJN}Fw9ZqzH3dZl8m
zN^WH8T!Isf3A#BFy5|`6DXMH$cDjyPlAEfV%cJS)jB2R3M6J<-ri)Frr|0|AJR)S8
zF1c2r<j=6c%PA1Xyz*<jO|wXrpJjG&J=WfTvU^FLqoy`#@spjF!jN@u{Qjl0YRSI-
zGbgW@>yq!>>$|8?x(K!lPgZp|mzPV4UDpD?SS2)b9a*EAyjQ>U<uFcm)|tl^JvH~f
z*X58rx_4jf!0LZgH*6UCrXgssmb_P$$mv&qwDVA1!OMp*v+-SLnWg&n2UpIHZEt83
zA586eXjt(e;mT+2d9o`PqQB1C*L6I^ue0X-!mD5U(4=$~9Xe;G2zmT}N%1FGFC>Mh
zppHCUI#E_GGxU(<E8L0&?Ds>y<CZ1m2MBkzmX?jQ67lcHW**=+9d^|js$MFX`?j|B
zNAK$B$9(?7p+Dg`8CR(|x6U)Z(x@(eQgACQGJ?r-Qg*0s+-ZsR-TJ9(N!->`Uv~?R
zH22tDTQeU2<}#k8qp-i&eDsqR-y4#3*9XQpA0E7W)HCUK1BpX}Y@ZMJ|J~a?yR+=^
z3hfWqUuo3J@RDhN-3~f$-}Idi{!HaOFqQZJzN^@T|FgWa4aSsgqhV4&elwX)pSbN@
zWFI_HbXoj4Nz_G1(Bs7#bt9su_sLW04ODFlW1@a#2iyH-zlAaBEWi3FeTzM{Gd;v2
zJ9?VAldZdlE4^T2HaGNasI`*ZZD2W+c&(r`SCimpVzaKec{bpK6iHLy=cG3~iuL*V
z+eB${dU2FzwK_W`>vLk;#A~3s-OOKE)xhis)?B;UL|{|hZz{sn>54@R#_}m2KZnp6
pZ@RJ4Ha(l@o=900-9IU#a1&2{b0**EWwFR<!ks~(X#Yvy{{r501)Tr@

literal 0
HcmV?d00001

diff --git a/torch/csrc/deploy/example/trace_simple.py b/torch/csrc/deploy/example/trace_simple.py
new file mode 100644
index 000000000000..8803a48b7493
--- /dev/null
+++ b/torch/csrc/deploy/example/trace_simple.py
@@ -0,0 +1,20 @@
+import argparse
+import torch
+
+class MyModule(torch.nn.Module):
+    def __init__(self, N, M):
+        super(MyModule, self).__init__()
+        self.weight = torch.nn.Parameter(torch.rand(N, M))
+
+    def forward(self, input):
+        output = self.weight + input
+        return output
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("save_file", help="Where to save the model")
+    args = parser.parse_args()
+
+    my_module = MyModule(10, 20)
+    sm = torch.jit.script(my_module)
+    sm.save(args.save_file)
diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt
new file mode 100644
index 000000000000..0c4c778291eb
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/CMakeLists.txt
@@ -0,0 +1,115 @@
+SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" )
+SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" PARENT_SCOPE)
+
+SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
+
+# Build cpython
+SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
+SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
+SET(PYTHON_LIB "${PYTHON_INSTALL_DIR}/lib/libpython3.8.a")
+SET(PYTHON_BIN "${PYTHON_INSTALL_DIR}/bin/python3")
+ExternalProject_Add(
+  cpython
+  PREFIX cpython
+  GIT_REPOSITORY https://github.com/python/cpython.git
+  GIT_TAG v3.8.6
+  UPDATE_COMMAND ""
+  BUILD_IN_SOURCE True
+  CONFIGURE_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC <SOURCE_DIR>/configure --prefix ${PYTHON_INSTALL_DIR}
+  BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8
+  INSTALL_COMMAND make install
+  BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN}
+  LOG_OUTPUT_ON_FAILURE True
+)
+
+# We find the built python modules, this is confusing because python build already outputs
+# the modules in a strange nested path, and then that path is relative to the
+# Cmake ExternalProject root in the cmake build dir.
+ExternalProject_Get_property(cpython SOURCE_DIR)
+SET(PYTHON_MODULE_DIR "${SOURCE_DIR}/build/temp.linux-x86_64-3.8/${SOURCE_DIR}/Modules")
+SET(PYTHON_STDLIB_DIR "${SOURCE_DIR}/Lib")
+SET(PYTHON_STDLIB "${PYTHON_INSTALL_DIR}/lib/libpython_stdlib3.8.a")
+# Then we use a hardcoded list of expected module names and include them in our lib
+include("CMakePythonModules.txt")
+ExternalProject_Add_Step(
+  cpython
+  archive_stdlib
+  DEPENDEES install
+  BYPRODUCTS ${PYTHON_STDLIB}
+  COMMAND ar -rc ${PYTHON_STDLIB} ${PYTHON_MODULES}
+  VERBATIM
+)
+# Get python typing extension, needed by torch
+SET(TYPING_PKG "${INTERPRETER_DIR}/third_party/typing_extensions.py")
+ExternalProject_Add(
+  typing
+  PREFIX typing
+  GIT_REPOSITORY https://github.com/python/typing.git
+  GIT_TAG 3.7.4.3
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND cp ../typing/typing_extensions/src_py3/typing_extensions.py ${TYPING_PKG}
+  BYPRODUCTS ${TYPING_PKG}
+  LOG_OUTPUT_ON_FAILURE True
+)
+
+# Output files generated by freeze script, containing frozen bytecode
+SET(FROZEN_DIR "${INTERPRETER_DIR}/frozen")
+set(FROZEN_FILES
+  ${FROZEN_DIR}/main.c
+  ${FROZEN_DIR}/bytecode_0.c
+  ${FROZEN_DIR}/bytecode_1.c
+  ${FROZEN_DIR}/bytecode_2.c
+  ${FROZEN_DIR}/bytecode_3.c
+  ${FROZEN_DIR}/bytecode_4.c
+)
+# Packages to freeze: python stdlib, typing extension, and torch
+add_custom_command(
+   OUTPUT ${FROZEN_FILES}
+   WORKING_DIRECTORY ${INTERPRETER_DIR}
+   COMMAND mkdir -p ${FROZEN_DIR}
+   COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
+   DEPENDS cpython typing
+   VERBATIM
+)
+
+# instantiate a library based on the objects that make up torch_python
+# make sure system python isn't used here
+target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR})
+add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
+# Build the interpreter lib, designed to be standalone and dlopened
+# We bake the python and torch_python binding objs into libinterpreter
+set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
+set(INTERPRETER_LIB_SOURCES
+  ${INTERPRETER_DIR}/interpreter.cpp
+  ${FROZEN_FILES}
+  ${LINKER_SCRIPT}
+)
+add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
+set_property(TARGET interpreter APPEND_STRING PROPERTY
+             LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
+# need to ensure headers are present before any .cpp in interpreter are compiled,
+# but cpp themselves don't clearly depend on cpython so there is a race otherwise
+add_dependencies(interpreter cpython)
+target_compile_options(
+    interpreter PRIVATE
+    -fvisibility=hidden
+)
+target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
+target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
+target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
+target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
+target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
+
+# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
+set(INTERPRETER_TEST_SOURCES
+  ${INTERPRETER_DIR}/test_main.cpp
+)
+add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
+target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
+target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
+target_link_libraries(interpreter_test PUBLIC gtest dl)
+# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
+# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
+target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
diff --git a/torch/csrc/deploy/interpreter/CMakePythonModules.txt b/torch/csrc/deploy/interpreter/CMakePythonModules.txt
new file mode 100644
index 000000000000..c6bc9cab76ff
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/CMakePythonModules.txt
@@ -0,0 +1,69 @@
+SET(PYTHON_MODULES
+  ${PYTHON_MODULE_DIR}/arraymodule.o
+  ${PYTHON_MODULE_DIR}/_asynciomodule.o
+  ${PYTHON_MODULE_DIR}/audioop.o
+  ${PYTHON_MODULE_DIR}/binascii.o
+  ${PYTHON_MODULE_DIR}/_bisectmodule.o
+  ${PYTHON_MODULE_DIR}/_blake2/blake2module.o ${PYTHON_MODULE_DIR}/_blake2/blake2b_impl.o ${PYTHON_MODULE_DIR}/_blake2/blake2s_impl.o
+  ${PYTHON_MODULE_DIR}/_bz2module.o
+  ${PYTHON_MODULE_DIR}/cmathmodule.o
+  # ${PYTHON_MODULE_DIR}/_math.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_cn.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_hk.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_iso2022.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_jp.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_kr.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_tw.o
+  ${PYTHON_MODULE_DIR}/_contextvarsmodule.o
+  ${PYTHON_MODULE_DIR}/_cryptmodule.o
+  ${PYTHON_MODULE_DIR}/_csv.o
+  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes.o ${PYTHON_MODULE_DIR}/_ctypes/callbacks.o ${PYTHON_MODULE_DIR}/_ctypes/callproc.o ${PYTHON_MODULE_DIR}/_ctypes/stgdict.o ${PYTHON_MODULE_DIR}/_ctypes/cfield.o
+  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes_test.o
+  ${PYTHON_MODULE_DIR}/_cursesmodule.o
+  ${PYTHON_MODULE_DIR}/_curses_panel.o
+  ${PYTHON_MODULE_DIR}/_datetimemodule.o
+  ${PYTHON_MODULE_DIR}/_decimal/_decimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/basearith.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/constants.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/context.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/convolute.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/crt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/difradix2.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fnt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fourstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/io.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/memory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/mpdecimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/numbertheory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/sixstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/transpose.o
+  ${PYTHON_MODULE_DIR}/_elementtree.o
+  ${PYTHON_MODULE_DIR}/fcntlmodule.o
+  ${PYTHON_MODULE_DIR}/grpmodule.o
+  ${PYTHON_MODULE_DIR}/_hashopenssl.o
+  ${PYTHON_MODULE_DIR}/_heapqmodule.o
+  ${PYTHON_MODULE_DIR}/_json.o
+  ${PYTHON_MODULE_DIR}/_lsprof.o
+  ${PYTHON_MODULE_DIR}/_lzmamodule.o
+  ${PYTHON_MODULE_DIR}/mathmodule.o
+  ${PYTHON_MODULE_DIR}/md5module.o
+  ${PYTHON_MODULE_DIR}/mmapmodule.o
+  ${PYTHON_MODULE_DIR}/cjkcodecs/multibytecodec.o
+  ${PYTHON_MODULE_DIR}/_multiprocessing/multiprocessing.o ${PYTHON_MODULE_DIR}/_multiprocessing/semaphore.o
+  ${PYTHON_MODULE_DIR}/nismodule.o
+  ${PYTHON_MODULE_DIR}/_opcode.o
+  ${PYTHON_MODULE_DIR}/ossaudiodev.o
+  ${PYTHON_MODULE_DIR}/parsermodule.o
+  ${PYTHON_MODULE_DIR}/_pickle.o
+  ${PYTHON_MODULE_DIR}/_posixsubprocess.o
+  ${PYTHON_MODULE_DIR}/pyexpat.o ${PYTHON_MODULE_DIR}/expat/xmlparse.o ${PYTHON_MODULE_DIR}/expat/xmlrole.o ${PYTHON_MODULE_DIR}/expat/xmltok.o
+  ${PYTHON_MODULE_DIR}/_queuemodule.o
+  ${PYTHON_MODULE_DIR}/_randommodule.o
+  ${PYTHON_MODULE_DIR}/readline.o
+  ${PYTHON_MODULE_DIR}/resource.o
+  ${PYTHON_MODULE_DIR}/selectmodule.o
+  ${PYTHON_MODULE_DIR}/sha1module.o
+  ${PYTHON_MODULE_DIR}/sha256module.o
+  ${PYTHON_MODULE_DIR}/_sha3/sha3module.o
+  ${PYTHON_MODULE_DIR}/sha512module.o
+  ${PYTHON_MODULE_DIR}/socketmodule.o
+  ${PYTHON_MODULE_DIR}/spwdmodule.o
+  ${PYTHON_MODULE_DIR}/_ssl.o
+  ${PYTHON_MODULE_DIR}/_struct.o
+  ${PYTHON_MODULE_DIR}/syslogmodule.o
+  ${PYTHON_MODULE_DIR}/termios.o
+  ${PYTHON_MODULE_DIR}/_testbuffer.o
+  ${PYTHON_MODULE_DIR}/_testcapimodule.o
+  ${PYTHON_MODULE_DIR}/_testimportmultiple.o
+  ${PYTHON_MODULE_DIR}/_testmultiphase.o
+  ${PYTHON_MODULE_DIR}/unicodedata.o
+  ${PYTHON_MODULE_DIR}/xxlimited.o
+  ${PYTHON_MODULE_DIR}/_xxtestfuzz/_xxtestfuzz.o ${PYTHON_MODULE_DIR}/_xxtestfuzz/fuzzer.o
+  ${PYTHON_MODULE_DIR}/zlibmodule.o
+)
diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py
new file mode 100644
index 000000000000..459b7be9381c
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/freeze.py
@@ -0,0 +1,269 @@
+"""
+Freeze Python packages.
+
+Freezing makes it possible to ship arbitrary Python modules as part of a C++
+library. The Python source of the module is compiled to bytecode and written
+to `.c` files, to be imported by Python's built-in FrozenImporter.
+
+In a normal Python installation, FrozenImporter is only used to bootstrap the
+initialization of the import machinery. Python's importers are defined in
+Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be
+retrieved before any importers are available. Freezing the module bytecode
+resolves this circular dependency.
+
+This script will freeze the Python standard library. It produces two things:
+- Bytecode files: A set of `.c` that define C variables containing Python bytecode.
+- Main file: A `main.c` file listing all of these modules in the right form to be
+  consumed by FrozenImporter.
+
+The library that wishes to these modules make them available to the local
+Python instance by extending `PyImport_FrozenModules` appropriately (see
+https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules).
+"""
+
+import argparse
+import functools
+import itertools
+import marshal
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+
+
+MAIN_INCLUDES = """#include <Python.h>
+
+"""
+
+MAIN_PREFIX = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen _PyImport_FrozenModules_torch[] = {
+"""
+
+FAKE_PREFIX = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen _PyImport_FrozenModules[] = {
+"""
+
+MAIN_SUFFIX = """\
+    {0, 0, 0} /* sentinel */
+};
+"""
+
+# Exclude some standard library modules to:
+# 1. Slim down the final frozen lib.
+# 2. Remove functionality we don't want to support.
+DENY_LIST = [
+    # Interface to unix databases
+    "dbm",
+    # ncurses bindings (terminal interfaces)
+    "curses",
+    # Tcl/Tk GUI
+    "tkinter",
+    "tkinter",
+    # Tests for the standard library
+    "test",
+    "tests",
+    "idle_test",
+    "__phello__.foo.py",
+    # importlib frozen modules. These are already baked into CPython.
+    "_bootstrap.py",
+    "_bootstrap_external.py",
+]
+
+NUM_BYTECODE_FILES = 5
+
+
+def indent_msg(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        args[0].indent += 1
+        ret = fn(*args, **kwargs)
+        args[0].indent -= 1
+        return ret
+
+    return wrapper
+
+
+@dataclass
+class FrozenModule:
+    # The fully qualified module name, e.g. 'foo.bar.baz'
+    module_name: str
+    # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz'
+    c_name: str
+    # The size of the C variable. Negative if this module is a package.
+    size: int
+    # The frozen bytecode
+    bytecode: bytes
+
+
+class Freezer:
+    def __init__(self, verbose: bool):
+        self.frozen_modules: List[FrozenModule] = []
+        self.indent: int = 0
+        self.verbose: bool = verbose
+
+    def msg(self, path: Path, code: str):
+        if not self.verbose:
+            return
+        # P: package dir
+        # F: python file
+        # S: skipped (not a package dir)
+        # X: skipped (deny-listed)
+        # N: skipped (not a python file)
+        for i in range(self.indent):
+            print("    ", end="")
+        print(f"{code} {path}")
+
+    def write_bytecode(self, install_root):
+        """
+        Write the `.c` files containing the frozen bytecode. Shard frozen
+        modules evenly across the files.
+        """
+        bytecode_file_names = [
+            f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES)
+        ]
+        bytecode_files = [open(os.path.join(install_root, name), "w") for name in bytecode_file_names]
+        it = itertools.cycle(bytecode_files)
+        for m in self.frozen_modules:
+            self.write_frozen(m, next(it))
+
+        for f in bytecode_files:
+            f.close()
+
+    def write_main(self, install_root, oss):
+        """
+        Write the `main.c` file containing a table enumerating all the
+        frozen modules.
+        """
+        with open(os.path.join(install_root, "main.c"), "w") as outfp:
+            outfp.write(MAIN_INCLUDES)
+            for m in self.frozen_modules:
+                outfp.write(f"extern unsigned char {m.c_name}[];\n")
+
+            outfp.write(MAIN_PREFIX)
+            for m in self.frozen_modules:
+                outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
+            outfp.write(MAIN_SUFFIX)
+            if oss:
+                outfp.write(FAKE_PREFIX)
+                outfp.write(MAIN_SUFFIX)
+
+    def write_frozen(self, m: FrozenModule, outfp):
+        """
+        Write a single frozen module's bytecode out to a C variable.
+        """
+        outfp.write(f"unsigned char {m.c_name}[] = {{")
+        for i in range(0, len(m.bytecode), 16):
+            outfp.write("\n\t")
+            for c in bytes(m.bytecode[i : i + 16]):
+                outfp.write("%d," % c)
+        outfp.write("\n};\n")
+
+    def compile_path(self, path: Path, top_package_path: Path):
+        """Generic entry point for compiling a Path object."""
+        if path.is_dir():
+            self.compile_package(path, top_package_path)
+        else:
+            self.compile_file(path, top_package_path)
+
+    @indent_msg
+    def compile_package(self, path: Path, top_package_path: Path):
+        """Compile all the files within a Python package dir."""
+        assert path.is_dir()
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        # Python packages are directories that have __init__.py in them.
+        is_package_dir = any([child.name == "__init__.py" for child in path.iterdir()])
+        if not is_package_dir:
+            self.msg(path, "S")
+            return
+
+        self.msg(path, "P")
+        # Recursively compile all children in this dir
+        for child in path.iterdir():
+            self.compile_path(child, top_package_path)
+
+    def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]:
+        # `path` looks like 'Lib/foo/bar/baz.py'
+
+        # chop off 'Lib/' to get something that represents a Python module hierarchy.
+        # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz'
+        normalized_path = file_path.relative_to(top_package_path.parent)
+
+        if normalized_path.name == "__init__.py":
+            # Special handling for `__init__.py`. In this case, this file
+            # specifies that the containing directory should be treated as a package.
+            # For 'foo/bar/baz/__init__.py':
+            # - The module name is 'baz'
+            module_basename = normalized_path.parent.name
+            # - The parent is foo.bar (need to shave off the 'baz')
+            module_parent = normalized_path.parent.parent.parts
+        else:
+            module_basename = normalized_path.stem
+            module_parent = normalized_path.parent.parts
+        return list(module_parent) + [module_basename]
+
+    @indent_msg
+    def compile_file(self, path: Path, top_package_path: Path):
+        """
+        Compile a Python source file to frozen bytecode. Append the result to
+        `self.frozen_modules`.
+        """
+        assert path.is_file()
+        if path.suffix != ".py":
+            self.msg(path, "N")
+            return
+
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        self.msg(path, "F")
+        module_qualname = self.get_module_qualname(path, top_package_path)
+        module_mangled_name = "__".join(module_qualname)
+        c_name = "M_" + module_mangled_name
+
+        with open(path, "r") as src_file:
+            co = compile(src_file.read(), path, "exec")
+
+        bytecode = marshal.dumps(co)
+        size = len(bytecode)
+        if path.name == '__init__.py':
+            # Python packages are signified by negative size.
+            size = -size
+        self.frozen_modules.append(
+            FrozenModule(".".join(module_qualname), c_name, size, bytecode)
+        )
+
+
+parser = argparse.ArgumentParser(description="Compile py source")
+parser.add_argument("paths", nargs="*", help="Paths to freeze.")
+parser.add_argument("--verbose", action="store_true", help="Print debug logs")
+parser.add_argument("--install_dir", help="Root directory for all output files")
+parser.add_argument("--fbcode_dir", help="Root directory for all output files")
+parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
+
+args = parser.parse_args()
+
+f = Freezer(args.verbose)
+
+for p in args.paths:
+    if args.fbcode_dir:
+        p = os.path.join(args.fbcode_dir, p)
+    path = Path(p)
+    if path.is_dir() and not Path.exists(path / '__init__.py'):
+        # this 'top level path p' is a standard directory containing modules,
+        # not a module itself
+        # each 'mod' could be a dir containing __init__.py or .py file
+        for mod in path.glob("*"):
+            f.compile_path(mod, mod)
+    else:
+        f.compile_path(path, path)
+
+f.write_bytecode(args.install_dir)
+f.write_main(args.install_dir, args.oss)
diff --git a/torch/csrc/deploy/interpreter/hide_symbols.script b/torch/csrc/deploy/interpreter/hide_symbols.script
new file mode 100644
index 000000000000..c748c8bfec95
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/hide_symbols.script
@@ -0,0 +1,5 @@
+INTERPRETER_0.1 {
+  global:
+    initialize_interface;
+  local: *;         # hide everything else
+};
diff --git a/torch/csrc/deploy/interpreter/interpreter.cpp b/torch/csrc/deploy/interpreter/interpreter.cpp
new file mode 100644
index 000000000000..7d685d33435c
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter.cpp
@@ -0,0 +1,324 @@
+#include <dlfcn.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <iostream>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+#include <pybind11/embed.h>
+#include <cstdio>
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <map>
+#include <thread>
+#include <fmt/format.h>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+// TODO this should come from cmake
+#define DEBUG 0
+template<typename T>
+const auto PYOBJ_ASSERT(T obj) {
+#if (DEBUG == 1)
+  if (NULL == obj) {
+    PyErr_Print();
+  }
+#endif
+  TORCH_INTERNAL_ASSERT(NULL != obj);
+}
+
+static wchar_t* program;
+
+#define FOREACH_LIBRARY(_) \
+  _(array)                 \
+  _(_asyncio)              \
+  _(audioop)               \
+  _(binascii)              \
+  _(_bisect)               \
+  _(_blake2)               \
+  _(_bz2)                  \
+  _(cmath)                 \
+  _(_codecs_cn)            \
+  _(_codecs_hk)            \
+  _(_codecs_iso2022)       \
+  _(_codecs_jp)            \
+  _(_codecs_kr)            \
+  _(_codecs_tw)            \
+  _(_contextvars)          \
+  _(_crypt)                \
+  _(_csv)                  \
+  _(_ctypes)               \
+  _(_ctypes_test)          \
+  _(_curses)               \
+  _(_curses_panel)         \
+  _(_datetime)             \
+  _(_decimal)              \
+  _(_elementtree)          \
+  _(fcntl)                 \
+  _(grp)                   \
+  _(_hashlib)              \
+  _(_heapq)                \
+  _(_json)                 \
+  _(_lsprof)               \
+  _(_lzma)                 \
+  _(math)                  \
+  _(_md5)                  \
+  _(mmap)                  \
+  _(_multibytecodec)       \
+  _(_multiprocessing)      \
+  _(nis)                   \
+  _(_opcode)               \
+  _(ossaudiodev)           \
+  _(parser)                \
+  _(_pickle)               \
+  _(_posixsubprocess)      \
+  _(pyexpat)               \
+  _(_queue)                \
+  _(_random)               \
+  _(readline)              \
+  _(resource)              \
+  _(select)                \
+  _(_sha1)                 \
+  _(_sha256)               \
+  _(_sha3)                 \
+  _(_sha512)               \
+  _(_socket)               \
+  _(spwd)                  \
+  _(_ssl)                  \
+  _(_struct)               \
+  _(syslog)                \
+  _(termios)               \
+  _(_testbuffer)           \
+  _(_testcapi)             \
+  _(_testimportmultiple)   \
+  _(_testmultiphase)       \
+  _(unicodedata)           \
+  _(xxlimited)             \
+  _(_xxtestfuzz)           \
+  _(zlib)
+
+#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
+FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
+#undef DECLARE_LIBRARY_INIT
+
+extern "C" __attribute__((visibility("default"))) void initialize_interface(
+    InterpreterImpl* s) {
+#define INITIALIZE_MEMBER(func) s->func = func;
+  FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
+#undef INITIALIZE_MEMBER
+}
+
+// These numbers of modules should not change as long as the cpython version
+// embedded in the build remains fixed
+static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
+static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
+
+// We need to preserve the existing FrozenModules list, since it includes
+// important importlib machinery. This code is adapted from the similar
+// `PyImport_ExtendInittab`.
+int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
+    struct _frozen *p = nullptr;
+    size_t a = 0, b = 0, c = 0;
+    int res = 0;
+
+    /* Count the number of entries in both tables */
+    for (a = 0; frozenpython[a].name != nullptr; a++) {
+      // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
+    }
+    for (b = 0; frozentorch[b].name != nullptr; b++) {
+      // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
+    }
+    for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
+      // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
+    }
+
+    // Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
+    TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
+    // Check a+b together since in OSS a is empty and b contains stdlib+torch, while
+    // in fbcode they are separated due to thirdparty2 frozenpython.
+    // No fixed number of torch modules to check for, but there should be at least one.
+    TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
+
+    /* Allocate new memory for the combined table */
+    if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
+        size_t size = sizeof(struct _frozen) * (a + b + c + 1);
+        p = (_frozen*)PyMem_Realloc(p, size);
+    }
+    if (p == nullptr) {
+      return -1;
+    }
+
+    /* Copy the tables into the new memory */
+    memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
+    memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
+    memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
+    PyImport_FrozenModules = p;
+    return res;
+}
+
+// We need to register a custom finder because we are registering `torch._C` as
+// a built-in module, and it will otherwise get skipped by the default importer.
+const char* finder = R"RAW(
+import sys
+# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
+sys.meta_path = sys.meta_path[:-1]
+
+class F:
+    def find_spec(self, fullname, path, target=None):
+        if fullname == 'torch._C':
+            return sys.meta_path[1].find_spec('torch._C', None, None)
+        return None
+sys.meta_path.insert(0, F())
+
+# make loader importable
+)RAW";
+
+const char* sysprint = R"RAW(
+import sys
+print("exec_prefix:", sys.base_exec_prefix)
+print("_base_executable:", sys._base_executable)
+print("base_prefix:", sys.base_prefix)
+print("exec_prefix:", sys.exec_prefix)
+print("executable:", sys.executable)
+print("path:", sys.path)
+print("prefix:", sys.prefix)
+
+)RAW";
+
+extern "C" PyObject* initModule(void);
+extern "C" struct _frozen _PyImport_FrozenModules[];
+extern "C" struct _frozen _PyImport_FrozenModules_torch[];
+
+static std::atomic<size_t> s_id;
+std::map<size_t, py::object> forwards;
+
+__attribute__((constructor)) void init() {
+
+}
+
+void startup() {
+#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
+  FOREACH_LIBRARY(APPEND_INIT)
+#undef APPEND_INIT
+  PyImport_AppendInittab("torch._C", initModule);
+
+  int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
+  TORCH_INTERNAL_ASSERT(ret == 0);
+
+  PyPreConfig preconfig;
+  PyPreConfig_InitIsolatedConfig(&preconfig);
+  PyStatus status = Py_PreInitialize(&preconfig);
+  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+  PyConfig config;
+  PyConfig_InitIsolatedConfig(&config);
+
+  // Completely blank out the path configuration. This ensures we have complete
+  // control of how our embedded Python searches for modules, and we will never
+  // consult the external filesystem. See:
+  // https://docs.python.org/3/c-api/init_config.html#path-configuration
+  config.site_import = 0;
+
+  status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
+  status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
+  status = PyConfig_SetString(&config, &config.base_prefix, L"");
+  status = PyConfig_SetString(&config, &config.exec_prefix, L"");
+  status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
+  status = PyConfig_SetString(&config, &config.prefix, L"");
+
+
+  config.module_search_paths_set = 1;
+  std::array<wchar_t*, 0> module_search_paths = {};
+  status = PyConfig_SetWideStringList(
+      &config, &config.module_search_paths, 0, module_search_paths.data());
+
+  status = Py_InitializeFromConfig(&config);
+  PyConfig_Clear(&config);
+  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
+
+  // Uncomment to debug python config
+  // PyRun_SimpleString(sysprint);
+
+  PyRun_SimpleString(finder);
+  // Release the GIL that PyInitialize acquires
+  PyEval_SaveThread();
+}
+
+void teardown() {
+  PyGILState_Ensure();
+
+  if (Py_FinalizeEx() < 0) {
+    std::cout << "IT BROKE SO WE ARE EXITING\n";
+    exit(120);
+  }
+  PyMem_RawFree(program);
+}
+
+__attribute__((destructor)) void deinit() {}
+
+void run_some_python(const char* code) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  if (PyRun_SimpleString(code) == -1) {
+    throw std::runtime_error("python eval failed\n");
+  }
+  PyGILState_Release(gstate);
+}
+
+void run_python_file(const char* code) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  FILE* f = fopen(code, "r");
+  if (PyRun_SimpleFile(f, code) == -1) {
+    throw std::runtime_error("python eval failed\n");
+  }
+  fclose(f);
+
+  PyGILState_Release(gstate);
+}
+
+
+size_t load_model(const char* filename, bool hermetic) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
+  std::string code;
+
+  if (hermetic) {
+    code = fmt::format(R"(
+from torch.package import PackageImporter
+
+i = PackageImporter('{}')
+model = i.load_pickle('model', 'model.pkl')
+)", filename);
+  } else {
+    code = std::string("model = torch.jit.load('") +
+        std::string(filename) + std::string("')");
+  }
+    py::exec(code);
+
+  auto id = ++s_id;
+
+  PyGILState_Release(gstate);
+  return id;
+}
+
+at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
+  at::Tensor output;
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  {
+    TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
+    auto forward = py::globals()["model"].attr("forward");
+
+    py::object py_output = forward(input);
+    // TODO is this going to leak?
+    // added it to prevent crash wehn using 'output' tensor in callee of
+    // forward()
+    py_output.inc_ref();
+    output = py::cast<at::Tensor>(py_output);
+  }
+
+  PyGILState_Release(gstate);
+
+  return output;
+  // return input;
+}
diff --git a/torch/csrc/deploy/interpreter/interpreter.h b/torch/csrc/deploy/interpreter/interpreter.h
new file mode 100644
index 000000000000..29e435e44970
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter.h
@@ -0,0 +1,67 @@
+#pragma once
+#include <dlfcn.h>
+#include <unistd.h>
+#include <experimental/filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
+
+
+class Interpreter : public InterpreterImpl {
+ private:
+  std::string library_name_;
+  void* handle_;
+
+ public:
+  Interpreter() : handle_(nullptr) {
+    char library_name[L_tmpnam];
+    library_name_ = library_name;
+    char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
+    if (libinterpreter_path == nullptr) {
+      throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
+    }
+    std::tmpnam(library_name);
+    {
+      std::ifstream src(libinterpreter_path,  std::ios::binary);
+      std::ofstream dst(library_name, std::ios::binary);
+      dst << src.rdbuf();
+    }
+    handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
+    if (!handle_) {
+      throw std::runtime_error(dlerror());
+    }
+
+    // technically, we can unlike the library right after dlopen, and this is
+    // better for cleanup because even if we crash the library doesn't stick
+    // around. However, its crap for debugging because gdb can't find the
+    // symbols if the library is no longer present.
+    unlink(library_name_.c_str());
+
+    void* initialize_interface = dlsym(handle_, "initialize_interface");
+    if (!initialize_interface) {
+      throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
+    }
+    ((void (*)(InterpreterImpl*))initialize_interface)(this);
+
+    this->startup();
+
+    // the actual torch loading process is not thread safe, by doing it
+    // in the constructor before we have multiple worker threads, then we
+    // ensure it doesn't race.
+    run_some_python("import torch");
+  }
+  ~Interpreter() {
+    if (handle_) {
+      this->teardown();
+
+      // it segfaults its face off trying to unload, but it's not clear
+      // if this is something we caused of if libtorch_python would also do the
+      // same if it were opened/closed a lot...
+      dlclose(handle_);
+    }
+  }
+  Interpreter(const Interpreter&) = delete;
+};
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h
new file mode 100644
index 000000000000..82326bd370f1
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <ATen/ATen.h>
+
+// NOTE- if adding new interface functions,
+// update interpreter.cpp initialize_interface.
+size_t load_model(const char* model_file, bool hermetic=false);
+at::Tensor forward_model(size_t model_id, at::Tensor const & input);
+void run_some_python(const char* code);
+void startup();
+void teardown();
+void run_python_file(const char* code);
+
+
+#define FOREACH_INTERFACE_FUNCTION(_) \
+  _(load_model)                       \
+  _(forward_model)                    \
+  _(run_some_python)                  \
+  _(startup)                          \
+  _(teardown)                         \
+  _(run_python_file)
+
+struct InterpreterImpl {
+#define DEFINE_POINTER(func) decltype(&::func) func;
+  FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
+#undef DEFINE_POINTER
+};
diff --git a/torch/csrc/deploy/interpreter/test_main.cpp b/torch/csrc/deploy/interpreter/test_main.cpp
new file mode 100644
index 000000000000..6107267c9f29
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/test_main.cpp
@@ -0,0 +1,49 @@
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <torch/csrc/deploy/interpreter/interpreter.h>
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  int rc = RUN_ALL_TESTS();
+
+  return rc;
+}
+
+TEST(Interpreter, Sanity) {
+  ASSERT_TRUE(true);
+}
+
+TEST(Interpreter, Hello) {
+  Interpreter interp;
+  interp.run_some_python("print('hello from first interpeter!')");
+
+  Interpreter interp2;
+  interp2.run_some_python("print('hello from second interpeter!')");
+}
+
+void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
+  Interpreter interp;
+  // Test
+  auto model_id = interp.load_model(model_filename, false);
+  at::Tensor output = interp.forward_model(model_id, input);
+
+  // Reference
+  auto ref_model = torch::jit::load(model_filename);
+  std::vector<torch::jit::IValue> ref_inputs;
+  ref_inputs.emplace_back(torch::jit::IValue(input));
+  at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
+
+  ASSERT_TRUE(ref_output.equal(output));
+}
+
+TEST(Interpreter, SimpleModel) {
+  char* model_path = std::getenv("SIMPLE_MODEL_PATH");
+  ASSERT_NE(model_path, nullptr);
+  const int A = 10, B = 20;
+  compare_torchpy_jit(
+      model_path, torch::ones(at::IntArrayRef({A, B})));
+}
diff --git a/torch/csrc/deploy/interpreter/third_party/README.md b/torch/csrc/deploy/interpreter/third_party/README.md
new file mode 100644
index 000000000000..2c5d9241d2bb
--- /dev/null
+++ b/torch/csrc/deploy/interpreter/third_party/README.md
@@ -0,0 +1,2 @@
+Python libraries that we want to package along with the Python implementation
+bundled in libinterpreter.
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 7286387644ad..faddb8cb16e2 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -113,6 +113,10 @@ def _lazy_call(callable):
     if is_initialized():
         callable()
     else:
+        # TODO(torch_deploy): this accesses linecache, which attempts to read the
+        # file system to get traceback info. Patch linecache or do something
+        # else here if this ends up being important.
+
         # Don't store the actual traceback to avoid memory cycle
         _queued_calls.append((callable, traceback.format_stack()))
 
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index df6a3793e90d..73eb7f93cf1c 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -2,6 +2,7 @@
 from .throughput_benchmark import ThroughputBenchmark
 
 import os.path as _osp
+import sys
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):
@@ -9,5 +10,8 @@ def set_module(obj, mod):
         raise TypeError("The mod argument should be a string")
     obj.__module__ = mod
 
-#: Path to folder containing CMake definitions for Torch package
-cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')
+if sys.executable == "torch_deploy":
+    # not valid inside torch_deploy interpreter, no paths exists for frozen modules
+    cmake_prefix_path = None
+else:
+    cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')

From 1c9347c6666d0bb8b9793b504e9cb597b75f1401 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 26/41] [ONNX] Use parameter values in onnx shape inference
 (#49706) (#50905)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50905

Adds an additional run of onnx shape inference after constant folding, since initializer may have changed and affected shape inference.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050881

Pulled By: SplitInfinity

fbshipit-source-id: 9e5d69c52b647133cd3a0781988e2ad1d1a9c09d
---
 ...erators.test_upsample_nearest_scale.expect |  8 +-
 ..._nearest_scale_default_scale_factor.expect |  8 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  3 +-
 torch/_C/__init__.pyi.in                      |  4 +-
 .../jit/passes/onnx/shape_type_inference.cpp  | 95 ++++++++++++-------
 .../jit/passes/onnx/shape_type_inference.h    |  7 +-
 torch/csrc/jit/python/init.cpp                | 12 ++-
 torch/onnx/utils.py                           | 26 +++--
 8 files changed, 102 insertions(+), 61 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
index 5355daf4f3ca..67d765831c1b 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect
@@ -50,16 +50,16 @@ graph {
         elem_type: 1
         shape {
           dim {
-            dim_param: "Upsample4_dim_0"
+            dim_value: 1
           }
           dim {
-            dim_param: "Upsample4_dim_1"
+            dim_value: 2
           }
           dim {
-            dim_param: "Upsample4_dim_2"
+            dim_value: 6
           }
           dim {
-            dim_param: "Upsample4_dim_3"
+            dim_value: 8
           }
         }
       }
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
index 5355daf4f3ca..67d765831c1b 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect
@@ -50,16 +50,16 @@ graph {
         elem_type: 1
         shape {
           dim {
-            dim_param: "Upsample4_dim_0"
+            dim_value: 1
           }
           dim {
-            dim_param: "Upsample4_dim_1"
+            dim_value: 2
           }
           dim {
-            dim_param: "Upsample4_dim_2"
+            dim_value: 6
           }
           dim {
-            dim_param: "Upsample4_dim_3"
+            dim_value: 8
           }
         }
       }
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index d3d4ebef5f61..6cabcd728085 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -689,7 +689,7 @@ def forward(self, input):
 
         # Without empty optional arguments dictionary
         x = torch.randn(2, 3)
-        self.run_test(NoOptionalModel(), (x,), input_names=['input_x'])      
+        self.run_test(NoOptionalModel(), (x,), input_names=['input_x'])
         # With empty optional arguments dictionary
         y = torch.randn(2, 3)
         self.run_test(NoOptionalModel(), (y, {}))
@@ -3668,6 +3668,7 @@ def forward(self, input):
         self.run_test(SplitModel2(), x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()
     def test_chunk(self):
         class ChunkModel(torch.nn.Module):
             def __init__(self):
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index a7ebf35cc0b6..6ebae6b80c1f 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -261,7 +261,7 @@ def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def
 
 def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
 def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ...
-def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ...
+def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ...
 def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool = False) -> None: ...
 def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ...
 def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
@@ -298,7 +298,7 @@ def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IV
 def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
 def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ...
 def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
-def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ...
+def _jit_pass_onnx_node_shape_type_inference(n: Node, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ...
 def _jit_pass_onnx_block(
     old_block: Block,
     new_block: Block,
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index bc1cabf81ddf..8d2ff1e63cfc 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -258,34 +258,50 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr<Graph> n_graph) {
 }
 
 // Clone the node n for the new graph.
-Node* CloneNodeToGraph(Node* n, std::shared_ptr<Graph> n_graph) {
-  auto clone_node = n_graph->createClone(n, [&n_graph](Value* v) {
-    auto v_n = v->node();
-    switch (v_n->kind()) {
-      case ::c10::onnx::Constant: {
-        // Clone the input if it is constant.
-        auto constant_n = n_graph->insertNode(
-            n_graph->createClone(v_n, [](Value* v) { return v; }));
-        return constant_n->output();
-      }
-      case ::c10::prim::ListConstruct: {
-        return CloneValueFromListConstruct(v, n_graph);
-      }
-      case ::c10::prim::PackPadded: {
-        auto input = n_graph->addInput();
-        input->copyMetadata(v_n->input(0));
-        return input;
-      }
-      default: {
-        // If the input is not constant, we cannot depend on its value
-        // in shape inference. Set it to graph input in the new graph,
-        // and copy over metadata, such as datatype and shape.
-        auto input = n_graph->addInput();
-        input->copyMetadata(v);
-        return input;
-      }
-    }
-  });
+Node* CloneNodeToGraph(
+    Node* n,
+    std::shared_ptr<Graph> n_graph,
+    const ParamMap& params_dict) {
+  auto vals_to_params_map =
+      buildValueToParamsMap(n->owningGraph()->block(), params_dict);
+  auto clone_node =
+      n_graph->createClone(n, [&n_graph, &vals_to_params_map](Value* v) {
+        auto v_n = v->node();
+        switch (v_n->kind()) {
+          case ::c10::onnx::Constant: {
+            // Clone the input if it is constant.
+            auto constant_n = n_graph->insertNode(
+                n_graph->createClone(v_n, [](Value* v) { return v; }));
+            return constant_n->output();
+          }
+          case ::c10::prim::ListConstruct: {
+            return CloneValueFromListConstruct(v, n_graph);
+          }
+          case ::c10::prim::PackPadded: {
+            auto input = n_graph->addInput();
+            input->copyMetadata(v_n->input(0));
+            return input;
+          }
+          default: {
+            if (vals_to_params_map.find(v) != vals_to_params_map.end()) {
+              // If the input is a parameter, insert a constant of its value as
+              // input.
+              auto val = vals_to_params_map.find(v)->second.second.toTensor();
+              return n_graph
+                  ->insertNode(n_graph->create(::c10::onnx::Constant)
+                                   ->t_(attr::value, val))
+                  ->output();
+            } else {
+              // If the input is not constant, we cannot depend on its value
+              // in shape inference. Set it to graph input in the new graph,
+              // and copy over metadata, such as datatype and shape.
+              auto input = n_graph->addInput();
+              input->copyMetadata(v);
+              return input;
+            }
+          }
+        }
+      });
   return clone_node;
 }
 
@@ -433,19 +449,25 @@ void FetchBlockInputMetadataFromParent(Block* b) {
   }
 }
 
-void ONNXShapeTypeInference(Block* b, int opset_version) {
+void ONNXShapeTypeInference(
+    Block* b,
+    const ParamMap& params_dict,
+    int opset_version) {
   FetchBlockInputMetadataFromParent(b);
   for (auto n : b->nodes()) {
     for (auto subblock : n->blocks()) {
-      ONNXShapeTypeInference(subblock, opset_version);
+      ONNXShapeTypeInference(subblock, params_dict, opset_version);
     }
-    ONNXShapeTypeInference(n, opset_version);
+    ONNXShapeTypeInference(n, params_dict, opset_version);
   }
 }
 
 } // namespace
 
-void ONNXShapeTypeInference(Node* n, int opset_version) {
+void ONNXShapeTypeInference(
+    Node* n,
+    const ParamMap& params_dict,
+    int opset_version) {
   GRAPH_UPDATE(
       "Running ONNX shape inference for node: ", n->kind().toDisplayString());
   if (!IsSupportedNode(n)) {
@@ -454,7 +476,7 @@ void ONNXShapeTypeInference(Node* n, int opset_version) {
   // Create a Graph containing only the single node n.
   // This graph is later converted to ONNX to run shape inference.
   auto n_graph = std::make_shared<Graph>();
-  auto clone_node = CloneNodeToGraph(n, n_graph);
+  auto clone_node = CloneNodeToGraph(n, n_graph, params_dict);
   n_graph->insertNode(clone_node);
 
   // Register all node outputs as graph outputs.
@@ -690,8 +712,11 @@ void ONNXAssignOutputShape(
   Py_DECREF(py_obj);
 }
 
-void ONNXShapeTypeInference(std::shared_ptr<Graph>& graph, int opset_version) {
-  ONNXShapeTypeInference(graph->block(), opset_version);
+void ONNXShapeTypeInference(
+    std::shared_ptr<Graph>& graph,
+    const ParamMap& params_dict,
+    int opset_version) {
+  ONNXShapeTypeInference(graph->block(), params_dict, opset_version);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index bac7e2439ca9..69fbff1d175c 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 
 namespace torch {
@@ -34,7 +35,10 @@ TORCH_API void ONNXAssignOutputShape(
 // The node must have ONNX namespace, and is valid ONNX node accroding to spec.
 // On successful ONNX shape inference runs, the function updates output types of
 // n with inferred shape and type. Otherwise n is unchanged.
-TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version);
+TORCH_API void ONNXShapeTypeInference(
+    Node* n,
+    const ParamMap& params_dict,
+    int opset_version);
 
 // Utilize ONNX Shape Inference for graph.
 // Internally calls ONNXShapeTypeInference for each node, to achieve more
@@ -42,6 +46,7 @@ TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version);
 // the entire graph.
 TORCH_API void ONNXShapeTypeInference(
     std::shared_ptr<Graph>& g,
+    const ParamMap& params_dict,
     int opset_version);
 
 } // namespace jit
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 2a91bd497e7b..197af7179361 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -210,13 +210,17 @@ void initJITBindings(PyObject* module) {
           PrepareInplaceOpsForONNX)
       .def(
           "_jit_pass_onnx_node_shape_type_inference",
-          [](Node* n, int opset_version) {
-            ONNXShapeTypeInference(n, opset_version);
+          [](Node* n,
+             std::map<std::string, IValue>& params_dict,
+             int opset_version) {
+            ONNXShapeTypeInference(n, params_dict, opset_version);
           })
       .def(
           "_jit_pass_onnx_graph_shape_type_inference",
-          [](std::shared_ptr<Graph>& graph, int opset_version) {
-            ONNXShapeTypeInference(graph, opset_version);
+          [](std::shared_ptr<Graph>& graph,
+             std::map<std::string, IValue>& params_dict,
+             int opset_version) {
+            ONNXShapeTypeInference(graph, params_dict, opset_version);
           })
       .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape)
       .def("_jit_pass_fuse", FuseGraph)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 59d45de1f553..7a483c2f728b 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -29,6 +29,8 @@ def is_in_onnx_export():
     global __IN_ONNX_EXPORT
     return __IN_ONNX_EXPORT
 
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore
 
 @contextlib.contextmanager
 def select_model_mode_for_export(model, mode):
@@ -224,7 +226,7 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_lint(graph)
     from torch.onnx.symbolic_helper import _onnx_shape_inference, _export_onnx_opset_version
     if _onnx_shape_inference:
-        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, _export_onnx_opset_version)
+        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version)
     return graph
 
 
@@ -358,7 +360,7 @@ def _trace(func, args, operator_export_type, return_outs=False):
         torch.jit._get_trace_graph(func, args, strict=False, _force_outplace=False, _return_inputs_states=True)
     warn_on_static_input_change(inputs_states)
 
-    trace_graph = _optimize_graph(trace_graph, operator_export_type)
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
     if return_outs:
         return trace_graph, torch_out
     return trace_graph
@@ -422,6 +424,11 @@ def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes):
         torch._C._jit_pass_onnx_function_substitution(graph)
     return graph, params, torch_out
 
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params):]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
 
 def _model_to_graph(model, args, verbose=False,
                     input_names=None, output_names=None,
@@ -443,9 +450,7 @@ def _model_to_graph(model, args, verbose=False,
                                                  _retain_param_name,
                                                  use_new_jit_passes)
 
-    input_and_param_names = [val.debugName() for val in graph.inputs()]
-    param_names = input_and_param_names[len(input_and_param_names) - len(params):]
-    params_dict = dict(zip(param_names, params))
+    params_dict = _get_named_param_dict(graph, params)
 
     graph = _optimize_graph(graph, operator_export_type,
                             _disable_torch_constant_prop=_disable_torch_constant_prop,
@@ -479,9 +484,7 @@ def _model_to_graph(model, args, verbose=False,
     flatten_args, _ = torch._C._jit_flatten(args)
     assert len(params) + len(flatten_args) == sum(1 for _ in graph.inputs())
 
-    input_and_param_names = [val.debugName() for val in graph.inputs()]
-    param_names = input_and_param_names[len(input_and_param_names) - len(params):]
-    params_dict = dict(zip(param_names, params))
+    params_dict = _get_named_param_dict(graph, params)
 
     if training is None or training == TrainingMode.EVAL:
         params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
@@ -491,6 +494,9 @@ def _model_to_graph(model, args, verbose=False,
                                                             _export_onnx_opset_version)
         torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
 
+    if _onnx_shape_inference:
+        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, _export_onnx_opset_version)
+
     params_dict = torch._C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
 
     # For ONNX opset < 9, constants only have three data types: float16, float, double.
@@ -878,7 +884,7 @@ def const_if_tensor(arg):
     from torch.onnx.symbolic_helper import _onnx_shape_inference
     if _onnx_shape_inference:
         from torch.onnx.symbolic_helper import _export_onnx_opset_version as opset_version
-        torch._C._jit_pass_onnx_node_shape_type_inference(n, opset_version)
+        torch._C._jit_pass_onnx_node_shape_type_inference(n, _params_dict, opset_version)
 
     if outputs == 1:
         return n.output()
@@ -1032,7 +1038,7 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                 # Process Loop and If after subblock is converted.
                 from torch.onnx.symbolic_helper import _onnx_shape_inference
                 if _onnx_shape_inference:
-                    torch._C._jit_pass_onnx_node_shape_type_inference(new_node, opset_version)
+                    torch._C._jit_pass_onnx_node_shape_type_inference(new_node, _params_dict, opset_version)
                 return new_op_outputs
             else:
                 symbolic_name = 'prim_' + op_name

From 7e4c95695539c0fe0675ab0c91de761247143d8a Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 27/41] [ONNX] Support opset13 Squeeze and Unsqueeze (#50150)
 (#50906)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50906

In opset 13, squeeze/unsqueeze is updated to take axes as input, instead of attribute.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050883

Pulled By: SplitInfinity

fbshipit-source-id: 7b5faf0e016d476bc75cbf2bfee6918d77e8aecd
---
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 119 +--------------
 test/onnx/test_utility_funs.py                |   4 +-
 torch/csrc/jit/passes/onnx/helper.cpp         |  23 +++
 torch/csrc/jit/passes/onnx/helper.h           |   9 ++
 torch/csrc/jit/passes/onnx/peephole.cpp       | 137 +++++++++---------
 .../jit/passes/onnx/shape_type_inference.cpp  |  28 ++--
 torch/onnx/symbolic_helper.py                 |  24 ++-
 torch/onnx/symbolic_opset10.py                |  14 +-
 torch/onnx/symbolic_opset11.py                |  35 +++--
 torch/onnx/symbolic_opset12.py                |   8 +-
 torch/onnx/symbolic_opset9.py                 |  70 ++++-----
 11 files changed, 205 insertions(+), 266 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 6cabcd728085..5b7091b6612f 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -302,7 +302,6 @@ def forward(self, input):
         x = torch.tensor([2], dtype=torch.long)
         self.run_model_test_with_external_data(model, x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
     def test_mobilenet_v2_with_external_data(self):
         model = torchvision.models.mobilenet_v2(pretrained=True)
@@ -444,7 +443,6 @@ def get_test_images(self):
         images = [image]
         return images
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_paste_mask_in_image(self):
         # disable profiling
         torch._C._jit_set_profiling_executor(False)
@@ -525,22 +523,18 @@ def test_keypoint_rcnn(self):
                       dynamic_axes={"images_tensors": [0, 1, 2]},
                       rtol=1e-3, atol=1e-5)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_word_language_model_RNN_TANH(self):
         self.run_word_language_model("RNN_TANH")
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_word_language_model_RNN_RELU(self):
         self.run_word_language_model("RNN_RELU")
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_word_language_model_LSTM(self):
         self.run_word_language_model("LSTM")
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_word_language_model_GRU(self):
         self.run_word_language_model("GRU")
@@ -768,7 +762,6 @@ def forward(self, x, y=None, z=None):
         z = torch.randn(2, 3)
         self.run_test(Model(), (x, None, z))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_cste_script(self):
         class MyModel(torch.jit.ScriptModule):
@@ -1037,44 +1030,37 @@ def forward(self, x):
         else:
             self.run_test(Squeeze(d), x1)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_squeeze_without_no_op(self):
         x = torch.randn(2, 1, 4)
         self.squeeze_model_tests(1, x, None)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_squeeze_dynamic(self):
         x_squeeze = torch.randn(2, 1, 4)
         x_noop = torch.randn(2, 2, 3)
         self.squeeze_model_tests(1, x_squeeze, x_noop)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_squeeze_neg_without_no_op(self):
         x = torch.randn(2, 1, 4)
         self.squeeze_model_tests(-2, x, None)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_squeeze_neg(self):
         x_squeeze = torch.randn(2, 1, 4)
         x_noop = torch.randn(2, 2, 3)
         self.squeeze_model_tests(-2, x_squeeze, x_noop)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_squeeze_all_dims(self):
         x_squeeze = torch.randn(2, 1, 4)
         x_noop = torch.randn(2, 2, 3)
         self.squeeze_model_tests(None, x_squeeze, x_noop)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_squeeze_no_op(self):
         x_noop = torch.randn(2, 1, 4)
         x_squeeze = torch.randn(2, 2, 1)
         self.squeeze_model_tests(2, x_noop, x_squeeze)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_squeeze_runtime_dim(self):
         class Squeeze(torch.nn.Module):
@@ -1088,7 +1074,14 @@ def forward(self, d1, d2):
         self.run_test(Squeeze(), (d1, d4), test_with_inputs=[(d3, d4)])
         self.run_test(Squeeze(), (d3, d4), test_with_inputs=[(d1, d3)])
 
-    @skipIfUnsupportedOpsetVersion([13])
+    def test_squeeze(self):
+        class Squeeze(torch.nn.Module):
+            def forward(self, x):
+                return torch.squeeze(x, dim=-2)
+
+        x = torch.randn(2, 1, 4)
+        self.run_test(Squeeze(), x)
+
     def test_unsqueeze(self):
         class Unsqueeze(torch.nn.Module):
             def forward(self, x):
@@ -1288,7 +1281,6 @@ def forward(self, x, y):
         y = torch.randn(2, 3, 4)
         self.run_test(FloorDivModule(), (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_floordiv(self):
         class FloordivModule(torch.nn.Module):
@@ -1366,7 +1358,6 @@ def forward(self, x, y):
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.double)
         self.run_test(torch.jit.script(DivModule()), (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_slice_trace(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
@@ -1375,7 +1366,6 @@ def forward(self, x):
         x = torch.randn(3)
         self.run_test(MyModule(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_slice_neg(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
@@ -1384,7 +1374,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5)
         self.run_test(NegSlice(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_slice_neg_large(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
@@ -1393,7 +1382,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_test(NegSlice(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_slice_neg_large_negone(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
@@ -1402,7 +1390,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_test(NegSlice(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_slice_with_input_index(self):
         class InputIndexSlice(torch.nn.Module):
@@ -1414,7 +1401,6 @@ def forward(self, x, y):
         y = torch.rand((22, 256))
         self.run_test(InputIndexSlice(), (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     @disableScriptTest()  # scripting tuple/list append
     def test_slice_dynamic(self):
@@ -1433,7 +1419,6 @@ def forward(self, x):
                       dynamic_axes={'input_1': [0, 1, 2],
                                     'output_1': [0, 1, 2]})
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_slice_dynamic_script(self):
         class DynamicSliceModel(torch.jit.ScriptModule):
@@ -1444,7 +1429,6 @@ def forward(self, x):
         x = torch.rand(1, 2)
         self.run_test(DynamicSliceModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_slice_dynamic_shape_script(self):
         class DynamicSliceModel(torch.nn.Module):
@@ -1454,7 +1438,6 @@ def forward(self, x):
         x = torch.rand(1, 2, 3, 4)
         self.run_test(DynamicSliceModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     @disableScriptTest()   # scripting tuple/list append
     def test_slice_dynamic_to_end(self):
@@ -1561,7 +1544,6 @@ def forward(self, end):
         x = torch.tensor(6.2, dtype=torch.float)
         self.run_test(ArangeModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_size(self):
         class SizeModel(torch.nn.Module):
@@ -1571,7 +1553,6 @@ def forward(self, input):
         x = torch.randn(5, 3, 2)
         self.run_test(SizeModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()  # x.stride() not scriptable
     def test_as_strided(self):
@@ -1586,7 +1567,6 @@ def forward(self, x):
         x = torch.randn(5, 8, 7)
         self.run_test(Model(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_tensor_index_advanced_indexing_ellipsis(self):
         class MyModel(torch.nn.Module):
@@ -1596,7 +1576,6 @@ def forward(self, input):
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), (m1,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_tensor_index_advanced_indexing(self):
         class MyModel(torch.nn.Module):
             def forward(self, input):
@@ -1617,7 +1596,6 @@ def forward(self, input):
 
         self.run_test(MyModel(), (m1,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_tensor_index_advanced_indexing_consecutive(self):
         class MyModel(torch.nn.Module):
             def forward(self, input):
@@ -1626,7 +1604,6 @@ def forward(self, input):
         m1 = torch.randn(3, 4, 5, 6, 7)
         self.run_test(MyModel(), (m1,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put(self):
         class IndexPutModel(torch.nn.Module):
@@ -1639,7 +1616,6 @@ def forward(self, x, ind, update):
         update = torch.ones(4)
         self.run_test(IndexPutModel(), (x, ind, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_accumulate(self):
         class IndexPutModel(torch.nn.Module):
@@ -1651,7 +1627,6 @@ def forward(self, x, ind, update):
         update = torch.ones(4)
         self.run_test(IndexPutModel(), (x, ind, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_slice_index(self):
         class IndexPutModel(torch.nn.Module):
@@ -1726,7 +1701,6 @@ def forward(self, x, update):
         update = torch.arange(3 * 5).to(torch.float).view(3, 5)
         self.run_test(IndexPutModel8(), (x, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_index_put_ellipsis(self):
@@ -1748,7 +1722,6 @@ def forward(self, x, update):
         update = torch.randn(4, 1, 3, 2)
         self.run_test(IndexPutModel2(), (x, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put_loop(self):
         @torch.jit.script
@@ -1831,7 +1804,6 @@ def forward(self, x, ind, data):
         data = torch.randn(4)
         self.run_test(CopyModel4(), (x, ind, data))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # Model not scriptable (output with shape doesn't match the broadcast shape)
     def test_copy_tracing(self):
@@ -1844,7 +1816,6 @@ def forward(self, x, data):
         update = torch.randn(1, 2)
         self.run_test(CopyModel(), (x, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_ellipsis(self):
         class CopyModel(torch.nn.Module):
@@ -1860,7 +1831,6 @@ def forward(self, x, update):
         update = torch.ones(1)
         self.run_test(CopyModel(), (x, update))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     # TODO: Limited scripting support with ellipsis indexing.
     # Due to dependency on input tensor rank being known.
@@ -1899,7 +1869,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Rand(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_random_dynamic_size(self):
         class RandN(torch.nn.Module):
@@ -2062,12 +2031,10 @@ def _interpolate_tests(self, is_upsample):
                         self._interpolate_script(xi, mode_i, False, is_upsample, True)
                     self._interpolate_script(xi, mode_i, False, is_upsample)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_interpolate_upsample(self):
         self._interpolate_tests(True)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_interpolate_function_substitution(self):
@@ -2098,13 +2065,11 @@ def forward(self, x):
 
         self.run_test(TracingModule(), (x,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     @disableScriptTest()
     def test_interpolate_downsample(self):
         self._interpolate_tests(False)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()
     def test_interpolate_no_shape(self):
@@ -2120,7 +2085,6 @@ def forward(self, x, y):
         y = torch.randn(16, 16, requires_grad=True)
         self.run_test(MyModel(), (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_interpolate_adaptive_pooling_error(self):
         x = torch.randn(1, 2, 6, requires_grad=True)
         with self.assertRaises(RuntimeError) as cm:
@@ -2129,7 +2093,6 @@ def test_interpolate_adaptive_pooling_error(self):
         with self.assertRaises(RuntimeError) as cm:
             self._interpolate(x, "area", False, True)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_groupnorm(self):
         model = torch.nn.GroupNorm(3, 6, 0.002)
         x = torch.randn(4, 6, 180, 180, 180)
@@ -2143,7 +2106,6 @@ def test_groupnorm(self):
         x = torch.randn(4, 6, 180, 180)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_groupnorm_noaffine(self):
         model = torch.nn.GroupNorm(4, 8, 0.002, affine=False)
@@ -2158,7 +2120,6 @@ def test_groupnorm_noaffine(self):
         x = torch.randn(4, 6, 180, 180)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_listunpack(self):
         class ListUnpack(torch.jit.ScriptModule):
@@ -2462,7 +2423,6 @@ def forward(self, input, input2):
         input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2)
         self.run_test(BitshiftModel(), (input, input2))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_narrow(self):
         class NarrowModel(torch.nn.Module):
             def forward(self, input):
@@ -2471,7 +2431,6 @@ def forward(self, input):
         x = torch.randn(3, 3, requires_grad=True)
         self.run_test(NarrowModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_narrow_dynamic(self):
         class NarrowModel(torch.nn.Module):
@@ -2481,7 +2440,6 @@ def forward(self, input):
         x = torch.randn(3, 3, requires_grad=True)
         self.run_test(NarrowModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_fill(self):
         class IndexFillModel(torch.nn.Module):
@@ -2492,7 +2450,6 @@ def forward(self, input):
         x = torch.randn(3, 4, 5, requires_grad=True)
         self.run_test(IndexFillModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_copy(self):
         class IndexCopyModel(torch.nn.Module):
@@ -2734,7 +2691,6 @@ def forward(self, input, indices):
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(GatherModel(), input=(input, indices))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_expand(self):
         class ExpandModel(torch.nn.Module):
@@ -2894,7 +2850,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, requires_grad=True)
         self.run_test(Model(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()  # scripting prim_dtype
     def test_lstm_no_hidden(self):
@@ -2909,7 +2864,6 @@ def forward(self, x):
         input = torch.randn((10, 16, 16))
         self.run_test(LSTMModel(), (input,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()  # scripting prim_dtype
     def test_lstm_proj_no_hidden(self):
@@ -2926,7 +2880,6 @@ def forward(self, x):
             self.run_test(LSTMModel(), (input,))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_lstm(self):
         model = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False)
@@ -2935,7 +2888,6 @@ def test_lstm(self):
         c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE)
         self.run_test(model, (input, (h0, c0)))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()
     def test_lstm_default_init_state(self):
@@ -2943,7 +2895,6 @@ def test_lstm_default_init_state(self):
         input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
         self.run_test(model, input)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()  # LSTMModel model not scriptable
     def test_lstm_fixed_batch_size(self):
@@ -2965,7 +2916,6 @@ def forward(self, input):
         input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE)
         self.run_test(LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2])
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()
     def test_lstm_post_fix_init_state(self):
@@ -2990,7 +2940,6 @@ def forward(self, input):
         self.run_test(model, input, dynamic_axes={'input' : {0 : 'seq', 1 : 'batch'}},
                       test_with_inputs=[input2])
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_lstm_constant_folding(self):
         class LstmNet(torch.nn.Module):
@@ -3018,7 +2967,6 @@ def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size
         model2, input2 = get_LstmNet_model_and_inputs(5, 4, 3, batch_size2, 7, False)
         self.run_test(model2, input2, do_constant_folding=True)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()
     def test_lstm_no_bias(self):
@@ -3044,7 +2992,6 @@ def get_LstmNet_model_and_inputs(num_layers, bidirectional):
         for model, input in models_and_inputs:
             self.run_test(model, input)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()
     def test_rnn_no_bias(self):
         def make_model(layers, packed_sequence):
@@ -3084,7 +3031,6 @@ def make_input(batch_size, layers, packed_sequence):
         for model, input in zip(models, inputs):
             self.run_test(model, input, batch_size=RNN_BATCH_SIZE)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_gru_no_bias(self):
         class GruNet(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
@@ -3114,7 +3060,6 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size,
         for model, input in models_and_inputs:
             self.run_test(model, input, do_constant_folding=True)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_gru_constant_folding(self):
         class GruNet(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_layers, bidirectional):
@@ -3377,7 +3322,6 @@ def test_argmin_argmax_select_last_index(self):
         input = torch.ones(7, 3, 5)
         self._argmin_argmax_model(input)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_repeat(self):
         class RepeatModel(torch.nn.Module):
             def forward(self, x, y):
@@ -3397,7 +3341,6 @@ def forward(self, input):
         x = torch.randint(10, (4, 2, 3, 4), dtype=torch.int32)
         self.run_test(ViewModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_view_dynamic(self):
         class ViewModel(torch.nn.Module):
             def forward(self, input, other):
@@ -3407,7 +3350,6 @@ def forward(self, input, other):
         shape = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, shape))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_view_dynamic_zero_dim(self):
         class ViewModel(torch.nn.Module):
             def forward(self, input):
@@ -3567,7 +3509,6 @@ def forward(self, input):
         self.run_test(LenModel(), x, input_names=['input'], dynamic_axes={'input': {0: 'seq'}},
                       test_with_inputs=(torch.randn(5, 5),))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_len_list(self):
         class LenListModel(torch.jit.ScriptModule):
@@ -3618,7 +3559,6 @@ def forward(self, input):
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel3(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()
     def test_split_size_as_list(self):
@@ -3635,7 +3575,6 @@ def forward(self, input, split_sizes: List[int]):
         split_sizes = [torch.tensor(2), torch.tensor(4)]
         self.run_test(SplitModel(), (x, split_sizes))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_split_size_with_slice(self):
         class SplitModule(torch.nn.Module):
@@ -3707,7 +3646,6 @@ def forward(self, x):
         x = torch.randn(4, 5, 6)
         self.run_test(ConcatDynamicModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_stack(self):
         class StackModel(torch.nn.Module):
             def forward(self, x, y, z):
@@ -3718,7 +3656,6 @@ def forward(self, x, y, z):
         z = torch.randn(3, 4, 5)
         self.run_test(StackModel(), (x, y, z))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_stack_dynamic(self):
         class StackDynamicModel(torch.jit.ScriptModule):
@@ -3795,7 +3732,6 @@ def forward(self, x):
         x = torch.randn(5, 3, 3)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_loop_multi_dim(self):
         class LoopMultiDimModel(torch.jit.ScriptModule):
@@ -3810,7 +3746,6 @@ def forward(self, x, y):
         y = torch.ones(1, dtype=torch.long)
         self.run_test(model, (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_list(self):
         class ListModel(torch.jit.ScriptModule):
@@ -3832,7 +3767,6 @@ def forward(self, x):
         inputs = torch.randn(16, 1)
         self.run_test(model, inputs)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_tensor_factories(self):
         class TensorFactory(torch.nn.Module):
@@ -3842,7 +3776,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(TensorFactory(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_tensor_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
@@ -3853,7 +3786,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(TensorFactory(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_tensor_like_factories_script(self):
         class TensorFactory(torch.jit.ScriptModule):
@@ -3866,7 +3798,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(TensorFactory(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_eye(self):
         class TensorFactory(torch.nn.Module):
@@ -3889,7 +3820,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Zero_(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_new_zeros(self):
         class Zero_(torch.nn.Module):
@@ -3913,7 +3843,6 @@ def forward(self, input):
         x = torch.randn(2, 3)
         self.run_test(List(), (x,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()
     def test_list_pass(self):
@@ -3953,7 +3882,6 @@ def forward(self, x, y):
         y = torch.randn(1, 2, 3)
         self.run_test(List(), (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_new_empty(self):
         class Emtpy(torch.nn.Module):
@@ -3963,7 +3891,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Emtpy(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_new_full(self):
         class Full(torch.nn.Module):
@@ -3973,7 +3900,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Full(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_list(self):
         class Arithmetic(torch.jit.ScriptModule):
@@ -4070,7 +3996,6 @@ def forward(self, x):
         x = torch.arange(16).view(2, 2, 4).to(torch.float32)
         self.run_test(MaskedFillModel2(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_masked_scatter(self):
         class MaskedScatterModel(torch.nn.Module):
@@ -4089,7 +4014,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, requires_grad=True)
         self.run_test(MaskedSelectModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # dtype not available
     def test_index_put_to_masked_fill(self):
@@ -4104,7 +4028,6 @@ def forward(self, input_mask, some_const):
         constant = torch.tensor(5, dtype=torch.float)
         self.run_test(MaskedFillModel(), (mask, constant))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # dtype not available
     def test_index_put_to_masked_scatter(self):
@@ -4225,7 +4148,6 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_unfold(self):
         class UnfoldModel(torch.nn.Module):
             def forward(self, x):
@@ -4238,7 +4160,6 @@ def forward(self, x):
                       input_names=['x'],
                       test_with_inputs=[y])
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfONNXShapeInference(False)
     def test_unfold_infer_shape(self):
         class UnfoldModule(torch.jit.ScriptModule):
@@ -4254,7 +4175,6 @@ def forward(self, x):
         x = torch.randn(32, 3, 64)
         self.run_test(UnfoldModule(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_unfold_dynamic_inputs(self):
         class UnfoldModel(torch.nn.Module):
@@ -4264,7 +4184,6 @@ def forward(self, x):
         x = torch.randn(4, 2, 4, requires_grad=True)
         self.run_test(UnfoldModel(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_prelu(self):
         class PReluModel(torch.nn.Module):
             def __init__(self):
@@ -4594,7 +4513,6 @@ def forward(self, input, other):
         model = MyModule()
         self.run_test(model, (x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_ones_bool(self):
         class MyModule(torch.nn.Module):
@@ -4641,7 +4559,6 @@ def test_constant_pad(self):
         self.run_test(model, x)
 
     # Dynamic padding is added in opset 11
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # Functional module not scriptable
     def test_pad_types(self):
@@ -4846,7 +4763,6 @@ def test_replication_pad(self):
         x = torch.randn(2, 2, 4, 4)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_im2col(self):
         class Unfold(torch.nn.Module):
@@ -4870,7 +4786,6 @@ def forward(self, x):
 
     # This test checks output scalar type in the ONNX graph should not be null
     # https://github.com/pytorch/pytorch/issues/28607
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_trace_script(self):
         @torch.jit.script
@@ -4944,7 +4859,6 @@ def forward(self, *tensor_list):
         x = torch.randn(3, 4)
         self.run_test(EinsumModelTranspose(), input=(x,))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_crossentropyloss(self):
         for ignore_index in [-100, 1]:
@@ -5043,7 +4957,6 @@ def forward(self, input, target):
 
         self.run_test(CrossEntropyLossMeanWeight(ignore_index), input=(x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_kldiv_loss(self):
 
@@ -5110,7 +5023,6 @@ def forward(self, input, target):
 
         self.run_test(KLDivLossMiniBatchMean(), input=(x, y))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss(self):
         class NLLModel(torch.nn.Module):
@@ -5131,7 +5043,6 @@ def forward(self, input, target):
         target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_none(self):
         class NLLModel(torch.nn.Module):
@@ -5153,7 +5064,6 @@ def forward(self, input, target):
         target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_mean(self):
         class NLLModel(torch.nn.Module):
@@ -5175,7 +5085,6 @@ def forward(self, input, target):
         target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_sum(self):
         class NLLModel(torch.nn.Module):
@@ -5197,7 +5106,6 @@ def forward(self, input, target):
         target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_mean_weights(self):
         class NLLModel(torch.nn.Module):
@@ -5219,7 +5127,6 @@ def forward(self, input, target):
         target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_mean_ignore_index(self):
         class NLLModel(torch.nn.Module):
@@ -5238,7 +5145,6 @@ def forward(self, input, target):
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
         self.run_test(NLLModel(), (input, target))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_nllloss_2d_mean_ignore_index_weights(self):
         class NLLModel(torch.nn.Module):
@@ -5267,7 +5173,6 @@ def forward(self, mat1, mat2):
         mat2 = torch.randn(3, 3)
         self.run_test(M(), input=(mat1, mat2))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)  # Because where op is not supported for opset < 9.
     def test_where_with_bool_tensor(self):
         class M(torch.nn.Module):
@@ -5279,7 +5184,6 @@ def forward(self, mat1, mat2):
         mat2 = torch.ones(2, 3)
         self.run_test(M(), input=(mat1, mat2))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(9)  # Because where op is not supported for opset < 9.
     def test_where_with_byte_tensor(self):
         class M(torch.nn.Module):
@@ -5478,7 +5382,6 @@ def forward(self, x):
 
     @skipIfONNXShapeInference(False)
     @skipIfUnsupportedMinOpsetVersion(13)
-    @skipIfUnsupportedOpsetVersion([13])
     def test_if_list(self):
         class IfModel(torch.nn.Module):
             def forward(self, x, y, cond):
@@ -5669,7 +5572,6 @@ def forward(self, input):
         x = torch.randn(6, 4, 3, 3)
         self.run_test(FakeQuantizePerTensorModel(), (x))
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_batchnorm_training(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -5793,7 +5695,6 @@ def forward(self, x):
 
         np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -5992,7 +5893,6 @@ def forward(self, boxes, scores):
 
         self.run_test(Module(), (boxes, scores))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_clip_boxes_to_image(self):
         boxes = torch.randn(5, 4) * 500
@@ -6050,7 +5950,6 @@ def test_roi_pool(self):
         model = ops.RoIPool((pool_h, pool_w), 2)
         self.run_test(model, (x, rois))
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_resize_images(self):
         class TransformModule(torch.nn.Module):
@@ -6067,7 +5966,6 @@ def forward(self, images):
                       input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]},
                       test_with_inputs=[(input_test,)])
 
-    @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_transform_images(self):
 
@@ -6198,7 +6096,6 @@ def make_test(name, base, layer, bidirectional, initial_state,
 
     # Cannot export with older opsets because of 'ConstantFill' op
     # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime
-    @skipIfUnsupportedOpsetVersion([13])
     @disableScriptTest()  # Test code not scriptable
     @skipIfUnsupportedMinOpsetVersion(9)
     def f(self):
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 5c1bfe8b5515..3aadabf85769 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -5,7 +5,7 @@
 from torch.onnx import utils, OperatorExportTypes, TrainingMode
 from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type
 import torch.utils.cpp_extension
-from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
+from test_pytorch_common import skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion
 import caffe2.python.onnx.backend as backend
 from verify import verify
 
@@ -618,6 +618,8 @@ def forward(self, x):
         assert next(iter).kind() == "aten::quantize_per_tensor"
         assert next(iter).kind() == "aten::dequantize"
 
+    # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11
+    @skipIfUnsupportedOpsetVersion([11, 12])
     def test_prim_fallthrough(self):
         # Test prim op
         class PrimModule(torch.jit.ScriptModule):
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index a14dcd611dd8..aca08331183c 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -97,5 +97,28 @@ Value* addInputToBlock(Block* block) {
   return block->addInput();
 }
 
+Node* createONNXUnsqueeze(
+    Graph* graph,
+    Node* n_to_insert_before,
+    Value* input,
+    int axis,
+    int opset_version) {
+  Node* unsqueeze_node = graph->create(onnx::Unsqueeze, 1);
+  unsqueeze_node->addInput(input);
+  unsqueeze_node->insertBefore(n_to_insert_before);
+  if (opset_version >= OPSET_VERSION_13) {
+    // ONNX spec sets `axes` as input for opset >= 13.
+    Node* unsqueeze_axes = graph->create(onnx::Constant, 1);
+    unsqueeze_axes->insertBefore(unsqueeze_node);
+    unsqueeze_axes->t_(
+        attr::value, at::unsqueeze(at::scalar_to_tensor(at::Scalar(axis)), 0));
+    unsqueeze_node->addInput(unsqueeze_axes->output());
+  } else {
+    // ONNX spec sets `axes` as attribute for opset < 13.
+    unsqueeze_node->is_(attr::axes, {0});
+  }
+  return unsqueeze_node;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index e27909ff6362..43989bd8e6c3 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -13,6 +13,7 @@ static const int OPSET_VERSION_9 = 9;
 static const int OPSET_VERSION_10 = 10;
 static const int OPSET_VERSION_11 = 11;
 static const int OPSET_VERSION_12 = 12;
+static const int OPSET_VERSION_13 = 13;
 
 using ValueToParamPairMap = std::map<Value*, std::pair<std::string, IValue>>;
 
@@ -33,5 +34,13 @@ Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs);
 Value* addInputToBlock(Block* block);
 
 TORCH_API c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
+
+Node* createONNXUnsqueeze(
+    Graph* graph,
+    Node* n_to_insert_before,
+    Value* input,
+    int axis,
+    int opset_version);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index d488201a8f80..8aa07332cc65 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -416,10 +416,8 @@ void fixDefaultRNNState(
   batch_size->addInput(shape_of_input->outputs()[0]);
   batch_size->addInput(gather_indices->outputs()[0]);
 
-  Node* unsqueezed_batch_size = graph->create(onnx::Unsqueeze, 1);
-  unsqueezed_batch_size->insertBefore(n);
-  unsqueezed_batch_size->addInput(batch_size->outputs()[0]);
-  unsqueezed_batch_size->is_(attr::axes, {0});
+  Node* unsqueezed_batch_size =
+      createONNXUnsqueeze(graph, n, batch_size->outputs()[0], 0, opset_version);
 
   Node* hidden_size = graph->create(onnx::Constant, 1);
   hidden_size->insertBefore(n);
@@ -440,10 +438,8 @@ void fixDefaultRNNState(
               ? 2
               : 1)));
 
-  Node* unsqueezed_num_directions = graph->create(onnx::Unsqueeze, 1);
-  unsqueezed_num_directions->insertBefore(n);
-  unsqueezed_num_directions->addInput(num_directions->outputs()[0]);
-  unsqueezed_num_directions->is_(attr::axes, {0});
+  Node* unsqueezed_num_directions = createONNXUnsqueeze(
+      graph, n, num_directions->outputs()[0], 0, opset_version);
 
   Node* concated_dims = graph->create(onnx::Concat, 1);
   concated_dims->insertBefore(n);
@@ -555,6 +551,65 @@ static void replaceInputWithList(Node* node, size_t i, ArrayRef<Value*> to) {
   }
 }
 
+static void eraseListConstruct(Block* block, int opset_version);
+
+static void eraseListConstruct(Node* n, int opset_version) {
+  for (auto b : n->blocks()) {
+    eraseListConstruct(b, opset_version);
+  }
+  std::vector<std::tuple<size_t, std::vector<Value*>>> replacements;
+
+  auto block = n->owningBlock();
+  size_t i = 0;
+  for (auto* input : n->inputs()) {
+    if (input->node()->kind() == prim::ListConstruct) {
+      auto* lc_node = input->node();
+      TypePtr elem =
+          lc_node->output()->type()->cast<ListType>()->getElementType();
+      if (elem->cast<IntType>()) {
+        // ListConstruct Int[] output case, we need to transform to ONNX
+        // Concat to ensure the output is a single tensor(dynamic) type in
+        // order to be consumed as inputs
+        std::vector<Value*> unsqueezed;
+        Graph* g = block->owningGraph();
+        for (auto* input : lc_node->inputs()) {
+          Node* unsqueezed_node =
+              createONNXUnsqueeze(g, lc_node, input, 0, opset_version);
+          unsqueezed.emplace_back(unsqueezed_node->output());
+        }
+        Node* concat_node = g->create(onnx::Concat, 1);
+        concat_node->i_(attr::axis, 0);
+        for (auto v : unsqueezed) {
+          concat_node->addInput(v);
+        }
+        concat_node->insertBefore(lc_node);
+
+        // make concat node output as new input, then ListConstruct should
+        // become dead
+        replacements.emplace_back(
+            i, std::vector<Value*>({concat_node->output()}));
+
+      } else {
+        if (opset_version >= OPSET_VERSION_11) {
+          c10::Symbol seq_node_kind = lc_node->inputs().size() > 0
+              ? onnx::SequenceConstruct
+              : onnx::SequenceEmpty;
+          Node* seq_node = block->owningGraph()->create(
+              seq_node_kind, {lc_node->inputs()}, 1);
+          seq_node->insertBefore(lc_node);
+          seq_node->output()->copyMetadata(lc_node->output());
+          lc_node->replaceAllUsesWith(seq_node);
+        }
+      }
+    }
+    i++;
+  }
+
+  for (auto ritr = replacements.rbegin(); ritr != replacements.rend(); ++ritr) {
+    replaceInputWithList(n, std::get<0>(*ritr), std::get<1>(*ritr));
+  }
+}
+
 static void eraseListConstruct(Block* block, int opset_version) {
   // TODO: Fix this pass/maybe get rid of this part.
   // Tensor lists might be used for meshgrid and such ops as well.
@@ -563,71 +618,9 @@ static void eraseListConstruct(Block* block, int opset_version) {
     Node* n = *it;
     ++it;
 
-    for (auto b : n->blocks()) {
-      eraseListConstruct(b, opset_version);
-    }
-    std::vector<std::tuple<size_t, std::vector<Value*>>> replacements;
-
-    size_t i = 0;
-    for (auto* input : n->inputs()) {
-      if (input->node()->kind() == prim::ListConstruct) {
-        auto* lc_node = input->node();
-        TypePtr elem =
-            lc_node->output()->type()->cast<ListType>()->getElementType();
-        if (elem->cast<IntType>()) {
-          // ListConstruct Int[] output case, we need to transform to ONNX
-          // Concat to ensure the output is a single tensor(dynamic) type in
-          // order to be consumed as inputs
-          std::vector<Value*> unsqueezed;
-          Graph* g = block->owningGraph();
-          for (auto* input : lc_node->inputs()) {
-            Node* unsqueezed_node = g->create(onnx::Unsqueeze, 1);
-            unsqueezed_node->insertBefore(lc_node);
-            unsqueezed_node->addInput(input);
-            unsqueezed_node->is_(attr::axes, {0});
-            unsqueezed.emplace_back(unsqueezed_node->output());
-          }
-          Node* concat_node = g->create(onnx::Concat, 1);
-          concat_node->i_(attr::axis, 0);
-          for (auto v : unsqueezed) {
-            concat_node->addInput(v);
-          }
-          concat_node->insertBefore(lc_node);
-
-          // make concat node output as new input, then ListConstruct should
-          // become dead
-          replacements.emplace_back(
-              i, std::vector<Value*>({concat_node->output()}));
-
-        } else {
-          if (opset_version < OPSET_VERSION_11) {
-            // Tensor lists are used mostly for inputs to cat/stack. They are
-            // already handled in those symbolics, and should become dead
-            // afterwards.
-            replacements.emplace_back(
-                i,
-                std::vector<Value*>(
-                    lc_node->inputs().begin(), lc_node->inputs().end()));
-          } else {
-            c10::Symbol seq_node_kind = lc_node->inputs().size() > 0
-                ? onnx::SequenceConstruct
-                : onnx::SequenceEmpty;
-            Node* seq_node = block->owningGraph()->create(
-                seq_node_kind, {lc_node->inputs()}, 1);
-            seq_node->insertBefore(lc_node);
-            seq_node->output()->copyMetadata(lc_node->output());
-            lc_node->replaceAllUsesWith(seq_node);
-          }
-        }
-      }
-      i++;
-    }
-
-    for (auto ritr = replacements.rbegin(); ritr != replacements.rend();
-         ++ritr) {
-      replaceInputWithList(n, std::get<0>(*ritr), std::get<1>(*ritr));
-    }
+    eraseListConstruct(n, opset_version);
   }
+  eraseListConstruct(block->return_node(), opset_version);
 }
 
 // For ops such as meshgrid where output is a list of Tensors
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 8d2ff1e63cfc..07d9340ade3a 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -201,7 +201,10 @@ bool IsSupportedNode(const Node* n) {
   return true;
 }
 
-Value* CloneValueFromListConstruct(Value* v, std::shared_ptr<Graph> n_graph) {
+Value* CloneValueFromListConstruct(
+    Value* v,
+    std::shared_ptr<Graph> n_graph,
+    int opset_version) {
   auto lc_node = v->node();
   TORCH_INTERNAL_ASSERT(lc_node->kind() == ::c10::prim::ListConstruct);
   // In jit/passes/onnx/peephole.cpp::eraseListConstruct,
@@ -221,12 +224,10 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr<Graph> n_graph) {
     // order to be consumed as inputs
     std::vector<Value*> unsqueezed;
     for (auto* input : lc_node->inputs()) {
-      Node* unsqueezed_node =
-          n_graph->insertNode(n_graph->create(::c10::onnx::Unsqueeze, 1));
       auto new_input = n_graph->addInput();
       new_input->copyMetadata(input);
-      unsqueezed_node->addInput(new_input);
-      unsqueezed_node->is_(attr::axes, {0});
+      Node* unsqueezed_node = createONNXUnsqueeze(
+          n_graph.get(), n_graph->return_node(), new_input, 0, opset_version);
       unsqueezed.emplace_back(unsqueezed_node->output());
     }
     Node* concat_node =
@@ -261,11 +262,12 @@ Value* CloneValueFromListConstruct(Value* v, std::shared_ptr<Graph> n_graph) {
 Node* CloneNodeToGraph(
     Node* n,
     std::shared_ptr<Graph> n_graph,
-    const ParamMap& params_dict) {
+    const ParamMap& params_dict,
+    int opset_version) {
   auto vals_to_params_map =
       buildValueToParamsMap(n->owningGraph()->block(), params_dict);
-  auto clone_node =
-      n_graph->createClone(n, [&n_graph, &vals_to_params_map](Value* v) {
+  auto clone_node = n_graph->createClone(
+      n, [&n_graph, &vals_to_params_map, opset_version](Value* v) {
         auto v_n = v->node();
         switch (v_n->kind()) {
           case ::c10::onnx::Constant: {
@@ -275,7 +277,7 @@ Node* CloneNodeToGraph(
             return constant_n->output();
           }
           case ::c10::prim::ListConstruct: {
-            return CloneValueFromListConstruct(v, n_graph);
+            return CloneValueFromListConstruct(v, n_graph, opset_version);
           }
           case ::c10::prim::PackPadded: {
             auto input = n_graph->addInput();
@@ -476,7 +478,7 @@ void ONNXShapeTypeInference(
   // Create a Graph containing only the single node n.
   // This graph is later converted to ONNX to run shape inference.
   auto n_graph = std::make_shared<Graph>();
-  auto clone_node = CloneNodeToGraph(n, n_graph, params_dict);
+  auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version);
   n_graph->insertNode(clone_node);
 
   // Register all node outputs as graph outputs.
@@ -507,12 +509,16 @@ void ONNXShapeTypeInference(
     } catch (std::runtime_error& ex) {
       // TODO: include this as warning once we have a more consolidated warning
       // system.
+      GRAPH_DEBUG(
+          "ONNX shape inference fails with: ",
+          ex.what(),
+          " on graph: ",
+          n_graph->toString());
       const char shape_err[] = "ShapeInferenceError";
       const char type_err[] = "TypeInferenceError";
       if ((strstr(ex.what(), shape_err) == NULL) &&
           (strstr(ex.what(), type_err) == NULL))
         throw;
-      GRAPH_DEBUG("ONNX shape inference fails with: ", ex.what());
     }
     GRAPH_DEBUG(
         "ONNX graph after shape inference: ", prettyPrint(*model_proto));
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 187dcfcb87e2..8794c5f115c5 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -321,9 +321,19 @@ def _interpolate_warning(interpolate_mode):
                   "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
                   "We recommend using opset 11 and above for models using this operator. ")
 
-def _unsqueeze_helper(g, input, dim):
-    from torch.onnx.symbolic_opset9 import unsqueeze
-    return unsqueeze(g, input, dim)
+def _unsqueeze_helper(g, input, axes_i):
+    if _export_onnx_opset_version >= 13:
+        axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+        return g.op("Unsqueeze", input, axes)
+    else:
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+
+def _squeeze_helper(g, input, axes_i):
+    if _export_onnx_opset_version >= 13:
+        axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+        return g.op("Squeeze", input, axes)
+    else:
+        return g.op("Squeeze", input, axes_i=axes_i)
 
 def _interpolate_size_to_scales(g, input, output_size, dim):
     output_size = _maybe_get_const(output_size, 'is')
@@ -371,7 +381,7 @@ def _interpolate_get_scales(g, scale_factor, dim):
     if isinstance(scale_factor.type(), torch._C.ListType) or (scale_factor_rank is not None and scale_factor_rank > 0):
         return g.op("Concat", offsets, scale_factor, axis_i=0)
     else:
-        scale_factor = _unsqueeze_helper(g, scale_factor, 0)
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
         scale_factor = g.op("Cast", scale_factor, to_i=cast_pytorch_to_onnx["Float"])
         scales = [scale_factor for i in range(dim - 2)]
     scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
@@ -400,7 +410,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_
         if not _is_packed_list(size):
             is_scalar = ((_maybe_get_const(size, 't').dim() == 0))
             if is_scalar:
-                size = _unsqueeze_helper(g, size, 0)
+                size = _unsqueeze_helper(g, size, [0])
                 size = [size for i in range(dim - 2)]
                 size = g.op("Concat", *size, axis_i=0)
         scale_factor = _interpolate_size_to_scales(g, input, size, dim)
@@ -477,9 +487,9 @@ def _index_fill_reshape_helper(g, self, dim, index):
         return _unimplemented("index_fill", "input rank not accesible")
     self_dim = self.type().dim()
     dim_value = _parse_arg(dim, 'i')
-    unsqueezed_index = g.op("Unsqueeze", index, axes_i=[i for i in range(self_dim) if i != dim_value])
+    unsqueezed_index = _unsqueeze_helper(g, index, [i for i in range(self_dim) if i != dim_value])
     expanded_index_shape = scatter(g, g.op("Shape", self), 0,
-                                   g.op("Unsqueeze", dim, axes_i=[0]), g.op("Shape", index))
+                                   _unsqueeze_helper(g, dim, [0]), g.op("Shape", index))
     expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
     return expanded_index_shape, expanded_index
 
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 6558df6e3d4c..349eb8f35565 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -136,11 +136,11 @@ def __interpolate(g, input, size, scale_factor, mode , align_corners, recompute_
 
 def _slice(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
     if dynamic_slice:
-        starts = g.op("Unsqueeze", starts, axes_i=[0])
-        ends = g.op("Unsqueeze", ends, axes_i=[0])
+        starts = sym_help._unsqueeze_helper(g, starts, [0])
+        ends = sym_help._unsqueeze_helper(g, ends, [0])
         if isinstance(axes, int):
             axes = g.op("Constant", value_t=torch.tensor(axes))
-        axes = g.op("Unsqueeze", axes, axes_i=[0])
+        axes = sym_help._unsqueeze_helper(g, axes, [0])
     else:
         assert len(starts) == len(ends)
         assert len(starts) == len(axes)
@@ -220,15 +220,15 @@ def embedding_bag(g,
             offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
         list_ = []
         for i in range(offset_len):
-            start_ = g.op("Unsqueeze", select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), axes_i=[0])
-            end_ = g.op("Unsqueeze", select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)), axes_i=[0])
+            start_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i)), [0])
+            end_ = sym_help._unsqueeze_helper(g, select(g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)), [0])
             axes_ = g.op("Constant", value_t=torch.tensor([0]))
             indices_row = g.op("Slice", indices, start_, end_, axes_)
 
             embeddings = g.op("Gather", embedding_matrix, indices_row)
             if not sym_help._is_none(per_sample_weights):
                 per_sample_weights_row = g.op("Slice", per_sample_weights, start_, end_, axes_)
-                per_sample_weights_row = g.op("Unsqueeze", per_sample_weights_row, axes_i=[1])
+                per_sample_weights_row = sym_help._unsqueeze_helper(g, per_sample_weights_row, [1])
                 embeddings = g.op("Mul", embeddings, per_sample_weights_row)
             if mode == 0:
                 embeddings = g.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
@@ -237,7 +237,7 @@ def embedding_bag(g,
             else:
                 embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
 
-            embeddings = g.op("Unsqueeze", embeddings, axes_i=[0])
+            embeddings = sym_help._unsqueeze_helper(g, embeddings, [0])
             list_.append(embeddings)
 
         output = g.op("Concat", *list_, axis_i=0)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 85c7bf97c883..82394bfebe2f 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -76,7 +76,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
             index = add(g, index, ind)
         broadcast_index_shape = g.op("Shape", index)
         indices_list = [
-            g.op("Unsqueeze", expand(g, ind, broadcast_index_shape, None), axes_i=[-1]) for ind in indices_list
+            sym_help._unsqueeze_helper(g, expand(g, ind, broadcast_index_shape, None), [-1]) for ind in indices_list
         ]
         index = g.op("Concat", *indices_list, axis_i=-1)
     else:
@@ -180,7 +180,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
                 return masked_fill(g, self, bool_inp, values)
             return masked_scatter(g, self, bool_inp, values)
         broadcast_index_shape = g.op("Shape", index)
-        index = g.op("Unsqueeze", index, axes_i=[-1])
+        index = sym_help._unsqueeze_helper(g, index, [-1])
     sub_data_shape = sym_help._slice_helper(
         g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[maxsize])
     values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
@@ -284,7 +284,7 @@ def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_s
             if rank is None:
                 return sym_help._unimplemented("interpolate (with a scalar output_size)",
                                                "missing input shape (try giving an array of output_size values)")
-            size = unsqueeze(g, size, 0)
+            size = sym_help._unsqueeze_helper(g, size, [0])
             size = [size for i in range(rank - 2)]
             size = g.op("Concat", *size, axis_i=0)
         size = g.op("Cast", size, to_i=sym_help.cast_pytorch_to_onnx['Long'])
@@ -376,7 +376,7 @@ def _len(g, self):
     if _is_tensor_list(self) or self.node().kind() == "onnx::SplitToSequence":
         return g.op("SequenceLength", self)
     sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return g.op('Squeeze', sz_0, axes_i=[0])
+    return sym_help._squeeze_helper(g, sz_0, [0])
 
 
 def __getitem_(g, self, i):
@@ -489,7 +489,7 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None):
             return split_out
         # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
         if sym_help._is_packed_list(split_size_or_sizes) and len(sym_help._unpack_list(split_size_or_sizes)) == _outputs:
-            split_sizes = [g.op("Unsqueeze", v, axes_i=[0]) for v in sym_help._unpack_list(split_size_or_sizes)]
+            split_sizes = [sym_help._unsqueeze_helper(g, v, [0]) for v in sym_help._unpack_list(split_size_or_sizes)]
             start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
             axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
             res = []
@@ -658,7 +658,7 @@ def squeeze(g, self, dim=None):
         if_node_outputs = g.op("If", cond)
         if_node = if_node_outputs.node()
         if_block = torch.onnx.utils._add_block(if_node)
-        squeeze_ = if_block.op("Squeeze", self, axes_i=[dim])
+        squeeze_ = sym_help._squeeze_helper(if_block, self, [dim])
         torch.onnx.utils._add_output_to_block(if_block, squeeze_)
         else_block = torch.onnx.utils._add_block(if_node)
         identity_ = else_block.op("Identity", self)
@@ -673,13 +673,12 @@ def squeeze(g, self, dim=None):
                       "be exported without the squeeze node. If the model is intended to be used with dynamic " +
                       "input shapes, please export with dynamic_axes argument.")
         return self
-    return g.op("Squeeze", self, axes_i=[dim])
+    return sym_help._squeeze_helper(g, self, [dim])
 
 
 @parse_args('v', 'i')
 def unsqueeze(g, self, dim):
-    return g.op("Unsqueeze", self, axes_i=[dim])
-
+    return sym_help._unsqueeze_helper(g, self, [dim])
 
 def mm(g, self, other):
     return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
@@ -782,7 +781,7 @@ def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding
 
     # Broadcast and add kernel staring positions (indices) with
     # kernel_grid along dim d, to get block indices along dim d
-    blocks_d_indices = g.op('Unsqueeze', blocks_d_indices, axes_i=[0])  # Reshape to [1, -1]
+    blocks_d_indices = sym_help._unsqueeze_helper(g, blocks_d_indices, [0])  # Reshape to [1, -1]
     kernel_mask = g.op('Reshape', kernel_grid, g.op('Constant', value_t=torch.tensor([-1, 1])))
     block_mask = g.op("Add", blocks_d_indices, kernel_mask)
 
@@ -804,8 +803,8 @@ def _get_im2col_output_shape(g, input, kernel_h, kernel_w):
                             g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w)))
 
     return g.op("Concat",
-                g.op("Unsqueeze", batch_dim, axes_i=[0]),
-                g.op("Unsqueeze", channel_unfolded, axes_i=[0]),
+                sym_help._unsqueeze_helper(g, batch_dim, [0]),
+                sym_help._unsqueeze_helper(g, channel_unfolded, [0]),
                 g.op("Constant", value_t=torch.tensor([-1])), axis_i=0)
 
 
@@ -901,9 +900,9 @@ def embedding_bag(g,
     loop_condition = g.op("Cast", loop_condition, to_i=9)
     zero = g.op("Constant", value_t=torch.tensor([0]))
 
-    indices_len = g.op("Unsqueeze",
-                       sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
-                       axes_i=[0])
+    indices_len = sym_help._unsqueeze_helper(g,
+                                             sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+                                             [0])
     if not include_last_offset:
         offsets = [offsets, indices_len]
         offsets = g.op("Concat", *offsets, axis_i=0)
@@ -923,8 +922,8 @@ def embedding_bag(g,
 
     indices_start = loop_block.op("Gather", offsets_starts, block_input_iter, axis_i=0)
     indices_end = loop_block.op("Gather", offsets_ends, block_input_iter, axis_i=0)
-    indices_start = loop_block.op("Unsqueeze", indices_start, axes_i=[0])
-    indices_end = loop_block.op("Unsqueeze", indices_end, axes_i=[0])
+    indices_start = sym_help._unsqueeze_helper(loop_block, indices_start, [0])
+    indices_end = sym_help._unsqueeze_helper(loop_block, indices_end, [0])
 
     indices_row = loop_block.op("Slice", indices, indices_start, indices_end, zero)
     embeddings = loop_block.op("Gather", embedding_matrix, indices_row, axis_i=0)
@@ -933,7 +932,7 @@ def embedding_bag(g,
                                                indices_start,
                                                indices_end,
                                                zero)
-        per_sample_weights_row = loop_block.op("Unsqueeze", per_sample_weights_row, axes_i=[1])
+        per_sample_weights_row = sym_help._unsqueeze_helper(loop_block, per_sample_weights_row, [1])
         embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row)
     if mode == 0:
         embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index cd67fd508fa2..63a40b555c8e 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -132,11 +132,11 @@ def unfold(g, input, dimension, size, step):
         starts = loop_block.op("Gather", low_indices, block_input_iter)
         ends = loop_block.op("Gather", hi_indices, block_input_iter)
         axes = loop_block.op("Constant", value_t=torch.tensor([2]))
-        starts = loop_block.op("Unsqueeze", starts, axes_i=[0])
-        ends = loop_block.op("Unsqueeze", ends, axes_i=[0])
+        starts = sym_help._unsqueeze_helper(loop_block, starts, [0])
+        ends = sym_help._unsqueeze_helper(loop_block, ends, [0])
         stack = loop_block.op("Slice", input, starts, ends, axes)
 
-        unsqueeze = loop_block.op("Unsqueeze", loop_block.op("Transpose", stack, perm_i=perm), axes_i=[dimension])
+        unsqueeze = sym_help._unsqueeze_helper(loop_block, loop_block.op("Transpose", stack, perm_i=perm), [dimension])
         unsqueeze_list.append(unsqueeze)
         concat = loop_block.op("Concat", *unsqueeze_list, axis_i=0)
 
@@ -148,7 +148,7 @@ def unfold(g, input, dimension, size, step):
         perm = [0, 1, 2, 3, 4]
         perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
         transpose = g.op("Transpose", loop_output, perm_i=perm)
-        squeeze = g.op("Squeeze", transpose, axes_i=[0])
+        squeeze = sym_help._squeeze_helper(g, transpose, [0])
 
         return squeeze
     else:
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index ada731884f76..a69a7be56850 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -186,7 +186,7 @@ def cat(g, tensor_list, dim):
 
 @parse_args('v', 'i')
 def stack(g, tensor_list, dim):
-    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in sym_help._unpack_list(tensor_list)]
+    unsqueezed = [sym_help._unsqueeze_helper(g, t, [dim]) for t in sym_help._unpack_list(tensor_list)]
     return g.op("Concat", *unsqueezed, axis_i=dim)
 
 
@@ -592,7 +592,7 @@ def unbind(g, self, dim=0, _outputs=None):
 
     outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
     outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [g.op("Squeeze", out, axes_i=[dim]) for out in outputs]
+    squeezed_outputs = [sym_help._squeeze_helper(g, out, [dim]) for out in outputs]
     return squeezed_outputs
 
 
@@ -605,7 +605,7 @@ def select(g, self, dim, index):
         else:
             end_index = index + 1
         slice_node = sym_help._slice_helper(g, self, axes=[dim], starts=[index], ends=[end_index])
-        return g.op("Squeeze", slice_node, axes_i=[dim])
+        return sym_help._squeeze_helper(g, slice_node, [dim])
     else:
         return g.op("Gather", self, index, axis_i=dim)
 
@@ -640,7 +640,7 @@ def squeeze(g, self, dim=None):
                       "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on " +
                       "non-singleton dimensions, it is recommended to export this model using opset " +
                       "version 11 or higher.")
-        return g.op("Squeeze", self, axes_i=[squeeze_dim])
+        return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim])
     if dim_size > 1:
         warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". The size of " +
                       "this dimension in the given input is " + str(dim_size) + ". The model will " +
@@ -651,12 +651,12 @@ def squeeze(g, self, dim=None):
 
     warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". If the model is " +
                   "intended to be used with dynamic input shapes, please use opset version 11 to export the model.")
-    return g.op("Squeeze", self, axes_i=[squeeze_dim])
+    return sym_help._squeeze_helper(g, self, axes_i=[squeeze_dim])
 
 def prelu(g, self, weight):
     self_rank = sym_help._get_tensor_rank(self)
     if self_rank is not None and self_rank > 2:
-        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+        weight = sym_help._unsqueeze_helper(g, weight, list(range(1, self_rank - 1)))
     return g.op("PRelu", self, weight)
 
 
@@ -674,7 +674,7 @@ def floor(g, input):
 
 def _len(g, self):
     sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return g.op('Squeeze', sz_0, axes_i=[0])
+    return sym_help._squeeze_helper(g, sz_0, [0])
 
 
 @parse_args('v', 't', 't')
@@ -1356,7 +1356,7 @@ def unfold(g, input, dimension, size, step):
         ndim = len(sizes)
         perm = list(range(0, ndim))
         perm.append(perm.pop(dimension))
-        unsqueeze = [g.op("Unsqueeze", g.op("Transpose", t, perm_i=perm), axes_i=[dimension]) for t in stack]
+        unsqueeze = [sym_help._unsqueeze_helper(g, g.op("Transpose", t, perm_i=perm), [dimension]) for t in stack]
         return g.op("Concat", *unsqueeze, axis_i=dimension)
     else:
         return _unimplemented("Unfold", "input size not accessible")
@@ -1732,14 +1732,14 @@ def eye(g, *args):
     if len(args) == 5:
         # aten::eye(n, dtype, layout, device, pin_memory)
         n, dtype, layout, device, pin_memory = args
-        dim_size = g.op("Unsqueeze", n, axes_i=[0])
+        dim_size = sym_help._unsqueeze_helper(g, n, [0])
         shape = g.op("Concat", dim_size, dim_size, axis_i=0)
         tensor = zeros(g, shape, dtype, layout, device)
         return g.op("EyeLike", tensor)
     elif len(args) == 6:
         # aten::eye(n, m, dtype, layout, device, pin_memory)
         n, m, dtype, layout, device, pin_memory = args
-        shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0)
+        shape = g.op("Concat", sym_help._unsqueeze_helper(g, n, [0]), sym_help._unsqueeze_helper(g, m, [0]), axis_i=0)
         tensor = zeros(g, shape, dtype, layout, device)
         return g.op("EyeLike", tensor)
     else:
@@ -1760,9 +1760,9 @@ def slice(g, self, *args):
                                    'is a deprecated experimental op. Please use statically allocated '
                                    'variables or export to a higher opset version.')
             else:
-                start_unsqueezed = g.op("Unsqueeze", start, axes_i=[0])
-                end_unsqueezed = g.op("Unsqueeze", end, axes_i=[0])
-                dim_unsqueezed = g.op("Unsqueeze", dim, axes_i=[0])
+                start_unsqueezed = sym_help._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = sym_help._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = sym_help._unsqueeze_helper(g, dim, [0])
                 return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed)
         else:
             start = _parse_arg(start, 'i')
@@ -1814,7 +1814,7 @@ def unsqueeze(g, self, dim):
         else:
             return _unimplemented('unsqueeze', 'negative axis with unknown input rank')
 
-    return g.op("Unsqueeze", self, axes_i=[dim])
+    return sym_help._unsqueeze_helper(g, self, axes_i=[dim])
 
 
 @parse_args('v', 'i', 'i', 'none')
@@ -1973,7 +1973,7 @@ def transform_weights_no_bias(layer_index):
         elif variant == 'GRU' or variant == 'LSTM':
             weight_ih, weight_hh = \
                 [reform_weights(g, w, hidden_size, reform_permutation) for w in weights]
-        return tuple(g.op('Unsqueeze', x, axes_i=[0]) for x in (weight_ih, weight_hh))
+        return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh))
 
     def transform_weights(layer_index):
         weights = layer_weights[layer_index]
@@ -1983,7 +1983,7 @@ def transform_weights(layer_index):
             weight_ih, weight_hh, bias_ih, bias_hh = \
                 [reform_weights(g, w, hidden_size, reform_permutation) for w in weights]
         bias_concat = g.op('Concat', bias_ih, bias_hh, axis_i=0)
-        return tuple(g.op('Unsqueeze', x, axes_i=[0]) for x in (weight_ih, weight_hh, bias_concat))
+        return tuple(sym_help._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh, bias_concat))
 
     def retrieve_state(x, start, end):
         return x if num_layers == 1 else sym_help._slice_helper(g, x, axes=[0], starts=[start], ends=[end])
@@ -2050,7 +2050,7 @@ def retrieve_state(x, start, end):
             prev_output = g.op('Transpose', prev_output, perm_i=[0, 2, 1, 3])
             prev_output = g.op('Reshape', prev_output, g.op('Constant', value_t=torch.LongTensor([0, 0, -1])))
         else:
-            prev_output = g.op('Squeeze', prev_output, axes_i=[1])
+            prev_output = sym_help._squeeze_helper(g, prev_output, [1])
 
         h_outs.append(h_out)
         if variant == 'LSTM':
@@ -2382,7 +2382,7 @@ def gather(g, self, dim, index, sparse_grad=False):
     values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
     depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
     index = g.op("Cast", g.op("OneHot", index, depth, values, axis_i=dim), to_i=sym_help.cast_pytorch_to_onnx[dtype])
-    mul = g.op("Mul", g.op("Unsqueeze", self, axes_i=[dim + 1]), index)
+    mul = g.op("Mul", sym_help._unsqueeze_helper(g, self, [dim + 1]), index)
     return g.op("ReduceSum", mul, axes_i=[dim], keepdims_i=0)
 
 
@@ -2477,42 +2477,42 @@ def _get_arange_dtype(dtype):
 
     if len(args) == 2:
         # aten::arange(Scalar end, Tensor out)
-        end = g.op("Unsqueeze", args[0], axes_i=[0])
+        end = sym_help._unsqueeze_helper(g, args[0], [0])
         dtype = 4  # default to int64
-        arange_tensor = g.op("Squeeze", nonzero(g, ones(g, end, dtype, None, None)), axes_i=[1])
+        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, end, dtype, None, None)), [1])
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 4:
         # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
         dtype = 4  # default to int64
-        step = g.op("Unsqueeze", args[2], axes_i=[0])
-        end = g.op("Unsqueeze", args[1], axes_i=[0])
-        start = g.op("Unsqueeze", args[0], axes_i=[0])
+        step = sym_help._unsqueeze_helper(g, args[2], [0])
+        end = sym_help._unsqueeze_helper(g, args[1], [0])
+        start = sym_help._unsqueeze_helper(g, args[0], [0])
         range_tensor = g.op("Div", g.op("Sub", end, start), step)
-        arange_tensor = g.op("Squeeze", nonzero(g, ones(g, range_tensor, None, None, None)), axes_i=[1])
+        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, None, None, None)), [1])
         arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 5:
         # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
         dtype = _get_arange_dtype(args[1])
-        end = g.op("Unsqueeze", args[0], axes_i=[0])
-        arange_tensor = g.op("Squeeze", nonzero(g, ones(g, end, dtype, *(args[2:]))), axes_i=[1])
+        end = sym_help._unsqueeze_helper(g, args[0], [0])
+        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, end, dtype, *(args[2:]))), [1])
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 6:
         # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
         dtype = _get_arange_dtype(args[2])
-        end = g.op("Unsqueeze", args[1], axes_i=[0])
-        start = g.op("Unsqueeze", args[0], axes_i=[0])
+        end = sym_help._unsqueeze_helper(g, args[1], [0])
+        start = sym_help._unsqueeze_helper(g, args[0], [0])
         range_tensor = g.op("Sub", end, start)
-        arange_tensor = g.op("Add", g.op("Squeeze", nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), axes_i=[1]), start)
+        arange_tensor = g.op("Add", sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]), start)
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 7:
         # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
         dtype = _get_arange_dtype(args[3])
-        step = g.op("Unsqueeze", args[2], axes_i=[0])
-        end = g.op("Unsqueeze", args[1], axes_i=[0])
-        start = g.op("Unsqueeze", args[0], axes_i=[0])
+        step = sym_help._unsqueeze_helper(g, args[2], [0])
+        end = sym_help._unsqueeze_helper(g, args[1], [0])
+        start = sym_help._unsqueeze_helper(g, args[0], [0])
         range_tensor = g.op("Div", g.op("Sub", end, start), step)
-        arange_tensor = g.op("Squeeze", nonzero(g, ones(g, range_tensor, dtype, *(args[4:]))), axes_i=[1])
+        arange_tensor = sym_help._squeeze_helper(g, nonzero(g, ones(g, range_tensor, dtype, *(args[4:]))), [1])
         arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
         return g.op("Cast", arange_tensor, to_i=sym_help.scalar_type_to_onnx[dtype])
     else:
@@ -2541,7 +2541,7 @@ def try_mask_to_index(index):
             warnings.warn("Exporting aten::index operator with indices of type Byte. "
                           "Only 1-D indices are supported. In any other case, "
                           "this will produce an incorrect ONNX graph.")
-            index = squeeze(g, nonzero(g, index), dim=1)
+            index = sym_help._squeeze_helper(g, nonzero(g, index), [1])
         return index
 
     indices = [try_mask_to_index(idx) for idx in indices]
@@ -2730,7 +2730,7 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
 
     # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
     axes = list(range(1, input_rank - 1))
-    return add(g, mul(g, norm, g.op("Unsqueeze", weight, axes_i=axes)), g.op("Unsqueeze", bias, axes_i=axes))
+    return add(g, mul(g, norm, sym_help._unsqueeze_helper(g, weight, axes)), sym_help._unsqueeze_helper(g, bias, axes))
 
 
 @parse_args('v', 'v', 'i')

From 1723ab53c493cf0fb1a3510c7a19fb81f20bc972 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 28/41] [ONNX] Update Reducesum operator for opset 13 (#50532)
 (#50907)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50907

* udpate symbolic for squeeze/unsqueeze

* update c++ unsqueeze/squeeze creation

* clang format

* enable tests

* clang format

* remove prints

* remove magic number

* add helper function

* fix build issue

* update opset9 symbolic with helper function

* fix utility test

* fix prim_fallthrough opset skip

* enable reducesum opset 13

* enable embedding_bag which contain reducesum op

* add ReduceSum helper

* remove block_listed_operators

* remove local test code

* remove embedding_bag() in opset13 file

* remove unuse import

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050888

Pulled By: SplitInfinity

fbshipit-source-id: 88307af6a7880abf94eac126ec1638e962de8c1f

Co-authored-by: BowenBao <bowbao@microsoft.com>
Co-authored-by: hwangdeyu <deyhuang@qq.com>
---
 test/onnx/test_pytorch_onnx_onnxruntime.py |  4 --
 torch/onnx/symbolic_helper.py              | 11 ++++++
 torch/onnx/symbolic_opset10.py             |  2 +-
 torch/onnx/symbolic_opset11.py             |  2 +-
 torch/onnx/symbolic_opset13.py             | 46 +++++++++++++++++++---
 torch/onnx/symbolic_opset9.py              |  8 ++--
 6 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 5b7091b6612f..1c9f97488f27 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2762,7 +2762,6 @@ def forward(self, input):
         x = torch.randn(4, 5, dtype=torch.float)
         self.run_test(ReducedOpModule(), x)
 
-    @skipIfUnsupportedOpsetVersion([13])
     def test_reduced_sum(self):
         return self._test_reduced_ops(op=torch.sum)
 
@@ -4319,7 +4318,6 @@ def forward(self, input):
 
     @disableScriptTest()  # error in propagate as assign input shape
     @skipIfUnsupportedMinOpsetVersion(10)
-    @skipIfUnsupportedOpsetVersion([12, 13])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag(self):
         model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True)
         input = torch.randint(10, (7,))
@@ -4336,7 +4334,6 @@ def test_embedding_bag(self):
         self.run_test(model, (input))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @skipIfUnsupportedOpsetVersion([12, 13])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_1d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, offset, weights):
@@ -4351,7 +4348,6 @@ def forward(self, embedding_matrix, input, offset, weights):
         self.run_test(model, (embedding_matrix, x, offset, w))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @skipIfUnsupportedOpsetVersion([12, 13])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 8794c5f115c5..5d11bce82135 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -335,6 +335,17 @@ def _squeeze_helper(g, input, axes_i):
     else:
         return g.op("Squeeze", input, axes_i=axes_i)
 
+def _reducesum_helper(g, input, axes_i=None, keepdims_i=1, noop_with_empty_axes_i=0):
+    keepdims_i = _maybe_get_const(keepdims_i, 'i')
+    if _export_onnx_opset_version >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("ReduceSum", input, axes_i, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i)
+        return g.op("ReduceSum", input, keepdims_i=keepdims_i, noop_with_empty_axes_i=noop_with_empty_axes_i)
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
 def _interpolate_size_to_scales(g, input, output_size, dim):
     output_size = _maybe_get_const(output_size, 'is')
     if _is_value(output_size):
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 349eb8f35565..b7f0bb6167b2 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -231,7 +231,7 @@ def embedding_bag(g,
                 per_sample_weights_row = sym_help._unsqueeze_helper(g, per_sample_weights_row, [1])
                 embeddings = g.op("Mul", embeddings, per_sample_weights_row)
             if mode == 0:
-                embeddings = g.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
+                embeddings = sym_help._reducesum_helper(g, embeddings, axes_i=[0], keepdims_i=0)
             elif mode == 1:
                 embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
             else:
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 82394bfebe2f..86c1bd15b656 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -935,7 +935,7 @@ def embedding_bag(g,
         per_sample_weights_row = sym_help._unsqueeze_helper(loop_block, per_sample_weights_row, [1])
         embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row)
     if mode == 0:
-        embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
+        embeddings = sym_help._reducesum_helper(loop_block, embeddings, axes_i=[0], keepdims_i=0)
     elif mode == 1:
         embeddings = loop_block.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
     else:
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index 001a20147c4f..9fffa23a1131 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -2,15 +2,16 @@
 # see Note [Edit Symbolic Files] in symbolic_helper.py
 
 # This file exports ONNX ops for opset 13
-from torch.onnx.symbolic_helper import _block_list_in_opset
 import torch
 import torch.onnx.symbolic_helper as sym_help
-from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_helper import parse_args, _unimplemented
+from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input
 
-block_listed_operators = ['embedding_bag']
 
-for block_listed_op in block_listed_operators:
-    vars()[block_listed_op] = _block_list_in_opset(block_listed_op)
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+# This file exports ONNX ops for opset 13
 
 
 @parse_args('v', 'i', 'none')
@@ -38,7 +39,7 @@ def frobenius_norm(g, self, dim=None, keepdim=False):
     if not sym_help._is_value(dim_val) and len(dim_val) == 0:
         return g.op("ReduceL2", self, keepdims_i=0)
     sqr = g.op('Mul', self, self)
-    sumsqr = g.op('ReduceSum', sqr, dim, keepdims_i=keepdim)
+    sumsqr = sym_help._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
     return g.op('Sqrt', sumsqr)
 
 
@@ -108,3 +109,36 @@ def unbind(g, self, dim=0, _outputs=None):
 def glu(g, input, dim):
     first, second = g.op('Split', input, dim, outputs=2)
     return g.op('Mul', first, g.op('Sigmoid', second))
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return g.op(onnx_op_name, self, keepdims_i=0)
+        else:
+            keepdim = sym_help._get_const(keepdim, 'i', 'keepdim')
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+    return symbolic
+
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @parse_args('v', 'none')
+        def reduce_nodim(g, self, dtype):
+            if dtype.node().kind() != 'prim::Constant':
+                return _unimplemented(name, "dtype")
+            return symbolic(g, self)
+
+        @parse_args('v', 'v', 'i', 'none')
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            if dtype.node().kind() != 'prim::Constant':
+                return _unimplemented(name, "dtype")
+            return symbolic(g, self, dim, keepdim)
+        return reduce_nodim, reduce_dim
+    return reduce
+
+sum = _reduce_with_dtype('ReduceSum', 'sum')
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index a69a7be56850..031f4505e655 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -753,7 +753,7 @@ def softmax(g, input, dim, dtype=None):
     input = g.op('Sub', input, g.op('ReduceMax', input, axes_i=[dim], keepdims_i=1))
 
     exp = g.op('Exp', input)
-    sum = g.op('ReduceSum', exp, axes_i=[dim])
+    sum = sym_help._reducesum_helper(g, exp, axes_i=[dim])
     softmax = g.op('Div', exp, sum)
     if dtype and dtype.node().kind() != 'prim::Constant':
         parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
@@ -2383,7 +2383,7 @@ def gather(g, self, dim, index, sparse_grad=False):
     depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
     index = g.op("Cast", g.op("OneHot", index, depth, values, axis_i=dim), to_i=sym_help.cast_pytorch_to_onnx[dtype])
     mul = g.op("Mul", sym_help._unsqueeze_helper(g, self, [dim + 1]), index)
-    return g.op("ReduceSum", mul, axes_i=[dim], keepdims_i=0)
+    return sym_help._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
 
 
 @parse_args('v', 'is', 'b', 'i')
@@ -2639,7 +2639,7 @@ def try_mask_to_index(index):
 @parse_args('v', 'is', 'i')
 def frobenius_norm(g, self, dim=None, keepdim=False):
     sqr = g.op('Mul', self, self)
-    sumsqr = g.op('ReduceSum', sqr, axes_i=dim, keepdims_i=keepdim)
+    sumsqr = sym_help._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
     return g.op('Sqrt', sumsqr)
 
 
@@ -2805,7 +2805,7 @@ def kl_div(g, input, target, reduction, log_target):
     elif reduction == 1:
         return g.op("ReduceMean", output, keepdims_i=0)
     elif reduction == 2:
-        return g.op("ReduceSum", output, keepdims_i=0)
+        return sym_help._reducesum_helper(g, output, keepdims_i=0)
     else:
         return sym_help._onnx_unsupported("kl_div with reduction other than none, mean, or sum. Please open a bug to "
                                           "request ONNX export support for the missing reduction type.")

From b308fb78d1be446bb20bf9b73ee3f6d21f080c31 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 29/41] [ONNX] Add binary_cross_entropy_with_logits op to ONNX
 opset version 12 (#49675) (#50908)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50908

Fixes #{#47997}
Exporting the operator binary_cross_entropy_with_logits to ONNX opset version 12.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050885

Pulled By: SplitInfinity

fbshipit-source-id: e4167895eed804739aa50481679500a4d564b360
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 46 ++++++++++++++++++++++
 torch/onnx/symbolic_opset12.py             | 28 +++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 1c9f97488f27..42e3e91b5ce4 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -5159,6 +5159,52 @@ def forward(self, input, target):
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
         self.run_test(NLLModel(), (input, target))
 
+
+    @skipIfUnsupportedMinOpsetVersion(12)
+    def test_binary_cross_entropy_with_logits(self):
+        x = torch.randn(5)
+        y = torch.empty(5).random_(2)
+        self._bce_logits_loss(x, y)
+
+        x = torch.randn(2, 3, 5, 7)
+        y = torch.empty(2, 3, 5, 7).random_(2)
+        weight = torch.tensor([2])
+        self._bce_logits_loss(x, y, weight)
+
+        x = torch.FloatTensor([[-0.4089, -1.2471, 0.5907], [-0.4897, -0.8267, -0.7349], [0.5241, -0.1246, -0.4751]])
+        y = torch.FloatTensor([[0, 1, 1], [0, 0, 1], [1, 0, 1]])
+        pos_weight = torch.empty([3]).random_(2)
+        self._bce_logits_loss(x, y, pos_weight)
+
+        x = torch.randn(3, 3, 4)
+        y = torch.empty(3, 3, 4).random_(2)
+        weight = torch.tensor([3])
+        pos_weight = torch.empty([3, 4]).random_(2)
+        self._bce_logits_loss(x, y, weight, pos_weight)
+
+    def _bce_logits_loss(self, x, y, weight=None, pos_weight=None):
+        class BCEWithLogitsLossNoneWeights(torch.nn.Module):
+            def forward(self, input, target, weight, pos_weight):
+                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
+                                                                            pos_weight=pos_weight, reduction='none')
+
+        self.run_test(BCEWithLogitsLossNoneWeights(), input=(x, y, weight, pos_weight))
+
+        class BCEWithLogitsLossMeanWeights(torch.nn.Module):
+            def forward(self, input, target, weight, pos_weight):
+                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
+                                                                            pos_weight=pos_weight, reduction='mean')
+
+        self.run_test(BCEWithLogitsLossMeanWeights(), input=(x, y, weight, pos_weight))
+
+        class BCEWithLogitsLossSumWeights(torch.nn.Module):
+            def forward(self, input, target, weight, pos_weight):
+                return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight,
+                                                                            pos_weight=pos_weight, reduction='sum')
+
+        self.run_test(BCEWithLogitsLossSumWeights(), input=(x, y, weight, pos_weight))
+
+
     def test_torch_mm(self):
         class M(torch.nn.Module):
             def forward(self, mat1, mat2):
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 63a40b555c8e..5a926eef5e1d 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -52,6 +52,34 @@ def nll_loss2d(g, self, target, weight, reduction, ignore_index):
     return nll_loss(g, self, target, weight, reduction, ignore_index)
 
 
+@parse_args('v', 'v', 'v', 'v', 'i')
+def binary_cross_entropy_with_logits(g, input, target, weight, pos_weight, reduction):
+    from torch.onnx.symbolic_opset9 import sigmoid, log, sub, neg, mul, add
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = sigmoid(g, input)
+    log_sig_x = log(g, sig_x)
+    sub_1_x = sub(g, p, sig_x)
+    sub_1_y = sub(g, p, target)
+    log_1_x = log(g, sub_1_x)
+    if pos_weight is None or sym_help._is_none(pos_weight):
+        output = neg(g, add(g, mul(g, target, log_sig_x), mul(g, sub_1_y, log_1_x)))
+    else:
+        output = neg(g, add(g, mul(g, mul(g, target, log_sig_x), pos_weight), mul(g, sub_1_y, log_1_x)))
+
+    if weight is not None and not sym_help._is_none(weight):
+        output = mul(g, weight, output)
+
+    reduction = sym_help._maybe_get_const(reduction, 'i')
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output)
+    elif reduction == 2:
+        return g.op("ReduceSum", output)
+    else:
+        return sym_help._onnx_unsupported("binary_cross_entropy_with_logits with reduction other than none, mean, or sum")
+
+
 def celu(g, self, alpha):
     alpha = sym_help._maybe_get_const(alpha, 'f')
     # if the input is of type double cast it to float

From e90a480d408336258bb7728d023825599587eb5c Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 30/41] [ONNX] Add logical_and, logical_or, logical_xor torch
 op support in pytorch exporter (#50570) (#50909)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50909

Fixes #{}
Add logical_and, logical_or, logical_xor torch op support in pytorch exporter.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050884

Pulled By: SplitInfinity

fbshipit-source-id: 2db564e9726c18a3477f9268a0ff862cd2c40e4d
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 66 ++++++++++++++++++++++
 torch/onnx/symbolic_opset9.py              | 15 +++++
 2 files changed, 81 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 42e3e91b5ce4..d4b6a9bf9a2b 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3217,6 +3217,72 @@ def _test_compare_ops(self, model, num_inputs):
             self.run_test(model, x_float)
             self.run_test(model, x_int)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_logical_and(self):
+        class AndModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.logical_and(x, y)
+
+        x = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        y = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        self.run_test(AndModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int32)
+        y = torch.randint(10, (5, 5), dtype=torch.int32)
+        self.run_test(AndModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.double)
+        y = torch.randint(10, (5, 5), dtype=torch.double)
+        self.run_test(AndModel(), input=(x, y))
+
+        x = torch.randint(10, (2, 3, 5), dtype=torch.float32)
+        y = torch.randint(10, (2, 3, 5), dtype=torch.long)
+        self.run_test(AndModel(), input=(x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_logical_or(self):
+        class OrModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.logical_or(x, y)
+
+        x = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        y = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        self.run_test(OrModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int32)
+        y = torch.randint(10, (5, 5), dtype=torch.int32)
+        self.run_test(OrModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.double)
+        y = torch.randint(10, (5, 5), dtype=torch.double)
+        self.run_test(OrModel(), input=(x, y))
+
+        x = torch.randint(10, (2, 3, 5), dtype=torch.float32)
+        y = torch.randint(10, (2, 3, 5), dtype=torch.long)
+        self.run_test(OrModel(), input=(x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_logical_xor(self):
+        class XorModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.logical_xor(x, y)
+
+        x = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        y = torch.randint(0, 2, (5, 5), dtype=torch.bool)
+        self.run_test(XorModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int32)
+        y = torch.randint(10, (5, 5), dtype=torch.int32)
+        self.run_test(XorModel(), input=(x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.double)
+        y = torch.randint(10, (5, 5), dtype=torch.double)
+        self.run_test(XorModel(), input=(x, y))
+
+        x = torch.randint(10, (2, 3, 5), dtype=torch.float32)
+        y = torch.randint(10, (2, 3, 5), dtype=torch.long)
+        self.run_test(XorModel(), input=(x, y))
+
     def test_gt(self):
         class GreaterModel(torch.nn.Module):
             def forward(self, input, other):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 031f4505e655..11aaeb57404e 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1105,6 +1105,21 @@ def __or_(g, input, other):
     return g.op('Or', input, other)
 
 
+@wrap_logical_op_with_cast_to_and_from('Bool')
+def logical_and(g, input, other):
+    return g.op('And', input, other)
+
+
+@wrap_logical_op_with_cast_to_and_from('Bool')
+def logical_or(g, input, other):
+    return g.op('Or', input, other)
+
+
+@wrap_logical_op_with_cast_to_and_from('Bool')
+def logical_xor(g, input, other):
+    return g.op('Xor', input, other)
+
+
 def __rshift_(g, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)

From 70dcfe29919c5e695c5f2573c96f57fe47071aa9 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 31/41] [ONNX] Enable _jit_pass_onnx_fold_if only when
 dynamic_axes is None (#50582) (#50910)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50910

Fixing pytorch/vision#3251 (PR #49410 triggers the torch vision test build failure, on three tests test_faster_rcnn, test_mask_rcnn, test_keypoint_rcnn. )

The offending PR is fine on pytorch UT, because the torchvision and pytorch test has a gap when we merge them - we are using different test API on two sides, therefore causing some discrepancy.

This PR bridge the gap for the above three tests, and disable _jit_pass_onnx_fold_if pass until it gets fixed.
Allow _jit_pass_onnx_fold_if only when dynamic_axes is None.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050886

Pulled By: SplitInfinity

fbshipit-source-id: b765ffe30914261866dcc761f0d0999fd16169e3
---
 .jenkins/caffe2/test.sh                    |  2 +-
 .jenkins/pytorch/common_utils.sh           |  2 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py | 94 ++++++++++++++++------
 torch/onnx/utils.py                        |  3 +-
 4 files changed, 73 insertions(+), 28 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index e6f43b6452cf..ac131ba738ca 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -160,7 +160,7 @@ pip install --user pytest-sugar
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # Check out torch/vision at Jun 11 2020 commit
   # This hash must match one in .jenkins/pytorch/test.sh
-  pip install -q --user git+https://github.com/pytorch/vision.git@e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
+  pip install -q --user git+https://github.com/pytorch/vision.git@ae0d80b3c52dc98b3a9763bdb974c3ef7b6eb83d
   pip install -q --user ninja
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index b28dcb2f41d8..38799ab782de 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -66,7 +66,7 @@ function get_bazel() {
   chmod +x tools/bazel
 }
 
-TORCHVISION_COMMIT=e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
+TORCHVISION_COMMIT=ae0d80b3c52dc98b3a9763bdb974c3ef7b6eb83d
 
 function install_torchvision() {
   # Check out torch/vision at Jun 11 2020 commit
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index d4b6a9bf9a2b..0ddfcf1e40ad 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -406,19 +406,7 @@ def run_word_language_model(self, model_name):
         # Only support CPU version, since tracer is not working in GPU RNN.
         self.run_test(model, (x, model.hidden))
 
-    @skipIfUnsupportedOpsetVersion([13])
-    @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()  # Faster RCNN model is not scriptable
-    def test_faster_rcnn(self):
-        model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200,
-                                                                                 max_size=300)
-        model.eval()
-        x = torch.randn(2, 3, 200, 300, requires_grad=True)
-        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
-        self.run_test(model, (x,), input_names=["images_tensors"], output_names=["outputs"],
-                      dynamic_axes={"images_tensors": [0, 1, 2, 3], "outputs": [0, 1, 2, 3]}, rtol=1e-3, atol=1e-5)
-
-    def get_image_from_url(self, url):
+    def get_image_from_url(self, url, size=(300, 200)):
         import os
         from urllib.parse import urlsplit
         from urllib import request
@@ -433,15 +421,40 @@ def get_image_from_url(self, url):
         with open(path, 'wb') as f:
             f.write(data)
         image = Image.open(path).convert("RGB")
-        image = image.resize((300, 200), Image.BILINEAR)
+
+        image = image.resize(size, Image.BILINEAR)
+
         to_tensor = transforms.ToTensor()
         return to_tensor(image)
 
     def get_test_images(self):
         image_url = "http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg"
-        image = self.get_image_from_url(url=image_url)
-        images = [image]
-        return images
+        image = self.get_image_from_url(url=image_url, size=(100, 320))
+
+        image_url2 = "https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image05.png"
+        image2 = self.get_image_from_url(url=image_url2, size=(250, 380))
+
+        return [image], [image2]
+
+    @skipIfUnsupportedOpsetVersion([13])
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # Faster RCNN model is not scriptable
+    def test_faster_rcnn(self):
+        model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200,
+                                                                                 max_size=300)
+        model.eval()
+        x = torch.randn(2, 3, 200, 300, requires_grad=True)
+        self.run_test(model, (x,), rtol=1e-3, atol=1e-5)
+        self.run_test(model, (x,), input_names=["images_tensors"], output_names=["outputs"],
+                      dynamic_axes={"images_tensors": [0, 1, 2, 3], "outputs": [0, 1, 2, 3]}, rtol=1e-3, atol=1e-5)
+        dummy_image = [torch.ones(3, 100, 100) * 0.3]
+        images, test_images = self.get_test_images()
+        self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)],
+                      input_names=["images_tensors"], output_names=["outputs"],
+                      dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)],
+                      input_names=["images_tensors"], output_names=["outputs"],
+                      dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
 
     def test_paste_mask_in_image(self):
         # disable profiling
@@ -480,11 +493,20 @@ def test_paste_mask_in_image(self):
     def test_mask_rcnn(self):
         model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn(pretrained=True, min_size=200,
                                                                              max_size=300)
-        images = self.get_test_images()
+        images, test_images = self.get_test_images()
         self.run_test(model, (images,), rtol=1e-3, atol=1e-5)
         self.run_test(model, (images,), input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
                       dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
                                     "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        dummy_image = [torch.ones(3, 100, 100) * 0.3]
+        self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)],
+                      input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
+                      dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
+                                    "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
+        self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)],
+                      input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"],
+                      dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0],
+                                    "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
 
     def test_heatmaps_to_keypoints(self):
         # disable profiling
@@ -516,12 +538,33 @@ def test_heatmaps_to_keypoints(self):
     def test_keypoint_rcnn(self):
         model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(pretrained=True, min_size=200,
                                                                                      max_size=300)
-        images = self.get_test_images()
+        images, test_images = self.get_test_images()
         self.run_test(model, (images,), rtol=1e-3, atol=1e-5)
         self.run_test(model, (images,), input_names=["images_tensors"],
                       output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
                       dynamic_axes={"images_tensors": [0, 1, 2]},
                       rtol=1e-3, atol=1e-5)
+        dummy_images = [torch.ones(3, 100, 100) * 0.3]
+        self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_images,)],
+                      input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
+                      dynamic_axes={"images_tensors": [0, 1, 2]},
+                      rtol=5e-3, atol=1e-5)
+        self.run_test(model, (dummy_images,), test_with_inputs=[(dummy_images,), (test_images,)],
+                      input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
+                      dynamic_axes={"images_tensors": [0, 1, 2]},
+                      rtol=5e-3, atol=1e-5)
+
+    @skipIfUnsupportedOpsetVersion([13])
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()
+    def test_shufflenet_v2_dynamic_axes(self):
+        model = torchvision.models.shufflenet_v2_x0_5(pretrained=True)
+        dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True)
+        test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True)
+        self.run_test(model, (dummy_input,), test_with_inputs=[(dummy_input,), (test_inputs,)],
+                      input_names=["input_images"], output_names=["outputs"],
+                      dynamic_axes={"input_images": {0: 'batch_size'}, "output": {0: 'batch_size'}},
+                      rtol=1e-3, atol=1e-5)
 
     @disableScriptTest()
     def test_word_language_model_RNN_TANH(self):
@@ -6016,7 +6059,7 @@ def forward(self, boxes, size):
         self.run_test(Module(), (boxes, size),
                       input_names=["boxes", "size"],
                       dynamic_axes={"size": [0, 1]},
-                      test_with_inputs=[(boxes, size_2)])
+                      test_with_inputs=[(boxes, size), (boxes, size_2)])
 
     @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -6072,7 +6115,7 @@ def forward(self, images):
         input_test = torch.rand(3, 100, 150)
         self.run_test(TransformModule(), (input,),
                       input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]},
-                      test_with_inputs=[(input_test,)])
+                      test_with_inputs=[(input,), (input_test,)])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_transform_images(self):
@@ -6087,7 +6130,7 @@ def forward(self, images):
 
         input = torch.rand(3, 100, 200), torch.rand(3, 200, 200)
         input_test = torch.rand(3, 100, 200), torch.rand(3, 200, 200)
-        self.run_test(TransformModule(), (input,), test_with_inputs=[(input_test,)])
+        self.run_test(TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)])
 
     def get_features(self, images):
         s0, s1 = images.shape[-2:]
@@ -6104,6 +6147,7 @@ def get_features(self, images):
     @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_rpn(self):
+
         class RPNModule(torch.nn.Module):
             def __init__(self):
                 super(RPNModule, self).__init__()
@@ -6126,7 +6170,7 @@ def forward(self, images, features):
                       dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3],
                                     "input3": [0, 1, 2, 3], "input4": [0, 1, 2, 3],
                                     "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]},
-                      test_with_inputs=[(images2, test_features)],
+                      test_with_inputs=[(images, features), (images2, test_features)],
                       dict_check=False)
 
     @skipIfUnsupportedOpsetVersion([13])
@@ -6154,7 +6198,7 @@ def forward(self, input, boxes):
         boxes1 = torch.rand(6, 4) * 256
         boxes1[:, 2:] += boxes1[:, :2]
 
-        self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i1, [boxes1],)])
+        self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i, [boxes],), (i1, [boxes1],)])
 
     @skipIfUnsupportedOpsetVersion([13])
     @skipIfUnsupportedMinOpsetVersion(11)
@@ -6189,7 +6233,7 @@ def forward(self, images, features):
                       input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
                       dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], "input3": [0, 1, 2, 3],
                                     "input4": [0, 1, 2, 3], "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]},
-                      test_with_inputs=[(images2, test_features)],
+                      test_with_inputs=[(images, features), (images2, test_features)],
                       dict_check=False)
 
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 7a483c2f728b..a17f2ea2eb2d 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -209,7 +209,8 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
         torch._C._jit_pass_onnx_scalar_type_analysis(graph)
         torch._C._jit_pass_lint(graph)
 
-        torch._C._jit_pass_onnx_fold_if(graph)
+        if dynamic_axes is None or not bool(dynamic_axes):
+            torch._C._jit_pass_onnx_fold_if(graph)
 
         from torch.onnx.symbolic_helper import _export_onnx_opset_version
         torch._C._jit_pass_onnx_peephole(graph, _export_onnx_opset_version, fixed_batch_size)

From 68034197e87a82988b27cad0c7c521868f8fe987 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 32/41] [ONNX] Support gelu for fp16 export (#50487) (#50911)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50911

Need to replace dtype of export created scalars from float to double. (In torch implicit conversion logic, python numbers are double)

Test case skipped in CI due to that current CI job env does not have CUDA support.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050889

Pulled By: SplitInfinity

fbshipit-source-id: 1fdde23a68d4793e6b9a82840acc213e5c3aa760
---
 scripts/onnx/test.sh                          |  1 +
 .../test_pytorch_onnx_onnxruntime_cuda.py     | 31 +++++++++++++++++++
 torch/onnx/symbolic_opset9.py                 |  7 ++---
 3 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 test/onnx/test_pytorch_onnx_onnxruntime_cuda.py

diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 3432ea434928..5e9cfa936064 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -58,6 +58,7 @@ pytest "${args[@]}" \
   --ignore "$top_dir/test/onnx/test_utility_funs.py" \
   --ignore "$top_dir/test/onnx/test_pytorch_onnx_caffe2.py" \
   --ignore "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py" \
+  --ignore "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py" \
   "${test_paths[@]}"
 
 # onnxruntime only support py3
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
new file mode 100644
index 000000000000..24017b125b10
--- /dev/null
+++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@@ -0,0 +1,31 @@
+import unittest
+import onnxruntime  # noqa
+import torch
+
+from test_pytorch_common import skipIfUnsupportedMinOpsetVersion
+from test_pytorch_common import skipIfNoCuda
+
+from test_pytorch_onnx_onnxruntime import TestONNXRuntime
+
+class TestONNXRuntime_cuda(unittest.TestCase):
+    from torch.onnx.symbolic_helper import _export_onnx_opset_version
+    opset_version = _export_onnx_opset_version
+    keep_initializers_as_inputs = True
+    use_new_jit_passes = True
+    onnx_shape_inference = True
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    @skipIfNoCuda
+    def test_gelu_fp16(self):
+        class GeluModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.gelu(x)
+
+        x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device('cuda'))
+        self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5)
+
+TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp
+TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test
+
+if __name__ == '__main__':
+    unittest.main(TestONNXRuntime_cuda())
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 11aaeb57404e..043fbd041897 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2702,10 +2702,9 @@ def remainder(g, input, other):
 
 def gelu(g, self):
     _sqrt2 = 1.4142135623730951
-    erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2)))
-    erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.float)))
-    return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.float)))
-
+    erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2, dtype=torch.double)))
+    erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.double)))
+    return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.double)))
 
 @parse_args('v', 'i', 'v', 'v', 'f', 'i')
 def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):

From 84e9bff85d22b4d23e24c70ef8b1d715502f7877 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 33/41] [ONNX] Replace optional parameters of Resize with
 placeholder for ops13. (#50574) (#50954)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50954

* Replace optional parameters of Resize with placeholder for ops13.

* Use common methods to handle different versions.

* Correct flake8 issue.

* Update per comments.

* Add something to trigger CI again.

* Trigger another round of CI.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050882

Pulled By: SplitInfinity

fbshipit-source-id: aea6205a1ba4a0621fe1ac9e0c7d94b92b6d8f21
---
 torch/onnx/symbolic_helper.py  | 127 +++++++++++++++++++++++++++++++++
 torch/onnx/symbolic_opset11.py |  94 +-----------------------
 2 files changed, 129 insertions(+), 92 deletions(-)

diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 5d11bce82135..6b0184f3be4c 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -10,6 +10,7 @@
 import torch.onnx.utils
 
 from functools import wraps
+from torch._C import OptionalType
 
 
 # Note [Edit Symbolic Files]
@@ -430,6 +431,126 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_
     return scale_factor, mode
 
 
+def _interpolate_helper(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = "asymmetric" if interpolate_mode == "nearest" \
+            else "align_corners" if align_corners else "pytorch_half_pixel"
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+            output_size = g.op("Cast", output_size, to_i=cast_pytorch_to_onnx['Long'])
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if _export_onnx_opset_version >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+                empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+            return g.op("Resize",
+                        input,
+                        empty_roi,
+                        empty_scales,
+                        output_size,
+                        coordinate_transformation_mode_s=coordinate_transformation_mode,
+                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                        mode_s=interpolate_mode,  # nearest, linear, or cubic
+                        nearest_mode_s="floor")  # only valid when mode="nearest"
+        else:
+            if _export_onnx_opset_version >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+            return g.op("Resize",
+                        input,
+                        empty_roi,
+                        scales,
+                        coordinate_transformation_mode_s=coordinate_transformation_mode,
+                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                        mode_s=interpolate_mode,  # nearest, linear, or cubic
+                        nearest_mode_s="floor")  # only valid when mode="nearest"
+    return symbolic_fn
+
+
+def __interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor):
+    mode = _maybe_get_const(mode, 's')
+    if 'linear' in mode:
+        mode = 'linear'
+    if 'cubic' in mode:
+        mode = 'cubic'
+    align_corners = _maybe_get_const(align_corners, 'b')
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = "asymmetric" if mode == "nearest" \
+        else "align_corners" if align_corners else "pytorch_half_pixel"
+
+    if not _is_none(size) :
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, 't').dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and ((_maybe_get_const(size, 't').dim() == 0))
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn("Cannot verify if the output_size is a scalar "
+                              "while exporting interpolate. Assuming that it is not a scalar.")
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented("interpolate (with a scalar output_size)",
+                                      "missing input shape (try giving an array of output_size values)")
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=cast_pytorch_to_onnx['Long'])
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if _export_onnx_opset_version >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        return g.op("Resize",
+                    input,
+                    empty_roi,
+                    empty_scales,
+                    size,
+                    coordinate_transformation_mode_s=coordinate_transformation_mode,
+                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                    mode_s=mode,  # nearest, linear, or cubic
+                    nearest_mode_s="floor")
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if _export_onnx_opset_version >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op("Resize",
+                    input,
+                    empty_roi,
+                    scales,
+                    coordinate_transformation_mode_s=coordinate_transformation_mode,
+                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                    mode_s=mode,  # nearest, linear, or cubic
+                    nearest_mode_s="floor")  # only valid when mode="nearest"
+
+
 def _unbind_helper(g, self, dim, _outputs):
     if _export_onnx_opset_version <= 9:
         from torch.onnx.symbolic_opset9 import unbind
@@ -546,6 +667,12 @@ def _is_split_static(split_size_or_sizes, _outputs):
         return False
     return True
 
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(OptionalType.ofTensor())
+    return n
+
+
 # ---------------------------------------------------------------------
 # ONNX operator version
 # ---------------------------------------------------------------------
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 86c1bd15b656..3792f77ae377 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -208,38 +208,7 @@ def pixel_shuffle(g, self, upscale_factor):
 
 
 def _interpolate(name, dim, interpolate_mode):
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = sym_help._get_interpolate_attributes(g, interpolate_mode, args)
-        align_corners = sym_help._maybe_get_scalar(align_corners)
-        coordinate_transformation_mode = "asymmetric" if interpolate_mode == "nearest" \
-            else "align_corners" if align_corners else "pytorch_half_pixel"
-        empty_tensor = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-        if scales is None:
-            input_size = g.op("Shape", input)
-            input_size_beg = sym_help._slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
-            output_size = g.op("Cast", output_size, to_i=sym_help.cast_pytorch_to_onnx["Long"])
-            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
-            scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-            return g.op("Resize",
-                        input,
-                        empty_tensor,  # roi only takes effect whith coordinate_transformation_mode="tf_crop_and_resize"
-                        scales,  # scales is not needed since we are sending out_size
-                        output_size,
-                        coordinate_transformation_mode_s=coordinate_transformation_mode,
-                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                        mode_s=interpolate_mode,  # nearest, linear, or cubic
-                        nearest_mode_s="floor")  # only valid when mode="nearest"
-        else:
-            return g.op("Resize",
-                        input,
-                        empty_tensor,  # roi only takes effect with coordinate_transformation_mode="tf_crop_and_resize"
-                        scales,  # scales is not needed since we are sending out_size
-                        coordinate_transformation_mode_s=coordinate_transformation_mode,
-                        cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                        mode_s=interpolate_mode,  # nearest, linear, or cubic
-                        nearest_mode_s="floor")  # only valid when mode="nearest"
-    return symbolic_fn
+    return sym_help._interpolate_helper(name, dim, interpolate_mode)
 
 
 upsample_nearest1d = _interpolate('upsample_nearest1d', 3, "nearest")
@@ -252,66 +221,7 @@ def symbolic_fn(g, input, output_size, *args):
 
 
 def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor):
-    mode = sym_help._maybe_get_const(mode, 's')
-    if 'linear' in mode:
-        mode = 'linear'
-    if 'cubic' in mode:
-        mode = 'cubic'
-    align_corners = sym_help._maybe_get_const(align_corners, 'b')
-    align_corners = False if not isinstance(align_corners, bool) else align_corners
-    coordinate_transformation_mode = "asymmetric" if mode == "nearest" \
-        else "align_corners" if align_corners else "pytorch_half_pixel"
-    # roi only takes effect with coordinate_transformation_mode="tf_crop_and_resize"
-    roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-    if not sym_help._is_none(size) :
-        input_size = g.op("Shape", input)
-        input_size = sym_help._slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
-        # in some cases size is not a packed list but size is a scalar
-        # We need to also verify that (sym_help._maybe_get_const(size, 't').dim() == 0)
-        # but this information is not always available. Try to get the dim,
-        # and if not assume that it is not a scalar.
-        try:
-            is_scalar = not sym_help._is_packed_list(size) and ((sym_help._maybe_get_const(size, 't').dim() == 0))
-        except AttributeError:
-            is_scalar = not sym_help._is_packed_list(size)
-            if not is_scalar:
-                warnings.warn("Cannot verify if the output_size is a scalar "
-                              "while exporting interpolate. Assuming that it is not a scalar.")
-
-        if is_scalar:
-            rank = sym_help._get_tensor_rank(input)
-            if rank is None:
-                return sym_help._unimplemented("interpolate (with a scalar output_size)",
-                                               "missing input shape (try giving an array of output_size values)")
-            size = sym_help._unsqueeze_helper(g, size, [0])
-            size = [size for i in range(rank - 2)]
-            size = g.op("Concat", *size, axis_i=0)
-        size = g.op("Cast", size, to_i=sym_help.cast_pytorch_to_onnx['Long'])
-        size = g.op("Concat", input_size, size, axis_i=0)
-        scales = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-        return g.op("Resize",
-                    input,
-                    roi,
-                    scales,
-                    size,
-                    coordinate_transformation_mode_s=coordinate_transformation_mode,
-                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                    mode_s=mode,  # nearest, linear, or cubic
-                    nearest_mode_s="floor")
-    else:  # if not sym_help._is_none(scales)
-        rank = sym_help._get_tensor_rank(input)
-        if rank is None:
-            return sym_help._unimplemented("interpolate (with scales)", "missing input shape")
-        scales = sym_help._interpolate_get_scales(g, scale_factor, rank)
-        return g.op("Resize",
-                    input,
-                    roi,
-                    scales,
-                    coordinate_transformation_mode_s=coordinate_transformation_mode,
-                    cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                    mode_s=mode,  # nearest, linear, or cubic
-                    nearest_mode_s="floor")  # only valid when mode="nearest"
+    return sym_help.__interpolate_helper(g, input, size, scale_factor, mode, align_corners, recompute_scale_factor)
 
 @parse_args('v', 'i', 'v', 'v')
 def gather(g, self, dim, index, sparse_grad=False):

From e2eb97dd7682d2810071ce78b76543acc1584a9c Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 27 Jan 2021 17:41:50 -0800
Subject: [PATCH 34/41] [ONNX] Fix param names (#50764) (#50955)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50955

Preserve name of parameters for ONNX.

Looks like output->copyMetadata(input) API is giving the same debugName to the output. So the name of the original input is changed. This update avoid the name change by just copying the type.

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D26050880

Pulled By: SplitInfinity

fbshipit-source-id: 8b04e41e6df7f33c5c9c873fb323c21462fc125b
---
 .../onnx/remove_inplace_ops_for_onnx.cpp      | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index bc26183a25bb..c9b42d76973a 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -571,29 +571,27 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
             << "Warning: ONNX Preprocess - Removing mutation on block inputs. "
             << "This changes graph semantics." << std::endl;
 
+        Node* newNode = nullptr;
         if (input->type()->kind() == TypeKind::ListType) {
           // Create an aten::list to clone the list in graph inputs
-          auto newNode = node->owningGraph()->create(aten::list, 1);
-          newNode->output()->copyMetadata(input);
+          newNode = node->owningGraph()->create(aten::list, 1);
+          newNode->output()->setType(input->type());
           newNode->addInput(input);
-          newNode->insertBefore(node);
-          node->replaceInput(index, newNode->output());
-          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+          b->prependNode(newNode);
         } else {
           // Create an aten::clone to clone the tensor in graph inputs
-          auto newNode = node->owningGraph()->create(aten::clone, 1);
-          newNode->output()->copyMetadata(input);
+          newNode = node->owningGraph()->create(aten::clone, 1);
+          newNode->output()->setType(input->type());
           newNode->addInput(input);
 
           auto* noneNode = node->owningGraph()->create(prim::Constant);
           noneNode->output()->setType(NoneType::get());
           newNode->addInput(noneNode->output());
-
-          newNode->insertBefore(node);
+          b->prependNode(newNode);
           noneNode->insertBefore(newNode);
-          node->replaceInput(index, newNode->output());
-          input->replaceAllUsesAfterNodeWith(node, newNode->output());
         }
+        node->replaceInput(index, newNode->output());
+        input->replaceAllUsesAfterNodeWith(node, newNode->output());
       }
     }
   }

From 4fb33f1d3a95e29bc850f3415895a15eac93c38c Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Wed, 27 Jan 2021 19:09:14 -0800
Subject: [PATCH 35/41] Trim profiler file paths (#51192)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51192

Trim profiler file paths when using stack traces

Test Plan:
python test/test_profiler.py -k test_source
```
			       SumBackward0         0.02%       6.000us         0.51%     154.000us     154.000us             1  test/test_profiler.py(91): test_source
																 ...conda3/envs/pytorch/lib/python3.8/unittest/case.py(633): _callTestMethod
																 ...r/local/miniconda3/envs/pytorch/lib/python3.8/unittest/case.py(676): run
																 ...al/miniconda3/envs/pytorch/lib/python3.8/unittest/case.py(736): __call__
																 .../local/miniconda3/envs/pytorch/lib/python3.8/unittest/suite.py(122): run
```

Reviewed By: ngimel

Differential Revision: D26113905

Pulled By: ilia-cher

fbshipit-source-id: 2b71c31b6c4437855d33013d42d977745e6f489f
---
 torch/autograd/profiler.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index a3d0da1aef9d..0221d80ba684 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1569,6 +1569,14 @@ def append(s):
 
     append(header_sep)
 
+    def trim_path(path, src_column_width):
+        if len(path) > src_column_width:
+            offset = len(path) - src_column_width
+            path = path[offset:]
+            if len(path) > 3:
+                path = "..." + path[3:]
+        return path
+
     event_limit = 0
     for evt in events:
         if event_limit == row_limit:
@@ -1629,14 +1637,14 @@ def append(s):
         if has_stack:
             src_field = ""
             if len(evt.stack) > 0:
-                src_field = evt.stack[0][:src_column_width]
+                src_field = trim_path(evt.stack[0], src_column_width)
             row_values.append(src_field)
         append(row_format.format(*row_values))
 
         if has_stack:
             empty_headers = [""] * (len(headers) - 1)
             for entry in evt.stack[1:MAX_STACK_ENTRY]:
-                append(row_format.format(*(empty_headers + [entry[:src_column_width]])))
+                append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)])))
             empty_headers.append("")
             append(row_format.format(*empty_headers))
 

From ea0d304e2eb28506eb6c5650dcf7ada039c9db7c Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Wed, 27 Jan 2021 19:09:14 -0800
Subject: [PATCH 36/41] Rewrite "ProfilerStep#<num>" in profiler output
 (#51194)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51194

Aggregate all "ProfilerStep#<num>" together

Test Plan:
python test/test_profiler.py -k
test_kineto_profiler_api

Reviewed By: ngimel

Differential Revision: D26113907

Pulled By: ilia-cher

fbshipit-source-id: 2bc803befc85153f07e770ea3c37b57e2870a1ba
---
 torch/autograd/profiler.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 0221d80ba684..0e50dcee702e 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -223,7 +223,7 @@ def export_chrome_trace(self, path):
                     '"pid": "CPU functions", '
                     '"args": {}}, '
                     % (
-                        evt.name,
+                        evt.trace_name,
                         evt.time_range.start,
                         evt.time_range.elapsed_us(),
                         evt.thread
@@ -241,7 +241,7 @@ def export_chrome_trace(self, path):
                             '"pid": "CPU functions", '
                             '"id": %s, '
                             '"cat": "cpu_to_cuda", '
-                            '"args": {}}, ' % (evt.name, evt.time_range.start,
+                            '"args": {}}, ' % (evt.trace_name, evt.time_range.start,
                                                evt.thread, next_id))
                     f.write('{"name": "%s", '
                             '"ph": "f", '
@@ -847,10 +847,11 @@ def __init__(
             self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
             is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0,
-            is_legacy=False, flops=None):
+            is_legacy=False, flops=None, trace_name=None):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
+        self.trace_name: str = trace_name if trace_name is not None else self.name
         self.time_range: Interval = Interval(start_us, end_us)
         self.thread: int = thread
         self.fwd_thread: Optional[int] = fwd_thread
@@ -1101,6 +1102,18 @@ def filter_name(name):
     ]
     return name in filtered_out_names
 
+# Demangles and optionally rewrites the provided event name,
+# with_wildcard - whether to replace certain numbered event names
+# with a wildcard name to aggregate them together in the profiler table
+# output
+def rewrite_name(name, with_wildcard=False):
+    string_table = StringTable()
+    name = string_table[name]
+    if with_wildcard:
+        if name.startswith("ProfilerStep#"):
+            name = "ProfilerStep*"
+    return name
+
 # Parsing of kineto profiler events
 def parse_kineto_results(result):
     # result.events() has most of the events - PyTorch op-level and device-level events
@@ -1120,7 +1133,6 @@ def parse_kineto_results(result):
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
     # Create and return FunctionEvent list
-    string_table = StringTable()
     function_events = []
     cuda_corr_map: Dict[int, List[torch.autograd.KinetoEvent]] = {}
     for kineto_event in result.events():
@@ -1142,7 +1154,8 @@ def parse_kineto_results(result):
         is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id()
         fe = FunctionEvent(
             id=kineto_event.correlation_id(),
-            name=string_table[kineto_event.name()],
+            name=rewrite_name(name=kineto_event.name(), with_wildcard=True),
+            trace_name=rewrite_name(name=kineto_event.name(), with_wildcard=False),
             thread=kineto_event.start_thread_id(),
             start_us=rel_start_us,
             end_us=rel_end_us,
@@ -1193,7 +1206,6 @@ def get_record_key(record):
     cuda_records = {}
     functions = []
     record_stack = []
-    string_table = StringTable()
 
     # cuda start events and the overall profiler start event don't happen
     # at exactly the same time because we need to record an event on each device
@@ -1271,7 +1283,8 @@ def adjusted_time(cuda_record, cuda_records_map):
                 fe = FunctionEvent(
                     id=record.handle(),
                     node_id=record.node_id(),
-                    name=string_table[start.name()],
+                    name=rewrite_name(name=start.name(), with_wildcard=True),
+                    trace_name=rewrite_name(name=start.name(), with_wildcard=False),
                     thread=start.thread_id(),
                     start_us=start_record.cpu_elapsed_us(start),
                     end_us=start_record.cpu_elapsed_us(record),

From d14d8c7f7fb64982c082639b49b16d0818e93e7c Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Wed, 27 Jan 2021 19:09:14 -0800
Subject: [PATCH 37/41] Add convenience import (#51195)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51195

Add kineto_available to torch.profiler

Test Plan:
>>> import torch.profiler
>>> torch.profiler.kineto_available()
True

Reviewed By: ngimel

Differential Revision: D26113906

Pulled By: ilia-cher

fbshipit-source-id: fe4502d29d10d8bd9459b0504aa0ee856af43acc
---
 torch/profiler/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index dabbf91dff90..e0f568d7cc4b 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -10,3 +10,4 @@
 '''
 
 from .profiler import profile, schedule, ProfilerAction, ProfilerActivity
+from torch.autograd import kineto_available

From 983b8e6b62fb9bc7260d1c52bbe27e99b771ad56 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 27 Jan 2021 19:33:26 -0800
Subject: [PATCH 38/41] fake_quant: add a more memory efficient version
 (#50561)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50561

Not for review yet, a bunch of TODOs need finalizing.

tl;dr; add an alternative implementation of `fake_quantize` which saves
a ask during the forward pass and uses it to calculate the backward.

There are two benefits:

1. the backward function no longer needs the input Tensor, and it can be
gc'ed earlier by autograd.  On MobileNetV2, this reduces QAT overhead
by ~15% (TODO: link, and absolute numbers).  We add an additional mask Tensor
to pass around, but its size is 4x smaller than the input tensor. A
future optimization would be to pack the mask bitwise and unpack in the
backward.

2. the computation of `qval` can be done only once in the forward and
reused in the backward. No perf change observed, TODO verify with better
matrics.

TODO: describe in more detail

Test Plan:
OSS / torchvision / MobileNetV2
```
python references/classification/train_quantization.py
  --print-freq 1
  --data-path /data/local/packages/ai-group.imagenet-256-smallest-side/prod/
  --output-dir ~/nfs/pytorch_vision_tests/
  --backend qnnpack
  --epochs 5
TODO paste results here
```

TODO more

Imported from OSS

Reviewed By: ngimel

Differential Revision: D25918519

fbshipit-source-id: ec544ca063f984de0f765bf833f205c99d6c18b6
---
 aten/src/ATen/native/native_functions.yaml    |  8 ++
 .../cpu/kernels/QuantizedOpKernels.cpp        | 34 ++++++++
 .../quantized/cuda/fake_quantize_core.cu      | 32 ++++++++
 .../ATen/native/quantized/fake_quant_affine.h | 10 +++
 .../fake_quant_per_tensor_affine.cpp          | 81 ++++++++++++++++---
 .../pt/quantization_test.py                   | 14 ++--
 test/quantization/test_workflow_module.py     | 59 ++++++++++++++
 test/test_namedtuple_return_api.py            |  4 +-
 tools/autograd/derivatives.yaml               |  3 +
 torch/overrides.py                            |  1 +
 torch/quantization/fake_quantize.py           | 16 +++-
 11 files changed, 243 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1856b9a9bf13..1fd04b4255b7 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4642,6 +4642,14 @@
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   variants: function
 
+- func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine_cachemask
+
+- func: fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+  variants: function
+
 - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 8137049a75c8..5ed6e28663e0 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2091,6 +2091,38 @@ void fake_quantize_grad_tensor_kernel(
   });
 }
 
+void fake_quantize_tensor_cachemask_kernel(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float sc,
+    int64_t z_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  float inv_scale = 1.0f / sc;
+
+  auto iter_combined = TensorIteratorConfig()
+    .check_all_same_dtype(false)
+    .add_output(output)
+    .add_output(mask)
+    .add_input(input)
+    .build();
+
+  // TODO(#51090): make it work for other dtypes
+  iter_combined.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
+      float* output_val = (float*)(data[0] + i * strides[0]);
+      bool* mask_val = (bool*)(data[1] + i * strides[1]);
+      float* input_val = (float*)(data[2] + i * strides[2]);
+
+      const auto qval = static_cast<int64_t>(z_point + std::nearbyint(*input_val * inv_scale));
+      *output_val = (std::fmin(std::fmax(qval, quant_min), quant_max) - z_point) * sc;
+      *mask_val = ((quant_min <= qval) && (qval <= quant_max));
+    }
+  });
+
+}
+
 void fake_quantize_learnable_tensor_grad_kernel_cpu(
     TensorIterator& iter,
     float scale,
@@ -3054,6 +3086,8 @@ REGISTER_DISPATCH(fake_quant_grad_tensor_stub,
                   &fake_quantize_grad_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu);
 REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel);
+REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub,
+                  &fake_quantize_tensor_cachemask_kernel);
 REGISTER_DISPATCH(qadaptive_avg_pool2d_nhwc_stub,
                   &qadaptive_avg_pool2d_nhwc_kernel);
 REGISTER_DISPATCH(qadaptive_avg_pool3d_ndhwc_stub,
diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
index e2f51398b48f..87937df546a8 100644
--- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
+++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
@@ -68,6 +68,37 @@ void fake_quantize_grad_tensor_kernel_cuda(
   });
 }
 
+void fake_quantize_tensor_cachemask_kernel_cuda(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+
+  float inv_scale = 1.0f / scale;
+  auto iter = TensorIteratorConfig()
+    .check_all_same_dtype(false)
+    .add_output(output)
+    .add_output(mask)
+    .add_input(input)
+    .build();
+
+  gpu_kernel_multiple_outputs(
+    iter, 
+    [=] GPU_LAMBDA (float input_val) -> thrust::tuple<float, bool> {
+      const auto qval = static_cast<int64_t>(std::nearbyint(input_val * inv_scale) + zero_point);
+      return {
+        // fake_quantized value
+        (fminf(quant_max, fmaxf(quant_min, qval)) - zero_point) * scale,
+        // mask for grad
+        ((quant_min <= qval) && (qval <= quant_max))
+      };
+    }
+  );
+}
+
 void _fake_quantize_grad_learnable_tensor_kernel_cuda(
     TensorIterator& iter,
     float scale,
@@ -96,6 +127,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda(
 }
 
 REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel_cuda);
+REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel_cuda);
 REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel_cuda);
 REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &_fake_quantize_grad_learnable_tensor_kernel_cuda);
 
diff --git a/aten/src/ATen/native/quantized/fake_quant_affine.h b/aten/src/ATen/native/quantized/fake_quant_affine.h
index 7a90ff57ae1b..6865c75f4a49 100644
--- a/aten/src/ATen/native/quantized/fake_quant_affine.h
+++ b/aten/src/ATen/native/quantized/fake_quant_affine.h
@@ -26,6 +26,15 @@ using fake_quant_grad_tensor_fn = void (*)(
     int64_t quant_min,
     int64_t quant_max);
 
+using fake_quant_tensor_cachemask_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float sc,
+    int64_t z_point,
+    int64_t quant_min,
+    int64_t quant_max);
+
 using fake_quant_learnable_grad_tensor_fn = void (*)(
     TensorIterator& iter,
     float scale,
@@ -36,6 +45,7 @@ using fake_quant_learnable_grad_tensor_fn = void (*)(
 
 DECLARE_DISPATCH(fake_quant_tensor_fn, fake_quant_tensor_stub);
 DECLARE_DISPATCH(fake_quant_grad_tensor_fn, fake_quant_grad_tensor_stub);
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub);
 DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub);
 
 using fake_quant_per_channel_fn = void (*)(
diff --git a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
index fb0853cf2ff2..a782033d5002 100644
--- a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
+++ b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
@@ -12,16 +12,19 @@ namespace native {
 // Use REGISTER_DISPATCH to run CPU and CUDA backend.
 DEFINE_DISPATCH(fake_quant_tensor_stub);
 DEFINE_DISPATCH(fake_quant_grad_tensor_stub);
+DEFINE_DISPATCH(fake_quant_tensor_cachemask_stub);
 DEFINE_DISPATCH(fake_quant_grad_learnable_tensor_stub);
 
 /* Fake-quantizes the 'inputs' tensor.
+
 Args:
-  X: Forward input tensor.
+  self: Forward input tensor.
   dY: Backward input tensor (_backward op only).
   scale: scale of per tensor affine quantization
   zero_point: zero_point of per tensor affine quantization
   quant_min: minimum quantized value
   quant_max: maximum quantized value
+
 Returns:
   Quantized tensor (double dtype).
 
@@ -50,22 +53,15 @@ Tensor fake_quantize_per_tensor_affine(
 /* Backward path to fake-quantize the 'inputs' tensor.
 
 Args:
-  X: Forward input tensor.
   dY: Backward input tensor.
+  X: Forward input tensor.
   scale: scale of per tensor affine quantization
   zero_point: zero_point of per tensor affine quantization
   quant_min: minimum quantized value
   quant_max: maximum quantized value
-  quant_delay: Count of global steps for which to delay the quantization.
-               See note in forward.
-  iter: The current quantization iteration used for `quant_delay`.
+
 Returns:
   Quantized tensor (double dtype).
-
-Notes:
-  - quant_delay might be set to non-zero to help weights stabilize in the
-    beginning of the training.
-  - quantization range [0, 2^bits - 1]
 */
 
 Tensor fake_quantize_per_tensor_affine_backward(
@@ -95,6 +91,71 @@ Tensor fake_quantize_per_tensor_affine_backward(
   return dX;
 }
 
+/* Fake-quantizes the 'inputs' tensor, saving a mask for the backward pass.
+
+This is numerically equivalent to `fake_quantize_per_tensor_affine`,
+but has a lower memory overhead in the backward pass.
+
+Args:
+  self: Forward input tensor.
+  scale: scale of per tensor affine quantization
+  zero_point: zero_point of per tensor affine quantization
+  quant_min: minimum quantized value
+  quant_max: maximum quantized value
+
+Returns:
+  Quantized tensor (double dtype).
+  Mask (bool dtype).
+*/
+std::tuple<Tensor, Tensor> fake_quantize_per_tensor_affine_cachemask(
+    const Tensor& self,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max) {
+  TORCH_CHECK(self.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(
+      quant_min <= quant_max,
+      "`quant_min` should be less than or \
+        equal to `quant_max`.");
+  TORCH_CHECK(
+      zero_point >= quant_min && zero_point <= quant_max,
+      "`zero_point` must be between `quant_min` and `quant_max`.");
+
+  auto Y = at::empty_like(self, self.options(), MemoryFormat::Preserve);
+  auto mask = at::empty_like(self, at::kBool, MemoryFormat::Preserve);
+  fake_quant_tensor_cachemask_stub(
+      self.device().type(), Y, mask, self, scale, zero_point, quant_min, quant_max);
+  // TODO(future, optional): look into packing the mask further (BoolTensor uses
+  //   1 byte per element, we only need 1 bit per element).
+  return std::make_tuple(Y, mask);
+}
+
+/* Backward path to fake-quantize the 'inputs' tensor, with mask.
+
+Args:
+  dY: output grad.
+  mask: mask tensor from the forward pass.
+
+Returns:
+  dX (input grad).
+*/
+Tensor fake_quantize_per_tensor_affine_cachemask_backward(
+    const Tensor& dY,
+    const Tensor& mask) {
+  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Bool);
+  TORCH_CHECK(mask.numel() == dY.numel(),
+      "`mask` and `dY` are not the same size: ",
+      "`mask` is size ", mask.numel(), " and `dY` is size ", dY.numel());
+  if (dY.numel() <= 0) {
+    return dY;
+  }
+  // Note: no additional kernels needed, since mask is pre-computed
+  // and we can use the existing tensor multiplication kernels.
+  return dY * mask;
+}
+
 int64_t _get_zero_point_from_tensor(
     const Tensor& zero_point,
     int64_t quant_min,
diff --git a/benchmarks/operator_benchmark/pt/quantization_test.py b/benchmarks/operator_benchmark/pt/quantization_test.py
index af09a5fa2523..a8377fb3e488 100644
--- a/benchmarks/operator_benchmark/pt/quantization_test.py
+++ b/benchmarks/operator_benchmark/pt/quantization_test.py
@@ -130,35 +130,38 @@ def forward(self, input, scales, zero_points, axis: int, dtype: int):
     'attr_names': ['N', 'C', 'H', 'W'],
     'attrs': [
         [1, 3, 512, 512],
-        [1, 3, 512, 512]
     ],
     'tags': ['short']
 }
 
 fake_quantize_configs_long_dict = {
     'N': [1],
-    'C': [1, 3, 8],
+    'C': [1, 3, 8, 32],
     'H': [256, 1024],
     'W': [256, 1024],
     'tags': ['long']
 }
 
 fake_quantize_configs_short = op_bench.config_list(
+    cross_product_configs={
+        'device': ('cpu', 'cuda'),
+    },
     **fake_quantize_configs_short_dict
 )
 
 fake_quantize_configs_long = op_bench.cross_product_configs(
+    device=('cpu', 'cuda'),
     **fake_quantize_configs_long_dict
 )
 
 
 class FakeQuantizeBenchmark(op_bench.TorchBenchmarkBase):
     r"""Benchmarks fake quantization with default parameters."""
-    def init(self, N, C, H, W):
+    def init(self, N, C, H, W, device):
         self.inputs = {
-            "input": torch.rand(N, C, H, W)
+            "input": torch.rand(N, C, H, W).to(device)
         }
-        self.op = tq.FakeQuantize()
+        self.op = tq.FakeQuantize().to(device)
         self.set_module_name('FakeQuantize')
 
     def forward(self, input):
@@ -169,6 +172,7 @@ def forward(self, input):
     fake_quantize_configs_short + fake_quantize_configs_long,
     FakeQuantizeBenchmark)
 
+
 # op_type is used to describe the type of operator used in benchmarking:
 # py_module represents the operator written in Python that can
 # backpropagate on scale and zero point.
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 866e1971ab19..869cd2cf3715 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -861,6 +861,65 @@ def test_backward_per_tensor(self, device, X):
         Y_prime.backward(dout)
         np.testing.assert_allclose(dX.cpu(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance)
 
+    def _test_forward_per_tensor_cachemask_impl(self, device):
+        for torch_type in (torch.qint8, torch.quint8):
+            X = torch.randn(4, 8).to(device)
+            # pick the scale + zp so that some values get clipped
+            obs = torch.quantization.MinMaxObserver(torch_type)
+            obs(X * 0.75)
+            scale, zero_point = obs.calculate_qparams()
+            scale, zero_point = float(scale), int(zero_point)
+            quant_min, quant_max = obs._calculate_qmin_qmax()
+
+            Y_test, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
+                X, scale, zero_point, quant_min, quant_max)
+            Y_ref = _fake_quantize_per_tensor_affine_reference(
+                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
+            self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
+
+    def test_forward_per_tensor_cachemask_cpu(self):
+        device = torch.device('cpu')
+        self._test_forward_per_tensor_cachemask_impl(device)
+
+    @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+    def test_forward_per_tensor_cachemask_cuda(self):
+        device = torch.device('cuda')
+        self._test_forward_per_tensor_cachemask_impl(device)
+
+    def _test_backward_per_tensor_cachemask_impl(self, device):
+        for torch_type in (torch.qint8, torch.quint8):
+            X = torch.randn(4, 8).to(device)
+            X.requires_grad_()
+            # pick the scale + zp so that some values get clipped
+            obs = torch.quantization.MinMaxObserver(torch_type)
+            obs(X * 0.75)
+            scale, zero_point = obs.calculate_qparams()
+            scale, zero_point = float(scale), int(zero_point)
+            quant_min, quant_max = obs._calculate_qmin_qmax()
+
+            # forward pass
+            Y_test, mask = torch.fake_quantize_per_tensor_affine_cachemask(
+                X, scale, zero_point, quant_min, quant_max)
+            Y_ref = _fake_quantize_per_tensor_affine_reference(
+                X.cpu(), scale, zero_point, quant_min, quant_max).to(device)
+            self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
+
+            # backward pass
+            dout = torch.rand(X.shape, dtype=torch.float).to(device)
+            dX = _fake_quantize_per_tensor_affine_grad_reference(
+                dout, X, scale, zero_point, quant_min, quant_max)
+            Y_test.backward(dout)
+            self.assertTrue(torch.allclose(dX, X.grad))
+
+    def test_backward_per_tensor_cachemask_cpu(self):
+        device = torch.device('cpu')
+        self._test_backward_per_tensor_cachemask_impl(device)
+
+    @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+    def test_backward_per_tensor_cachemask_cuda(self):
+        device = torch.device('cuda')
+        self._test_backward_per_tensor_cachemask_impl(device)
+
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            X=hu.tensor(shapes=hu.array_shapes(1, 5,),
                        elements=hu.floats(-1e3, 1e3, allow_nan=False, allow_infinity=False),
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 00432c9e71cd..5ee70c0dacd1 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -14,7 +14,7 @@
     'max', 'min', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
     'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq',
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual", 'linalg_qr',
-    '_svd_helper', 'linalg_svd', 'linalg_slogdet',
+    '_svd_helper', 'linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
 }
 
 
@@ -68,6 +68,8 @@ def test_namedtuple_return(self):
             op(operators=['lstsq'], input=(a,), names=('solution', 'QR'), hasout=True),
             op(operators=['linalg_eigh'], input=("L",), names=('eigenvalues', 'eigenvectors'), hasout=True),
             op(operators=['linalg_slogdet'], input=(), names=('sign', 'logabsdet'), hasout=True),
+            op(operators=['fake_quantize_per_tensor_affine_cachemask'],
+               input=(0.1, 0, 0, 255), names=('output', 'mask',), hasout=False),
             op(operators=['unpack_dual'], input=(0,), names=('primal', 'tangent'), hasout=False),
         ]
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c199d5a4e9df..a9a751abd260 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -461,6 +461,9 @@
 - name: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   self: fake_quantize_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max)
 
+- name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+  self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
+
 - name: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
   self, scale, zero_point: "grad.defined() ? _fake_quantize_learnable_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max) : std::tuple<Tensor, Tensor, Tensor>()"
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 1a5ebfb9a133..187bb0425dc7 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -391,6 +391,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.expm1: lambda input, out=None: -1,
         torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
         torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
+        torch.fake_quantize_per_tensor_affine_cachemask: lambda input, scale, zero_point, quant_min, quant_max: -1,
         torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
         torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
         torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 46dba803a1ff..4d29db46acc4 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -140,9 +140,19 @@ def forward(self, X):
                 X = torch.fake_quantize_per_channel_affine(X, self.scale, self.zero_point,
                                                            self.ch_axis, self.quant_min, self.quant_max)
             else:
-                X = torch.fake_quantize_per_tensor_affine(X, float(self.scale),
-                                                          int(self.zero_point), self.quant_min,
-                                                          self.quant_max)
+                if self.training:
+                    # During training, use the memory optimized fake_quant
+                    # forward. It has a reduced memory overhead in the backward
+                    # pass compared to fake_quantize_per_tensor_affine.
+                    X, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
+                        X, float(self.scale), int(self.zero_point),
+                        self.quant_min, self.quant_max)
+                else:
+                    # During inference, use the fastest fake_quant
+                    # which does not compute any extra info for the backward.
+                    X = torch.fake_quantize_per_tensor_affine(
+                        X, float(self.scale), int(self.zero_point),
+                        self.quant_min, self.quant_max)
         return X
 
     @torch.jit.export

From 0335222a4ae3219061b36257dde411694ffb3f67 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 27 Jan 2021 19:33:26 -0800
Subject: [PATCH 39/41] memory efficient fq: use it everywhere, delete the old
 version (#51159)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/51159

This PR is the cleanup after #50561. High level, we make the new
definition of fake_quant be the definition used by autograd, but keep the old
function around as a thin wrapper to keep the user facing API the same.

In detail:
1. point `fake_quantize_per_tensor_affine`'s implementation to be `fake_quantize_per_tensor_affine_cachemask`
2. delete the `fake_quantize_per_tensor_affine` backward, autograd will automatically use the cachemask backward
3. delete all the `fake_quantize_per_tensor_affine` kernels, since they are no longer used by anything

Test Plan:
```
python test/test_quantization.py TestFakeQuantize
```

performance testing was done in the previous PR.

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D26090869

fbshipit-source-id: fda042881f77a993a9d15dafabea7cfaf9dc7c9c
---
 aten/src/ATen/native/native_functions.yaml    |  5 +-
 .../cpu/kernels/QuantizedOpKernels.cpp        | 40 -------------
 .../quantized/cuda/fake_quantize_core.cu      | 50 ----------------
 .../ATen/native/quantized/fake_quant_affine.h | 19 ------
 .../fake_quant_per_tensor_affine.cpp          | 59 +------------------
 .../check_backward_compatibility.py           |  1 +
 tools/autograd/derivatives.yaml               |  3 -
 torch/quantization/fake_quantize.py           | 16 +----
 8 files changed, 8 insertions(+), 185 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1fd04b4255b7..0f86bc4d3819 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4637,10 +4637,7 @@
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   variants: function
   dispatch:
-    CPU, CUDA: fake_quantize_per_tensor_affine
-
-- func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  variants: function
+    Math: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
   variants: function
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 5ed6e28663e0..f4c1b1d572f4 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2054,43 +2054,6 @@ void q_batch_norm_kernel(
 
 }
 
-void fake_quantize_tensor_kernel(
-    Tensor& output,
-    const Tensor& input,
-    float sc,
-    int64_t z_point,
-    int64_t quant_min,
-    int64_t quant_max) {
-  float inv_scale = 1.0f / sc;
-  auto iter = TensorIterator::unary_op(output, input);
-  cpu_kernel(iter, [&](float self) -> float {
-    return (std::fmin(
-                std::fmax(
-                    static_cast<int64_t>(
-                        z_point + std::nearbyint(self * inv_scale)),
-                    quant_min),
-                quant_max) -
-            z_point) *
-        sc;
-  });
-}
-
-void fake_quantize_grad_tensor_kernel(
-    Tensor& input_grad,
-    const Tensor& input,
-    const Tensor& output_grad,
-    float sc,
-    int64_t z_point,
-    int64_t quant_min,
-    int64_t quant_max) {
-  float inv_scale = 1.0f / sc;
-  auto iter = TensorIterator::binary_op(input_grad, input, output_grad);
-  cpu_kernel(iter, [&](float x, float dy) -> float {
-    int64_t xq = static_cast<int64_t>(z_point + std::nearbyint(x * inv_scale));
-    return dy * (xq >= quant_min && xq <= quant_max);
-  });
-}
-
 void fake_quantize_tensor_cachemask_kernel(
     Tensor& output,
     Tensor& mask,
@@ -3082,10 +3045,7 @@ REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub,
                   &fake_quantize_learnable_tensor_grad_kernel_cpu);
 REGISTER_DISPATCH(fake_quant_grad_per_channel_stub,
                   &fake_quant_grad_per_channel_cpu);
-REGISTER_DISPATCH(fake_quant_grad_tensor_stub,
-                  &fake_quantize_grad_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu);
-REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub,
                   &fake_quantize_tensor_cachemask_kernel);
 REGISTER_DISPATCH(qadaptive_avg_pool2d_nhwc_stub,
diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
index 87937df546a8..9d74bc9a3400 100644
--- a/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
+++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu
@@ -20,54 +20,6 @@ Returns:
 */
 namespace at {
 namespace native {
-void fake_quantize_tensor_kernel_cuda(
-    Tensor& output,
-    const Tensor& input,
-    float scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max) {
-  // scalar type of this function is guaranteed to be float
-  float inv_scale = 1.0f / scale;
-  auto iter = TensorIteratorConfig()
-    .check_all_same_dtype(false)
-    .add_output(output)
-    .add_input(input)
-    .build();
-  gpu_kernel(iter, [=] GPU_LAMBDA(float input_val) -> float {
-    return (fminf(
-                quant_max,
-                fmaxf(
-                    quant_min,
-                    static_cast<int64_t>(
-                        std::nearbyint(input_val * inv_scale) + zero_point))) -
-            zero_point) *
-        scale;
-  });
-}
-
-void fake_quantize_grad_tensor_kernel_cuda(
-    Tensor& input_grad,
-    const Tensor& input,
-    const Tensor& output_grad,
-    float scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max) {
-  // scalar type of this function is guaranteed to be float
-  float inv_scale = 1.0f / scale;
-  auto iter = TensorIteratorConfig()
-    .check_all_same_dtype(false)
-    .add_output(input_grad)
-    .add_input(output_grad)
-    .add_input(input)
-    .build();
-  gpu_kernel(iter, [=] GPU_LAMBDA(float dy, float x) -> float {
-    int64_t Xq = std::nearbyint(x * inv_scale) + zero_point;
-    return (Xq >= quant_min && Xq <= quant_max) * dy;
-  });
-}
-
 void fake_quantize_tensor_cachemask_kernel_cuda(
     Tensor& output,
     Tensor& mask,
@@ -126,9 +78,7 @@ void _fake_quantize_grad_learnable_tensor_kernel_cuda(
   });
 }
 
-REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel_cuda);
 REGISTER_DISPATCH(fake_quant_tensor_cachemask_stub, &fake_quantize_tensor_cachemask_kernel_cuda);
-REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel_cuda);
 REGISTER_DISPATCH(fake_quant_grad_learnable_tensor_stub, &_fake_quantize_grad_learnable_tensor_kernel_cuda);
 
 // Fake quantize per channel
diff --git a/aten/src/ATen/native/quantized/fake_quant_affine.h b/aten/src/ATen/native/quantized/fake_quant_affine.h
index 6865c75f4a49..0215f521eb1f 100644
--- a/aten/src/ATen/native/quantized/fake_quant_affine.h
+++ b/aten/src/ATen/native/quantized/fake_quant_affine.h
@@ -9,23 +9,6 @@ struct TensorIterator;
 
 namespace native {
 
-using fake_quant_tensor_fn = void (*)(
-    Tensor& output,
-    const Tensor& input,
-    float sc,
-    int64_t z_point,
-    int64_t quant_min,
-    int64_t quant_max);
-
-using fake_quant_grad_tensor_fn = void (*)(
-    Tensor& input_grad,
-    const Tensor& input,
-    const Tensor& output_grad,
-    float sc,
-    int64_t z_point,
-    int64_t quant_min,
-    int64_t quant_max);
-
 using fake_quant_tensor_cachemask_fn = void (*)(
     Tensor& output,
     Tensor& mask,
@@ -43,8 +26,6 @@ using fake_quant_learnable_grad_tensor_fn = void (*)(
     int64_t quant_min,
     int64_t quant_max);
 
-DECLARE_DISPATCH(fake_quant_tensor_fn, fake_quant_tensor_stub);
-DECLARE_DISPATCH(fake_quant_grad_tensor_fn, fake_quant_grad_tensor_stub);
 DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub);
 DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub);
 
diff --git a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
index a782033d5002..128db5ebb7b4 100644
--- a/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
+++ b/aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp
@@ -10,8 +10,6 @@ namespace at {
 namespace native {
 
 // Use REGISTER_DISPATCH to run CPU and CUDA backend.
-DEFINE_DISPATCH(fake_quant_tensor_stub);
-DEFINE_DISPATCH(fake_quant_grad_tensor_stub);
 DEFINE_DISPATCH(fake_quant_tensor_cachemask_stub);
 DEFINE_DISPATCH(fake_quant_grad_learnable_tensor_stub);
 
@@ -35,60 +33,9 @@ Tensor fake_quantize_per_tensor_affine(
     int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max) {
-  TORCH_CHECK(self.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(
-      quant_min <= quant_max,
-      "`quant_min` should be less than or \
-        equal to `quant_max`.");
-  TORCH_CHECK(
-      zero_point >= quant_min && zero_point <= quant_max,
-      "`zero_point` must be between `quant_min` and `quant_max`.");
-
-  auto Y = at::empty_like(self, self.options(), MemoryFormat::Preserve);
-  fake_quant_tensor_stub(
-      self.device().type(), Y, self, scale, zero_point, quant_min, quant_max);
-  return Y;
-}
-
-/* Backward path to fake-quantize the 'inputs' tensor.
-
-Args:
-  dY: Backward input tensor.
-  X: Forward input tensor.
-  scale: scale of per tensor affine quantization
-  zero_point: zero_point of per tensor affine quantization
-  quant_min: minimum quantized value
-  quant_max: maximum quantized value
-
-Returns:
-  Quantized tensor (double dtype).
-*/
-
-Tensor fake_quantize_per_tensor_affine_backward(
-    const Tensor& dY,
-    const Tensor& X,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max) {
-  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(X.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(X.numel() == dY.numel(), "`X` and `dY` are not the same size");
-  TORCH_CHECK(
-      quant_min <= quant_max,
-      "`quant_min` should be less than or \
-        equal to `quant_max`.");
-  TORCH_CHECK(
-      zero_point >= quant_min && zero_point <= quant_max,
-      "`zero_point` must be between `quant_min` and `quant_max`.");
-  if (X.numel() <= 0) {
-    return X;
-  }
-
-  auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  fake_quant_grad_tensor_stub(
-      X.device().type(), dX, X, dY, scale, zero_point, quant_min, quant_max);
-  return dX;
+  const auto res = at::fake_quantize_per_tensor_affine_cachemask(
+      self, scale, zero_point, quant_min, quant_max);
+  return std::get<0>(res);
 }
 
 /* Fake-quantizes the 'inputs' tensor, saving a mask for the backward pass.
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 981f21d5de67..658c61dd4425 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -56,6 +56,7 @@
     ("aten::_multinomial_alias_setup", datetime.date(2021, 1, 31)),
     ("aten::_multinomial_alias_draw", datetime.date(2021, 1, 31)),
     ("prim::profile_optional", datetime.date(2021, 1, 31)),
+    ("aten::fake_quantize_per_tensor_affine_backward", datetime.date(2021, 2, 20)),
 ]
 
 def allow_listed(schema, allow_list):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a9a751abd260..31136610ed4a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -458,9 +458,6 @@
 - name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   self: zeros_like(grad)
 
-- name: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  self: fake_quantize_per_tensor_affine_backward(grad, self, scale, zero_point, quant_min, quant_max)
-
 - name: fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
   self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask)
 
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 4d29db46acc4..58d0ad7a055a 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -140,19 +140,9 @@ def forward(self, X):
                 X = torch.fake_quantize_per_channel_affine(X, self.scale, self.zero_point,
                                                            self.ch_axis, self.quant_min, self.quant_max)
             else:
-                if self.training:
-                    # During training, use the memory optimized fake_quant
-                    # forward. It has a reduced memory overhead in the backward
-                    # pass compared to fake_quantize_per_tensor_affine.
-                    X, _mask = torch.fake_quantize_per_tensor_affine_cachemask(
-                        X, float(self.scale), int(self.zero_point),
-                        self.quant_min, self.quant_max)
-                else:
-                    # During inference, use the fastest fake_quant
-                    # which does not compute any extra info for the backward.
-                    X = torch.fake_quantize_per_tensor_affine(
-                        X, float(self.scale), int(self.zero_point),
-                        self.quant_min, self.quant_max)
+                X = torch.fake_quantize_per_tensor_affine(
+                    X, float(self.scale), int(self.zero_point),
+                    self.quant_min, self.quant_max)
         return X
 
     @torch.jit.export

From dfdb1547b9c1934904bfd137b4007d6a46a6f597 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 27 Jan 2021 19:42:12 -0800
Subject: [PATCH 40/41] Revert D26094906: Add serialization logic for complex
 numbers

Test Plan: revert-hammer

Differential Revision:
D26094906 (https://github.com/pytorch/pytorch/commit/2de4ecd4ebc99d509b8f13ff12ed241c7433a0ad)

Original commit changeset: 7b2614f3ee4a

fbshipit-source-id: 6f32a9fc6bb2a904ca1a282bbc6b2df0aee50068
---
 test/test_complex.py                       | 12 ------------
 torch/csrc/jit/python/pybind_utils.h       |  2 --
 torch/csrc/jit/serialization/pickler.cpp   | 10 ----------
 torch/csrc/jit/serialization/pickler.h     |  1 -
 torch/csrc/jit/serialization/unpickler.cpp | 13 +++----------
 5 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/test/test_complex.py b/test/test_complex.py
index cc8ed7f62398..e6a705032d74 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -12,18 +12,6 @@ def fn(a: complex):
 
         self.checkScript(fn, (3 + 5j,))
 
-    def test_pickle(self):
-        class ComplexModule(torch.jit.ScriptModule):
-            def __init__(self):
-                super().__init__()
-                self.a = 3 + 5j
-
-            def forward(self, b: int):
-                return b
-
-        loaded = self.getExportImportCopy(ComplexModule())
-        self.assertEqual(loaded.a, 3 + 5j)
-
 class TestComplexTensor(TestCase):
     @dtypes(*torch.testing.get_all_complex_dtypes())
     def test_to_list(self, device, dtype):
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index eca56876999f..06c57f32f15c 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -293,8 +293,6 @@ inline InferredType tryToInferType(py::handle input) {
     return InferredType(IntType::get());
   } else if (py::isinstance<py::float_>(input)) {
     return InferredType(FloatType::get());
-  } else if (PyComplex_CheckExact(input.ptr())) {
-    return InferredType(ComplexDoubleType::get());
   } else if (py::isinstance<py::str>(input)) {
     return InferredType(StringType::get());
   } else if (THPLayout_Check(input.ptr())) {
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index a0dd826c4267..e2118af2019d 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -49,8 +49,6 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     pushTuple(ivalue);
   } else if (ivalue.isDouble()) {
     pushDouble(ivalue.toDouble());
-  } else if (ivalue.isComplexDouble()) {
-    pushComplexDouble(ivalue);
   } else if (ivalue.isInt()) {
     pushInt(ivalue.toInt());
   } else if (ivalue.isBool()) {
@@ -466,14 +464,6 @@ void Pickler::pushDouble(double value) {
   // Python pickle format is big endian, swap.
   push<double>(swapDouble(value));
 }
-void Pickler::pushComplexDouble(const IValue& value) {
-  c10::complex<double> d = value.toComplexDouble();
-  pushGlobal("builtins", "complex");
-  pushIValue(d.real());
-  pushIValue(d.imag());
-  push<PickleOpCode>(PickleOpCode::TUPLE2);
-  push<PickleOpCode>(PickleOpCode::REDUCE);
-}
 
 void Pickler::pushLong(const std::string& data) {
   uint64_t size = data.size();
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 4dc216ec702b..21d0f61a18eb 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -160,7 +160,6 @@ class TORCH_API Pickler {
   void endTypeTag(const IValue& value);
   void pushBool(bool value);
   void pushDouble(double value);
-  void pushComplexDouble(const IValue& value);
   void pushGenericList(const IValue& ivalue);
   void pushIntList(const IValue& ivalue);
   void pushList(const IValue& ivalue);
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index efeaac75c41c..f363fe73f1e9 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -57,7 +57,6 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case StorageType::Kind:
       case NumberType::Kind:
       case FloatType::Kind:
-      case ComplexDoubleType::Kind:
       case IntType::Kind:
       case NoneType::Kind:
       case GeneratorType::Kind:
@@ -81,6 +80,9 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
       case AnyEnumType::Kind:
         // no op, there is nothing to tag
         break;
+      // TODO(@anjali411): Implement serialization/deserialization for complex
+      // numbers
+      case ComplexDoubleType::Kind:
       case EnumType::Kind:
         // TODO(gmagogsfm): Implement serialization/deserialization of Enum.
         AT_ASSERT(false);
@@ -541,15 +543,6 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
-  } else if (module_name == "builtins" && class_name == "complex") {
-    globals_.emplace_back([this] {
-      auto elems = pop(stack_).toTuple()->elements();
-      AT_ASSERT(elems.size() == 2);
-      auto complex =
-          c10::complex<double>(elems.at(0).toDouble(), elems.at(1).toDouble());
-      stack_.emplace_back(complex);
-    });
-
   } else if (module_name == "collections" && class_name == "OrderedDict") {
     // collections.OrderedDict is used in tensor serialization for a tensor's
     // backward hooks (but they are not actually saved with this Pickler)

From 12a434abbc322b6e1103c7fb0c0afb35447dafb7 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 27 Jan 2021 19:49:03 -0800
Subject: [PATCH 41/41] Revert D26077905: Back out "Revert D25850783: Add
 torch::deploy, an embedded torch-python interpreter"

Test Plan: revert-hammer

Differential Revision:
D26077905 (https://github.com/pytorch/pytorch/commit/dc2a44c4fc5f5efba16b8567ab970f8bcf1fe007)

Original commit changeset: fae83bf9822d

fbshipit-source-id: b70185916502ba9ebe16d781cf0659b9f7865c9a
---
 .github/workflows/lint.yml                    |   6 -
 .gitignore                                    |   3 -
 .jenkins/pytorch/build.sh                     |  11 -
 .jenkins/pytorch/test.sh                      |   8 -
 CMakeLists.txt                                |   5 -
 torch/__init__.py                             |  10 +-
 torch/_ops.py                                 |   3 +-
 torch/_utils_internal.py                      |  16 +-
 torch/csrc/Module.cpp                         |   2 -
 torch/csrc/deploy/.gitignore                  |   1 -
 torch/csrc/deploy/CMakeLists.txt              |   3 -
 torch/csrc/deploy/README.md                   |  10 -
 torch/csrc/deploy/example/simple.pt           | Bin 2432 -> 0 bytes
 torch/csrc/deploy/example/trace_simple.py     |  20 --
 torch/csrc/deploy/interpreter/CMakeLists.txt  | 115 -------
 .../deploy/interpreter/CMakePythonModules.txt |  69 ----
 torch/csrc/deploy/interpreter/freeze.py       | 269 ---------------
 .../deploy/interpreter/hide_symbols.script    |   5 -
 torch/csrc/deploy/interpreter/interpreter.cpp | 324 ------------------
 torch/csrc/deploy/interpreter/interpreter.h   |  67 ----
 .../deploy/interpreter/interpreter_impl.h     |  26 --
 torch/csrc/deploy/interpreter/test_main.cpp   |  49 ---
 .../deploy/interpreter/third_party/README.md  |   2 -
 torch/cuda/__init__.py                        |   4 -
 torch/utils/__init__.py                       |   8 +-
 25 files changed, 12 insertions(+), 1024 deletions(-)
 delete mode 100644 torch/csrc/deploy/.gitignore
 delete mode 100644 torch/csrc/deploy/CMakeLists.txt
 delete mode 100644 torch/csrc/deploy/README.md
 delete mode 100644 torch/csrc/deploy/example/simple.pt
 delete mode 100644 torch/csrc/deploy/example/trace_simple.py
 delete mode 100644 torch/csrc/deploy/interpreter/CMakeLists.txt
 delete mode 100644 torch/csrc/deploy/interpreter/CMakePythonModules.txt
 delete mode 100644 torch/csrc/deploy/interpreter/freeze.py
 delete mode 100644 torch/csrc/deploy/interpreter/hide_symbols.script
 delete mode 100644 torch/csrc/deploy/interpreter/interpreter.cpp
 delete mode 100644 torch/csrc/deploy/interpreter/interpreter.h
 delete mode 100644 torch/csrc/deploy/interpreter/interpreter_impl.h
 delete mode 100644 torch/csrc/deploy/interpreter/test_main.cpp
 delete mode 100644 torch/csrc/deploy/interpreter/third_party/README.md

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 9c215540108b..54acbe7b1c6a 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -170,8 +170,6 @@ jobs:
           # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
           # in a follow up PR.
           # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
-          # deploy/interpreter files are excluded due to using macros and other techniquies
-          # that are not easily converted to accepted c++
           python tools/clang_tidy.py                               \
             --verbose                                              \
             --paths torch/csrc/                                    \
@@ -188,10 +186,6 @@ jobs:
             -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
             -g"-torch/csrc/generic/*.cpp"                          \
             -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
-            -g"-torch/csrc/deploy/interpreter/interpreter.cpp"     \
-            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
-            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
-            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
diff --git a/.gitignore b/.gitignore
index a3a832ce7555..e1fe94cb9bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,9 +66,6 @@ torch/csrc/autograd/generated/*
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
 torch/csrc/cudnn/cuDNN.cpp
-torch/csrc/deploy/interpreter/cpython
-torch/csrc/deploy/interpreter/frozen
-torch/csrc/deploy/interpreter/third_party/typing_extensions.py
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/jit/generated/*
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index dfd359c1ddf4..fad9c8e49e64 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -23,17 +23,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@"
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then
-  # Enabling DEPLOY build (embedded torch python interpreter, experimental)
-  # only on one config for now, can expand later
-  export USE_DEPLOY=ON
-
-  # Deploy feature builds cpython. It requires these packages.
-  # TODO move this to dockerfile?
-  sudo apt-get -qq update
-  sudo apt-get -qq install libffi-dev libbz2-dev libreadline-dev libncurses5-dev libncursesw5-dev libgdbm-dev libsqlite3-dev uuid-dev tk-dev
-fi
-
 echo "Python version:"
 python --version
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index d70a377ec086..73563f145eb8 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -354,11 +354,6 @@ test_vec256() {
   fi
 }
 
-test_torch_deploy() {
-  SIMPLE_MODEL_PATH=torch/csrc/deploy/example/simple.pt LIBINTERPRETER_PATH=build/lib/libinterpreter.so build/bin/interpreter_test
-  assert_git_not_dirty
-}
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -376,9 +371,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then
-    test_torch_deploy
-  fi
   install_torchvision
   test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c138f261c27b..a23208752afb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -919,8 +919,3 @@ endif()
 
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
-
-# ---[ Torch Deploy
-if(USE_DEPLOY)
-  add_subdirectory(torch/csrc/deploy)
-endif()
diff --git a/torch/__init__.py b/torch/__init__.py
index f27af91eb493..3f9df8bc009a 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -22,11 +22,7 @@
 from ._utils import _import_dotted_name
 from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \
     USE_RTLD_GLOBAL_WITH_LIBTORCH, USE_GLOBAL_DEPS
-# TODO(torch_deploy) figure out how to freeze version.py in fbcode build
-if sys.executable == 'torch_deploy':
-    __version__ = "torch-deploy-1.8"
-else:
-    from .version import __version__
+from .version import __version__
 from ._six import string_classes as _string_classes
 
 from typing import Set, Type, TYPE_CHECKING
@@ -138,7 +134,7 @@
 
 # See Note [Global dependencies]
 def _load_global_deps():
-    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
+    if platform.system() == 'Windows':
         return
 
     lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
@@ -520,7 +516,7 @@ class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase):
 ################################################################################
 
 def manager_path():
-    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
+    if platform.system() == 'Windows':
         return b""
     path = get_file_path('torch', 'bin', 'torch_shm_manager')
     prepare_multiprocessing_environment(get_file_path('torch'))
diff --git a/torch/_ops.py b/torch/_ops.py
index 96c8baac7838..dd0c8cd19fde 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import ctypes
+import os
 import sys
 import types
 
@@ -66,7 +67,7 @@ def __getattr__(self, op_name):
         return op
 
 class _Ops(types.ModuleType):
-    __file__ = '_ops.py'
+    __file__ = os.path.join(os.path.dirname(__file__), '_ops.py')
 
     def __init__(self):
         super(_Ops, self).__init__('torch.ops')
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index c77e960ae659..be7d8fcaa685 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -1,7 +1,6 @@
 
 import os
 import inspect
-import sys
 import tempfile
 
 # this arbitrary-looking assortment of functionality is provided here
@@ -9,16 +8,11 @@
 # use is the FB build environment, where this source file is replaced
 # by an equivalent.
 
-if sys.executable == 'torch_deploy':
-    # __file__ is meaningless in the context of frozen torch used in torch deploy.
-    # setting empty torch_parent should allow below functions to operate without crashing,
-    # but it's unclear if there is a valid use case for them in the context of deploy.
-    torch_parent = ""
+if os.path.basename(os.path.dirname(__file__)) == 'shared':
+    torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
 else:
-    if os.path.basename(os.path.dirname(__file__)) == 'shared':
-        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-    else:
-        torch_parent = os.path.dirname(os.path.dirname(__file__))
+    torch_parent = os.path.dirname(os.path.dirname(__file__))
+
 
 def get_file_path(*path_components):
     return os.path.join(torch_parent, *path_components)
@@ -66,7 +60,7 @@ def get_source_lines_and_file(obj, error_msg=None):
 
 TEST_MASTER_ADDR = '127.0.0.1'
 TEST_MASTER_PORT = 29500
-# USE_GLOBAL_DEPS controls whether __init__.py tries to load
+# USE_GLOBAL_DEPS controls whether __init__.py tries to load 
 # libtorch_global_deps, see Note [Global dependencies]
 USE_GLOBAL_DEPS = True
 # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index bbd3ccef505f..eb80fff32b81 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -692,8 +692,6 @@ extern "C"
 #ifdef _WIN32
 __declspec(dllexport)
 #endif
-TORCH_API PyObject* initModule();
-// separate decl and defn for msvc error C2491
 PyObject* initModule() {
   HANDLE_TH_ERRORS
   at::internal::lazy_init_num_threads();
diff --git a/torch/csrc/deploy/.gitignore b/torch/csrc/deploy/.gitignore
deleted file mode 100644
index aa484a97a20f..000000000000
--- a/torch/csrc/deploy/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-example/generated/*
diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt
deleted file mode 100644
index 9da314905860..000000000000
--- a/torch/csrc/deploy/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-
-add_subdirectory(interpreter)
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
deleted file mode 100644
index 4fab5aa4ef56..000000000000
--- a/torch/csrc/deploy/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Torch Deploy
-This is an experimental feature to embed multiple python interpreters inside the torch library,
-providing a solution to the 'GIL problem' for multithreading with the convenience of python
-and eager or torchscripted pytorch programs.
-
-# libinterpreter
-This is an internal library used behind the scenes to enable multiple python interpreters in
-a single deploy runtime.  libinterpreter.so is DLOPENed multiple times by the deploy library.
-Each copy of libinterpreter exposes a simple interpreter interface but hides its python and other
-internal symbols, preventing the different python instances from seeing each other.
diff --git a/torch/csrc/deploy/example/simple.pt b/torch/csrc/deploy/example/simple.pt
deleted file mode 100644
index 50f9a087aa822821647a8acabe28bb84207475b2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2432
zcmah~4Ny~87JmE%jN!K_C{c`n7DSRLExN!vhoWYYRE&s2)-FvT36Ky7`GEqhO3~?p
zL|_?ZsctLdMyZHOE5mjH5kaV>s1(uBHlpj!S}e8F-KAEovhVR{g6sCo+_~?)d%knu
z`R@776^k4k3BtvNc+OmjmkEO^D@UW`D^iRpe1R~=lh``jhIZRyK=*I)!cZZq8){(n
z@pmC?B^ynOG?YYeu>YNUa`b@-j{Wg<^dcXFL{|=d_n*LpxJB4{DiZ_0cLHYUD8#+K
z1H*4J@SRPF(w;cn*y4?rHxI+^Z-0Txury4*pNcYmEbge-j72SNa8-N+niN~G_qF}N
z<@_7^%N(&n#=@$r$*7cjV)N&FVM)6^KJ=2I;OA!2;OT%JqWUNghic$ze6Z|;`Ixk4
zK3-5SLHR&3X6&{=+nIhCDBc5>@l4=GrIPiSf`i2ftn~r#S$8M-R@H4N@;w^W9&!<C
zLhDGMi)<|L&q9HFH_46di#iqX8*C43gr<&b;L)|2O#7+{a%zUbxBU^o$RJcKh{Oo5
zc<jsQg*%TH<2=zL(ANf|(-+QI6jL9inQM=Y<aP)+pO4C%3_M-UN7EV|vi`+@TkA`s
zMy@mQ&gncXI=l@sEo^MNQ4V7|C+rDdh4#K26g&;W%F>18t;$kz&B73@dhIeC+LVs`
zlLq9;xM(`F6k0QwC>~BDzdtq({_3B=an%TTuO5a^Bl;k4EEHMiO5sOy27bIifQgN2
zB-}WWA20bC8nwOTsGTFcx{C#kmHi}<zbk^Bum>KbDNxWcOnTn)#y)>Gs-O0PW1JTh
zx_jV>{nvnD3B?xEILwnAA<f|jV6<g1GK70!*NS}bAcx5sULnq3wjAs83H0yUjC<~C
zab%E%X6S)m{$qw{!!7c=t07psG!ZV|^+Ac>LMW+lz(s#$;PQ1cyw7?Ix5xsJ`Ii&$
zMoAm|aArNmfEhSP6EJ$c5-*z~F>S0H@&#F7Svm^Qi5Wn6X(N+=iojq=D4xeZ!0GNS
z_)UBu4zBgWo|~V*DY*)xx09rY-*HlLX#)}-b5IW_z$M%b<64iwf$(M6o6`(+8<WtK
zW&jb#6K`J*!y$tfFWp>@-=5os3(8l*-#T1TEOK_~{^#dUxef$D5Ah52=1v4y>y4L_
zsS(C3dR}q13M{ghOQlAwUY;S9GB*@#&?<5@N~Vqoes5clgh7e=ZKWzb!>F^9PdM?U
zxki=7z?ABhskthRLTXfI8?<_9-b$v<J}5}yOd$-EXG*$K=P(Ij3N_jk<L1eaOr4{|
zj`9)coFr5>gghr##}J)2Zqhl6h#(PD#5M+r*djY)iCnJLXq0lJN}Fw9ZqzH3dZl8m
zN^WH8T!Isf3A#BFy5|`6DXMH$cDjyPlAEfV%cJS)jB2R3M6J<-ri)Frr|0|AJR)S8
zF1c2r<j=6c%PA1Xyz*<jO|wXrpJjG&J=WfTvU^FLqoy`#@spjF!jN@u{Qjl0YRSI-
zGbgW@>yq!>>$|8?x(K!lPgZp|mzPV4UDpD?SS2)b9a*EAyjQ>U<uFcm)|tl^JvH~f
z*X58rx_4jf!0LZgH*6UCrXgssmb_P$$mv&qwDVA1!OMp*v+-SLnWg&n2UpIHZEt83
zA586eXjt(e;mT+2d9o`PqQB1C*L6I^ue0X-!mD5U(4=$~9Xe;G2zmT}N%1FGFC>Mh
zppHCUI#E_GGxU(<E8L0&?Ds>y<CZ1m2MBkzmX?jQ67lcHW**=+9d^|js$MFX`?j|B
zNAK$B$9(?7p+Dg`8CR(|x6U)Z(x@(eQgACQGJ?r-Qg*0s+-ZsR-TJ9(N!->`Uv~?R
zH22tDTQeU2<}#k8qp-i&eDsqR-y4#3*9XQpA0E7W)HCUK1BpX}Y@ZMJ|J~a?yR+=^
z3hfWqUuo3J@RDhN-3~f$-}Idi{!HaOFqQZJzN^@T|FgWa4aSsgqhV4&elwX)pSbN@
zWFI_HbXoj4Nz_G1(Bs7#bt9su_sLW04ODFlW1@a#2iyH-zlAaBEWi3FeTzM{Gd;v2
zJ9?VAldZdlE4^T2HaGNasI`*ZZD2W+c&(r`SCimpVzaKec{bpK6iHLy=cG3~iuL*V
z+eB${dU2FzwK_W`>vLk;#A~3s-OOKE)xhis)?B;UL|{|hZz{sn>54@R#_}m2KZnp6
pZ@RJ4Ha(l@o=900-9IU#a1&2{b0**EWwFR<!ks~(X#Yvy{{r501)Tr@

diff --git a/torch/csrc/deploy/example/trace_simple.py b/torch/csrc/deploy/example/trace_simple.py
deleted file mode 100644
index 8803a48b7493..000000000000
--- a/torch/csrc/deploy/example/trace_simple.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import argparse
-import torch
-
-class MyModule(torch.nn.Module):
-    def __init__(self, N, M):
-        super(MyModule, self).__init__()
-        self.weight = torch.nn.Parameter(torch.rand(N, M))
-
-    def forward(self, input):
-        output = self.weight + input
-        return output
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("save_file", help="Where to save the model")
-    args = parser.parse_args()
-
-    my_module = MyModule(10, 20)
-    sm = torch.jit.script(my_module)
-    sm.save(args.save_file)
diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt
deleted file mode 100644
index 0c4c778291eb..000000000000
--- a/torch/csrc/deploy/interpreter/CMakeLists.txt
+++ /dev/null
@@ -1,115 +0,0 @@
-SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" )
-SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" PARENT_SCOPE)
-
-SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
-
-# Build cpython
-SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython")
-SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8")
-SET(PYTHON_LIB "${PYTHON_INSTALL_DIR}/lib/libpython3.8.a")
-SET(PYTHON_BIN "${PYTHON_INSTALL_DIR}/bin/python3")
-ExternalProject_Add(
-  cpython
-  PREFIX cpython
-  GIT_REPOSITORY https://github.com/python/cpython.git
-  GIT_TAG v3.8.6
-  UPDATE_COMMAND ""
-  BUILD_IN_SOURCE True
-  CONFIGURE_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC <SOURCE_DIR>/configure --prefix ${PYTHON_INSTALL_DIR}
-  BUILD_COMMAND CFLAGS=-fPIC CPPFLAGS=-fPIC make -j8
-  INSTALL_COMMAND make install
-  BYPRODUCTS ${PYTHON_MODULES} ${PYTHON_LIB} ${PYTHON_BIN}
-  LOG_OUTPUT_ON_FAILURE True
-)
-
-# We find the built python modules, this is confusing because python build already outputs
-# the modules in a strange nested path, and then that path is relative to the
-# Cmake ExternalProject root in the cmake build dir.
-ExternalProject_Get_property(cpython SOURCE_DIR)
-SET(PYTHON_MODULE_DIR "${SOURCE_DIR}/build/temp.linux-x86_64-3.8/${SOURCE_DIR}/Modules")
-SET(PYTHON_STDLIB_DIR "${SOURCE_DIR}/Lib")
-SET(PYTHON_STDLIB "${PYTHON_INSTALL_DIR}/lib/libpython_stdlib3.8.a")
-# Then we use a hardcoded list of expected module names and include them in our lib
-include("CMakePythonModules.txt")
-ExternalProject_Add_Step(
-  cpython
-  archive_stdlib
-  DEPENDEES install
-  BYPRODUCTS ${PYTHON_STDLIB}
-  COMMAND ar -rc ${PYTHON_STDLIB} ${PYTHON_MODULES}
-  VERBATIM
-)
-# Get python typing extension, needed by torch
-SET(TYPING_PKG "${INTERPRETER_DIR}/third_party/typing_extensions.py")
-ExternalProject_Add(
-  typing
-  PREFIX typing
-  GIT_REPOSITORY https://github.com/python/typing.git
-  GIT_TAG 3.7.4.3
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  INSTALL_COMMAND cp ../typing/typing_extensions/src_py3/typing_extensions.py ${TYPING_PKG}
-  BYPRODUCTS ${TYPING_PKG}
-  LOG_OUTPUT_ON_FAILURE True
-)
-
-# Output files generated by freeze script, containing frozen bytecode
-SET(FROZEN_DIR "${INTERPRETER_DIR}/frozen")
-set(FROZEN_FILES
-  ${FROZEN_DIR}/main.c
-  ${FROZEN_DIR}/bytecode_0.c
-  ${FROZEN_DIR}/bytecode_1.c
-  ${FROZEN_DIR}/bytecode_2.c
-  ${FROZEN_DIR}/bytecode_3.c
-  ${FROZEN_DIR}/bytecode_4.c
-)
-# Packages to freeze: python stdlib, typing extension, and torch
-add_custom_command(
-   OUTPUT ${FROZEN_FILES}
-   WORKING_DIRECTORY ${INTERPRETER_DIR}
-   COMMAND mkdir -p ${FROZEN_DIR}
-   COMMAND ${PYTHON_BIN} freeze.py ${PYTHON_STDLIB_DIR} ${TYPING_PKG} ${PYTORCH_ROOT}/torch --oss --install_dir ${FROZEN_DIR} --verbose
-   DEPENDS cpython typing
-   VERBATIM
-)
-
-# instantiate a library based on the objects that make up torch_python
-# make sure system python isn't used here
-target_include_directories(torch_python_obj BEFORE PRIVATE ${PYTHON_INC_DIR})
-add_library(torch_python_static STATIC $<TARGET_OBJECTS:torch_python_obj>)
-# Build the interpreter lib, designed to be standalone and dlopened
-# We bake the python and torch_python binding objs into libinterpreter
-set(LINKER_SCRIPT "${INTERPRETER_DIR}/hide_symbols.script")
-set(INTERPRETER_LIB_SOURCES
-  ${INTERPRETER_DIR}/interpreter.cpp
-  ${FROZEN_FILES}
-  ${LINKER_SCRIPT}
-)
-add_library(interpreter ${INTERPRETER_LIB_SOURCES} ${LINKER_SCRIPT})
-set_property(TARGET interpreter APPEND_STRING PROPERTY
-             LINK_FLAGS " -Wl,--version-script=${LINKER_SCRIPT}")
-# need to ensure headers are present before any .cpp in interpreter are compiled,
-# but cpp themselves don't clearly depend on cpython so there is a race otherwise
-add_dependencies(interpreter cpython)
-target_compile_options(
-    interpreter PRIVATE
-    -fvisibility=hidden
-)
-target_include_directories(interpreter PRIVATE ${INTERPRETER_DIR})
-target_include_directories(interpreter PUBLIC ${PYTHON_INC_DIR})
-target_link_libraries(interpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STDLIB} torch_python_static)
-target_link_libraries(interpreter PRIVATE crypt crypto ssl pthread dl util m z ffi lzma readline nsl ncursesw panelw) # for python builtins
-target_link_libraries(interpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite)
-
-# handy to have a standalone app to verify linkage and usage of interpreter before embedding it in another lib
-set(INTERPRETER_TEST_SOURCES
-  ${INTERPRETER_DIR}/test_main.cpp
-)
-add_executable(interpreter_test ${INTERPRETER_TEST_SOURCES})
-target_include_directories(interpreter_test PRIVATE ${PYTORCH_ROOT}/torch)
-target_include_directories(interpreter_test PRIVATE ${PYTHON_INC_DIR})
-target_link_libraries(interpreter_test PUBLIC gtest dl)
-# no-as-needed to ensure shm and torch are included to satisfy runtime dlopen
-# dependencies for libinterpreter, regardless of whether they are used in interpreter_test
-target_link_libraries(interpreter_test PUBLIC "-Wl,--no-as-needed" shm torch protobuf::libprotobuf-lite)
diff --git a/torch/csrc/deploy/interpreter/CMakePythonModules.txt b/torch/csrc/deploy/interpreter/CMakePythonModules.txt
deleted file mode 100644
index c6bc9cab76ff..000000000000
--- a/torch/csrc/deploy/interpreter/CMakePythonModules.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-SET(PYTHON_MODULES
-  ${PYTHON_MODULE_DIR}/arraymodule.o
-  ${PYTHON_MODULE_DIR}/_asynciomodule.o
-  ${PYTHON_MODULE_DIR}/audioop.o
-  ${PYTHON_MODULE_DIR}/binascii.o
-  ${PYTHON_MODULE_DIR}/_bisectmodule.o
-  ${PYTHON_MODULE_DIR}/_blake2/blake2module.o ${PYTHON_MODULE_DIR}/_blake2/blake2b_impl.o ${PYTHON_MODULE_DIR}/_blake2/blake2s_impl.o
-  ${PYTHON_MODULE_DIR}/_bz2module.o
-  ${PYTHON_MODULE_DIR}/cmathmodule.o
-  # ${PYTHON_MODULE_DIR}/_math.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_cn.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_hk.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_iso2022.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_jp.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_kr.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/_codecs_tw.o
-  ${PYTHON_MODULE_DIR}/_contextvarsmodule.o
-  ${PYTHON_MODULE_DIR}/_cryptmodule.o
-  ${PYTHON_MODULE_DIR}/_csv.o
-  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes.o ${PYTHON_MODULE_DIR}/_ctypes/callbacks.o ${PYTHON_MODULE_DIR}/_ctypes/callproc.o ${PYTHON_MODULE_DIR}/_ctypes/stgdict.o ${PYTHON_MODULE_DIR}/_ctypes/cfield.o
-  ${PYTHON_MODULE_DIR}/_ctypes/_ctypes_test.o
-  ${PYTHON_MODULE_DIR}/_cursesmodule.o
-  ${PYTHON_MODULE_DIR}/_curses_panel.o
-  ${PYTHON_MODULE_DIR}/_datetimemodule.o
-  ${PYTHON_MODULE_DIR}/_decimal/_decimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/basearith.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/constants.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/context.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/convolute.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/crt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/difradix2.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fnt.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/fourstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/io.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/memory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/mpdecimal.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/numbertheory.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/sixstep.o ${PYTHON_MODULE_DIR}/_decimal/libmpdec/transpose.o
-  ${PYTHON_MODULE_DIR}/_elementtree.o
-  ${PYTHON_MODULE_DIR}/fcntlmodule.o
-  ${PYTHON_MODULE_DIR}/grpmodule.o
-  ${PYTHON_MODULE_DIR}/_hashopenssl.o
-  ${PYTHON_MODULE_DIR}/_heapqmodule.o
-  ${PYTHON_MODULE_DIR}/_json.o
-  ${PYTHON_MODULE_DIR}/_lsprof.o
-  ${PYTHON_MODULE_DIR}/_lzmamodule.o
-  ${PYTHON_MODULE_DIR}/mathmodule.o
-  ${PYTHON_MODULE_DIR}/md5module.o
-  ${PYTHON_MODULE_DIR}/mmapmodule.o
-  ${PYTHON_MODULE_DIR}/cjkcodecs/multibytecodec.o
-  ${PYTHON_MODULE_DIR}/_multiprocessing/multiprocessing.o ${PYTHON_MODULE_DIR}/_multiprocessing/semaphore.o
-  ${PYTHON_MODULE_DIR}/nismodule.o
-  ${PYTHON_MODULE_DIR}/_opcode.o
-  ${PYTHON_MODULE_DIR}/ossaudiodev.o
-  ${PYTHON_MODULE_DIR}/parsermodule.o
-  ${PYTHON_MODULE_DIR}/_pickle.o
-  ${PYTHON_MODULE_DIR}/_posixsubprocess.o
-  ${PYTHON_MODULE_DIR}/pyexpat.o ${PYTHON_MODULE_DIR}/expat/xmlparse.o ${PYTHON_MODULE_DIR}/expat/xmlrole.o ${PYTHON_MODULE_DIR}/expat/xmltok.o
-  ${PYTHON_MODULE_DIR}/_queuemodule.o
-  ${PYTHON_MODULE_DIR}/_randommodule.o
-  ${PYTHON_MODULE_DIR}/readline.o
-  ${PYTHON_MODULE_DIR}/resource.o
-  ${PYTHON_MODULE_DIR}/selectmodule.o
-  ${PYTHON_MODULE_DIR}/sha1module.o
-  ${PYTHON_MODULE_DIR}/sha256module.o
-  ${PYTHON_MODULE_DIR}/_sha3/sha3module.o
-  ${PYTHON_MODULE_DIR}/sha512module.o
-  ${PYTHON_MODULE_DIR}/socketmodule.o
-  ${PYTHON_MODULE_DIR}/spwdmodule.o
-  ${PYTHON_MODULE_DIR}/_ssl.o
-  ${PYTHON_MODULE_DIR}/_struct.o
-  ${PYTHON_MODULE_DIR}/syslogmodule.o
-  ${PYTHON_MODULE_DIR}/termios.o
-  ${PYTHON_MODULE_DIR}/_testbuffer.o
-  ${PYTHON_MODULE_DIR}/_testcapimodule.o
-  ${PYTHON_MODULE_DIR}/_testimportmultiple.o
-  ${PYTHON_MODULE_DIR}/_testmultiphase.o
-  ${PYTHON_MODULE_DIR}/unicodedata.o
-  ${PYTHON_MODULE_DIR}/xxlimited.o
-  ${PYTHON_MODULE_DIR}/_xxtestfuzz/_xxtestfuzz.o ${PYTHON_MODULE_DIR}/_xxtestfuzz/fuzzer.o
-  ${PYTHON_MODULE_DIR}/zlibmodule.o
-)
diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py
deleted file mode 100644
index 459b7be9381c..000000000000
--- a/torch/csrc/deploy/interpreter/freeze.py
+++ /dev/null
@@ -1,269 +0,0 @@
-"""
-Freeze Python packages.
-
-Freezing makes it possible to ship arbitrary Python modules as part of a C++
-library. The Python source of the module is compiled to bytecode and written
-to `.c` files, to be imported by Python's built-in FrozenImporter.
-
-In a normal Python installation, FrozenImporter is only used to bootstrap the
-initialization of the import machinery. Python's importers are defined in
-Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be
-retrieved before any importers are available. Freezing the module bytecode
-resolves this circular dependency.
-
-This script will freeze the Python standard library. It produces two things:
-- Bytecode files: A set of `.c` that define C variables containing Python bytecode.
-- Main file: A `main.c` file listing all of these modules in the right form to be
-  consumed by FrozenImporter.
-
-The library that wishes to these modules make them available to the local
-Python instance by extending `PyImport_FrozenModules` appropriately (see
-https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules).
-"""
-
-import argparse
-import functools
-import itertools
-import marshal
-import os
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List
-
-
-MAIN_INCLUDES = """#include <Python.h>
-
-"""
-
-MAIN_PREFIX = """
-// Compiled standard library modules. These should be appended to the existing
-// `PyImport_FrozenModules` that ships with CPython.
-struct _frozen _PyImport_FrozenModules_torch[] = {
-"""
-
-FAKE_PREFIX = """
-// Compiled standard library modules. These should be appended to the existing
-// `PyImport_FrozenModules` that ships with CPython.
-struct _frozen _PyImport_FrozenModules[] = {
-"""
-
-MAIN_SUFFIX = """\
-    {0, 0, 0} /* sentinel */
-};
-"""
-
-# Exclude some standard library modules to:
-# 1. Slim down the final frozen lib.
-# 2. Remove functionality we don't want to support.
-DENY_LIST = [
-    # Interface to unix databases
-    "dbm",
-    # ncurses bindings (terminal interfaces)
-    "curses",
-    # Tcl/Tk GUI
-    "tkinter",
-    "tkinter",
-    # Tests for the standard library
-    "test",
-    "tests",
-    "idle_test",
-    "__phello__.foo.py",
-    # importlib frozen modules. These are already baked into CPython.
-    "_bootstrap.py",
-    "_bootstrap_external.py",
-]
-
-NUM_BYTECODE_FILES = 5
-
-
-def indent_msg(fn):
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        args[0].indent += 1
-        ret = fn(*args, **kwargs)
-        args[0].indent -= 1
-        return ret
-
-    return wrapper
-
-
-@dataclass
-class FrozenModule:
-    # The fully qualified module name, e.g. 'foo.bar.baz'
-    module_name: str
-    # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz'
-    c_name: str
-    # The size of the C variable. Negative if this module is a package.
-    size: int
-    # The frozen bytecode
-    bytecode: bytes
-
-
-class Freezer:
-    def __init__(self, verbose: bool):
-        self.frozen_modules: List[FrozenModule] = []
-        self.indent: int = 0
-        self.verbose: bool = verbose
-
-    def msg(self, path: Path, code: str):
-        if not self.verbose:
-            return
-        # P: package dir
-        # F: python file
-        # S: skipped (not a package dir)
-        # X: skipped (deny-listed)
-        # N: skipped (not a python file)
-        for i in range(self.indent):
-            print("    ", end="")
-        print(f"{code} {path}")
-
-    def write_bytecode(self, install_root):
-        """
-        Write the `.c` files containing the frozen bytecode. Shard frozen
-        modules evenly across the files.
-        """
-        bytecode_file_names = [
-            f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES)
-        ]
-        bytecode_files = [open(os.path.join(install_root, name), "w") for name in bytecode_file_names]
-        it = itertools.cycle(bytecode_files)
-        for m in self.frozen_modules:
-            self.write_frozen(m, next(it))
-
-        for f in bytecode_files:
-            f.close()
-
-    def write_main(self, install_root, oss):
-        """
-        Write the `main.c` file containing a table enumerating all the
-        frozen modules.
-        """
-        with open(os.path.join(install_root, "main.c"), "w") as outfp:
-            outfp.write(MAIN_INCLUDES)
-            for m in self.frozen_modules:
-                outfp.write(f"extern unsigned char {m.c_name}[];\n")
-
-            outfp.write(MAIN_PREFIX)
-            for m in self.frozen_modules:
-                outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
-            outfp.write(MAIN_SUFFIX)
-            if oss:
-                outfp.write(FAKE_PREFIX)
-                outfp.write(MAIN_SUFFIX)
-
-    def write_frozen(self, m: FrozenModule, outfp):
-        """
-        Write a single frozen module's bytecode out to a C variable.
-        """
-        outfp.write(f"unsigned char {m.c_name}[] = {{")
-        for i in range(0, len(m.bytecode), 16):
-            outfp.write("\n\t")
-            for c in bytes(m.bytecode[i : i + 16]):
-                outfp.write("%d," % c)
-        outfp.write("\n};\n")
-
-    def compile_path(self, path: Path, top_package_path: Path):
-        """Generic entry point for compiling a Path object."""
-        if path.is_dir():
-            self.compile_package(path, top_package_path)
-        else:
-            self.compile_file(path, top_package_path)
-
-    @indent_msg
-    def compile_package(self, path: Path, top_package_path: Path):
-        """Compile all the files within a Python package dir."""
-        assert path.is_dir()
-        if path.name in DENY_LIST:
-            self.msg(path, "X")
-            return
-
-        # Python packages are directories that have __init__.py in them.
-        is_package_dir = any([child.name == "__init__.py" for child in path.iterdir()])
-        if not is_package_dir:
-            self.msg(path, "S")
-            return
-
-        self.msg(path, "P")
-        # Recursively compile all children in this dir
-        for child in path.iterdir():
-            self.compile_path(child, top_package_path)
-
-    def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]:
-        # `path` looks like 'Lib/foo/bar/baz.py'
-
-        # chop off 'Lib/' to get something that represents a Python module hierarchy.
-        # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz'
-        normalized_path = file_path.relative_to(top_package_path.parent)
-
-        if normalized_path.name == "__init__.py":
-            # Special handling for `__init__.py`. In this case, this file
-            # specifies that the containing directory should be treated as a package.
-            # For 'foo/bar/baz/__init__.py':
-            # - The module name is 'baz'
-            module_basename = normalized_path.parent.name
-            # - The parent is foo.bar (need to shave off the 'baz')
-            module_parent = normalized_path.parent.parent.parts
-        else:
-            module_basename = normalized_path.stem
-            module_parent = normalized_path.parent.parts
-        return list(module_parent) + [module_basename]
-
-    @indent_msg
-    def compile_file(self, path: Path, top_package_path: Path):
-        """
-        Compile a Python source file to frozen bytecode. Append the result to
-        `self.frozen_modules`.
-        """
-        assert path.is_file()
-        if path.suffix != ".py":
-            self.msg(path, "N")
-            return
-
-        if path.name in DENY_LIST:
-            self.msg(path, "X")
-            return
-
-        self.msg(path, "F")
-        module_qualname = self.get_module_qualname(path, top_package_path)
-        module_mangled_name = "__".join(module_qualname)
-        c_name = "M_" + module_mangled_name
-
-        with open(path, "r") as src_file:
-            co = compile(src_file.read(), path, "exec")
-
-        bytecode = marshal.dumps(co)
-        size = len(bytecode)
-        if path.name == '__init__.py':
-            # Python packages are signified by negative size.
-            size = -size
-        self.frozen_modules.append(
-            FrozenModule(".".join(module_qualname), c_name, size, bytecode)
-        )
-
-
-parser = argparse.ArgumentParser(description="Compile py source")
-parser.add_argument("paths", nargs="*", help="Paths to freeze.")
-parser.add_argument("--verbose", action="store_true", help="Print debug logs")
-parser.add_argument("--install_dir", help="Root directory for all output files")
-parser.add_argument("--fbcode_dir", help="Root directory for all output files")
-parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
-
-args = parser.parse_args()
-
-f = Freezer(args.verbose)
-
-for p in args.paths:
-    if args.fbcode_dir:
-        p = os.path.join(args.fbcode_dir, p)
-    path = Path(p)
-    if path.is_dir() and not Path.exists(path / '__init__.py'):
-        # this 'top level path p' is a standard directory containing modules,
-        # not a module itself
-        # each 'mod' could be a dir containing __init__.py or .py file
-        for mod in path.glob("*"):
-            f.compile_path(mod, mod)
-    else:
-        f.compile_path(path, path)
-
-f.write_bytecode(args.install_dir)
-f.write_main(args.install_dir, args.oss)
diff --git a/torch/csrc/deploy/interpreter/hide_symbols.script b/torch/csrc/deploy/interpreter/hide_symbols.script
deleted file mode 100644
index c748c8bfec95..000000000000
--- a/torch/csrc/deploy/interpreter/hide_symbols.script
+++ /dev/null
@@ -1,5 +0,0 @@
-INTERPRETER_0.1 {
-  global:
-    initialize_interface;
-  local: *;         # hide everything else
-};
diff --git a/torch/csrc/deploy/interpreter/interpreter.cpp b/torch/csrc/deploy/interpreter/interpreter.cpp
deleted file mode 100644
index 7d685d33435c..000000000000
--- a/torch/csrc/deploy/interpreter/interpreter.cpp
+++ /dev/null
@@ -1,324 +0,0 @@
-#include <dlfcn.h>
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <iostream>
-#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
-#include <pybind11/embed.h>
-#include <cstdio>
-#include <ATen/ATen.h>
-#include <torch/csrc/jit/python/pybind_utils.h>
-#include <map>
-#include <thread>
-#include <fmt/format.h>
-
-namespace py = pybind11;
-using namespace py::literals;
-
-// TODO this should come from cmake
-#define DEBUG 0
-template<typename T>
-const auto PYOBJ_ASSERT(T obj) {
-#if (DEBUG == 1)
-  if (NULL == obj) {
-    PyErr_Print();
-  }
-#endif
-  TORCH_INTERNAL_ASSERT(NULL != obj);
-}
-
-static wchar_t* program;
-
-#define FOREACH_LIBRARY(_) \
-  _(array)                 \
-  _(_asyncio)              \
-  _(audioop)               \
-  _(binascii)              \
-  _(_bisect)               \
-  _(_blake2)               \
-  _(_bz2)                  \
-  _(cmath)                 \
-  _(_codecs_cn)            \
-  _(_codecs_hk)            \
-  _(_codecs_iso2022)       \
-  _(_codecs_jp)            \
-  _(_codecs_kr)            \
-  _(_codecs_tw)            \
-  _(_contextvars)          \
-  _(_crypt)                \
-  _(_csv)                  \
-  _(_ctypes)               \
-  _(_ctypes_test)          \
-  _(_curses)               \
-  _(_curses_panel)         \
-  _(_datetime)             \
-  _(_decimal)              \
-  _(_elementtree)          \
-  _(fcntl)                 \
-  _(grp)                   \
-  _(_hashlib)              \
-  _(_heapq)                \
-  _(_json)                 \
-  _(_lsprof)               \
-  _(_lzma)                 \
-  _(math)                  \
-  _(_md5)                  \
-  _(mmap)                  \
-  _(_multibytecodec)       \
-  _(_multiprocessing)      \
-  _(nis)                   \
-  _(_opcode)               \
-  _(ossaudiodev)           \
-  _(parser)                \
-  _(_pickle)               \
-  _(_posixsubprocess)      \
-  _(pyexpat)               \
-  _(_queue)                \
-  _(_random)               \
-  _(readline)              \
-  _(resource)              \
-  _(select)                \
-  _(_sha1)                 \
-  _(_sha256)               \
-  _(_sha3)                 \
-  _(_sha512)               \
-  _(_socket)               \
-  _(spwd)                  \
-  _(_ssl)                  \
-  _(_struct)               \
-  _(syslog)                \
-  _(termios)               \
-  _(_testbuffer)           \
-  _(_testcapi)             \
-  _(_testimportmultiple)   \
-  _(_testmultiphase)       \
-  _(unicodedata)           \
-  _(xxlimited)             \
-  _(_xxtestfuzz)           \
-  _(zlib)
-
-#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
-FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
-#undef DECLARE_LIBRARY_INIT
-
-extern "C" __attribute__((visibility("default"))) void initialize_interface(
-    InterpreterImpl* s) {
-#define INITIALIZE_MEMBER(func) s->func = func;
-  FOREACH_INTERFACE_FUNCTION(INITIALIZE_MEMBER)
-#undef INITIALIZE_MEMBER
-}
-
-// These numbers of modules should not change as long as the cpython version
-// embedded in the build remains fixed
-static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
-static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;
-
-// We need to preserve the existing FrozenModules list, since it includes
-// important importlib machinery. This code is adapted from the similar
-// `PyImport_ExtendInittab`.
-int extendFrozenModules(struct _frozen *frozenpython, struct _frozen *frozentorch) {
-    struct _frozen *p = nullptr;
-    size_t a = 0, b = 0, c = 0;
-    int res = 0;
-
-    /* Count the number of entries in both tables */
-    for (a = 0; frozenpython[a].name != nullptr; a++) {
-      // std::cout << "frozenpython[" << a << "]: " << frozenpython[a].name << std::endl;
-    }
-    for (b = 0; frozentorch[b].name != nullptr; b++) {
-      // std::cout << "frozentorch[" << b << "]: " << frozentorch[b].name << std::endl;
-    }
-    for (c = 0; PyImport_FrozenModules[c].name != nullptr; c++) {
-      // std::cout << "oldfrozen[" << c << "]: " << PyImport_FrozenModules[c].name << std::endl;
-    }
-
-    // Num frozen builtins shouldn't change (unless modifying the underlying cpython version)
-    TORCH_INTERNAL_ASSERT(c == NUM_FROZEN_PY_BUILTIN_MODULES, "Missing python builtin frozen modules");
-    // Check a+b together since in OSS a is empty and b contains stdlib+torch, while
-    // in fbcode they are separated due to thirdparty2 frozenpython.
-    // No fixed number of torch modules to check for, but there should be at least one.
-    TORCH_INTERNAL_ASSERT(a + b > NUM_FROZEN_PY_STDLIB_MODULES + 1, "Missing frozen python stdlib or torch modules");
-
-    /* Allocate new memory for the combined table */
-    if (a + b + c <= SIZE_MAX / sizeof(struct _frozen) - 1) {
-        size_t size = sizeof(struct _frozen) * (a + b + c + 1);
-        p = (_frozen*)PyMem_Realloc(p, size);
-    }
-    if (p == nullptr) {
-      return -1;
-    }
-
-    /* Copy the tables into the new memory */
-    memcpy(p, PyImport_FrozenModules, (c + 1) * sizeof(struct _frozen));
-    memcpy(p + c, frozenpython, (a + 1) * sizeof(struct _frozen));
-    memcpy(p + a + c, frozentorch, (b + 1) * sizeof(struct _frozen));
-    PyImport_FrozenModules = p;
-    return res;
-}
-
-// We need to register a custom finder because we are registering `torch._C` as
-// a built-in module, and it will otherwise get skipped by the default importer.
-const char* finder = R"RAW(
-import sys
-# Remove the path-based importer, as we don't want our isolated interpreter to read the file system
-sys.meta_path = sys.meta_path[:-1]
-
-class F:
-    def find_spec(self, fullname, path, target=None):
-        if fullname == 'torch._C':
-            return sys.meta_path[1].find_spec('torch._C', None, None)
-        return None
-sys.meta_path.insert(0, F())
-
-# make loader importable
-)RAW";
-
-const char* sysprint = R"RAW(
-import sys
-print("exec_prefix:", sys.base_exec_prefix)
-print("_base_executable:", sys._base_executable)
-print("base_prefix:", sys.base_prefix)
-print("exec_prefix:", sys.exec_prefix)
-print("executable:", sys.executable)
-print("path:", sys.path)
-print("prefix:", sys.prefix)
-
-)RAW";
-
-extern "C" PyObject* initModule(void);
-extern "C" struct _frozen _PyImport_FrozenModules[];
-extern "C" struct _frozen _PyImport_FrozenModules_torch[];
-
-static std::atomic<size_t> s_id;
-std::map<size_t, py::object> forwards;
-
-__attribute__((constructor)) void init() {
-
-}
-
-void startup() {
-#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
-  FOREACH_LIBRARY(APPEND_INIT)
-#undef APPEND_INIT
-  PyImport_AppendInittab("torch._C", initModule);
-
-  int ret = extendFrozenModules(_PyImport_FrozenModules, _PyImport_FrozenModules_torch);
-  TORCH_INTERNAL_ASSERT(ret == 0);
-
-  PyPreConfig preconfig;
-  PyPreConfig_InitIsolatedConfig(&preconfig);
-  PyStatus status = Py_PreInitialize(&preconfig);
-  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
-
-  PyConfig config;
-  PyConfig_InitIsolatedConfig(&config);
-
-  // Completely blank out the path configuration. This ensures we have complete
-  // control of how our embedded Python searches for modules, and we will never
-  // consult the external filesystem. See:
-  // https://docs.python.org/3/c-api/init_config.html#path-configuration
-  config.site_import = 0;
-
-  status = PyConfig_SetString(&config, &config.base_exec_prefix, L"");
-  status = PyConfig_SetString(&config, &config.base_executable, L"torch_deploy");
-  status = PyConfig_SetString(&config, &config.base_prefix, L"");
-  status = PyConfig_SetString(&config, &config.exec_prefix, L"");
-  status = PyConfig_SetString(&config, &config.executable, L"torch_deploy");
-  status = PyConfig_SetString(&config, &config.prefix, L"");
-
-
-  config.module_search_paths_set = 1;
-  std::array<wchar_t*, 0> module_search_paths = {};
-  status = PyConfig_SetWideStringList(
-      &config, &config.module_search_paths, 0, module_search_paths.data());
-
-  status = Py_InitializeFromConfig(&config);
-  PyConfig_Clear(&config);
-  TORCH_INTERNAL_ASSERT(!PyStatus_Exception(status))
-
-  // Uncomment to debug python config
-  // PyRun_SimpleString(sysprint);
-
-  PyRun_SimpleString(finder);
-  // Release the GIL that PyInitialize acquires
-  PyEval_SaveThread();
-}
-
-void teardown() {
-  PyGILState_Ensure();
-
-  if (Py_FinalizeEx() < 0) {
-    std::cout << "IT BROKE SO WE ARE EXITING\n";
-    exit(120);
-  }
-  PyMem_RawFree(program);
-}
-
-__attribute__((destructor)) void deinit() {}
-
-void run_some_python(const char* code) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-
-  if (PyRun_SimpleString(code) == -1) {
-    throw std::runtime_error("python eval failed\n");
-  }
-  PyGILState_Release(gstate);
-}
-
-void run_python_file(const char* code) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-
-  FILE* f = fopen(code, "r");
-  if (PyRun_SimpleFile(f, code) == -1) {
-    throw std::runtime_error("python eval failed\n");
-  }
-  fclose(f);
-
-  PyGILState_Release(gstate);
-}
-
-
-size_t load_model(const char* filename, bool hermetic) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-  TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
-  std::string code;
-
-  if (hermetic) {
-    code = fmt::format(R"(
-from torch.package import PackageImporter
-
-i = PackageImporter('{}')
-model = i.load_pickle('model', 'model.pkl')
-)", filename);
-  } else {
-    code = std::string("model = torch.jit.load('") +
-        std::string(filename) + std::string("')");
-  }
-    py::exec(code);
-
-  auto id = ++s_id;
-
-  PyGILState_Release(gstate);
-  return id;
-}
-
-at::Tensor forward_model(size_t model_id, at::Tensor const & input) {
-  at::Tensor output;
-  PyGILState_STATE gstate = PyGILState_Ensure();
-  {
-    TORCH_INTERNAL_ASSERT(PyGILState_Check() == 1);
-    auto forward = py::globals()["model"].attr("forward");
-
-    py::object py_output = forward(input);
-    // TODO is this going to leak?
-    // added it to prevent crash wehn using 'output' tensor in callee of
-    // forward()
-    py_output.inc_ref();
-    output = py::cast<at::Tensor>(py_output);
-  }
-
-  PyGILState_Release(gstate);
-
-  return output;
-  // return input;
-}
diff --git a/torch/csrc/deploy/interpreter/interpreter.h b/torch/csrc/deploy/interpreter/interpreter.h
deleted file mode 100644
index 29e435e44970..000000000000
--- a/torch/csrc/deploy/interpreter/interpreter.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#pragma once
-#include <dlfcn.h>
-#include <unistd.h>
-#include <experimental/filesystem>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-#include <torch/csrc/deploy/interpreter/interpreter_impl.h>
-
-
-class Interpreter : public InterpreterImpl {
- private:
-  std::string library_name_;
-  void* handle_;
-
- public:
-  Interpreter() : handle_(nullptr) {
-    char library_name[L_tmpnam];
-    library_name_ = library_name;
-    char* libinterpreter_path = std::getenv("LIBINTERPRETER_PATH");
-    if (libinterpreter_path == nullptr) {
-      throw std::runtime_error("libinterpreter_path is NULL, set LIBINTERPRETER_PATH env.");
-    }
-    std::tmpnam(library_name);
-    {
-      std::ifstream src(libinterpreter_path,  std::ios::binary);
-      std::ofstream dst(library_name, std::ios::binary);
-      dst << src.rdbuf();
-    }
-    handle_ = dlopen(library_name, RTLD_LOCAL | RTLD_LAZY);
-    if (!handle_) {
-      throw std::runtime_error(dlerror());
-    }
-
-    // technically, we can unlike the library right after dlopen, and this is
-    // better for cleanup because even if we crash the library doesn't stick
-    // around. However, its crap for debugging because gdb can't find the
-    // symbols if the library is no longer present.
-    unlink(library_name_.c_str());
-
-    void* initialize_interface = dlsym(handle_, "initialize_interface");
-    if (!initialize_interface) {
-      throw std::runtime_error("Unable to load initialize_interface function from interpreter lib.");
-    }
-    ((void (*)(InterpreterImpl*))initialize_interface)(this);
-
-    this->startup();
-
-    // the actual torch loading process is not thread safe, by doing it
-    // in the constructor before we have multiple worker threads, then we
-    // ensure it doesn't race.
-    run_some_python("import torch");
-  }
-  ~Interpreter() {
-    if (handle_) {
-      this->teardown();
-
-      // it segfaults its face off trying to unload, but it's not clear
-      // if this is something we caused of if libtorch_python would also do the
-      // same if it were opened/closed a lot...
-      dlclose(handle_);
-    }
-  }
-  Interpreter(const Interpreter&) = delete;
-};
diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h
deleted file mode 100644
index 82326bd370f1..000000000000
--- a/torch/csrc/deploy/interpreter/interpreter_impl.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#include <ATen/ATen.h>
-
-// NOTE- if adding new interface functions,
-// update interpreter.cpp initialize_interface.
-size_t load_model(const char* model_file, bool hermetic=false);
-at::Tensor forward_model(size_t model_id, at::Tensor const & input);
-void run_some_python(const char* code);
-void startup();
-void teardown();
-void run_python_file(const char* code);
-
-
-#define FOREACH_INTERFACE_FUNCTION(_) \
-  _(load_model)                       \
-  _(forward_model)                    \
-  _(run_some_python)                  \
-  _(startup)                          \
-  _(teardown)                         \
-  _(run_python_file)
-
-struct InterpreterImpl {
-#define DEFINE_POINTER(func) decltype(&::func) func;
-  FOREACH_INTERFACE_FUNCTION(DEFINE_POINTER)
-#undef DEFINE_POINTER
-};
diff --git a/torch/csrc/deploy/interpreter/test_main.cpp b/torch/csrc/deploy/interpreter/test_main.cpp
deleted file mode 100644
index 6107267c9f29..000000000000
--- a/torch/csrc/deploy/interpreter/test_main.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-#include <torch/script.h>
-#include <torch/torch.h>
-#include <torch/csrc/deploy/interpreter/interpreter.h>
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-
-  int rc = RUN_ALL_TESTS();
-
-  return rc;
-}
-
-TEST(Interpreter, Sanity) {
-  ASSERT_TRUE(true);
-}
-
-TEST(Interpreter, Hello) {
-  Interpreter interp;
-  interp.run_some_python("print('hello from first interpeter!')");
-
-  Interpreter interp2;
-  interp2.run_some_python("print('hello from second interpeter!')");
-}
-
-void compare_torchpy_jit(const char* model_filename, at::Tensor const & input) {
-  Interpreter interp;
-  // Test
-  auto model_id = interp.load_model(model_filename, false);
-  at::Tensor output = interp.forward_model(model_id, input);
-
-  // Reference
-  auto ref_model = torch::jit::load(model_filename);
-  std::vector<torch::jit::IValue> ref_inputs;
-  ref_inputs.emplace_back(torch::jit::IValue(input));
-  at::Tensor ref_output = ref_model.forward(ref_inputs).toTensor();
-
-  ASSERT_TRUE(ref_output.equal(output));
-}
-
-TEST(Interpreter, SimpleModel) {
-  char* model_path = std::getenv("SIMPLE_MODEL_PATH");
-  ASSERT_NE(model_path, nullptr);
-  const int A = 10, B = 20;
-  compare_torchpy_jit(
-      model_path, torch::ones(at::IntArrayRef({A, B})));
-}
diff --git a/torch/csrc/deploy/interpreter/third_party/README.md b/torch/csrc/deploy/interpreter/third_party/README.md
deleted file mode 100644
index 2c5d9241d2bb..000000000000
--- a/torch/csrc/deploy/interpreter/third_party/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-Python libraries that we want to package along with the Python implementation
-bundled in libinterpreter.
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index faddb8cb16e2..7286387644ad 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -113,10 +113,6 @@ def _lazy_call(callable):
     if is_initialized():
         callable()
     else:
-        # TODO(torch_deploy): this accesses linecache, which attempts to read the
-        # file system to get traceback info. Patch linecache or do something
-        # else here if this ends up being important.
-
         # Don't store the actual traceback to avoid memory cycle
         _queued_calls.append((callable, traceback.format_stack()))
 
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index 73eb7f93cf1c..df6a3793e90d 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -2,7 +2,6 @@
 from .throughput_benchmark import ThroughputBenchmark
 
 import os.path as _osp
-import sys
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):
@@ -10,8 +9,5 @@ def set_module(obj, mod):
         raise TypeError("The mod argument should be a string")
     obj.__module__ = mod
 
-if sys.executable == "torch_deploy":
-    # not valid inside torch_deploy interpreter, no paths exists for frozen modules
-    cmake_prefix_path = None
-else:
-    cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')
+#: Path to folder containing CMake definitions for Torch package
+cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')