From 22c3ae8b576674e52e1972abe13891aaf4e99089 Mon Sep 17 00:00:00 2001
From: pbialecki <pbialecki@nvidia.com>
Date: Wed, 2 Dec 2020 20:23:45 -0800
Subject: [PATCH 001/132] Disable autocast cache for tensor views as fix for
 #48049 (#48696)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48049

Root cause of the issue explained [here](https://github.com/pytorch/pytorch/issues/48049#issuecomment-736701769).

This PR implements albanD's suggestion to add the `!t.is_view()` check and disable autocast caching for views of tensors.

The added test checks for an increase in memory usage by comparing the initially allocated memory with the memory after 3 iterations using a single `nn.Linear` layer in a `no_grad` and `autocast` context.

After this PR the memory usage in the original issue doesn't grow anymore and yields:
```python
autocast: True
0: 0MB (peak 1165MB)
1: 0MB (peak 1264MB)
2: 0MB (peak 1265MB)
3: 0MB (peak 1265MB)
4: 0MB (peak 1265MB)
5: 0MB (peak 1265MB)
6: 0MB (peak 1265MB)
7: 0MB (peak 1265MB)
8: 0MB (peak 1265MB)
9: 0MB (peak 1265MB)
```

CC ngimel mcarilli

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48696

Reviewed By: bdhirsh

Differential Revision: D25276231

Pulled By: ngimel

fbshipit-source-id: e2571e9f166c0a6f6f569b0c28e8b9ca34132743
---
 aten/src/ATen/autocast_mode.cpp |  2 +-
 test/test_cuda.py               | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 8f19cebb1f52..39264beccfa0 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -68,7 +68,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg) {
   if (is_eligible(arg) && (arg.scalar_type() != to_type)) {
     // Heuristic:  Do what Apex does, and cache fp16 casts of fp32 model weights (leaves).
     // See cached_casts declaration above for detailed strategy.
-    bool can_try_cache = (to_type == at::kHalf && arg.scalar_type() == at::kFloat && arg.requires_grad() && arg.is_leaf());
+    bool can_try_cache = (to_type == at::kHalf && arg.scalar_type() == at::kFloat && arg.requires_grad() && arg.is_leaf() && !arg.is_view());
     if (can_try_cache) {
       auto it = cached_casts.find(arg.unsafeGetTensorImpl());
       if (it != cached_casts.end()) {
diff --git a/test/test_cuda.py b/test/test_cuda.py
index b8da6db0714b..17e3942c001d 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2838,6 +2838,22 @@ def test_autocast_rnn(self):
                 for grad, grad_control in zip(grads, grads_control):
                     self.assertEqual(grad.half(), grad_control)
 
+    def test_autocast_cache_leak(self):
+        # Reported at https://github.com/pytorch/pytorch/issues/48049
+        # Test is used to check, if autocast recaches the same parameters
+        # when executed in a `torch.no_grad()` block.
+
+        linear = torch.nn.Linear(10, 10).to('cuda')
+        data = torch.randn(1, 10, device='cuda')
+
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                out = linear(data)
+                first_iter_mem = torch.cuda.memory_allocated()
+                for _ in range(3):
+                    out = linear(data)
+                self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
+
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     def test_max_large_axis(self):

From ea573ea944839ed866e0167938c155f5fd615bf9 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 2 Dec 2020 20:33:07 -0800
Subject: [PATCH 002/132] [qunat][graphmode][fx] Standalone module takes float
 as input and output (#48671)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48671

Standalone module might be called separately so it's better to use float
as interface.

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D25256184

fbshipit-source-id: e209492a180ce1f81f31c8d6057956a74bad20b1
---
 test/quantization/test_quantize_fx.py         | 14 +--
 torch/quantization/fx/observed_module.py      |  8 --
 .../quantization/fx/quantization_patterns.py  |  3 +-
 torch/quantization/fx/quantize.py             | 95 +++++--------------
 torch/quantization/quantize_fx.py             | 17 +---
 5 files changed, 39 insertions(+), 98 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 4ce84c4f827e..27064c41805a 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -550,9 +550,9 @@ def forward(self, x):
                 ns.call_module(torch.quantization.MinMaxObserver): 2
             }
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            # for output of conv in the standalone module
+            # for input and output of conv in the standalone module
             count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 1
+                ns.call_module(torch.quantization.MinMaxObserver): 2
             }
             self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
 
@@ -565,11 +565,11 @@ def forward(self, x):
             }
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
             count_check = {
-                # quantization of input happens in parent module
-                # quantization of output happens in the quantized conv module
-                ns.call_function(torch.quantize_per_tensor) : 0,
-                # dequantization for output happens in parent module
-                ns.call_method('dequantize') : 0,
+                # standalone module will take float as input and output
+                # so we'll see quantize and dequantize in the modoule
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d): 1,
+                ns.call_method('dequantize') : 1,
             }
             self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
             res = m(data)
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
index 4c97881444e0..780808ded0c3 100644
--- a/torch/quantization/fx/observed_module.py
+++ b/torch/quantization/fx/observed_module.py
@@ -32,14 +32,6 @@ def is_observed_module(module):
     return isinstance(module, ObservedGraphModule)
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
-
-    def get_preserved_attr_names(self):
-        return ['_activation_post_process_map',
-                '_patterns',
-                '_qconfig_map',
-                '_standalone_module_observed_input_idxs',
-                '_output_is_observed']
-
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 5e548c8f79b9..72e165f8351e 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -676,4 +676,5 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
         # update the modules dict
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+        # standalone module takes float input
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 432ce82d90d7..68f560c95096 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -155,6 +155,11 @@ def is_submodule_of_fake_quant(name, module, named_modules):
     parent_name, _ = _parent_name(name)
     return is_activation_post_process(named_modules[parent_name])
 
+def is_observed_standalone_module_node(node, modules):
+    return node.op == 'call_module' and \
+        is_observed_standalone_module(modules[node.target])
+
+
 def get_flattened_qconfig_dict(qconfig_dict):
     """ flatten the global, object_type and module_name qconfig
     to the same qconfig_dict so that it can be used by
@@ -338,16 +343,9 @@ def _prepare(self, model, qconfig_dict, prepare_custom_config_dict,
         parent module, and will be quantized separately as one unit.
 
         When we are preparing a standalone module:
-        input of the module is observed in parent module, output of the module
-        is observed in the standalone module.
+        both input and output are observed in prepared standalone module
         Returns:
-            model(GraphModule): prepared standalone module with following
-            attributes:
-                _standalone_module_observed_input_idxs(List[Int]): a list of
-                    indexes for the graph inputs that needs to be observed in
-                    parent module
-                _output_is_observed(Bool): a boolean variable indicate whether
-                    the output of the custom module is observed or not
+            model(GraphModule): prepared standalone module
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
@@ -434,7 +432,6 @@ def insert_observer_for_special_module(quantize_handler):
               Returns: standalone_module_input_idxs: the indexs for inputs that
               needs to be observed by parent module
             """
-            standalone_module_input_idxs = None
             assert self.modules is not None
             if isinstance(quantize_handler, CustomModuleQuantizeHandler):
                 custom_module = self.modules[node.target]
@@ -455,21 +452,17 @@ def insert_observer_for_special_module(quantize_handler):
                 observed_standalone_module = \
                     prepare(standalone_module, {"": qconfig})
                 observed_standalone_module.qconfig = qconfig
-                standalone_module_input_idxs = observed_standalone_module.\
-                    _standalone_module_observed_input_idxs
                 observed_standalone_module = mark_observed_standalone_module(
                     observed_standalone_module)
                 parent_name, name = _parent_name(node.target)
                 setattr(self.modules[parent_name], name,
                         observed_standalone_module)
                 self.modules[node.target] = observed_standalone_module
-            return standalone_module_input_idxs
 
         def insert_observer_for_output_of_the_node(
                 node,
                 quantize_handler,
-                qconfig,
-                standalone_module_input_idxs):
+                qconfig):
             """ Insert observer/fake_quantize module for output of the observed
             module if needed
             """
@@ -526,24 +519,14 @@ def input_is_observed(arg):
                         observed_node_names_set.add(node.name)
                 elif isinstance(quantize_handler,
                                 StandaloneModuleQuantizeHandler):
-                    assert node.op == 'call_module'
-                    output_is_observed = \
-                        self.modules[node.target]._output_is_observed
-                    if output_is_observed:
-                        observed_node_names_set.add(node.name)
+                    # output is observed in the standalone module
+                    return
                 elif (quantize_handler.all_node_args and
                       input_output_observed(quantize_handler)):
                     # observer for outputs
                     new_observer = qconfig.activation()
                     insert_observer(node, new_observer)
 
-            # insert observer for input of standalone module
-            if standalone_module_input_idxs is not None:
-                for idx in standalone_module_input_idxs:
-                    if node.args[idx].name not in observed_node_names_set:
-                        new_observer = qconfig.activation()
-                        insert_observer(node.args[idx], new_observer)
-
         def insert_observer_for_input_arg_of_observed_node(arg):
             """
                Input:
@@ -551,12 +534,6 @@ def insert_observer_for_input_arg_of_observed_node(arg):
                  input activaiton for functional linear node
             """
             if node.name not in observed_node_names_set and node.name in quants:
-                if is_standalone_module and node.name in graph_inputs:
-                    # we'll insert observer for input of standalone module
-                    # in parent graph
-                    standalone_module_observed_input_idxs.append(
-                        graph_inputs.index(node.name))
-                    return
                 _, activation_post_process_ctr = quants[node.name]
                 if activation_post_process_ctr is not None:
                     insert_observer(node, activation_post_process_ctr())
@@ -579,10 +556,9 @@ def insert_observer_for_input_arg_of_observed_node(arg):
                 # index for input of custom module that needs to be observed in
                 # parent
                 if qconfig is not None:
-                    standalone_module_input_idxs = \
-                        insert_observer_for_special_module(obj)
+                    insert_observer_for_special_module(obj)
                     insert_observer_for_output_of_the_node(
-                        node, obj, qconfig, standalone_module_input_idxs)
+                        node, obj, qconfig)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             insert_observer_for_input_arg_of_observed_node(node)
@@ -591,17 +567,6 @@ def insert_observer_for_input_arg_of_observed_node(arg):
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         model = mark_observed_module(model)
-        if is_standalone_module:
-            assert result_node is not None
-            assert isinstance(result_node.args[0], Node), \
-                'standalone module returning dict is not yet supported'
-            # indicator for whether output is observed or not.
-            # This used for correctly quantize standalone modules
-            output_is_observed = \
-                result_node.args[0].name in observed_node_names_set
-            model._standalone_module_observed_input_idxs = \
-                standalone_module_observed_input_idxs
-            model._output_is_observed = output_is_observed
         return model
 
     def save_state(self, observed):
@@ -646,12 +611,9 @@ def _convert(self, model, debug=False, convert_custom_config_dict=None,
                  is_standalone_module=False):
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
-        For standalone module: the inputs will be quantized by parent module,
-        checks `_standalone_module_observed_input_idxs` of
-        input observed model and will treat these inputs as quantized
-        also will not dequantize the final output.
-        Returns a quantized standalone module which accepts quantized input
-        (if needed) and produces quantized output (if needed).
+
+        Returns a quantized standalone module which accepts float input
+        and produces float output.
         """
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
@@ -769,11 +731,8 @@ def is_quantized(node):
         def is_output_quantized(node) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
-            if node.op == 'call_module' and \
-                    is_observed_standalone_module(self.modules[node.target]):
-                quantized = bool(self.modules[node.target]._output_is_observed)
-            else:
-                quantized = True
+            # by default the output is expected to be quantized
+            quantized = True
 
             # Need to get correct quantized/non-quantized state for the output
             # of CopyNode
@@ -833,10 +792,9 @@ def insert_quantize_node(node):
             if node.op == 'output':
                 cur_output_node_idx = output_node_seen_cnt
                 output_node_seen_cnt += 1
-                if is_standalone_module or (cur_output_node_idx in output_quantized_idxs):
-                    # Result are kept quantized in the quantized standalone
-                    # module, or if the user specified the output_quantized_idxs
-                    # override.
+                if cur_output_node_idx in output_quantized_idxs:
+                    # Result are kept quantized if the user specified the
+                    # output_quantized_idxs override.
                     graph_output = map_arg(node.args[0], load_x)
                 else:
                     graph_output = map_arg(node.args[0], load_non_quantized)
@@ -851,10 +809,15 @@ def insert_quantize_node(node):
                     quantized = False
                 else:
                     assert obj is not None
+                    is_standalone_module_node = is_observed_standalone_module_node(
+                        node, self.modules)
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    quantized = is_output_quantized(node)
+                    if is_standalone_module_node:
+                        quantized = False
+                    else:
+                        quantized = is_output_quantized(node)
 
                 if quantized:
                     quant_env[node.name] = result
@@ -868,12 +831,6 @@ def insert_quantize_node(node):
             if node.op == 'call_module' and \
                     is_activation_post_process(self.modules[node.target]):
                 insert_quantize_node(node)
-            elif (is_standalone_module and node.op == 'placeholder' and
-                  graph_inputs.index(node.name) in
-                  model._standalone_module_observed_input_idxs):
-                # the node is quantized in parent module
-                quant_env[node.name] = \
-                    self.quantized_graph.node_copy(node, load_non_quantized)
             elif node.op == 'placeholder':
                 cur_placeholder_node_idx = placeholder_node_seen_cnt
                 placeholder_node_seen_cnt += 1
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 9db2e8be86f0..7d12a7316896 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -99,14 +99,8 @@ def _prepare_standalone_module_fx(model, qconfig_dict, prepare_custom_config_dic
     standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
-    input of the module is quantized in parent module, output of the module
-    is quantized in the standalone module.
-    Extra attributes in output GraphModule while preparing a standalone module:
-        _standalone_module_observed_input_idxs(List[Int]): a list of indexs for the graph inputs that
-                                         needs to be observed in parent module
-        _output_is_observed(Bool): a boolean variable indicate whether the output of the
-                                   custom module is observed or not
-
+    Both input and output of the module are observed in the
+    standalone module.
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
@@ -356,11 +350,8 @@ def _convert_standalone_module_fx(graph_module, debug=False, convert_custom_conf
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
-    The inputs will be quantized by parent module, checks `_standalone_module_observed_input_idxs` of
-    input model and will treat these inputs as quantized
-    also will not dequantize the final output
     Return:
-      A quantized standalone module which accepts quantized input(if needed)
-      and produces quantized output (if needed).
+        A quantized standalone module which accepts float input
+        and produces float output.
     """
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)

From 5f105e2aa6d021029811c7085512a3b1bba3f578 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 2 Dec 2020 20:44:26 -0800
Subject: [PATCH 003/132] Add test for empty tensors for batch matmuls (#47700)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47700

Reviewed By: malfet

Differential Revision: D24874754

Pulled By: ngimel

fbshipit-source-id: 41ba837740ff7d5bd49d5f7277ad2064985aba2f
---
 aten/src/ATen/native/LinearAlgebra.cpp     | 27 ++++++---
 aten/src/ATen/native/cuda/LinearAlgebra.cu | 65 ----------------------
 aten/src/ATen/native/native_functions.yaml |  9 +--
 test/test_linalg.py                        | 40 ++++++++++++-
 4 files changed, 60 insertions(+), 81 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index afd4ec15d25f..39eb6faf23d4 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -442,7 +442,7 @@ static void addmm_impl_cpu_(
   }
 }
 
-static void addbmm_impl_cpu_(
+static void addbmm_impl_(
     Tensor &result, const Tensor &self, const Tensor &batch1, const Tensor &batch2, Scalar beta, Scalar alpha) {
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -467,29 +467,38 @@ static void addbmm_impl_cpu_(
 
   const int64_t num_batches = batch1.size(0);
 
+  if (num_batches == 0) {
+    if (beta.to<c10::complex<double>>() != 0.0) {
+      result.mul_(beta);
+    } else {
+      result.zero_();
+    }
+    return;
+  }
+
   for (int64_t batch = 0; batch < num_batches; ++batch) {
-    addmm_impl_cpu_(result, result, batch1[batch], batch2[batch], beta, alpha);
+    result.addmm_(batch1[batch], batch2[batch], beta, alpha);
     beta = 1; // accumulate output once
   }
 }
 
-Tensor& addbmm_cpu_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor& addbmm_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor b_self = std::get<0>(expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm_out"));
   {
     at::NoNamesGuard guard;
-    addbmm_impl_cpu_(result, b_self, batch1, batch2, beta, alpha);
+    addbmm_impl_(result, b_self, batch1, batch2, beta, alpha);
   }
   at::namedinference::propagate_names_for_addmm(result, batch1, batch2, self);
   return result;
 }
 
-Tensor &addbmm_cpu_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
-  return addbmm_cpu_out(self, self, batch1, batch2, beta, alpha);
+Tensor &addbmm_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return native::addbmm_out(self, self, batch1, batch2, beta, alpha);
 }
 
-Tensor addbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor addbmm(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor result = at::empty({0}, self.options());
-  return addbmm_cpu_out(result, self, batch1, batch2, beta, alpha);
+  return native::addbmm_out(result, self, batch1, batch2, beta, alpha);
 }
 
 Tensor& addmm_cpu_out(Tensor &result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
@@ -608,7 +617,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
   if (self_or_result.numel() == 0) {
     return self_or_result;
   } else if (contraction_size == 0) {
-    if (is_bmm_out) {
+    if (is_bmm_out || (beta.to<c10::complex<double>>() == 0.0)) {
       return self_or_result.zero_();
     } else {
       return self_or_result.mul_(beta);
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index e155f9d367bc..57ee8e0be738 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -313,71 +313,6 @@ Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
   return native::bmm_out_cuda(result, self, mat2);
 }
 
-Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
-                        const Tensor& batch1, const Tensor& batch2,
-                        Scalar beta, Scalar alpha) {
-  TORCH_CHECK(batch1.dim() == 3 && batch2.dim() == 3,
-              "Batch tensors should be 3D, got dimensions ", batch1.dim(),
-              " and ", batch2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == batch1.device() &&
-              out.device() == batch2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              batch1.device(), " and ", batch2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  int64_t batchnum = batch1.size(0);
-  int64_t m1d1 = batch1.size(1);
-  int64_t innerdim = batch1.size(2);
-  int64_t m2d2 = batch2.size(2);
-  TORCH_CHECK(batchnum == batch2.size(0),
-              "equal number of batches expected");
-  TORCH_CHECK(m1d1 == self_.size(0),
-              "first dimension of batch1  must match first dimension of input");
-  TORCH_CHECK(m2d2 == self_.size(1),
-              "second dimension of batch2 must match second dimension of input");
-  TORCH_CHECK(innerdim == batch2.size(1),
-              "second dimension of batch1 must match first dimension of batch2");
-
-  if (&out != &self) {
-    at::native::resize_as_(out, self_);
-    if (beta.to<c10::complex<double>>() != 0.0) {
-      at::native::copy_(out, self_);
-    }
-  }
-
-  for (int64_t i=0; i<batchnum; i++) {
-    addmm_out_cuda(out, out, batch1[i], batch2[i], beta, alpha);
-    beta = 1;
-  }
-  return out;
-}
-
-Tensor& addbmm__cuda(Tensor& self,
-                     const Tensor& batch1, const Tensor& batch2,
-                     Scalar beta, Scalar alpha) {
-  addbmm_out_cuda(self, self, batch1, batch2, beta, alpha);
-  return self;
-}
-
-Tensor addbmm_cuda(const Tensor& self,
-                   const Tensor& batch1, const Tensor& batch2,
-                   Scalar beta, Scalar alpha)
-{
-  Tensor out = at::empty({0}, self.options());
-  addbmm_out_cuda(out, self, batch1, batch2, beta, alpha);
-  return out;
-}
-
 namespace {
 
 inline void dot_check(const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3cc6f3a93f5a..e5226129add2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5612,20 +5612,17 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: addbmm_cpu_
-    CUDA: addbmm__cuda
+    CPU, CUDA: addbmm_
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: addbmm_cpu_out
-    CUDA: addbmm_out_cuda
+    CPU, CUDA: addbmm_out
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: addbmm_cpu
-    CUDA: addbmm_cuda
+    CPU, CUDA: addbmm
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   use_c10_dispatcher: full
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 71c3cf654c1b..f1d521bb30be 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -3606,7 +3606,7 @@ def test_strided_mm_bmm(self, device, dtype):
         torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
-    @precisionOverride({torch.half: 0.005, torch.bfloat16: 0.05})
+    @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1")
     @onlyOnCPUAndCUDA
     @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
@@ -3632,18 +3632,31 @@ def invert_perm(p):
             return (d[0], d[1], d[2])
 
         def generate_inputs():
+            # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
                 b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
+            # broadcasting tensors
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
                 b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
                 b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
                 yield b1, b2
+            # zero-sized tensors
+            bug = (self.device_type == 'cuda' and dtype == torch.half and torch.version.cuda is not None and
+                   float(torch.version.cuda) < 11)
+            if bug:
+                return
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = torch.randn(shape1, dtype=dtype, device=device)
+                b2 = torch.randn(shape2, dtype=dtype, device=device)
+                yield b1, b2
 
         for (b1, b2), perm3 in itertools.product(generate_inputs(), itertools.permutations((0, 1, 2))):
             res1 = torch.bmm(b1, b2)
@@ -3824,6 +3837,17 @@ def generate_tensor():
                 ).to(device=device, dtype=dtype).sum(0)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = make_tensor(shape1, device, dtype, low=-1, high=1)
+                b2 = make_tensor(shape2, device, dtype, low=-1, high=1)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype).sum(0)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("addbmm", b1, b2, ref, out_tensor)
@@ -3875,6 +3899,20 @@ def generate_tensor():
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            bug = (self.device_type == 'cuda' and dtype == torch.half and torch.version.cuda is not None and
+                   float(torch.version.cuda) < 11)
+            if bug:
+                return
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = make_tensor(shape1, device, dtype, low=-2, high=2)
+                b2 = make_tensor(shape2, device, dtype, low=-2, high=2)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)

From 85c1e8acdc7b4f0f30db02fabfb1f5552840da84 Mon Sep 17 00:00:00 2001
From: Lemo <lemo1234@gmail.com>
Date: Wed, 2 Dec 2020 21:20:48 -0800
Subject: [PATCH 004/132] Replace kernel resource strings with real .cu source
 files (#48283)

Summary:
Convert the NVFUSER's runtime CUDA sources (under `.../jit/codegen/cuda/runtime`) to string literals, then include the headers with the generated literals.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48283

Reviewed By: mrshenli

Differential Revision: D25163362

Pulled By: ngimel

fbshipit-source-id: 4e6c181688ddea78ce6f3c754fee62fa6df16641
---
 .github/workflows/lint.yml                    |   1 +
 .gitignore                                    |   1 +
 caffe2/CMakeLists.txt                         |  43 ++
 tools/build_variables.bzl                     |  13 +
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |  28 +-
 torch/csrc/jit/codegen/cuda/executor_utils.h  |   3 +
 .../codegen/cuda/kernel_resource_strings.h    | 664 ------------------
 .../codegen/cuda/runtime/block_reduction.cu   | 104 +++
 .../jit/codegen/cuda/runtime/broadcast.cu     |  41 ++
 .../jit/codegen/cuda/runtime/fp16_support.cu  |  21 +
 .../codegen/cuda/runtime/grid_reduction.cu    | 374 ++++++++++
 .../csrc/jit/codegen/cuda/runtime/helpers.cu  |  47 ++
 .../codegen/cuda/runtime/random_numbers.cu    | 104 +++
 torch/csrc/jit/codegen/cuda/runtime/tensor.cu |  26 +
 .../jit/codegen/cuda/tools/stringify_file.py  |  29 +
 15 files changed, 827 insertions(+), 672 deletions(-)
 delete mode 100644 torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/helpers.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/runtime/tensor.cu
 create mode 100644 torch/csrc/jit/codegen/cuda/tools/stringify_file.py

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ba43f4cf77d6..4a0fb9cbf819 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -175,6 +175,7 @@ jobs:
             -g"-torch/csrc/cuda/python_nccl.cpp"                   \
             -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
             -g"-torch/csrc/generic/*.cpp"                          \
+            -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
diff --git a/.gitignore b/.gitignore
index d1f06437acee..10994ba7a64b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -192,6 +192,7 @@ build_ios
 /build_*
 .build_debug/*
 .build_release/*
+.build_profile/*
 distribute/*
 *.testbin
 *.bin
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4dd3f6c6569c..61bca7c6ffc0 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -781,6 +781,49 @@ elseif(USE_CUDA)
   endif()
 endif()
 
+if(USE_CUDA OR USE_ROCM)
+  if(USE_CUDA)
+    set(TORCHLIB_FLAVOR torch_cuda)
+  elseif(USE_ROCM)
+    set(TORCHLIB_FLAVOR torch_hip)
+  endif()
+
+  # The list of NVFUSER runtime files
+  list(APPEND NVFUSER_RUNTIME_FILES
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
+  )
+
+  file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
+
+  # "stringify" NVFUSER runtime sources
+  # (generate C++ header files embedding the original input as a string literal)
+  set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py")
+  foreach(src ${NVFUSER_RUNTIME_FILES})
+    get_filename_component(filename ${src} NAME_WE)
+    set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
+    add_custom_command(
+      COMMENT "Stringify NVFUSER runtime source file"
+      OUTPUT ${dst}
+      DEPENDS ${src}
+      COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
+    )
+    add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
+    add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename})
+
+    # also generate the resource headers during the configuration step
+    # (so tools like clang-tidy can run w/o requiring a real build)
+    execute_process(COMMAND
+      ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
+  endforeach()
+
+  target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include")
+endif()
 
 if(NOT MSVC AND USE_XNNPACK)
   TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 6181239bd5b3..accd810b7085 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -23,6 +23,19 @@ GENERATED_CPP = [
     "autograd/generated/python_variable_methods.cpp",
 ]
 
+# NVFuser runtime library
+libtorch_nvfuser_runtime_sources = [
+    "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/helpers.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu",
+    "torch/csrc/jit/codegen/cuda/runtime/tensor.cu",
+]
+
+libtorch_nvfuser_generated_headers = ["{}.h".format(name[36:-3]) for name in libtorch_nvfuser_runtime_sources]
+
 def libtorch_generated_sources(gencode_pattern):
     return [gencode_pattern.format(name) for name in [
         "autograd/generated/Functions.cpp",
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 4d7f843ad383..61ca4ef3db89 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -7,10 +7,17 @@
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_resource_strings.h>
 #include <torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h>
 #include <torch/csrc/jit/resource_guard.h>
 
+#include <nvfuser_resources/block_reduction.h>
+#include <nvfuser_resources/broadcast.h>
+#include <nvfuser_resources/fp16_support.h>
+#include <nvfuser_resources/grid_reduction.h>
+#include <nvfuser_resources/helpers.h>
+#include <nvfuser_resources/random_numbers.h>
+#include <nvfuser_resources/tensor.h>
+
 #include <fstream>
 
 namespace torch {
@@ -21,13 +28,18 @@ namespace executor_utils {
 
 std::string kernelPreamble() {
   std::stringstream ss;
-  ss << code_template_tensor_struct << "\n"
-     << code_fp16_support << "\n"
-     << code_random_number_gen << "\n"
-     << code_helper_funcs << "\n"
-     << code_template_block_reduction << "\n"
-     << code_template_grid_reduction << "\n"
-     << code_template_block_broadcast << "\n";
+
+#ifndef __HIP_PLATFORM_HCC__
+  ss << nvfuser_resources::fp16_support_cu;
+#endif
+
+  ss << nvfuser_resources::tensor_cu;
+  ss << nvfuser_resources::random_numbers_cu;
+  ss << nvfuser_resources::helpers_cu;
+  ss << nvfuser_resources::block_reduction_cu;
+  ss << nvfuser_resources::grid_reduction_cu;
+  ss << nvfuser_resources::broadcast_cu;
+
   return ss.str();
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index b306cf04da0a..28a702b98d73 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -14,6 +14,9 @@
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
+#include <string>
+#include <vector>
+
 namespace torch {
 namespace jit {
 namespace fuser {
diff --git a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
deleted file mode 100644
index a601a956c175..000000000000
--- a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
+++ /dev/null
@@ -1,664 +0,0 @@
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-// IO data structure for kernel code;
-static auto code_template_tensor_struct = R"(
-typedef unsigned char uint8_t;
-typedef signed char int8_t;
-typedef short int  int16_t;
-typedef long long int int64_t;
-
-template<typename T, int N>
-struct Tensor {
-  __device__ T& operator[](int64_t ind) {
-    return data[ind];
-  };
-
-  T* data;
-  int64_t size[N];
-  int64_t stride[N];
-};
-
-// Specialization for 0-dim case as it does not need size and stride arrays.
-// They will be an error as well since zero-length arrays are not allowed.
-template<typename T>
-struct Tensor<T, 0> {
-  __device__ T& operator[](int64_t) {
-    return *data;
-  };
-
-  T* data;
-};
-)";
-
-// Code support for FP16 __half type and intrinsics
-#ifdef __HIP_PLATFORM_HCC__
-static auto code_fp16_support = R"()";
-#else
-static auto code_fp16_support = R"(
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
-#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
-struct __align__(2) __half {
-  __host__ __device__ __half() { }
-protected:
-  unsigned short __x;
-};
-
-/* Definitions of intrinsics */
-__device__ __half __float2half(const float f) {
-  __half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
-  return val;
-}
-__device__ float __half2float(const __half h) {
-  float val;
-  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
-  return val;
-}
-)";
-#endif
-// struct and code for functions that need random number generation
-static auto code_random_number_gen = R"(
-class Philox {
-public:
-  __device__ inline Philox(unsigned long long seed,
-                           unsigned long long subsequence,
-                           unsigned long long offset) {
-    key.x = (unsigned int)seed;
-    key.y = (unsigned int)(seed >> 32);
-    counter = make_uint4(0, 0, 0, 0);
-    counter.z = (unsigned int)(subsequence);
-    counter.w = (unsigned int)(subsequence >> 32);
-    STATE = 0;
-    incr_n(offset / 4);
-  }
-  __device__ inline unsigned long operator()() {
-    if(STATE == 0) {
-      uint4 counter_ = counter;
-      uint2 key_ = key;
-      for(int i = 0; i < 9; i++) {
-        counter_ = single_round(counter_, key_);
-        key_.x += (kPhilox10A); key_.y += (kPhilox10B);
-      }
-      output = single_round(counter_, key_);
-      incr();
-    }
-    unsigned long ret;
-    switch(STATE) {
-      case 0: ret = output.x; break;
-      case 1: ret = output.y; break;
-      case 2: ret = output.z; break;
-      case 3: ret = output.w; break;
-    }
-    STATE = (STATE + 1) % 4;
-    return ret;
-  }
-private:
-  uint4 counter;
-  uint4 output;
-  uint2 key;
-  unsigned int STATE;
-  __device__ inline void incr_n(unsigned long long n) {
-    unsigned int nlo = (unsigned int)(n);
-    unsigned int nhi = (unsigned int)(n >> 32);
-    counter.x += nlo;
-    if (counter.x < nlo)
-      nhi++;
-    counter.y += nhi;
-    if (nhi <= counter.y)
-      return;
-    if (++counter.z)
-      return;
-    ++counter.w;
-  }
-  __device__ inline void incr() {
-    if (++counter.x)
-      return;
-    if (++counter.y)
-      return;
-    if (++counter.z)
-      return;
-    ++counter.w;
-  }
-  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
-                                    unsigned int *result_high) {
-    *result_high = __umulhi(a, b);
-    return a*b;
-  }
-  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
-    unsigned int hi0;
-    unsigned int hi1;
-    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
-    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
-    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
-    return ret;
-  }
-  static const unsigned long kPhilox10A = 0x9E3779B9;
-  static const unsigned long kPhilox10B = 0xBB67AE85;
-  static const unsigned long kPhiloxSA = 0xD2511F53;
-  static const unsigned long kPhiloxSB = 0xCD9E8D57;
-};
-// Inverse of 2^32.
-#define M_RAN_INVM32 2.3283064e-10f
-__device__  __inline__ float uniform(unsigned int x) {
-  return x * M_RAN_INVM32;
-}
-)";
-
-// Helper functions for Operations
-static auto code_helper_funcs = R"(
-__device__ constexpr int ceilDiv(const int a, const int b) {
-  return (a + b - 1) / b;
-}
-__device__ constexpr int alignBufferSize(const int buffer, const int size) {
-  return (buffer + (size-1)) & ~(size-1);
-}
-__device__ float clamp(const float x, const float minv, const float maxv) {
-  return x < minv ? minv : (x > maxv ? maxv : x);
-}
-__device__ float frac(const float x) {
-  return x - truncf(x);
-}
-__device__ float gelu(const float x) {
-  return x * normcdf(x);
-}
-__device__ float reciprocal(const float x) {
-  return 1.f / x;
-}
-__device__ float relu(const float x) {
-  return x <= 0.f ? 0.f : x;
-}
-__device__ float remainder(const float a, const float b) {
-  return a - b * floorf(a / b);
-}
-__device__ float sigmoid(const float x) {
-  return 1.f / (1.f + expf(-x));
-}
-__device__ float threshold(const float x, const float t, const float v) {
-  return x <= t ? v : x;
-}
-__device__ float where(const bool c, const float a, const float b) {
-  return c ? a : b;
-}
-__device__ float randLike(Philox rnd) {
-  return uniform(rnd());
-};
-)";
-
-// Note: We agressively template functions taking dim3 in the functions below
-//       because ROCM uses different types for the various dim3 and maps them
-//       directly to intrinsics, but they're dim3 when used after modification.
-/*
- *  EXAMPLE USAGE:
- *  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
- *    (output[output_index], inputs[input_index], [] __device__ (T& a, const T
- * b) { a += b; } );
- */
-static auto code_template_block_reduction = R"(
-// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
-// dimension of the block. If set to 0 it means that dimension doesn't
-// participate, otherwise it is the number of threads. We could start with warp
-// reductions, then reduce the warps, this could save some shared memory, but
-// may actually be slower.
-template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func, typename _dim3ti, typename _dim3bd>
-__inline__ __device__
-void blockReduce(
-    T& out,
-    const T inp_val,
-    Func reduction_op,
-    const _dim3ti& thread_idx,
-    const _dim3bd& block_dim,
-    T* shared_mem,
-    bool read_write_pred,
-    T init_val) {
-
-  unsigned int reduction_size
-    = (X_REDUCE ? block_dim.x : 1)
-    * (Y_REDUCE ? block_dim.y : 1)
-    * (Z_REDUCE ? block_dim.z : 1);
-
-  // If this thread will output a final result
-  bool should_write = true;
-
-  if (X_REDUCE)
-    should_write = should_write && thread_idx.x == 0;
-  if (Y_REDUCE)
-    should_write = should_write && thread_idx.y == 0;
-  if (Z_REDUCE)
-    should_write = should_write && thread_idx.z == 0;
-
-  unsigned int reduction_stride;
-  unsigned int reduction_tid;
-  unsigned int linear_tid;
-
-  if(X_REDUCE && !Y_REDUCE && Z_REDUCE){
-    // Transpose Z and Y in the shared memory so Z and X dims are contiguous in smem
-    reduction_stride = 1;
-    linear_tid = threadIdx.y * blockDim.z * blockDim.x + threadIdx.z * blockDim.x + threadIdx.x;
-    reduction_tid = threadIdx.z * blockDim.x + threadIdx.x;
-  } else {
-    // Normal reduction in order
-    reduction_stride
-    = (X_REDUCE ? 1
-    : (Y_REDUCE ? block_dim.x
-    : (Z_REDUCE ? block_dim.x * block_dim.y : 0)));
-
-    linear_tid = thread_idx.z * block_dim.y * block_dim.x + thread_idx.y * block_dim.x + thread_idx.x;
-
-    reduction_tid
-    = ( Z_REDUCE ? thread_idx.z : 0 ) * ( Y_REDUCE ? block_dim.y : 1 ) * ( X_REDUCE ? block_dim.x : 1 )
-    + ( Y_REDUCE ? thread_idx.y : 0 )                                 * ( X_REDUCE ? block_dim.x : 1 )
-    + ( X_REDUCE ? thread_idx.x : 0 );
-  }
-
-  assert( reduction_stride != 0 );
-
-  if(read_write_pred){
-    shared_mem[linear_tid] = inp_val;
-  } else {
-    shared_mem[linear_tid] = init_val;
-  }
-  __syncthreads();
-  // Reduce down to nearest power of 2:
-  int np2 =  1 << (31 - __clz(reduction_size));
-
-  if( reduction_tid < np2 ){
-    if( reduction_tid + np2 < reduction_size){
-      reduction_op( shared_mem[linear_tid], shared_mem[linear_tid + np2 * reduction_stride] );
-    }
-  }
-  __syncthreads();
-  //for (int factor = np2/2; factor > contig_threads / 2; factor>>=1) {
-  for (int factor = np2/2; factor > 0; factor>>=1) {
-    if (reduction_tid < factor) {
-      reduction_op( shared_mem[linear_tid], shared_mem[linear_tid + factor * reduction_stride] );
-    }
-    __syncthreads();
-  }
-
-  if(should_write && read_write_pred)
-    out = shared_mem[linear_tid];
-
-}
-)";
-
-/**
-  Inter-block reduction.
-
-  Function gridReduce performs point-wise reductions of scalars across thread
-  blocks. Thread blocks are disjointly partitioned into groups of thread blocks,
-  "reduction segments," that are collectively defined by boolean template
-  parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
-  whether thread blocks along the dimension should be grouped into the same
-  reduction segment. Cross-block reducitons are independently done within each
-  segment and generates distinctive results per segment. For instance, if all of
-  X/Y/Z_BLOCK are true, reductions will be done across all thread blocks since
-  there will be just a single segment consisting of all thread blocks. If none
-  of them are true, each thread block will become a segment by itself, so no
-  reduction will be performed.
-
-  The input scalars to reduce within each segment are a certain subset of
-  thread-private scalars provided as part of the gridReduce function parameters.
-  Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD, determine which
-  subset of the scalars should be used for inter-block reductions. Specifically,
-  all the input scalars of threads along each dimension will be used when
-  X/Y/Z_THREAD are true. Otherwise, only the value held at offset 0 of each
-  dimension will be used. Thus, for example, if all of X/Y/Z_THREAD are true,
-  the scalars of all threads in each block will participate in inter-block
-  reductions. If all of them are false, only one scalar of the thread at
-  threadIdx.x == threadIdx.y == threadIdx.z == 0 will be used. In the code
-  below, we call the subset of threads a "reduction block."
-
-  Inter-block reductions perform point-wise reductions of scalars of reduction
-  blocks within each reduction segment. More specifically, let rb be a reduction
-  block and rs be a reduction segment. Let IN(thread_idx, block_idx) denote the
-  input scalar of thread at thread_idx and block_idx. The result of each
-  reduction segment, OUT(thread_idx, block_idx_out), is defined only for each
-  thread_idx in thread block block_idx_out in the segment as follows:
-
-    OUT(thread_idx, block_idx_out) = Reduction of IN(thread_idx, block_idx) for
-  all block_idx in a reduction segment
-
-  OUT is not given for all threads that are not in block_idx_out and the
-  reduction block.
-
-  See also the function comment of gridReduce.
-*/
-static auto code_template_grid_reduction = R"(
-namespace reduction {
-
-// Utility functions
-template<typename _dim3>
-__host__ __device__ __forceinline__ size_t size(const _dim3& d) {
-  return (size_t)d.x * (size_t)d.y * (size_t)d.z;
-}
-
-#define isize(d) d.x * d.y * d.z
-
-template<typename _dim3pos, typename _dim3dim>
-__host__ __device__ __forceinline__ size_t offset(const _dim3pos& pos, const _dim3dim& dim) {
-  return (size_t)pos.x + (size_t)pos.y * (size_t)dim.x +
-      (size_t)pos.z * (size_t)dim.x * (size_t)dim.y;
-}
-
-#define ioffset(pos, dim) pos.x + pos.y * dim.x + pos.z * dim.x * dim.y
-
-// Returns dim3 of each reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
-__host__ __device__ dim3 dimension_of_reduction_segment(const _dim3& grid_dim) {
-  return dim3{X_BLOCK ? grid_dim.x : 1,
-        Y_BLOCK ? grid_dim.y : 1,
-        Z_BLOCK ? grid_dim.z : 1};
-}
-
-// Returns the number of blocks in each reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
-__host__ __device__ size_t size_of_reduction_segment(const _dim3& grid_dim) {
-  return size(dimension_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(grid_dim));
-}
-
-// Returns the total number of reduction segments.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
-__host__ __device__ size_t number_of_reduction_segments(const _dim3& grid_dim) {
-  return (X_BLOCK ? 1: grid_dim.x) *
-      (Y_BLOCK ? 1 : grid_dim.y) *
-      (Z_BLOCK ? 1 : grid_dim.z);
-}
-
-// Returns the 1-D index of the segment of thread block of block_idx.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
-__host__ __device__ size_t index_of_reduction_segment(const _dim3bi& block_idx,
-                                                      const _dim3gd& grid_dim) {
-  size_t seg_idx = 0;
-  if (!Z_BLOCK)
-    seg_idx += block_idx.z;
-  if (!Y_BLOCK)
-    seg_idx = seg_idx * grid_dim.y + block_idx.y;
-  if (!X_BLOCK)
-    seg_idx = seg_idx * grid_dim.x + block_idx.x;
-  return seg_idx;
-}
-
-// Returns the offset of thread block in its reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
-__host__ __device__ size_t offset_in_reduction_segment(const _dim3bi& block_idx,
-                                                       const _dim3gd& grid_dim) {
-  size_t offset = 0;
-  if (Z_BLOCK)
-    offset = offset * grid_dim.z + block_idx.z;
-  if (Y_BLOCK)
-    offset = offset * grid_dim.y + block_idx.y;
-  if (X_BLOCK)
-    offset = offset * grid_dim.x + block_idx.x;
-  return offset;
-}
-
-// Returns dim3 of each reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
-__host__ __device__ dim3 dimension_of_reduction_block(const _dim3& block_dim) {
-  return dim3{X_THREAD ? block_dim.x : 1,
-        Y_THREAD ? block_dim.y : 1,
-        Z_THREAD ? block_dim.z : 1};
-}
-
-// Returns the number of threads of each reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
-__host__ __device__ int size_of_reduction_block(const _dim3& block_dim) {
-  auto tmp_dim = dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
-  return isize(tmp_dim);
-}
-
-// Returns the linear offset of a thread in a reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3ti, typename _dim3bd>
-__host__ __device__ int offset_in_reduction_block(const _dim3ti& thread_idx,
-                                                  const _dim3bd& block_dim) {
-  int offset = 0;
-  if (Z_THREAD)
-    offset += thread_idx.z;
-  if (Y_THREAD)
-    offset = offset * block_dim.y + thread_idx.y;
-  if (X_THREAD)
-    offset = offset * block_dim.x + thread_idx.x;
-  return offset;
-}
-
-/** Reduces all the reduction blocks in each reduction segment.
-
-  This is only used by one thread block per reduction segment. The input
-  reduction blocks of the segment are stored in an intermediate buffer pointed
-  by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
-  block is formed.
-
-  The size of a reduction block is by definition smaller or equal to the size of
-  a thread block. We use the remaining threads to parallelize reductions across
-  reduction blocks. For example, when X/Y/Z_THREAD = {true, false, false}, we
-  use blockDim.y*blockDim.z threads for each output value. This is done first by
-  loading the input values in parallel and then by reducing across threads of
-  dimensions whose XYZ_THREAD are false.
-
-  Note that what is done here after the loading from global memory is similar to
-  what the existing blockReduce function does. The main difference is that the
-  logical block to reduce is a 2D domain where the leading dimension is the size
-  of a reduction block and the second dimension is the remaining factor in each
-  thread block. For example, when X/Y/Z_THREAD = {false, true, false}, the
-  threads are arranged as (blockDim.y, blockDim.x*blockDim.z). We do not reduce
-  along the first dimension but only the second dimension. So, it is possible to
-  reuse the existing blockReduce with dim3{blockDim.y, blockDim.x*blockDim.z}
-  instead of blockDim and with X_THREAD and Y_THREAD being false and true,
-  respectively. Also, it still need to shuffle the final output values to their
-  actual corresponding threads. In the case of when X/Y/Z_THREAD = {false, true,
-  false}, after the intra-block reduction, the final results will still be held
-  by the first blockDim.y threads, which need to be transferred to threads at
-  threadIdx.x == 0 and threadIdx.z == 0.
-*/
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD,
-          typename T, typename Func>
-__device__ void gridReduceLastBlock(
-      T& out,
-      const T *in,
-      const size_t in_size,
-      Func reduction_op,
-      T* shared_buf,
-      bool read_write_pred,
-      T init_val) {
-        
-  const int tid = ioffset(threadIdx, blockDim);
-  const int block_size = isize(blockDim);
-  const int rblock_size = size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
-
-  T inp = init_val;
-  if (tid < in_size) {
-    inp = in[tid];
-  }
-  for (size_t i = tid + block_size; i < in_size; i += block_size) {
-    reduction_op(inp, in[i]);
-  }
-
-  const auto should_write = (X_THREAD || threadIdx.x == 0) &&
-      (Y_THREAD || threadIdx.y == 0) &&
-      (Z_THREAD || threadIdx.z == 0);
-
-  auto rem_size = block_size / rblock_size;
-
-  if (rem_size > 1) {
-    const int rblock_offset = tid % rblock_size;
-    const int rblock_idx = tid / rblock_size;
-    blockReduce<false, true, false>(
-        inp, inp, reduction_op,
-        dim3{(unsigned)rblock_offset, (unsigned)rblock_idx, 0},
-        dim3{(unsigned)rblock_size, (unsigned)rem_size},
-        shared_buf, true, init_val);
-    __syncthreads();
-    if (tid < rblock_size) {
-      shared_buf[tid] = inp;
-    }
-    __syncthreads();
-    if (should_write) {
-      inp = shared_buf[offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(
-          threadIdx, blockDim)];
-    }
-  }
-
-  if (should_write && read_write_pred) {
-    out = inp;
-  }
-}
-
-/** Reduces per-thread values across thread blocks.
-
-Function parameters:
-- out: Per-thread output location
-- inp_val: Per-thread input value
-- reduction_op: Scalar reduction function
-- work_buf: Temporary buffer for cross-block reductions
-- sync_flags: A vector of integers for synchronizations
-- shared_buf: Shared memory buffer for intra-block reduction
-
-Return true when the thread block has the valid result.
-
-Template parameters:
-- X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
-  dimensions
-- X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate in
-  the cross-block reduction. Otherwise, only threads at offset 0 do.
-- T: Scalar data type of input/output data
-- Func: Type of scalara reduction function
-
-Template parameters X/Y/Z_BLOCK define a group of thread blocks that are reduced together. We call
-it a reduction segment. Some examples are:
-
-Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which includes all
-  thread blocks. It is effecively the same as the grid.
-Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an individual
-  segment by itself.
-Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread blocks that have
-  the same blockDim.x. There will be blockDim.y*blockDim.z such segments.
-
-X/Y/Z_THREAD defines a sub region of a thread block that should be reduced with
-the sub regions of other thread blocks. We call it a reduction block. E.g.,
-
-Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in the
-  cross-block reductions. The reduction block is 1x1x1 with thread 0.
-Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block participate in
-  the cross-block reductions. The reduction block in this case is equivalent to
-  the thread block.
-
-After the function completes, only one thread block per reduction segment gets
-valid reduction results. There is no guarantee which particular block gets the
-final results.
-*/
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK,
-          bool X_THREAD, bool Y_THREAD, bool Z_THREAD,
-          typename T, typename Func>
-__device__ bool gridReduce(T& out, T inp_val, Func reduction_op,
-                           volatile T* work_buf,
-                           Tensor<int64_t, 1> sync_flags,
-                           T* shared_buf, bool read_write_pred, T init_val) {
-
-  // Number of values to reduce in the grid dimensions
-  const auto seg_size =
-      size_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
-
-  // Index of the reduction we're performing out of the seg_size
-  const auto seg_idx =
-      index_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
-
-  // Number of threads we can use in final reduction, Seems to assume all threads in the block participate
-  const auto rblock_size =
-      size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
-
-  // advance to the offset for this segment
-  // index of reduction * size of the reduction * size of threads
-  work_buf += seg_idx * seg_size * rblock_size;
-
-  if ((X_THREAD || threadIdx.x == 0) &&
-      (Y_THREAD || threadIdx.y == 0) &&
-      (Z_THREAD || threadIdx.z == 0)) {
-    auto rblock_offset =
-        offset_in_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
-    auto thread_offset =
-        offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(threadIdx, blockDim);
-    auto work_buf_offset = rblock_size * rblock_offset + thread_offset;
-    if(read_write_pred){
-      work_buf[work_buf_offset] = inp_val;
-    } else {
-      work_buf[work_buf_offset] = init_val;
-    }
-  }
-  __syncthreads();
-
-  __shared__ bool last_block;
-  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
-    __threadfence();
-    // printf("%ld\n", sync_flags[seg_idx]);
-    auto old = (int64_t) atomicAdd(  (unsigned long long*) &sync_flags[seg_idx], 1);
-    last_block = old + 1 == seg_size;
-    // printf("Last_block = %d + 1 == %d\n", (int)old, (int)seg_size);
-  }
-  __syncthreads();
-
-  if (last_block) {
-    // printf("Last block %d %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
-    // final reduction
-    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
-        out, (T*)work_buf, seg_size * rblock_size,
-        reduction_op, shared_buf, read_write_pred, init_val);
-    return true;
-  } else {
-    // printf("Not last block %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
-    return false;
-  }
-}
-} // namespace reduction
-)";
-
-static auto code_template_block_broadcast = R"(
-namespace broadcast {
-
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
-__host__ __device__ unsigned offset_of_source(const dim3& block_dim, const dim3& thread_idx) {
-  unsigned offset = 0;
-  if (!Z_THREAD)
-    offset = offset * block_dim.z + thread_idx.z;
-  if (!Y_THREAD)
-    offset = offset * block_dim.y + thread_idx.y;
-  if (!X_THREAD)
-    offset = offset * block_dim.x + thread_idx.x;
-  return offset;
-}
-
-/** Broadcasts within partitioned groups of threads.
-
-    X_THREAD: Broadcast from threadIdx.x == 0 if true
-    Y_THREAD: Broadcast from threadIdx.y == 0 if true
-    Z_THREAD: Broadcast from threadIdx.z == 0 if true
-    inp_val: Per-thread source value. Only valid when the thread is a source.
-    out: Per-thread output location
- */
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
-  __device__ void blockBroadcast(T& out, T inp_val, T* shared_mem) {
-
-  const bool has_valid_data =
-      (!X_THREAD || threadIdx.x == 0) &&
-      (!Y_THREAD || threadIdx.y == 0) &&
-      (!Z_THREAD || threadIdx.z == 0);
-
-  const auto shared_offset = offset_of_source<X_THREAD, Y_THREAD, Z_THREAD>(blockDim, threadIdx);
-
-  if (has_valid_data)
-    shared_mem[shared_offset] = inp_val;
-
-  __syncthreads();
-
-  out = shared_mem[shared_offset];
-}
-
-} // namespace broadcast
-)";
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu
new file mode 100644
index 000000000000..480a99efdc42
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu
@@ -0,0 +1,104 @@
+// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
+// dimension of the block. If set to 0 it means that dimension doesn't
+// participate, otherwise it is the number of threads. We could start with warp
+// reductions, then reduce the warps, this could save some shared memory, but
+// may actually be slower.
+//
+//  EXAMPLE USAGE:
+//  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
+//    (output[output_index], inputs[input_index],
+//      [] __device__ (T& a, const T b) { a += b; });
+//
+// Note: We agressively template functions taking dim3 in the functions below
+//       because ROCM uses different types for the various dim3 and maps them
+//       directly to intrinsics, but they're dim3 when used after modification.
+//
+template <
+    bool X_REDUCE,
+    bool Y_REDUCE,
+    bool Z_REDUCE,
+    typename T,
+    typename Func,
+    typename _dim3ti,
+    typename _dim3bd>
+__device__ void blockReduce(
+    T& out,
+    const T inp_val,
+    Func reduction_op,
+    const _dim3ti& thread_idx,
+    const _dim3bd& block_dim,
+    T* shared_mem,
+    bool read_write_pred,
+    T init_val) {
+  unsigned int reduction_size = (X_REDUCE ? block_dim.x : 1) *
+      (Y_REDUCE ? block_dim.y : 1) * (Z_REDUCE ? block_dim.z : 1);
+
+  // If this thread will output a final result
+  bool should_write = true;
+
+  if (X_REDUCE)
+    should_write = should_write && thread_idx.x == 0;
+  if (Y_REDUCE)
+    should_write = should_write && thread_idx.y == 0;
+  if (Z_REDUCE)
+    should_write = should_write && thread_idx.z == 0;
+
+  unsigned int reduction_stride;
+  unsigned int reduction_tid;
+  unsigned int linear_tid;
+
+  if (X_REDUCE && !Y_REDUCE && Z_REDUCE) {
+    // Transpose Z and Y in the shared memory so Z and X dims are contiguous in
+    // smem
+    reduction_stride = 1;
+    linear_tid = threadIdx.y * blockDim.z * blockDim.x +
+        threadIdx.z * blockDim.x + threadIdx.x;
+    reduction_tid = threadIdx.z * blockDim.x + threadIdx.x;
+  } else {
+    // Normal reduction in order
+    reduction_stride =
+        (X_REDUCE ? 1
+                  : (Y_REDUCE ? block_dim.x
+                              : (Z_REDUCE ? block_dim.x * block_dim.y : 0)));
+
+    linear_tid = thread_idx.z * block_dim.y * block_dim.x +
+        thread_idx.y * block_dim.x + thread_idx.x;
+
+    reduction_tid = (Z_REDUCE ? thread_idx.z : 0) *
+            (Y_REDUCE ? block_dim.y : 1) * (X_REDUCE ? block_dim.x : 1) +
+        (Y_REDUCE ? thread_idx.y : 0) * (X_REDUCE ? block_dim.x : 1) +
+        (X_REDUCE ? thread_idx.x : 0);
+  }
+
+  assert(reduction_stride != 0);
+
+  if (read_write_pred) {
+    shared_mem[linear_tid] = inp_val;
+  } else {
+    shared_mem[linear_tid] = init_val;
+  }
+  __syncthreads();
+  // Reduce down to nearest power of 2:
+  int np2 = 1 << (31 - __clz(reduction_size));
+
+  if (reduction_tid < np2) {
+    if (reduction_tid + np2 < reduction_size) {
+      reduction_op(
+          shared_mem[linear_tid],
+          shared_mem[linear_tid + np2 * reduction_stride]);
+    }
+  }
+  __syncthreads();
+  // for (int factor = np2/2; factor > contig_threads / 2; factor>>=1) {
+  for (int factor = np2 / 2; factor > 0; factor >>= 1) {
+    if (reduction_tid < factor) {
+      reduction_op(
+          shared_mem[linear_tid],
+          shared_mem[linear_tid + factor * reduction_stride]);
+    }
+    __syncthreads();
+  }
+
+  if (should_write && read_write_pred)
+    out = shared_mem[linear_tid];
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu b/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
new file mode 100644
index 000000000000..9a13b021f101
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
@@ -0,0 +1,41 @@
+namespace broadcast {
+
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
+__host__ __device__ unsigned offset_of_source(
+    const dim3& block_dim,
+    const dim3& thread_idx) {
+  unsigned offset = 0;
+  if (!Z_THREAD)
+    offset = offset * block_dim.z + thread_idx.z;
+  if (!Y_THREAD)
+    offset = offset * block_dim.y + thread_idx.y;
+  if (!X_THREAD)
+    offset = offset * block_dim.x + thread_idx.x;
+  return offset;
+}
+
+// Broadcasts within partitioned groups of threads.
+//
+// X_THREAD: Broadcast from threadIdx.x == 0 if true
+// Y_THREAD: Broadcast from threadIdx.y == 0 if true
+// Z_THREAD: Broadcast from threadIdx.z == 0 if true
+// inp_val: Per-thread source value. Only valid when the thread is a source.
+// out: Per-thread output location
+//
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
+__device__ void blockBroadcast(T& out, T inp_val, T* shared_mem) {
+  const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
+      (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
+
+  const auto shared_offset =
+      offset_of_source<X_THREAD, Y_THREAD, Z_THREAD>(blockDim, threadIdx);
+
+  if (has_valid_data)
+    shared_mem[shared_offset] = inp_val;
+
+  __syncthreads();
+
+  out = shared_mem[shared_offset];
+}
+
+} // namespace broadcast
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
new file mode 100644
index 000000000000..ba236784ed74
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
@@ -0,0 +1,21 @@
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short*>(&(var)))
+
+struct __align__(2) __half {
+  __host__ __device__ __half() {}
+
+ protected:
+  unsigned short __x;
+};
+
+__device__ __half __float2half(const float f) {
+  __half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
+  return val;
+}
+
+__device__ float __half2float(const __half h) {
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
+  return val;
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
new file mode 100644
index 000000000000..15e20413f672
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
@@ -0,0 +1,374 @@
+// Inter-block reduction.
+//
+// Function gridReduce performs point-wise reductions of scalars across thread
+// blocks. Thread blocks are disjointly partitioned into groups of thread
+// blocks, "reduction segments," that are collectively defined by boolean
+// template parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK
+// determines whether thread blocks along the dimension should be grouped into
+// the same reduction segment. Cross-block reducitons are independently done
+// within each segment and generates distinctive results per segment. For
+// instance, if all of X/Y/Z_BLOCK are true, reductions will be done across all
+// thread blocks since there will be just a single segment consisting of all
+// thread blocks. If none of them are true, each thread block will become a
+// segment by itself, so no reduction will be performed.
+//
+// The input scalars to reduce within each segment are a certain subset of
+// thread-private scalars provided as part of the gridReduce function
+// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
+// determine which subset of the scalars should be used for inter-block
+// reductions. Specifically, all the input scalars of threads along each
+// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
+// held at offset 0 of each dimension will be used. Thus, for example, if all of
+// X/Y/Z_THREAD are true, the scalars of all threads in each block will
+// participate in inter-block reductions. If all of them are false, only one
+// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
+// be used. In the code below, we call the subset of threads a "reduction
+// block."
+//
+// Inter-block reductions perform point-wise reductions of scalars of reduction
+// blocks within each reduction segment. More specifically, let rb be a
+// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
+// denote the input scalar of thread at thread_idx and block_idx. The result of
+// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
+// each thread_idx in thread block block_idx_out in the segment as follows:
+//
+//   OUT(thread_idx, block_idx_out) =
+//     Reduction of IN(thread_idx, block_idx) for
+//       all block_idx in a reduction segment
+//
+// OUT is not given for all threads that are not in block_idx_out and the
+// reduction block.
+//
+// See also the function comment of gridReduce.
+
+namespace reduction {
+
+// Utility functions
+template <typename _dim3>
+__device__ __forceinline__ size_t size(const _dim3& d) {
+  return (size_t)d.x * (size_t)d.y * (size_t)d.z;
+}
+
+#define isize(d) d.x* d.y* d.z
+
+template <typename _dim3pos, typename _dim3dim>
+__device__ __forceinline__ size_t
+offset(const _dim3pos& pos, const _dim3dim& dim) {
+  return (size_t)pos.x + (size_t)pos.y * (size_t)dim.x +
+      (size_t)pos.z * (size_t)dim.x * (size_t)dim.y;
+}
+
+#define ioffset(pos, dim) pos.x + pos.y* dim.x + pos.z* dim.x* dim.y
+
+// Returns dim3 of each reduction segment.
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__device__ dim3 dimension_of_reduction_segment(const _dim3& grid_dim) {
+  return dim3{X_BLOCK ? grid_dim.x : 1,
+              Y_BLOCK ? grid_dim.y : 1,
+              Z_BLOCK ? grid_dim.z : 1};
+}
+
+// Returns the number of blocks in each reduction segment.
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__device__ size_t size_of_reduction_segment(const _dim3& grid_dim) {
+  return size(
+      dimension_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(grid_dim));
+}
+
+// Returns the total number of reduction segments.
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__device__ size_t number_of_reduction_segments(const _dim3& grid_dim) {
+  return (X_BLOCK ? 1 : grid_dim.x) * (Y_BLOCK ? 1 : grid_dim.y) *
+      (Z_BLOCK ? 1 : grid_dim.z);
+}
+
+// Returns the 1-D index of the segment of thread block of block_idx.
+template <
+    bool X_BLOCK,
+    bool Y_BLOCK,
+    bool Z_BLOCK,
+    typename _dim3bi,
+    typename _dim3gd>
+__device__ size_t
+index_of_reduction_segment(const _dim3bi& block_idx, const _dim3gd& grid_dim) {
+  size_t seg_idx = 0;
+  if (!Z_BLOCK)
+    seg_idx += block_idx.z;
+  if (!Y_BLOCK)
+    seg_idx = seg_idx * grid_dim.y + block_idx.y;
+  if (!X_BLOCK)
+    seg_idx = seg_idx * grid_dim.x + block_idx.x;
+  return seg_idx;
+}
+
+// Returns the offset of thread block in its reduction segment.
+template <
+    bool X_BLOCK,
+    bool Y_BLOCK,
+    bool Z_BLOCK,
+    typename _dim3bi,
+    typename _dim3gd>
+__device__ size_t
+offset_in_reduction_segment(const _dim3bi& block_idx, const _dim3gd& grid_dim) {
+  size_t offset = 0;
+  if (Z_BLOCK)
+    offset = offset * grid_dim.z + block_idx.z;
+  if (Y_BLOCK)
+    offset = offset * grid_dim.y + block_idx.y;
+  if (X_BLOCK)
+    offset = offset * grid_dim.x + block_idx.x;
+  return offset;
+}
+
+// Returns dim3 of each reduction block.
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
+__device__ dim3 dimension_of_reduction_block(const _dim3& block_dim) {
+  return dim3{X_THREAD ? block_dim.x : 1,
+              Y_THREAD ? block_dim.y : 1,
+              Z_THREAD ? block_dim.z : 1};
+}
+
+// Returns the number of threads of each reduction block.
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
+__device__ int size_of_reduction_block(const _dim3& block_dim) {
+  auto tmp_dim =
+      dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
+  return isize(tmp_dim);
+}
+
+// Returns the linear offset of a thread in a reduction block.
+template <
+    bool X_THREAD,
+    bool Y_THREAD,
+    bool Z_THREAD,
+    typename _dim3ti,
+    typename _dim3bd>
+__device__ int offset_in_reduction_block(
+    const _dim3ti& thread_idx,
+    const _dim3bd& block_dim) {
+  int offset = 0;
+  if (Z_THREAD)
+    offset += thread_idx.z;
+  if (Y_THREAD)
+    offset = offset * block_dim.y + thread_idx.y;
+  if (X_THREAD)
+    offset = offset * block_dim.x + thread_idx.x;
+  return offset;
+}
+
+// Reduces all the reduction blocks in each reduction segment.
+//
+// This is only used by one thread block per reduction segment. The input
+// reduction blocks of the segment are stored in an intermediate buffer pointed
+// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
+// block is formed.
+//
+// The size of a reduction block is by definition smaller or equal to the size
+// of a thread block. We use the remaining threads to parallelize reductions
+// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
+// false}, we use blockDim.y*blockDim.z threads for each output value. This is
+// done first by loading the input values in parallel and then by reducing
+// across threads of dimensions whose XYZ_THREAD are false.
+//
+// Note that what is done here after the loading from global memory is similar
+// to what the existing blockReduce function does. The main difference is that
+// the logical block to reduce is a 2D domain where the leading dimension is the
+// size of a reduction block and the second dimension is the remaining factor in
+// each thread block. For example, when X/Y/Z_THREAD = {false, true, false}, the
+// threads are arranged as (blockDim.y, blockDim.x*blockDim.z). We do not reduce
+// along the first dimension but only the second dimension. So, it is possible
+// to reuse the existing blockReduce with dim3{blockDim.y,
+// blockDim.x*blockDim.z} instead of blockDim and with X_THREAD and Y_THREAD
+// being false and true, respectively. Also, it still need to shuffle the final
+// output values to their actual corresponding threads. In the case of when
+// X/Y/Z_THREAD = {false, true, false}, after the intra-block reduction, the
+// final results will still be held by the first blockDim.y threads, which need
+// to be transferred to threads at threadIdx.x == 0 and threadIdx.z == 0.
+template <
+    bool X_THREAD,
+    bool Y_THREAD,
+    bool Z_THREAD,
+    typename T,
+    typename Func>
+__device__ void gridReduceLastBlock(
+    T& out,
+    const T* in,
+    const size_t in_size,
+    Func reduction_op,
+    T* shared_buf,
+    bool read_write_pred,
+    T init_val) {
+  const int tid = ioffset(threadIdx, blockDim);
+  const int block_size = isize(blockDim);
+  const int rblock_size =
+      size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
+
+  T inp = init_val;
+  if (tid < in_size) {
+    inp = in[tid];
+  }
+  for (size_t i = tid + block_size; i < in_size; i += block_size) {
+    reduction_op(inp, in[i]);
+  }
+
+  const auto should_write = (X_THREAD || threadIdx.x == 0) &&
+      (Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
+
+  auto rem_size = block_size / rblock_size;
+
+  if (rem_size > 1) {
+    const int rblock_offset = tid % rblock_size;
+    const int rblock_idx = tid / rblock_size;
+    blockReduce<false, true, false>(
+        inp,
+        inp,
+        reduction_op,
+        dim3{(unsigned)rblock_offset, (unsigned)rblock_idx, 0},
+        dim3{(unsigned)rblock_size, (unsigned)rem_size},
+        shared_buf,
+        true,
+        init_val);
+    __syncthreads();
+    if (tid < rblock_size) {
+      shared_buf[tid] = inp;
+    }
+    __syncthreads();
+    if (should_write) {
+      inp = shared_buf[offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(
+          threadIdx, blockDim)];
+    }
+  }
+
+  if (should_write && read_write_pred) {
+    out = inp;
+  }
+}
+
+// Reduces per-thread values across thread blocks.
+//
+// Function parameters:
+// - out: Per-thread output location
+// - inp_val: Per-thread input value
+// - reduction_op: Scalar reduction function
+// - work_buf: Temporary buffer for cross-block reductions
+// - sync_flags: A vector of integers for synchronizations
+// - shared_buf: Shared memory buffer for intra-block reduction
+//
+// Return true when the thread block has the valid result.
+//
+// Template parameters:
+// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z
+//   dimensions
+// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate
+//   in the cross-block reduction. Otherwise, only threads at offset 0 do.
+// - T: Scalar data type of input/output data
+// - Func: Type of scalara reduction function
+//
+// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
+// reduced together. We call it a reduction segment. Some examples are:
+//
+// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
+// includes all thread blocks. It is effecively the same as the grid.
+//
+// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
+// individual segment by itself.
+//
+// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
+// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
+// such segments.
+//
+// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced
+// with the sub regions of other thread blocks. We call it a reduction block.
+// E.g.,
+//
+// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in
+// the cross-block reductions. The reduction block is 1x1x1 with thread 0.
+//
+// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block
+// participate in the cross-block reductions. The reduction block in this case
+// is equivalent to the thread block.
+//
+// After the function completes, only one thread block per reduction segment
+// gets valid reduction results. There is no guarantee which particular block
+// gets the final results.
+//
+template <
+    bool X_BLOCK,
+    bool Y_BLOCK,
+    bool Z_BLOCK,
+    bool X_THREAD,
+    bool Y_THREAD,
+    bool Z_THREAD,
+    typename T,
+    typename Func>
+__device__ bool gridReduce(
+    T& out,
+    T inp_val,
+    Func reduction_op,
+    volatile T* work_buf,
+    Tensor<int64_t, 1> sync_flags,
+    T* shared_buf,
+    bool read_write_pred,
+    T init_val) {
+  // Number of values to reduce in the grid dimensions
+  const auto seg_size =
+      size_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
+
+  // Index of the reduction we're performing out of the seg_size
+  const auto seg_idx =
+      index_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+
+  // Number of threads we can use in final reduction, Seems to assume all
+  // threads in the block participate
+  const auto rblock_size =
+      size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
+
+  // advance to the offset for this segment
+  // index of reduction * size of the reduction * size of threads
+  work_buf += seg_idx * seg_size * rblock_size;
+
+  if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
+      (Z_THREAD || threadIdx.z == 0)) {
+    auto rblock_offset = offset_in_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(
+        blockIdx, gridDim);
+    auto thread_offset =
+        offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(
+            threadIdx, blockDim);
+    auto work_buf_offset = rblock_size * rblock_offset + thread_offset;
+    if (read_write_pred) {
+      work_buf[work_buf_offset] = inp_val;
+    } else {
+      work_buf[work_buf_offset] = init_val;
+    }
+  }
+  __syncthreads();
+
+  __shared__ bool last_block;
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
+    __threadfence();
+    // printf("%ld\n", sync_flags[seg_idx]);
+    auto old = (int64_t)atomicAdd((unsigned long long*)&sync_flags[seg_idx], 1);
+    last_block = old + 1 == seg_size;
+    // printf("Last_block = %d + 1 == %d\n", (int)old, (int)seg_size);
+  }
+  __syncthreads();
+
+  if (last_block) {
+    // printf("Last block %d %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
+    // final reduction
+    gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
+        out,
+        (T*)work_buf,
+        seg_size * rblock_size,
+        reduction_op,
+        shared_buf,
+        read_write_pred,
+        init_val);
+    return true;
+  } else {
+    // printf("Not last block %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
+    return false;
+  }
+}
+
+} // namespace reduction
diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
new file mode 100644
index 000000000000..15b33b25634d
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
@@ -0,0 +1,47 @@
+__device__ constexpr int ceilDiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+__device__ constexpr int alignBufferSize(int buffer, int size) {
+  return (buffer + (size - 1)) & ~(size - 1);
+}
+
+__device__ float clamp(float x, float minv, float maxv) {
+  return x < minv ? minv : (x > maxv ? maxv : x);
+}
+
+__device__ float frac(float x) {
+  return x - truncf(x);
+}
+
+__device__ float gelu(float x) {
+  return x * normcdf(x);
+}
+
+__device__ float reciprocal(float x) {
+  return 1.f / x;
+}
+
+__device__ float relu(float x) {
+  return x <= 0.f ? 0.f : x;
+}
+
+__device__ float remainder(float a, float b) {
+  return a - b * floorf(a / b);
+}
+
+__device__ float sigmoid(float x) {
+  return 1.f / (1.f + expf(-x));
+}
+
+__device__ float threshold(float x, float t, float v) {
+  return x <= t ? v : x;
+}
+
+__device__ float where(bool c, float a, float b) {
+  return c ? a : b;
+}
+
+__device__ float randLike(Philox rnd) {
+  return uniform(rnd());
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu b/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
new file mode 100644
index 000000000000..d690145e61bd
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
@@ -0,0 +1,104 @@
+class Philox {
+ public:
+  __device__ Philox(
+      unsigned long long seed,
+      unsigned long long subsequence,
+      unsigned long long offset) {
+    key.x = (unsigned int)seed;
+    key.y = (unsigned int)(seed >> 32);
+    counter = make_uint4(0, 0, 0, 0);
+    counter.z = (unsigned int)(subsequence);
+    counter.w = (unsigned int)(subsequence >> 32);
+    STATE = 0;
+    incr_n(offset / 4);
+  }
+
+  __device__ unsigned long operator()() {
+    if (STATE == 0) {
+      uint4 counter_ = counter;
+      uint2 key_ = key;
+      for (int i = 0; i < 9; i++) {
+        counter_ = single_round(counter_, key_);
+        key_.x += (kPhilox10A);
+        key_.y += (kPhilox10B);
+      }
+      output = single_round(counter_, key_);
+      incr();
+    }
+    unsigned long ret = 0;
+    switch (STATE) {
+      case 0:
+        ret = output.x;
+        break;
+      case 1:
+        ret = output.y;
+        break;
+      case 2:
+        ret = output.z;
+        break;
+      case 3:
+        ret = output.w;
+        break;
+    }
+    STATE = (STATE + 1) % 4;
+    return ret;
+  }
+
+ private:
+  __device__ void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+
+  __device__ void incr() {
+    if (++counter.x)
+      return;
+    if (++counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+
+  __device__ unsigned int mulhilo32(
+      unsigned int a,
+      unsigned int b,
+      unsigned int* result_high) {
+    *result_high = __umulhi(a, b);
+    return a * b;
+  }
+
+  __device__ uint4 single_round(uint4 ctr, uint2 key) {
+    unsigned int hi0;
+    unsigned int hi1;
+    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    return ret;
+  }
+
+ private:
+  static constexpr unsigned long kPhilox10A = 0x9E3779B9;
+  static constexpr unsigned long kPhilox10B = 0xBB67AE85;
+  static constexpr unsigned long kPhiloxSA = 0xD2511F53;
+  static constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
+
+  uint4 counter = {};
+  uint4 output = {};
+  uint2 key = {};
+  unsigned int STATE = 0;
+};
+
+__device__ float uniform(unsigned int x) {
+  constexpr float kRanInvM32 = 2.3283064e-10f; // Inverse of 2^32.
+  return x * kRanInvM32;
+}
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu
new file mode 100644
index 000000000000..76731c8c4482
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu
@@ -0,0 +1,26 @@
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef short int int16_t;
+typedef long long int int64_t;
+
+template <typename T, int N>
+struct Tensor {
+  __device__ T& operator[](int64_t ind) {
+    return data[ind];
+  };
+
+  T* data;
+  int64_t size[N];
+  int64_t stride[N];
+};
+
+// Specialization for 0-dim case as it does not need size and stride arrays.
+// They will be an error as well since zero-length arrays are not allowed.
+template <typename T>
+struct Tensor<T, 0> {
+  __device__ T& operator[](int64_t) {
+    return *data;
+  };
+
+  T* data;
+};
diff --git a/torch/csrc/jit/codegen/cuda/tools/stringify_file.py b/torch/csrc/jit/codegen/cuda/tools/stringify_file.py
new file mode 100644
index 000000000000..9f4e74e9c1e6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/tools/stringify_file.py
@@ -0,0 +1,29 @@
+
+# Generates a C++ header files embedding the original input as a string literal
+
+import argparse
+import pathlib
+from datetime import datetime
+
+arg_parser = argparse.ArgumentParser(
+    description='Converts source files to C++ string literals', allow_abbrev=False)
+
+arg_parser.add_argument('-i', '--input', required=True,
+                        help='Input source file')
+
+arg_parser.add_argument('-o', '--output', required=True,
+                        help='Name of the generated header file')
+
+args = arg_parser.parse_args()
+
+with open(args.input, 'r') as fin:
+    with open(args.output, 'w') as fout:
+        literal_name = f'{pathlib.Path(args.input).stem}_cu'
+        fout.write(f'// Generated from "{args.input}"\n')
+        fout.write(f'// {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n')
+        fout.write('namespace nvfuser_resources {\n\n')
+        fout.write(f'constexpr const char* {literal_name} = R"(\n')
+        for line in fin:
+            fout.write(line)
+        fout.write(')";\n')
+        fout.write('\n} // namespace nvfuser_resources\n')

From 1112773cf533a2ab6246fbb4d9a0ee0d0d53d9ea Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <SsnL@users.noreply.github.com>
Date: Wed, 2 Dec 2020 21:40:37 -0800
Subject: [PATCH 005/132] Fix unintended error when worker force kill happens
 #43455 (#43462)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43455

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43462

Reviewed By: bdhirsh

Differential Revision: D25277759

Pulled By: VitalyFedyunin

fbshipit-source-id: 0bb0d87374c0403853d71aac2c242374bfc7acf2
---
 torch/utils/data/dataloader.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index d1025c02cc9b..1eb60c81f7d0 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -1292,13 +1292,10 @@ def _shutdown_workers(self):
                     if self._persistent_workers or self._workers_status[worker_id]:
                         self._mark_worker_as_unavailable(worker_id, shutdown=True)
                 for w in self._workers:
+                    # We should be able to join here, but in case anything went
+                    # wrong, we set a timeout and if the workers fail to join,
+                    # they are killed in the `finally` block.
                     w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
-                    if w.is_alive():
-                        # Existing mechanisms try to make the workers exit
-                        # peacefully, but in case that we unfortunately reach
-                        # here, which we shouldn't, (e.g., pytorch/pytorch#39570),
-                        # we kill the worker.
-                        w.terminate()
                 for q in self._index_queues:
                     q.cancel_join_thread()
                     q.close()
@@ -1316,6 +1313,13 @@ def _shutdown_workers(self):
                 if self._worker_pids_set:
                     _utils.signal_handling._remove_worker_pids(id(self))
                     self._worker_pids_set = False
+                for w in self._workers:
+                    if w.is_alive():
+                        # Existing mechanisms try to make the workers exit
+                        # peacefully, but in case that we unfortunately reach
+                        # here, which we shouldn't, (e.g., pytorch/pytorch#39570),
+                        # we kill the worker.
+                        w.terminate()
 
     def __del__(self):
         self._shutdown_workers()

From 79b9c034656652cad838a15094536fe38344ab2c Mon Sep 17 00:00:00 2001
From: Edvard Ghazaryan <edvardg@fb.com>
Date: Wed, 2 Dec 2020 22:49:41 -0800
Subject: [PATCH 006/132] Optimize torch zeros (#45636)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45636

After creating empty tensor 'memset' used to zero out items of tensor

Test Plan:
pytorch benchmark tool results:

timer = benchmark_utils.Timer(stmt="torch.zeros((1024, 4096))")

Before: 1007 us
After:     841.26 us
1 measurement, 10000 runs , 1 thread

timer = benchmark_utils.Timer(stmt="torch.zeros((128))")

Before: 4 - 7.6 us
After:     2.4 - 2.8 us
1 measurement, 10000 runs , 1 thread

           torch.int8     |   1   |  4096  |  8192  |  16384  |  32768  |
1 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   500  |   600  |    700  |   2000  |
  (Reference)  x.zero_()  |  800  |  1000  |  1000  |   2000  |   2000  |
2 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   500  |   600  |    700  |   2000  |
  (Reference)  x.zero_()  |  800  |  1000  |  1000  |   2000  |   3000  |
4 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   500  |   600  |    700  |   2000  |
  (Reference)  x.zero_()  |  800  |  1000  |  1000  |   2000  |   3000  |

           torch.int32    |   1   |  4096  |  8192  |  16384  |  32768  |
1 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  400  |   700  |  2000  |   2900  |   5500  |
  (Reference)  x.zero_()  |  800  |  2000  |  3000  |   4400  |   7300  |
2 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   700  |  2000  |   3000  |   5600  |
  (Reference)  x.zero_()  |  900  |  2000  |  2000  |   3600  |   7200  |
4 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  400  |   700  |  2000  |   3000  |   5700  |
  (Reference)  x.zero_()  |  800  |  2000  |  3100  |   4300  |   9000  |

           torch.float16  |   1   |  4096  |  8192  |  16384  |  32768  |
1 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   500  |   700  |   2000  |   3000  |
  (Reference)  x.zero_()  |  800  |  1000  |  2000  |   2000  |   3300  |
2 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   600  |   700  |   2000  |   3000  |
  (Reference)  x.zero_()  |  800  |  1000  |  2000  |   2000  |   4300  |
4 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   600  |   700  |   2000  |   3300  |
  (Reference)  x.zero_()  |  900  |  1000  |  2000  |   2000  |   4400  |

           torch.float32  |   1   |  4096  |  8192  |  16384  |  32768  |
1 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   700  |  2000  |   3200  |   6100  |
  (Reference)  x.zero_()  |  800  |  2000  |  2000  |   3500  |   6100  |
2 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   700  |  2000  |   3100  |   5600  |
  (Reference)  x.zero_()  |  800  |  2000  |  2000  |   3300  |   7000  |
4 threads: --------------------------------------------------------------
  (PR #45636)  x.zero_()  |  500  |   700  |  2000  |   3000  |   5600  |
  (Reference)  x.zero_()  |  900  |  2000  |  2000  |   3600  |   7500  |

Reviewed By: ngimel

Differential Revision: D23925113

fbshipit-source-id: 04e97ff6d67c52a8e7a21449113e1a0a7443098f
---
 aten/src/ATen/native/Fill.cpp | 18 ++++++++++++++++++
 aten/src/ATen/test/basic.cpp  | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index b466ca26fc0c..fbafe2577f93 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -91,7 +91,25 @@ Tensor& fill_diagonal_(Tensor& self, Scalar fill_value, bool wrap) {
   return self;
 }
 
+Tensor& zero_cpu_(Tensor &self, int64_t nelements) {
+  void* ptr = self.data_ptr();
+  if (nullptr == ptr) {
+    return self.fill_(0);
+  }
+  int64_t size_bytes = nelements * self.dtype().itemsize();
+  if (size_bytes > 0) {
+    std::memset(ptr, 0, size_bytes);
+  }
+  return self;
+}
+
 Tensor& zero_(Tensor &self) {
+  int64_t nelements = at::prod_intlist(self.sizes());
+  if (self.device() == at::kCPU &&
+      self.is_non_overlapping_and_dense() &&
+      nelements < internal::GRAIN_SIZE) {
+    return zero_cpu_(self, nelements);
+  }
   return self.fill_(0);
 }
 
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index a81a9a06cea6..1055eec9833a 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -80,6 +80,23 @@ void TestAdd(DeprecatedTypeProperties& type) {
   }
 }
 
+void TestZeros(DeprecatedTypeProperties& type) {
+  auto begin = std::chrono::high_resolution_clock::now();
+  Tensor a = zeros({1024, 1024}, type);
+  for (int i = 1; i < 1000; ++i) {
+    a = zeros({128, 128}, type);
+  }
+  auto end = std::chrono::high_resolution_clock::now();
+  std::cout << std::dec << "   "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(
+                   end - begin)
+                   .count()
+            << " ms" << std::endl;
+
+   std::srand(std::time(nullptr));
+   ASSERT_EQ(norm(a).item<double>(), 0.0);
+}
+
 void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
   auto begin = std::chrono::high_resolution_clock::now();
   Tensor d = ones({3, 4}, type);
@@ -309,6 +326,7 @@ void test(DeprecatedTypeProperties& type) {
   TestSort(type);
   TestRandperm(type);
   TestAdd(type);
+  TestZeros(type);
   TestLoadsOfAdds(type);
   TestLoadOfAddsWithCopy(type);
   TestIsContiguous(type);

From c7746adbc6e6ace9d4c2b54e32c8d36a7b7b0e31 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 2 Dec 2020 23:36:18 -0800
Subject: [PATCH 007/132] Revert D24874754: [pytorch][PR] Add test for empty
 tensors for batch matmuls

Test Plan: revert-hammer

Differential Revision:
D24874754 (https://github.com/pytorch/pytorch/commit/5f105e2aa6d021029811c7085512a3b1bba3f578)

Original commit changeset: 41ba837740ff

fbshipit-source-id: d6cb31cbc4a2a386aab0a5f24710f218f9a561ca
---
 aten/src/ATen/native/LinearAlgebra.cpp     | 27 +++------
 aten/src/ATen/native/cuda/LinearAlgebra.cu | 65 ++++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  9 ++-
 test/test_linalg.py                        | 40 +------------
 4 files changed, 81 insertions(+), 60 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 39eb6faf23d4..afd4ec15d25f 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -442,7 +442,7 @@ static void addmm_impl_cpu_(
   }
 }
 
-static void addbmm_impl_(
+static void addbmm_impl_cpu_(
     Tensor &result, const Tensor &self, const Tensor &batch1, const Tensor &batch2, Scalar beta, Scalar alpha) {
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -467,38 +467,29 @@ static void addbmm_impl_(
 
   const int64_t num_batches = batch1.size(0);
 
-  if (num_batches == 0) {
-    if (beta.to<c10::complex<double>>() != 0.0) {
-      result.mul_(beta);
-    } else {
-      result.zero_();
-    }
-    return;
-  }
-
   for (int64_t batch = 0; batch < num_batches; ++batch) {
-    result.addmm_(batch1[batch], batch2[batch], beta, alpha);
+    addmm_impl_cpu_(result, result, batch1[batch], batch2[batch], beta, alpha);
     beta = 1; // accumulate output once
   }
 }
 
-Tensor& addbmm_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor& addbmm_cpu_out(Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor b_self = std::get<0>(expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm_out"));
   {
     at::NoNamesGuard guard;
-    addbmm_impl_(result, b_self, batch1, batch2, beta, alpha);
+    addbmm_impl_cpu_(result, b_self, batch1, batch2, beta, alpha);
   }
   at::namedinference::propagate_names_for_addmm(result, batch1, batch2, self);
   return result;
 }
 
-Tensor &addbmm_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
-  return native::addbmm_out(self, self, batch1, batch2, beta, alpha);
+Tensor &addbmm_cpu_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return addbmm_cpu_out(self, self, batch1, batch2, beta, alpha);
 }
 
-Tensor addbmm(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+Tensor addbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
   Tensor result = at::empty({0}, self.options());
-  return native::addbmm_out(result, self, batch1, batch2, beta, alpha);
+  return addbmm_cpu_out(result, self, batch1, batch2, beta, alpha);
 }
 
 Tensor& addmm_cpu_out(Tensor &result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
@@ -617,7 +608,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
   if (self_or_result.numel() == 0) {
     return self_or_result;
   } else if (contraction_size == 0) {
-    if (is_bmm_out || (beta.to<c10::complex<double>>() == 0.0)) {
+    if (is_bmm_out) {
       return self_or_result.zero_();
     } else {
       return self_or_result.mul_(beta);
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index 57ee8e0be738..e155f9d367bc 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -313,6 +313,71 @@ Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
   return native::bmm_out_cuda(result, self, mat2);
 }
 
+Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
+                        const Tensor& batch1, const Tensor& batch2,
+                        Scalar beta, Scalar alpha) {
+  TORCH_CHECK(batch1.dim() == 3 && batch2.dim() == 3,
+              "Batch tensors should be 3D, got dimensions ", batch1.dim(),
+              " and ", batch2.dim());
+
+  Tensor self_;
+  if (&out != &self) {
+    std::tie(self_) = expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm");
+  } else {
+    self_ = self;
+  }
+
+  TORCH_CHECK(out.device() == self_.device() &&
+              out.device() == batch1.device() &&
+              out.device() == batch2.device(),
+              "Expected all tensors to be on the same device. Found: ",
+              out.device(), ", ", self_.device(), ", ",
+              batch1.device(), " and ", batch2.device());
+  TORCH_CHECK(self_.dim() == 2,
+              "2D tensor expected, got ", self_.dim(), "D tensor for input");
+  int64_t batchnum = batch1.size(0);
+  int64_t m1d1 = batch1.size(1);
+  int64_t innerdim = batch1.size(2);
+  int64_t m2d2 = batch2.size(2);
+  TORCH_CHECK(batchnum == batch2.size(0),
+              "equal number of batches expected");
+  TORCH_CHECK(m1d1 == self_.size(0),
+              "first dimension of batch1  must match first dimension of input");
+  TORCH_CHECK(m2d2 == self_.size(1),
+              "second dimension of batch2 must match second dimension of input");
+  TORCH_CHECK(innerdim == batch2.size(1),
+              "second dimension of batch1 must match first dimension of batch2");
+
+  if (&out != &self) {
+    at::native::resize_as_(out, self_);
+    if (beta.to<c10::complex<double>>() != 0.0) {
+      at::native::copy_(out, self_);
+    }
+  }
+
+  for (int64_t i=0; i<batchnum; i++) {
+    addmm_out_cuda(out, out, batch1[i], batch2[i], beta, alpha);
+    beta = 1;
+  }
+  return out;
+}
+
+Tensor& addbmm__cuda(Tensor& self,
+                     const Tensor& batch1, const Tensor& batch2,
+                     Scalar beta, Scalar alpha) {
+  addbmm_out_cuda(self, self, batch1, batch2, beta, alpha);
+  return self;
+}
+
+Tensor addbmm_cuda(const Tensor& self,
+                   const Tensor& batch1, const Tensor& batch2,
+                   Scalar beta, Scalar alpha)
+{
+  Tensor out = at::empty({0}, self.options());
+  addbmm_out_cuda(out, self, batch1, batch2, beta, alpha);
+  return out;
+}
+
 namespace {
 
 inline void dot_check(const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index e5226129add2..3cc6f3a93f5a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5612,17 +5612,20 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU, CUDA: addbmm_
+    CPU: addbmm_cpu_
+    CUDA: addbmm__cuda
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: addbmm_out
+    CPU: addbmm_cpu_out
+    CUDA: addbmm_out_cuda
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU, CUDA: addbmm
+    CPU: addbmm_cpu
+    CUDA: addbmm_cuda
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   use_c10_dispatcher: full
diff --git a/test/test_linalg.py b/test/test_linalg.py
index f1d521bb30be..71c3cf654c1b 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -3606,7 +3606,7 @@ def test_strided_mm_bmm(self, device, dtype):
         torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
-    @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
+    @precisionOverride({torch.half: 0.005, torch.bfloat16: 0.05})
     @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1")
     @onlyOnCPUAndCUDA
     @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
@@ -3632,31 +3632,18 @@ def invert_perm(p):
             return (d[0], d[1], d[2])
 
         def generate_inputs():
-            # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
                 b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
-            # broadcasting tensors
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
                 b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
                 b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
                 yield b1, b2
-            # zero-sized tensors
-            bug = (self.device_type == 'cuda' and dtype == torch.half and torch.version.cuda is not None and
-                   float(torch.version.cuda) < 11)
-            if bug:
-                return
-            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
-                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
-                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = torch.randn(shape1, dtype=dtype, device=device)
-                b2 = torch.randn(shape2, dtype=dtype, device=device)
-                yield b1, b2
 
         for (b1, b2), perm3 in itertools.product(generate_inputs(), itertools.permutations((0, 1, 2))):
             res1 = torch.bmm(b1, b2)
@@ -3837,17 +3824,6 @@ def generate_tensor():
                 ).to(device=device, dtype=dtype).sum(0)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
-            # zero-sized tensors
-            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
-                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
-                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1)
-                ref = torch.from_numpy(
-                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
-                ).to(device=device, dtype=dtype).sum(0)
-                out_tensor = torch.zeros_like(ref)
-                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("addbmm", b1, b2, ref, out_tensor)
@@ -3899,20 +3875,6 @@ def generate_tensor():
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
                 out_tensor = torch.zeros_like(ref)
                 yield b1, b2, ref, out_tensor
-            # zero-sized tensors
-            bug = (self.device_type == 'cuda' and dtype == torch.half and torch.version.cuda is not None and
-                   float(torch.version.cuda) < 11)
-            if bug:
-                return
-            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
-                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
-                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = make_tensor(shape1, device, dtype, low=-2, high=2)
-                b2 = make_tensor(shape2, device, dtype, low=-2, high=2)
-                ref = torch.from_numpy(
-                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype)
-                out_tensor = torch.zeros_like(ref)
-                yield b1, b2, ref, out_tensor
 
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)

From 313e77fc06aded10ea07f9807f8736ec033cb574 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritz.obermeyer@gmail.com>
Date: Thu, 3 Dec 2020 02:40:23 -0800
Subject: [PATCH 008/132] Add broadcast_shapes() function and use it in
 MultivariateNormal (#43935)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43837

This adds a `torch.broadcast_shapes()` function similar to Pyro's [broadcast_shape()](https://github.com/pyro-ppl/pyro/blob/7c2c22c10dffda8a33ffbd593cc8d58819959e40/pyro/distributions/util.py#L151) and JAX's [lax.broadcast_shapes()](https://jax.readthedocs.io/en/test-docs/_modules/jax/lax/lax.html). This helper is useful e.g. in multivariate distributions that are parameterized by multiple tensors and we want to `torch.broadcast_tensors()` but the parameter tensors have different "event shape" (e.g. mean vectors and covariance matrices). This helper is already heavily used in Pyro's distribution codebase, and we would like to start using it in `torch.distributions`.

- [x] refactor `MultivariateNormal`'s expansion logic to use `torch.broadcast_shapes()`
- [x] add unit tests for `torch.broadcast_shapes()`
- [x] add docs

cc neerajprad

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43935

Reviewed By: bdhirsh

Differential Revision: D25275213

Pulled By: neerajprad

fbshipit-source-id: 1011fdd597d0a7a4ef744ebc359bbb3c3be2aadc
---
 docs/source/torch.rst                      |  1 +
 test/test_view_ops.py                      | 17 +++++++++++
 torch/distributions/multivariate_normal.py | 14 +++++----
 torch/functional.py                        | 34 ++++++++++++++++++++++
 torch/overrides.py                         |  1 +
 5 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 4399e63c3b01..b16f14de9bf6 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -447,6 +447,7 @@ Other Operations
     bincount
     block_diag
     broadcast_tensors
+    broadcast_shapes
     bucketize
     cartesian_prod
     cdist
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 6722a55588ed..d4e59a3dbf23 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -1021,6 +1021,23 @@ def test_broadcast_tensors(self, device, dtype):
         self.assertTrue(y1.size() == expected_size)
         self.assertTrue(y2.size() == expected_size)
 
+
+    @onlyCPU
+    def test_broadcast_shapes(self, device):
+        examples = [(), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)]
+        for s0 in examples:
+            x0 = torch.randn(s0)
+            expected = torch.broadcast_tensors(x0)[0].shape
+            actual = torch.broadcast_shapes(s0)
+            self.assertEqual(expected, actual)
+
+            for s1 in examples:
+                x1 = torch.randn(s1)
+                expected = torch.broadcast_tensors(x0, x1)[0].shape
+                actual = torch.broadcast_shapes(s0, s1)
+                self.assertEqual(expected, actual)
+
+
     def test_view(self, device):
         tensor = torch.rand(15, device=device)
         template = torch.rand(3, 5, device=device)
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index de997f49a94f..4845d4742dfc 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -122,25 +122,27 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
         if (covariance_matrix is not None) + (scale_tril is not None) + (precision_matrix is not None) != 1:
             raise ValueError("Exactly one of covariance_matrix or precision_matrix or scale_tril may be specified.")
 
-        loc_ = loc.unsqueeze(-1)  # temporarily add dim on right
         if scale_tril is not None:
             if scale_tril.dim() < 2:
                 raise ValueError("scale_tril matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self.scale_tril, loc_ = torch.broadcast_tensors(scale_tril, loc_)
+            batch_shape = torch.broadcast_shapes(scale_tril.shape[:-2], loc.shape[:-1])
+            self.scale_tril = scale_tril.expand(batch_shape + (-1, -1))
         elif covariance_matrix is not None:
             if covariance_matrix.dim() < 2:
                 raise ValueError("covariance_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self.covariance_matrix, loc_ = torch.broadcast_tensors(covariance_matrix, loc_)
+            batch_shape = torch.broadcast_shapes(covariance_matrix.shape[:-2], loc.shape[:-1])
+            self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
         else:
             if precision_matrix.dim() < 2:
                 raise ValueError("precision_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self.precision_matrix, loc_ = torch.broadcast_tensors(precision_matrix, loc_)
-        self.loc = loc_[..., 0]  # drop rightmost dim
+            batch_shape = torch.broadcast_shapes(precision_matrix.shape[:-2], loc.shape[:-1])
+            self.precision_matrix = precision_matrix.expand(batch_shape + (-1, -1))
+        self.loc = loc.expand(batch_shape + (-1,))
 
-        batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:]
+        event_shape = self.loc.shape[-1:]
         super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
         if scale_tril is not None:
diff --git a/torch/functional.py b/torch/functional.py
index 29af0b662ccd..62076a9dc29a 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -19,6 +19,7 @@
     'atleast_2d',
     'atleast_3d',
     'align_tensors',
+    'broadcast_shapes',
     'broadcast_tensors',
     'cartesian_prod',
     'block_diag',
@@ -72,6 +73,39 @@ def broadcast_tensors(*tensors):
     return _VF.broadcast_tensors(tensors)  # type: ignore
 
 
+def broadcast_shapes(*shapes):
+    r"""broadcast_shapes(*shapes) -> Size
+
+    Similar to :func:`broadcast_tensors` but for shapes.
+
+    This is equivalent to
+    ``torch.broadcast_tensors(*map(torch.empty, shapes))[0].shape``
+    but avoids the need create to intermediate tensors. This is useful for
+    broadcasting tensors of common batch shape but different rightmost shape,
+    e.g. to broadcast mean vectors with covariance matrices.
+
+    Example::
+
+        >>> torch.broadcast_shapes((2,), (3, 1), (1, 1, 1))
+        torch.Size([1, 3, 2])
+
+    Args:
+        \*shapes (torch.Size): Shapes of tensors.
+
+    Returns:
+        shape (torch.Size): A shape compatible with all input shapes.
+
+    Raises:
+        RuntimeError: If shapes are incompatible.
+    """
+    # TODO Movie this to C++ once the jit has better support for torch.Size.
+    with torch.no_grad():
+        scalar = torch.zeros((), device="cpu")
+        tensors = [scalar.expand(shape) for shape in shapes]
+        tensors = broadcast_tensors(*tensors)
+        return tensors[0].shape
+
+
 def split(tensor, split_size_or_sections, dim=0):
     r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 36ae037ed557..f6a49376ab53 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -119,6 +119,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.as_strided,
         torch.bartlett_window,
         torch.blackman_window,
+        torch.broadcast_shapes,
         torch.can_cast,
         torch.cudnn_affine_grid_generator,
         torch.cudnn_batch_norm,

From 5489a98cd381fd6ed564cdf186bbc7d0ca9b8587 Mon Sep 17 00:00:00 2001
From: neerajprad <neerajprad@devvm903.atn0.facebook.com>
Date: Thu, 3 Dec 2020 03:19:50 -0800
Subject: [PATCH 009/132] Add support for CorrCholeskyTransform (#48041)

Summary:
This adds a transform to convert a real vector of (D * (D-1))/2 dimension into the cholesky factor of a D x D correlation matrix. This follows the implementation in [NumPyro](https://github.com/pyro-ppl/numpyro/blob/master/numpyro/distributions/transforms.py) by fehiepsi. This is needed for the LKJDistribution which will be added in a subsequent PR.

Also in line with the ongoing effort to refactor distributions test, this moves the transforms test into its own file that uses pytest with parametrized fixtures.

For review:
 fehiepsi - could you help review the math?
 fritzo - do you have any suggestions for what to do about the event dimension (more details are in the comment below)?
 ezyang - could you review the changes in `run_test.py`? Instead of a separate `PYTEST_TESTS`, I have clubbed these tests in `USE_PYTEST_LIST` to avoid duplicate logic. The only difference is that we do not anymore check if pytest is not installed and exclude the tests in the list. I figured that if existing tests are already using pytest, this should not matter.

TODOs (probably not all can be satisfied at the same time):
 - [x] Use operations that are JIT friendly, i.e. the transform works with different sized input under JIT.
 - [x] Resolve test failures - currently `arange(scalar_tensor)` fails on certain backends but this is needed for JIT. Maybe we should only support same sized tensor under JIT?
 - [x] Add tests to check that the transform gives correct gradients and is in agreement with the `log_det_jacobian`.
 - [x] Add `input_event_dim` and `output_event_dim` to `CorrCholeskyTransform`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48041

Reviewed By: zhangguanheng66

Differential Revision: D25262505

Pulled By: neerajprad

fbshipit-source-id: 5a57e1c19d8230b53592437590b9169bdf2f71e9
---
 test/distributions/test_constraints.py     |  15 +-
 test/distributions/test_distributions.py   | 322 +-----------------
 test/distributions/test_transforms.py      | 365 +++++++++++++++++++++
 test/distributions/test_utils.py           |  24 ++
 test/run_test.py                           |  12 +-
 torch/distributions/constraint_registry.py |   6 +
 torch/distributions/constraints.py         |  14 +
 torch/distributions/transforms.py          |  90 ++++-
 torch/distributions/utils.py               |  33 ++
 9 files changed, 548 insertions(+), 333 deletions(-)
 create mode 100644 test/distributions/test_transforms.py
 create mode 100644 test/distributions/test_utils.py

diff --git a/test/distributions/test_constraints.py b/test/distributions/test_constraints.py
index b4f75fb58de8..d4dd9239920d 100644
--- a/test/distributions/test_constraints.py
+++ b/test/distributions/test_constraints.py
@@ -27,6 +27,7 @@
     (constraints.half_open_interval, -2, -1),
     (constraints.half_open_interval, 1, 2),
     (constraints.simplex,),
+    (constraints.corr_cholesky,),
     (constraints.lower_cholesky,),
 ]
 
@@ -49,7 +50,11 @@ def test_biject_to(constraint_fn, args, is_cuda):
     except NotImplementedError:
         pytest.skip('`biject_to` not implemented.')
     assert t.bijective, "biject_to({}) is not bijective".format(constraint)
-    x = torch.randn(5, 5, dtype=torch.double)
+    if constraint_fn is constraints.corr_cholesky:
+        # (D * (D-1)) / 2 (where D = 4) = 6 (size of last dim)
+        x = torch.randn(6, 6, dtype=torch.double)
+    else:
+        x = torch.randn(5, 5, dtype=torch.double)
     if is_cuda:
         x = x.cuda()
     y = t(x)
@@ -62,7 +67,7 @@ def test_biject_to(constraint_fn, args, is_cuda):
     assert torch.allclose(x, x2), "Error in biject_to({}) inverse".format(constraint)
 
     j = t.log_abs_det_jacobian(x, y)
-    assert j.shape == x.shape[:x.dim() - t.event_dim]
+    assert j.shape == x.shape[:x.dim() - t.input_event_dim]
 
 
 @pytest.mark.parametrize('constraint_fn, args', [(c[0], c[1:]) for c in CONSTRAINTS])
@@ -72,7 +77,11 @@ def test_biject_to(constraint_fn, args, is_cuda):
 def test_transform_to(constraint_fn, args, is_cuda):
     constraint = build_constraint(constraint_fn, args, is_cuda=is_cuda)
     t = transform_to(constraint)
-    x = torch.randn(5, 5, dtype=torch.double)
+    if constraint_fn is constraints.corr_cholesky:
+        # (D * (D-1)) / 2 (where D = 4) = 6 (size of last dim)
+        x = torch.randn(6, 6, dtype=torch.double)
+    else:
+        x = torch.randn(5, 5, dtype=torch.double)
     if is_cuda:
         x = x.cuda()
     y = t(x)
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b4b8b6e81462..abba69eb472f 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -56,13 +56,8 @@
 from torch.distributions.constraints import Constraint, is_dependent
 from torch.distributions.dirichlet import _Dirichlet_backward
 from torch.distributions.kl import _kl_expfamily_expfamily
-from torch.distributions.transforms import (AbsTransform, AffineTransform,
-                                            CatTransform, ComposeTransform, ExpTransform,
-                                            LowerCholeskyTransform,
-                                            PowerTransform, SigmoidTransform,
-                                            TanhTransform, SoftmaxTransform,
-                                            StickBreakingTransform,
-                                            identity_transform, StackTransform)
+from torch.distributions.transforms import (AffineTransform, CatTransform, ExpTransform,
+                                            StackTransform, identity_transform)
 from torch.distributions.utils import probs_to_logits, lazy_property
 from torch.nn.functional import softmax
 
@@ -4300,319 +4295,6 @@ def test_icdf(self):
             self.assertEqual(icdf, scipy_dist.ppf(samples), msg=pytorch_dist)
 
 
-class TestTransforms(TestCase):
-    def setUp(self):
-        super(TestTransforms, self).setUp()
-        self.transforms = []
-        transforms_by_cache_size = {}
-        for cache_size in [0, 1]:
-            transforms = [
-                AbsTransform(cache_size=cache_size),
-                ExpTransform(cache_size=cache_size),
-                PowerTransform(exponent=2,
-                               cache_size=cache_size),
-                PowerTransform(exponent=torch.tensor(5.).normal_(),
-                               cache_size=cache_size),
-                SigmoidTransform(cache_size=cache_size),
-                TanhTransform(cache_size=cache_size),
-                AffineTransform(0, 1, cache_size=cache_size),
-                AffineTransform(1, -2, cache_size=cache_size),
-                AffineTransform(torch.randn(5),
-                                torch.randn(5),
-                                cache_size=cache_size),
-                AffineTransform(torch.randn(4, 5),
-                                torch.randn(4, 5),
-                                cache_size=cache_size),
-                SoftmaxTransform(cache_size=cache_size),
-                StickBreakingTransform(cache_size=cache_size),
-                LowerCholeskyTransform(cache_size=cache_size),
-                ComposeTransform([
-                    AffineTransform(torch.randn(4, 5),
-                                    torch.randn(4, 5),
-                                    cache_size=cache_size),
-                ]),
-                ComposeTransform([
-                    AffineTransform(torch.randn(4, 5),
-                                    torch.randn(4, 5),
-                                    cache_size=cache_size),
-                    ExpTransform(cache_size=cache_size),
-                ]),
-                ComposeTransform([
-                    AffineTransform(0, 1, cache_size=cache_size),
-                    AffineTransform(torch.randn(4, 5),
-                                    torch.randn(4, 5),
-                                    cache_size=cache_size),
-                    AffineTransform(1, -2, cache_size=cache_size),
-                    AffineTransform(torch.randn(4, 5),
-                                    torch.randn(4, 5),
-                                    cache_size=cache_size),
-                ]),
-            ]
-            for t in transforms[:]:
-                transforms.append(t.inv)
-            transforms.append(identity_transform)
-            self.transforms += transforms
-            if cache_size == 0:
-                self.unique_transforms = transforms[:]
-
-    def _generate_data(self, transform):
-        domain = transform.domain
-        codomain = transform.codomain
-        x = torch.empty(4, 5)
-        if domain is constraints.lower_cholesky or codomain is constraints.lower_cholesky:
-            x = torch.empty(6, 6)
-            x = x.normal_()
-            return x
-        elif domain is constraints.real:
-            return x.normal_()
-        elif domain is constraints.positive:
-            return x.normal_().exp()
-        elif domain is constraints.unit_interval:
-            return x.uniform_()
-        elif isinstance(domain, constraints.interval):
-            x = x.uniform_()
-            x = x.mul_(domain.upper_bound - domain.lower_bound).add_(domain.lower_bound)
-            return x
-        elif domain is constraints.simplex:
-            x = x.normal_().exp()
-            x /= x.sum(-1, True)
-            return x
-        raise ValueError('Unsupported domain: {}'.format(domain))
-
-    def test_inv_inv(self):
-        for t in self.transforms:
-            self.assertTrue(t.inv.inv is t)
-
-    def test_equality(self):
-        transforms = self.unique_transforms
-        for x, y in product(transforms, transforms):
-            if x is y:
-                self.assertTrue(x == y)
-                self.assertFalse(x != y)
-            else:
-                self.assertFalse(x == y)
-                self.assertTrue(x != y)
-
-        self.assertTrue(identity_transform == identity_transform.inv)
-        self.assertFalse(identity_transform != identity_transform.inv)
-
-    def test_with_cache(self):
-        for transform in self.transforms:
-            if transform._cache_size == 0:
-                transform = transform.with_cache(1)
-            self.assertTrue(transform._cache_size == 1)
-
-            x = self._generate_data(transform).requires_grad_()
-            try:
-                y = transform(x)
-            except NotImplementedError:
-                continue
-            y2 = transform(x)
-            self.assertTrue(y2 is y)
-
-    def test_forward_inverse_cache(self):
-        for transform in self.transforms:
-            x = self._generate_data(transform).requires_grad_()
-            try:
-                y = transform(x)
-            except NotImplementedError:
-                continue
-            x2 = transform.inv(y)  # should be implemented at least by caching
-            y2 = transform(x2)  # should be implemented at least by caching
-            if transform.bijective:
-                # verify function inverse
-                self.assertEqual(x2, x, msg='\n'.join([
-                    '{} t.inv(t(-)) error'.format(transform),
-                    'x = {}'.format(x),
-                    'y = t(x) = {}'.format(y),
-                    'x2 = t.inv(y) = {}'.format(x2),
-                ]))
-            else:
-                # verify weaker function pseudo-inverse
-                self.assertEqual(y2, y, msg='\n'.join([
-                    '{} t(t.inv(t(-))) error'.format(transform),
-                    'x = {}'.format(x),
-                    'y = t(x) = {}'.format(y),
-                    'x2 = t.inv(y) = {}'.format(x2),
-                    'y2 = t(x2) = {}'.format(y2),
-                ]))
-
-    def test_forward_inverse_no_cache(self):
-        for transform in self.transforms:
-            x = self._generate_data(transform).requires_grad_()
-            try:
-                y = transform(x)
-                x2 = transform.inv(y.clone())  # bypass cache
-                y2 = transform(x2)
-            except NotImplementedError:
-                continue
-            if transform.bijective:
-                # verify function inverse
-                self.assertEqual(x2, x, msg='\n'.join([
-                    '{} t.inv(t(-)) error'.format(transform),
-                    'x = {}'.format(x),
-                    'y = t(x) = {}'.format(y),
-                    'x2 = t.inv(y) = {}'.format(x2),
-                ]))
-            else:
-                # verify weaker function pseudo-inverse
-                self.assertEqual(y2, y, msg='\n'.join([
-                    '{} t(t.inv(t(-))) error'.format(transform),
-                    'x = {}'.format(x),
-                    'y = t(x) = {}'.format(y),
-                    'x2 = t.inv(y) = {}'.format(x2),
-                    'y2 = t(x2) = {}'.format(y2),
-                ]))
-
-    def test_univariate_forward_jacobian(self):
-        for transform in self.transforms:
-            if transform.event_dim > 0:
-                continue
-            x = self._generate_data(transform).requires_grad_()
-            try:
-                y = transform(x)
-                actual = transform.log_abs_det_jacobian(x, y)
-            except NotImplementedError:
-                continue
-            expected = torch.abs(grad([y.sum()], [x])[0]).log()
-            self.assertEqual(actual, expected, msg='\n'.join([
-                'Bad {}.log_abs_det_jacobian() disagrees with ()'.format(transform),
-                'Expected: {}'.format(expected),
-                'Actual: {}'.format(actual),
-            ]))
-
-    def test_univariate_inverse_jacobian(self):
-        for transform in self.transforms:
-            if transform.event_dim > 0:
-                continue
-            y = self._generate_data(transform.inv).requires_grad_()
-            try:
-                x = transform.inv(y)
-                actual = transform.log_abs_det_jacobian(x, y)
-            except NotImplementedError:
-                continue
-            expected = -torch.abs(grad([x.sum()], [y])[0]).log()
-            self.assertEqual(actual, expected, msg='\n'.join([
-                '{}.log_abs_det_jacobian() disagrees with .inv()'.format(transform),
-                'Expected: {}'.format(expected),
-                'Actual: {}'.format(actual),
-            ]))
-
-    def test_jacobian_shape(self):
-        for transform in self.transforms:
-            x = self._generate_data(transform)
-            try:
-                y = transform(x)
-                actual = transform.log_abs_det_jacobian(x, y)
-            except NotImplementedError:
-                continue
-            self.assertEqual(actual.shape, x.shape[:x.dim() - transform.event_dim])
-
-    def test_transform_shapes(self):
-        transform0 = ExpTransform()
-        transform1 = SoftmaxTransform()
-        transform2 = LowerCholeskyTransform()
-
-        self.assertEqual(transform0.event_dim, 0)
-        self.assertEqual(transform1.event_dim, 1)
-        self.assertEqual(transform2.event_dim, 2)
-        self.assertEqual(ComposeTransform([transform0, transform1]).event_dim, 1)
-        self.assertEqual(ComposeTransform([transform0, transform2]).event_dim, 2)
-        self.assertEqual(ComposeTransform([transform1, transform2]).event_dim, 2)
-
-    def test_transformed_distribution_shapes(self):
-        transform0 = ExpTransform()
-        transform1 = SoftmaxTransform()
-        transform2 = LowerCholeskyTransform()
-        base_dist0 = Normal(torch.zeros(4, 4), torch.ones(4, 4))
-        base_dist1 = Dirichlet(torch.ones(4, 4))
-        base_dist2 = Normal(torch.zeros(3, 4, 4), torch.ones(3, 4, 4))
-        examples = [
-            ((4, 4), (), base_dist0),
-            ((4,), (4,), base_dist1),
-            ((4, 4), (), TransformedDistribution(base_dist0, [transform0])),
-            ((4,), (4,), TransformedDistribution(base_dist0, [transform1])),
-            ((4,), (4,), TransformedDistribution(base_dist0, [transform0, transform1])),
-            ((), (4, 4), TransformedDistribution(base_dist0, [transform0, transform2])),
-            ((4,), (4,), TransformedDistribution(base_dist0, [transform1, transform0])),
-            ((), (4, 4), TransformedDistribution(base_dist0, [transform1, transform2])),
-            ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform0])),
-            ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform1])),
-            ((4,), (4,), TransformedDistribution(base_dist1, [transform0])),
-            ((4,), (4,), TransformedDistribution(base_dist1, [transform1])),
-            ((), (4, 4), TransformedDistribution(base_dist1, [transform2])),
-            ((4,), (4,), TransformedDistribution(base_dist1, [transform0, transform1])),
-            ((), (4, 4), TransformedDistribution(base_dist1, [transform0, transform2])),
-            ((4,), (4,), TransformedDistribution(base_dist1, [transform1, transform0])),
-            ((), (4, 4), TransformedDistribution(base_dist1, [transform1, transform2])),
-            ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform0])),
-            ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform1])),
-            ((3, 4, 4), (), base_dist2),
-            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2])),
-            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform0, transform2])),
-            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform1, transform2])),
-            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform0])),
-            ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform1])),
-        ]
-        for batch_shape, event_shape, dist in examples:
-            self.assertEqual(dist.batch_shape, batch_shape)
-            self.assertEqual(dist.event_shape, event_shape)
-            x = dist.rsample()
-            try:
-                dist.log_prob(x)  # this should not crash
-            except NotImplementedError:
-                continue
-
-    def test_jit_fwd(self):
-        for transform in self.unique_transforms:
-            x = self._generate_data(transform).requires_grad_()
-
-            def f(x):
-                return transform(x)
-
-            try:
-                traced_f = torch.jit.trace(f, (x,))
-            except NotImplementedError:
-                continue
-
-            # check on different inputs
-            x = self._generate_data(transform).requires_grad_()
-            self.assertEqual(f(x), traced_f(x))
-
-    def test_jit_inv(self):
-        for transform in self.unique_transforms:
-            y = self._generate_data(transform.inv).requires_grad_()
-
-            def f(y):
-                return transform.inv(y)
-
-            try:
-                traced_f = torch.jit.trace(f, (y,))
-            except NotImplementedError:
-                continue
-
-            # check on different inputs
-            y = self._generate_data(transform.inv).requires_grad_()
-            self.assertEqual(f(y), traced_f(y))
-
-    def test_jit_jacobian(self):
-        for transform in self.unique_transforms:
-            x = self._generate_data(transform).requires_grad_()
-
-            def f(x):
-                y = transform(x)
-                return transform.log_abs_det_jacobian(x, y)
-
-            try:
-                traced_f = torch.jit.trace(f, (x,))
-            except NotImplementedError:
-                continue
-
-            # check on different inputs
-            x = self._generate_data(transform).requires_grad_()
-            self.assertEqual(f(x), traced_f(x))
-
-
 class TestFunctors(TestCase):
     def test_cat_transform(self):
         x1 = -1 * torch.arange(1, 101, dtype=torch.float).view(-1, 100)
diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py
new file mode 100644
index 000000000000..b5e9144f0bd8
--- /dev/null
+++ b/test/distributions/test_transforms.py
@@ -0,0 +1,365 @@
+from numbers import Number
+
+import pytest
+
+import torch
+from torch.autograd.functional import jacobian
+from torch.distributions import Dirichlet, Normal, TransformedDistribution, constraints
+from torch.distributions.transforms import (AbsTransform, AffineTransform, ComposeTransform,
+                                            CorrCholeskyTransform, ExpTransform,
+                                            LowerCholeskyTransform, PowerTransform,
+                                            SigmoidTransform, TanhTransform, SoftmaxTransform,
+                                            StickBreakingTransform, identity_transform, Transform,
+                                            _InverseTransform)
+from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
+
+
+def get_transforms(cache_size):
+    transforms = [
+        AbsTransform(cache_size=cache_size),
+        ExpTransform(cache_size=cache_size),
+        PowerTransform(exponent=2,
+                       cache_size=cache_size),
+        PowerTransform(exponent=torch.tensor(5.).normal_(),
+                       cache_size=cache_size),
+        SigmoidTransform(cache_size=cache_size),
+        TanhTransform(cache_size=cache_size),
+        AffineTransform(0, 1, cache_size=cache_size),
+        AffineTransform(1, -2, cache_size=cache_size),
+        AffineTransform(torch.randn(5),
+                        torch.randn(5),
+                        cache_size=cache_size),
+        AffineTransform(torch.randn(4, 5),
+                        torch.randn(4, 5),
+                        cache_size=cache_size),
+        SoftmaxTransform(cache_size=cache_size),
+        StickBreakingTransform(cache_size=cache_size),
+        LowerCholeskyTransform(cache_size=cache_size),
+        CorrCholeskyTransform(cache_size=cache_size),
+        ComposeTransform([
+            AffineTransform(torch.randn(4, 5),
+                            torch.randn(4, 5),
+                            cache_size=cache_size),
+        ]),
+        ComposeTransform([
+            AffineTransform(torch.randn(4, 5),
+                            torch.randn(4, 5),
+                            cache_size=cache_size),
+            ExpTransform(cache_size=cache_size),
+        ]),
+        ComposeTransform([
+            AffineTransform(0, 1, cache_size=cache_size),
+            AffineTransform(torch.randn(4, 5),
+                            torch.randn(4, 5),
+                            cache_size=cache_size),
+            AffineTransform(1, -2, cache_size=cache_size),
+            AffineTransform(torch.randn(4, 5),
+                            torch.randn(4, 5),
+                            cache_size=cache_size),
+        ]),
+    ]
+    transforms += [t.inv for t in transforms]
+    return transforms
+
+
+def reshape_transform(transform, shape):
+    # Needed to squash batch dims for testing jacobian
+    if isinstance(transform, AffineTransform):
+        if isinstance(transform.loc, Number):
+            return transform
+        try:
+            return AffineTransform(transform.loc.expand(shape), transform.scale.expand(shape), cache_size=transform._cache_size)
+        except RuntimeError:
+            return AffineTransform(transform.loc.reshape(shape), transform.scale.reshape(shape), cache_size=transform._cache_size)
+    if isinstance(transform, ComposeTransform):
+        reshaped_parts = []
+        for p in transform.parts:
+            reshaped_parts.append(reshape_transform(p, shape))
+        return ComposeTransform(reshaped_parts, cache_size=transform._cache_size)
+    if isinstance(transform.inv, AffineTransform):
+        return reshape_transform(transform.inv, shape).inv
+    if isinstance(transform.inv, ComposeTransform):
+        return reshape_transform(transform.inv, shape).inv
+    return transform
+
+
+# Generate pytest ids
+def transform_id(x):
+    assert isinstance(x, Transform)
+    name = f'Inv({type(x._inv).__name__})' if isinstance(x, _InverseTransform) else f'{type(x).__name__}'
+    return f'{name}(cache_size={x._cache_size})'
+
+
+def generate_data(transform):
+    torch.manual_seed(1)
+    domain = transform.domain
+    codomain = transform.codomain
+    x = torch.empty(4, 5)
+    if domain is constraints.lower_cholesky or codomain is constraints.lower_cholesky:
+        x = torch.empty(6, 6)
+        x = x.normal_()
+        return x
+    elif domain is constraints.real:
+        return x.normal_()
+    elif domain is constraints.real_vector:
+        # For corr_cholesky the last dim in the vector
+        # must be of size (dim * dim) // 2
+        x = torch.empty(3, 6)
+        x = x.normal_()
+        return x
+    elif domain is constraints.positive:
+        return x.normal_().exp()
+    elif domain is constraints.unit_interval:
+        return x.uniform_()
+    elif isinstance(domain, constraints.interval):
+        x = x.uniform_()
+        x = x.mul_(domain.upper_bound - domain.lower_bound).add_(domain.lower_bound)
+        return x
+    elif domain is constraints.simplex:
+        x = x.normal_().exp()
+        x /= x.sum(-1, True)
+        return x
+    elif domain is constraints.corr_cholesky:
+        x = torch.empty(4, 5, 5)
+        x = x.normal_().tril()
+        x /= x.norm(dim=-1, keepdim=True)
+        x.diagonal(dim1=-1).copy_(x.diagonal(dim1=-1).abs())
+        return x
+    raise ValueError('Unsupported domain: {}'.format(domain))
+
+
+TRANSFORMS_CACHE_ACTIVE = get_transforms(cache_size=1)
+TRANSFORMS_CACHE_INACTIVE = get_transforms(cache_size=0)
+ALL_TRANSFORMS = TRANSFORMS_CACHE_ACTIVE + TRANSFORMS_CACHE_INACTIVE + [identity_transform]
+
+
+@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+def test_inv_inv(transform, ids=transform_id):
+    assert transform.inv.inv is transform
+
+
+@pytest.mark.parametrize('x', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize('y', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+def test_equality(x, y):
+    if x is y:
+        assert x == y
+    else:
+        assert x != y
+    assert identity_transform == identity_transform.inv
+
+
+@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+def test_with_cache(transform):
+    if transform._cache_size == 0:
+        transform = transform.with_cache(1)
+    assert transform._cache_size == 1
+    x = generate_data(transform).requires_grad_()
+    try:
+        y = transform(x)
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+    y2 = transform(x)
+    assert y2 is y
+
+
+@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize('test_cached', [True, False])
+def test_forward_inverse(transform, test_cached):
+    x = generate_data(transform).requires_grad_()
+    try:
+        y = transform(x)
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+    if test_cached:
+        x2 = transform.inv(y)  # should be implemented at least by caching
+    else:
+        try:
+            x2 = transform.inv(y.clone())  # bypass cache
+        except NotImplementedError:
+            pytest.skip('Not implemented.')
+    y2 = transform(x2)
+    if transform.bijective:
+        # verify function inverse
+        assert torch.allclose(x2, x, atol=1e-4, equal_nan=True), '\n'.join([
+            '{} t.inv(t(-)) error'.format(transform),
+            'x = {}'.format(x),
+            'y = t(x) = {}'.format(y),
+            'x2 = t.inv(y) = {}'.format(x2),
+        ])
+    else:
+        # verify weaker function pseudo-inverse
+        assert torch.allclose(y2, y, atol=1e-4, equal_nan=True), '\n'.join([
+            '{} t(t.inv(t(-))) error'.format(transform),
+            'x = {}'.format(x),
+            'y = t(x) = {}'.format(y),
+            'x2 = t.inv(y) = {}'.format(x2),
+            'y2 = t(x2) = {}'.format(y2),
+        ])
+
+
+def test_compose_transform_shapes():
+    transform0 = ExpTransform()
+    transform1 = SoftmaxTransform()
+    transform2 = LowerCholeskyTransform()
+
+    assert transform0.event_dim == 0
+    assert transform1.event_dim == 1
+    assert transform2.event_dim == 2
+    assert ComposeTransform([transform0, transform1]).event_dim == 1
+    assert ComposeTransform([transform0, transform2]).event_dim == 2
+    assert ComposeTransform([transform1, transform2]).event_dim == 2
+
+
+transform0 = ExpTransform()
+transform1 = SoftmaxTransform()
+transform2 = LowerCholeskyTransform()
+base_dist0 = Normal(torch.zeros(4, 4), torch.ones(4, 4))
+base_dist1 = Dirichlet(torch.ones(4, 4))
+base_dist2 = Normal(torch.zeros(3, 4, 4), torch.ones(3, 4, 4))
+
+
+@pytest.mark.parametrize('batch_shape, event_shape, dist', [
+    ((4, 4), (), base_dist0),
+    ((4,), (4,), base_dist1),
+    ((4, 4), (), TransformedDistribution(base_dist0, [transform0])),
+    ((4,), (4,), TransformedDistribution(base_dist0, [transform1])),
+    ((4,), (4,), TransformedDistribution(base_dist0, [transform0, transform1])),
+    ((), (4, 4), TransformedDistribution(base_dist0, [transform0, transform2])),
+    ((4,), (4,), TransformedDistribution(base_dist0, [transform1, transform0])),
+    ((), (4, 4), TransformedDistribution(base_dist0, [transform1, transform2])),
+    ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform0])),
+    ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform1])),
+    ((4,), (4,), TransformedDistribution(base_dist1, [transform0])),
+    ((4,), (4,), TransformedDistribution(base_dist1, [transform1])),
+    ((), (4, 4), TransformedDistribution(base_dist1, [transform2])),
+    ((4,), (4,), TransformedDistribution(base_dist1, [transform0, transform1])),
+    ((), (4, 4), TransformedDistribution(base_dist1, [transform0, transform2])),
+    ((4,), (4,), TransformedDistribution(base_dist1, [transform1, transform0])),
+    ((), (4, 4), TransformedDistribution(base_dist1, [transform1, transform2])),
+    ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform0])),
+    ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform1])),
+    ((3, 4, 4), (), base_dist2),
+    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2])),
+    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform0, transform2])),
+    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform1, transform2])),
+    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform0])),
+    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform1])),
+])
+def test_transformed_distribution_shapes(batch_shape, event_shape, dist):
+    assert dist.batch_shape == batch_shape
+    assert dist.event_shape == event_shape
+    x = dist.rsample()
+    try:
+        dist.log_prob(x)  # this should not crash
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+
+
+@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+def test_jit_fwd(transform):
+    x = generate_data(transform).requires_grad_()
+
+    def f(x):
+        return transform(x)
+
+    try:
+        traced_f = torch.jit.trace(f, (x,))
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+
+    # check on different inputs
+    x = generate_data(transform).requires_grad_()
+    assert torch.allclose(f(x), traced_f(x), atol=1e-5, equal_nan=True)
+
+
+@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+def test_jit_inv(transform):
+    y = generate_data(transform.inv).requires_grad_()
+
+    def f(y):
+        return transform.inv(y)
+
+    try:
+        traced_f = torch.jit.trace(f, (y,))
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+
+    # check on different inputs
+    y = generate_data(transform.inv).requires_grad_()
+    assert torch.allclose(f(y), traced_f(y), atol=1e-5, equal_nan=True)
+
+
+@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+def test_jit_jacobian(transform):
+    x = generate_data(transform).requires_grad_()
+
+    def f(x):
+        y = transform(x)
+        return transform.log_abs_det_jacobian(x, y)
+
+    try:
+        traced_f = torch.jit.trace(f, (x,))
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+
+    # check on different inputs
+    x = generate_data(transform).requires_grad_()
+    assert torch.allclose(f(x), traced_f(x), atol=1e-5, equal_nan=True)
+
+
+@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+def test_jacobian(transform):
+    x = generate_data(transform)
+    try:
+        y = transform(x)
+        actual = transform.log_abs_det_jacobian(x, y)
+    except NotImplementedError:
+        pytest.skip('Not implemented.')
+    # Test shape
+    target_shape = x.shape[:x.dim() - transform.input_event_dim]
+    assert actual.shape == target_shape
+
+    # Expand if required
+    transform = reshape_transform(transform, x.shape)
+    ndims = len(x.shape)
+    event_dim = ndims - transform.input_event_dim
+    x_ = x.view((-1,) + x.shape[event_dim:])
+    n = x_.shape[0]
+    # Reshape to squash batch dims to a single batch dim
+    transform = reshape_transform(transform, x_.shape)
+
+    # 1. Transforms with 0 off-diagonal elements
+    if transform.input_event_dim == 0:
+        jac = jacobian(transform, x_)
+        # assert off-diagonal elements are zero
+        assert torch.allclose(jac, jac.diagonal().diag_embed())
+        expected = jac.diagonal().abs().log().reshape(x.shape)
+    # 2. Transforms with non-0 off-diagonal elements
+    else:
+        if isinstance(transform, CorrCholeskyTransform):
+            jac = jacobian(lambda x: tril_matrix_to_vec(transform(x), diag=-1), x_)
+        elif isinstance(transform.inv, CorrCholeskyTransform):
+            jac = jacobian(lambda x: transform(vec_to_tril_matrix(x, diag=-1)),
+                           tril_matrix_to_vec(x_, diag=-1))
+        elif isinstance(transform, StickBreakingTransform):
+            jac = jacobian(lambda x: transform(x)[..., :-1], x_)
+        else:
+            jac = jacobian(transform, x_)
+
+        # Note that jacobian will have shape (batch_dims, y_event_dims, batch_dims, x_event_dims)
+        # However, batches are independent so this can be converted into a (batch_dims, event_dims, event_dims)
+        # after reshaping the event dims (see above) to give a batched square matrix whose determinant
+        # can be computed.
+        gather_idx_shape = list(jac.shape)
+        gather_idx_shape[-2] = 1
+        gather_idxs = torch.arange(n).reshape((n,) + (1,) * (len(jac.shape) - 1)).expand(gather_idx_shape)
+        jac = jac.gather(-2, gather_idxs).squeeze(-2)
+        out_ndims = jac.shape[-2]
+        jac = jac[..., :out_ndims]  # Remove extra zero-valued dims (for inverse stick-breaking).
+        expected = torch.slogdet(jac).logabsdet
+
+    assert torch.allclose(actual, expected, atol=1e-5)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/test/distributions/test_utils.py b/test/distributions/test_utils.py
new file mode 100644
index 000000000000..b58cfe39fc1c
--- /dev/null
+++ b/test/distributions/test_utils.py
@@ -0,0 +1,24 @@
+import pytest
+
+import torch
+from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
+
+
+@pytest.mark.parametrize('shape', [
+    (2, 2),
+    (3, 3),
+    (2, 4, 4),
+    (2, 2, 4, 4),
+])
+def test_tril_matrix_to_vec(shape):
+    mat = torch.randn(shape)
+    n = mat.shape[-1]
+    for diag in range(-n + 1, n):
+        actual = mat.tril(diag)
+        vec = tril_matrix_to_vec(actual, diag)
+        tril_mat = vec_to_tril_matrix(vec, diag)
+        assert torch.allclose(tril_mat, actual)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/test/run_test.py b/test/run_test.py
index 2bf1353ecd34..070b6103ab54 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -3,7 +3,6 @@
 import argparse
 import copy
 from datetime import datetime
-import importlib
 import modulefinder
 import os
 import shutil
@@ -152,6 +151,9 @@
     'distributed/_pipeline/sync/test_stream',
     'distributed/_pipeline/sync/test_transparency',
     'distributed/_pipeline/sync/test_worker',
+    'distributions/test_constraints',
+    'distributions/test_transforms',
+    'distributions/test_utils',
 ]
 
 WINDOWS_BLOCKLIST = [
@@ -188,11 +190,6 @@
     'test_cuda_primary_ctx',
 ] + [test for test in TESTS if test.startswith('distributed/')]
 
-# These tests use some specific pytest feature like parameterized testing or
-# fixtures that cannot be run by unittest
-PYTEST_TESTS = [
-    'distributions/test_constraints'
-]
 
 # These tests are slow enough that it's worth calculating whether the patch
 # touched any related files first.
@@ -647,9 +644,6 @@ def get_selected_tests(options):
         options.exclude.extend(JIT_EXECUTOR_TESTS)
 
     selected_tests = exclude_tests(options.exclude, selected_tests)
-    # exclude PYTEST_TESTS if pytest not installed.
-    if importlib.util.find_spec('pytest') is None:
-        selected_tests = exclude_tests(PYTEST_TESTS, selected_tests, 'PyTest not found.')
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:
         target_arch = os.environ.get('VSCMD_ARG_TGT_ARCH')
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index 6587631c4cfe..4675b8ceaca8 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -215,6 +215,12 @@ def _transform_to_lower_cholesky(constraint):
     return transforms.LowerCholeskyTransform()
 
 
+@biject_to.register(constraints.corr_cholesky)
+@transform_to.register(constraints.corr_cholesky)
+def _transform_to_corr_cholesky(constraint):
+    return transforms.CorrCholeskyTransform()
+
+
 @biject_to.register(constraints.cat)
 def _biject_to_cat(constraint):
     return transforms.CatTransform([biject_to(c)
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 7bcbc586434d..630c192ffed0 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -26,6 +26,7 @@
     'Constraint',
     'boolean',
     'cat',
+    'corr_cholesky',
     'dependent',
     'dependent_property',
     'greater_than',
@@ -275,6 +276,18 @@ def check(self, value):
         return lower_triangular & positive_diagonal
 
 
+class _CorrCholesky(Constraint):
+    """
+    Constrain to lower-triangular square matrices with positive diagonals and each
+    row vector being of unit length.
+    """
+    def check(self, value):
+        tol = torch.finfo(value.dtype).eps * value.size(-1) * 10  # 10 is an adjustable fudge factor
+        row_norm = torch.linalg.norm(value.detach(), dim=-1)
+        unit_row_norm = (row_norm - 1.).abs().le(tol).all(dim=-1)
+        return _LowerCholesky().check(value) & unit_row_norm
+
+
 class _PositiveDefinite(Constraint):
     """
     Constrain to positive-definite matrices.
@@ -360,6 +373,7 @@ def check(self, value):
 simplex = _Simplex()
 lower_triangular = _LowerTriangular()
 lower_cholesky = _LowerCholesky()
+corr_cholesky = _CorrCholesky()
 positive_definite = _PositiveDefinite()
 cat = _Cat
 stack = _Stack
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index f4de4b15b0bb..a0412d52df0d 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -6,7 +6,8 @@
 import torch.nn.functional as F
 from torch.distributions import constraints
 from torch.distributions.utils import (_sum_rightmost, broadcast_all,
-                                       lazy_property)
+                                       lazy_property, tril_matrix_to_vec,
+                                       vec_to_tril_matrix)
 from torch.nn.functional import pad
 from torch.nn.functional import softplus
 from typing import List
@@ -16,6 +17,7 @@
     'AffineTransform',
     'CatTransform',
     'ComposeTransform',
+    'CorrCholeskyTransform',
     'ExpTransform',
     'LowerCholeskyTransform',
     'PowerTransform',
@@ -92,6 +94,14 @@ def __init__(self, cache_size=0):
             raise ValueError('cache_size must be 0 or 1')
         super(Transform, self).__init__()
 
+    @property
+    def input_event_dim(self):
+        return self.event_dim
+
+    @property
+    def output_event_dim(self):
+        return self.event_dim
+
     @property
     def inv(self):
         """
@@ -195,6 +205,16 @@ def codomain(self):
         assert self._inv is not None
         return self._inv.domain
 
+    @property
+    def input_event_dim(self):
+        assert self._inv is not None
+        return self._inv.output_event_dim
+
+    @property
+    def output_event_dim(self):
+        assert self._inv is not None
+        return self._inv.input_event_dim
+
     @property
     def bijective(self):
         assert self._inv is not None
@@ -535,6 +555,74 @@ def log_abs_det_jacobian(self, x, y):
         return result.expand(shape)
 
 
+class CorrCholeskyTransform(Transform):
+    r"""
+    Transforms an uncontrained real vector :math:`x` with length :math:`D*(D-1)/2` into the
+    Cholesky factor of a D-dimension correlation matrix. This Cholesky factor is a lower
+    triangular matrix with positive diagonals and unit Euclidean norm for each row.
+    The transform is processed as follows:
+
+        1. First we convert x into a lower triangular matrix in row order.
+        2. For each row :math:`X_i` of the lower triangular part, we apply a *signed* version of
+           class :class:`StickBreakingTransform` to transform :math:`X_i` into a
+           unit Euclidean length vector using the following steps:
+           - Scales into the interval :math:`(-1, 1)` domain: :math:`r_i = \tanh(X_i)`.
+           - Transforms into an unsigned domain: :math:`z_i = r_i^2`.
+           - Applies :math:`s_i = StickBreakingTransform(z_i)`.
+           - Transforms back into signed domain: :math:`y_i = sign(r_i) * \sqrt{s_i}`.
+    """
+    domain = constraints.real_vector
+    codomain = constraints.corr_cholesky
+    input_event_dim = 1
+    output_event_dim = 2
+    bijective = True
+
+    @property
+    def event_dim(self):
+        raise ValueError("Please use `.input_event_dim` or `.output_event_dim` instead.")
+
+    def _call(self, x):
+        x = torch.tanh(x)
+        eps = torch.finfo(x.dtype).eps
+        x = x.clamp(min=-1 + eps, max=1 - eps)
+        r = vec_to_tril_matrix(x, diag=-1)
+        # apply stick-breaking on the squared values
+        # Note that y = sign(r) * sqrt(z * z1m_cumprod)
+        #             = (sign(r) * sqrt(z)) * sqrt(z1m_cumprod) = r * sqrt(z1m_cumprod)
+        z = r ** 2
+        z1m_cumprod_sqrt = (1 - z).sqrt().cumprod(-1)
+        # Diagonal elements must be 1.
+        r = r + torch.eye(r.shape[-1], dtype=r.dtype, device=r.device)
+        y = r * pad(z1m_cumprod_sqrt[..., :-1], [1, 0], value=1)
+        return y
+
+    def _inverse(self, y):
+        # inverse stick-breaking
+        # See: https://mc-stan.org/docs/2_18/reference-manual/cholesky-factors-of-correlation-matrices-1.html
+        y_cumsum = 1 - torch.cumsum(y * y, dim=-1)
+        y_cumsum_shifted = pad(y_cumsum[..., :-1], [1, 0], value=1)
+        y_vec = tril_matrix_to_vec(y, diag=-1)
+        y_cumsum_vec = tril_matrix_to_vec(y_cumsum_shifted, diag=-1)
+        t = y_vec / (y_cumsum_vec).sqrt()
+        # inverse of tanh
+        x = ((1 + t) / (1 - t)).log() / 2
+        return x
+
+    def log_abs_det_jacobian(self, x, y, intermediates=None):
+        # Because domain and codomain are two spaces with different dimensions, determinant of
+        # Jacobian is not well-defined. We return `log_abs_det_jacobian` of `x` and the
+        # flattened lower triangular part of `y`.
+
+        # See: https://mc-stan.org/docs/2_18/reference-manual/cholesky-factors-of-correlation-matrices-1.html
+        y1m_cumsum = 1 - (y * y).cumsum(dim=-1)
+        # by taking diagonal=-2, we don't need to shift z_cumprod to the right
+        # also works for 2 x 2 matrix
+        y1m_cumsum_tril = tril_matrix_to_vec(y1m_cumsum, diag=-2)
+        stick_breaking_logdet = 0.5 * (y1m_cumsum_tril).log().sum(-1)
+        tanh_logdet = -2 * (x + softplus(-2 * x) - math.log(2.)).sum(dim=-1)
+        return stick_breaking_logdet + tanh_logdet
+
+
 class SoftmaxTransform(Transform):
     r"""
     Transform from unconstrained space to the simplex via :math:`y = \exp(x)` then
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 36ff1f71c35b..05500f22c344 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -108,3 +108,36 @@ def __get__(self, instance, obj_type=None):
             value = self.wrapped(instance)
         setattr(instance, self.wrapped.__name__, value)
         return value
+
+
+def tril_matrix_to_vec(mat, diag=0):
+    r"""
+    Convert a `D x D` matrix or a batch of matrices into a (batched) vector
+    which comprises of lower triangular elements from the matrix in row order.
+    """
+    n = mat.shape[-1]
+    if not torch._C._get_tracing_state() and (diag <= -n or diag >= n):
+        raise ValueError(f'diag ({diag}) provided is outside [{-n+1}, {n-1}].')
+    arange = torch.arange(n, device=mat.device)
+    tril_mask = arange < arange.view(-1, 1) + (diag + 1)
+    vec = mat[..., tril_mask]
+    return vec
+
+
+def vec_to_tril_matrix(vec, diag=0):
+    r"""
+    Convert a vector or a batch of vectors into a batched `D x D`
+    lower triangular matrix containing elements from the vector in row order.
+    """
+    # +ve root of D**2 + (1+2*diag)*D - |diag| * (diag+1) - 2*vec.shape[-1] = 0
+    n = (-(1 + 2 * diag) + ((1 + 2 * diag)**2 + 8 * vec.shape[-1] + 4 * abs(diag) * (diag + 1))**0.5) / 2
+    eps = torch.finfo(vec.dtype).eps
+    if not torch._C._get_tracing_state() and (round(n) - n > eps):
+        raise ValueError(f'The size of last dimension is {vec.shape[-1]} which cannot be expressed as ' +
+                         'the lower triangular part of a square D x D matrix.')
+    n = torch.round(n).long() if isinstance(n, torch.Tensor) else round(n)
+    mat = vec.new_zeros(vec.shape[:-1] + torch.Size((n, n)))
+    arange = torch.arange(n, device=vec.device)
+    tril_mask = arange < arange.view(-1, 1) + (diag + 1)
+    mat[..., tril_mask] = vec
+    return mat

From 0484b048d050ea5b10cb0efd147148aa893f2a4b Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 3 Dec 2020 05:34:53 -0800
Subject: [PATCH 010/132] Replace constexpr with CONSTEXPR_EXCEPT_WIN_CUDA
 (#48717)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48716

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48717

Reviewed By: ezyang

Differential Revision: D25277886

Pulled By: datumbox

fbshipit-source-id: eb845db35d31b64d3e4401ed56843814192ce5a6
---
 torch/csrc/jit/ir/ir.h                 | 4 ++--
 torch/csrc/jit/serialization/pickler.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 9db2dbdf2516..37cb31f0967d 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -1326,7 +1326,7 @@ inline const Graph* Value::owningGraph() const {
 
 /************* All nodes not required to be defined before Graph **************/
 struct ProfileOp : public Node {
-  static constexpr Symbol Kind = ::c10::prim::profile;
+  static CONSTEXPR_EXCEPT_WIN_CUDA Symbol Kind = ::c10::prim::profile;
   ProfileOp(Graph* graph, std::function<void(std::vector<IValue>&)> callback)
       : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {}
 
@@ -1346,7 +1346,7 @@ struct ProfileOp : public Node {
 };
 
 struct TORCH_API ProfileOptionalOp : public Node {
-  static constexpr Symbol Kind = ::c10::prim::profile_optional;
+  static CONSTEXPR_EXCEPT_WIN_CUDA Symbol Kind = ::c10::prim::profile_optional;
   ProfileOptionalOp(
       Graph* graph,
       std::function<void(std::vector<IValue>&)> callback)
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 4473b0cb50dd..6a557e6e53f3 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -209,7 +209,7 @@ class TORCH_API Pickler {
   // the left of a '::', its type cannot be deduced by the compiler so one must
   // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
   // does not)
-  static constexpr size_t kBufferSize = 256;
+  static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256;
   template <typename T>
   void push(typename std::common_type<T>::type value) {
     const char* begin = reinterpret_cast<const char*>(&value);

From ef50c94e7ccb67129935b5c64747f2a5645790f8 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Thu, 3 Dec 2020 06:48:51 -0800
Subject: [PATCH 011/132] reenabling MPI test (#48725)

Summary:
fixes https://github.com/pytorch/pytorch/issues/47443.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48725

Reviewed By: mrshenli

Differential Revision: D25278758

Pulled By: walterddr

fbshipit-source-id: a02d0fef99a7941c8e98da16a45d840e12b8b0c3
---
 test/distributed/test_distributed_fork.py | 2 +-
 test/run_test.py                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/distributed/test_distributed_fork.py b/test/distributed/test_distributed_fork.py
index 293eba1f278a..84d23e71af95 100644
--- a/test/distributed/test_distributed_fork.py
+++ b/test/distributed/test_distributed_fork.py
@@ -54,7 +54,7 @@ def setUp(self):
     WORLD_SIZE = os.environ["WORLD_SIZE"]
     dist.init_process_group(init_method=INIT_METHOD, backend="mpi")
 
-    class TestMPI(DistributedTest._DistTestBase):
+    class TestMPIWithFork(TestCase, DistributedTest._DistTestBase):
         pass
 
 elif BACKEND == "test":
diff --git a/test/run_test.py b/test/run_test.py
index 070b6103ab54..3687459a4a70 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -415,7 +415,7 @@ def test_distributed(test_module, test_directory, options):
                 init_str = "with {} init_method"
                 with_init = init_str.format("file" if with_init_file else "env")
                 print_to_stderr(
-                    'Running distributed tests for the {} backend{}'.format(
+                    'Running distributed tests for the {} backend {}'.format(
                         backend, with_init))
             os.environ['TEMP_DIR'] = tmp_dir
             os.environ['BACKEND'] = backend

From c01e5b88275018c3f8c08ab76e6efd3560064e60 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 3 Dec 2020 07:28:11 -0800
Subject: [PATCH 012/132] Simplify CachingAllocator. (#48752)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48752

Reviewed By: linbinyu

Differential Revision: D25285292

fbshipit-source-id: 17679ccda5279ab426e50e4266c50aac74f92a13
---
 c10/mobile/CPUCachingAllocator.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
index 6a748f4f1791..2f11e6ea8669 100644
--- a/c10/mobile/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -54,6 +54,9 @@ class C10_API CPUCachingAllocator {
    * No speculative allocation for any future allocations.
    */
   private:
+    inline void* allocate_and_cache(const size_t bytes);
+    void free_cached();
+  protected:
     // Invariants.
     // 1. If memory is ever allocated via this allocator then
     //    the pointer will exist in allocation_map_, unless the allocator
@@ -71,9 +74,6 @@ class C10_API CPUCachingAllocator {
     // As a result of above invariants, allocated memory ptr cannot be in
     // available_map_ unless it is in allocation_map_ as well.
     ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
-    inline void* allocate_and_cache(const size_t bytes);
-    void free_cached();
-  protected:
     static ska::flat_hash_map<void*, size_t> allocation_map_;
     // Since allocation_map, which is a global instance, is mutated/read via
     // all public APIs we need a global mutex.

From d6f9e8562ba63023dbd02ae29365c608e3dc95bf Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 3 Dec 2020 08:19:25 -0800
Subject: [PATCH 013/132] Generalize some TensorIterator consumers to take
 TensorIteratorBase (#48727)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48727

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D25278033

Pulled By: ezyang

fbshipit-source-id: 77f125ddb8446edf467a22130227d90583884bca
---
 aten/src/ATen/cuda/detail/OffsetCalculator.cuh      |  2 +-
 aten/src/ATen/native/TensorIteratorDynamicCasting.h |  4 ++--
 aten/src/ATen/native/cpu/Loops.h                    | 12 ++++++------
 aten/src/ATen/native/cuda/CUDALoops.cuh             |  2 +-
 aten/src/ATen/native/cuda/Loops.cuh                 | 12 ++++++------
 aten/src/ATen/native/cuda/ROCmLoops.cuh             |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
index 190b6f378ea3..33b499f03b33 100644
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@@ -90,7 +90,7 @@ struct TrivialOffsetCalculator {
 };
 
 template<int N>
-static OffsetCalculator<N> make_offset_calculator(const at::TensorIterator& iter) {
+static OffsetCalculator<N> make_offset_calculator(const at::TensorIteratorBase& iter) {
   AT_ASSERT(N <= iter.ntensors());
   std::array<const int64_t*, N> strides;
   for (int i = 0; i < N; i++) {
diff --git a/aten/src/ATen/native/TensorIteratorDynamicCasting.h b/aten/src/ATen/native/TensorIteratorDynamicCasting.h
index 31b4461c67e7..8e3b6760091c 100644
--- a/aten/src/ATen/native/TensorIteratorDynamicCasting.h
+++ b/aten/src/ATen/native/TensorIteratorDynamicCasting.h
@@ -26,7 +26,7 @@ namespace at { namespace native {
 // (and returns) of func_t
 template<typename func_t, int nargs=function_traits<func_t>::arity>
 struct needs_dynamic_casting {
-  static bool check(TensorIterator& iter) {
+  static bool check(TensorIteratorBase& iter) {
     using traits = function_traits<func_t>;
     using cpp_type = typename traits::template arg<nargs - 1>::type;
     using cpp_map = c10::CppTypeToScalarType<cpp_type>;
@@ -40,7 +40,7 @@ struct needs_dynamic_casting {
 
 template<typename func_t>
 struct needs_dynamic_casting<func_t, 0> {
-  static bool check(TensorIterator& iter) {
+  static bool check(TensorIteratorBase& iter) {
     using traits = function_traits<func_t>;
     using cpp_type = typename traits::result_type;
 
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index f263ce897fbb..305c14eb9c5a 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -185,7 +185,7 @@ static inline void unroll_contiguous_scalar_checks(
 }
 
 template <typename func_t>
-void cpu_kernel(TensorIterator& iter, func_t&& op) {
+void cpu_kernel(TensorIteratorBase& iter, func_t&& op) {
   using traits = function_traits<func_t>;
   // this could be extended to work with void return types
   TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
@@ -207,7 +207,7 @@ void cpu_kernel(TensorIterator& iter, func_t&& op) {
 }
 
 template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
-void cpu_kernel_vec(TensorIterator& iter, func_t&& op, vec_func_t&& vop) {
+void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
   using traits = function_traits<func_t>;
   // this could be extended to work with void return types
   TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
@@ -236,7 +236,7 @@ void cpu_kernel_vec(TensorIterator& iter, func_t&& op, vec_func_t&& vop) {
 }
 
 template <typename func_t>
-void cpu_serial_kernel(TensorIterator& iter, func_t&& op, const Range& range) {
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
   using traits = function_traits<func_t>;
   constexpr bool result_void = std::is_void<typename traits::result_type>::value;
   TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
@@ -258,12 +258,12 @@ void cpu_serial_kernel(TensorIterator& iter, func_t&& op, const Range& range) {
 }
 
 template <typename func_t>
-void cpu_serial_kernel(TensorIterator& iter, func_t&& op) {
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
   cpu_serial_kernel(iter, op, {0, iter.numel()});
 }
 
 template <typename func_t, typename vec_func_t>
-void cpu_serial_kernel_vec(TensorIterator& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
   using traits = function_traits<func_t>;
   // this could be extended to work with void return types
   TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
@@ -289,7 +289,7 @@ void cpu_serial_kernel_vec(TensorIterator& iter, func_t&& op, vec_func_t&& vop,
 }
 
 template <typename func_t, typename vec_func_t>
-void cpu_serial_kernel_vec(TensorIterator& iter, func_t&& op, vec_func_t&& vop) {
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
   cpu_serial_kernel_vec(iter, op, vop, {0, iter.numel()});
 }
 
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 093ace17297c..91401e994ebd 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -131,7 +131,7 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
 }
 
 template <typename func_t>
-void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   using traits = function_traits<func_t>;
   using arg0_t = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index fb1f8e6720b4..82765b2aeddb 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -20,7 +20,7 @@ constexpr int block_work_size = BLOCK_WORK_SIZE;
 namespace at { namespace native {
 
 template<int N>
-static OffsetCalculator<N> make_input_offset_calculator(const TensorIterator& iter) {
+static OffsetCalculator<N> make_input_offset_calculator(const TensorIteratorBase& iter) {
   // array size can not be 0, this happens when N == 0
   constexpr int array_size = std::max<int>(N, 1);
   TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
@@ -34,7 +34,7 @@ static OffsetCalculator<N> make_input_offset_calculator(const TensorIterator& it
 }
 
 template <int num_outputs = 1>
-static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIterator& iter) {
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIteratorBase& iter) {
   TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
   std::array<const int64_t*, num_outputs> strides;
   int64_t element_sizes[num_outputs];
@@ -88,7 +88,7 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
 namespace at { namespace native {
 
 template <typename func_t>
-void gpu_kernel(TensorIterator& iter, const func_t& f) {
+void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
 
   for (int arg = 0; arg < iter.ntensors(); arg++) {
     TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda());
@@ -139,7 +139,7 @@ struct BUnaryFunctor {
 };
 
 template <typename func_t>
-void gpu_kernel_with_scalars(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
   TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
 
   using traits = function_traits<func_t>;
@@ -187,7 +187,7 @@ static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const fun
 }
 
 template <typename func_t>
-void gpu_kernel_multiple_outputs_impl(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) {
   using traits = function_traits<func_t>;
   using output_t = typename traits::result_type;
   static_assert(is_tuple<output_t>::value, "f's return type must be `thrust::tuple`");
@@ -218,7 +218,7 @@ void gpu_kernel_multiple_outputs_impl(TensorIterator& iter, const func_t& f) {
 } // namespace
 
 template <typename func_t>
-void gpu_kernel_multiple_outputs(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) {
   ASSERT_HOST_DEVICE_LAMBDA(func_t);
 
   for (int arg = 0; arg < iter.ntensors(); arg++) {
diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh
index e0dc83556677..b5115c6dcdfb 100644
--- a/aten/src/ATen/native/cuda/ROCmLoops.cuh
+++ b/aten/src/ATen/native/cuda/ROCmLoops.cuh
@@ -306,7 +306,7 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) {}
 
 
 template <typename func_t>
-void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   using traits = function_traits<func_t>;
   using arg0_t = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;

From f9a0abfc4330226a43765243e78b4ded1ca32c4f Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 3 Dec 2020 08:19:25 -0800
Subject: [PATCH 014/132] Fix code review from #48659 and #48116 (#48731)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48731

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D25278034

Pulled By: ezyang

fbshipit-source-id: 73652311b48d8d80c06e9385b7ff18ef3a158ae8
---
 aten/src/ATen/native/TensorIterator.cpp | 10 +++++++++-
 test/test_torch.py                      |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index 770774aaaeda..33461d63e0e1 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -1126,6 +1126,9 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
+          if (!op.tensor.defined()) {
+            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
+          }
           set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::Contiguous), names_);
         }
         break;
@@ -1134,6 +1137,9 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
+          if (!op.tensor.defined()) {
+            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
+          }
           set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::ChannelsLast), names_);
         }
         break;
@@ -1148,6 +1154,9 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
         TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
+          if (!op.tensor.defined()) {
+            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
+          }
           set_output(i, shape_, operands_[i_defined].tensor.strides(), op.options(), names_);
         }
         break;
@@ -1275,7 +1284,6 @@ void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayR
   auto& op = operands_[output_idx];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
   if (!op.tensor.defined()) {
-      TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", output_idx);
       if (strides.empty()) {
           op.tensor = at::empty(sizes, options);
       } else {
diff --git a/test/test_torch.py b/test/test_torch.py
index 9f21efb48b85..b5a87b4dd2ae 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2550,6 +2550,7 @@ def test_empty_meta(self):
             y = torch.empty_meta(2 ** 20)
             z = x + y
             self.assertEqual(z.size(), (2 ** 20, 2 ** 20))
+            self.assertRaises(RuntimeError, lambda: z[0][0].item())
 
         def test_upsample_nearest1d_meta(self):
             # TODO: this is not a sustainable way of testing meta functions,
@@ -2560,12 +2561,14 @@ def test_upsample_nearest1d_meta(self):
             x = torch.empty_meta(2 * 10 ** 8, 3, 2 * 10 ** 8)
             z = torch.nn.functional.interpolate(x, scale_factor=2)
             self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8))
+            self.assertRaises(RuntimeError, lambda: z[0][0][0].item())
 
             # interpolate doesn't seem to support out=
             # (not sure why passing None here doesn't work? How strange...)
             z = torch.empty_meta(0)
             torch._C._nn.upsample_nearest1d(x, (4 * 10 ** 8,), 2, out=z)
             self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8))
+            self.assertRaises(RuntimeError, lambda: z[0][0][0].item())
 
         def test_normal_shape(self):
             warned = False

From 93973ee6993c97c8b9d60c4f720423bc625073ea Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 3 Dec 2020 08:19:25 -0800
Subject: [PATCH 015/132] Header cleanup (#48728)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48728

Mostly removing unnecessary includes so that TensorIterator.h can be
included from NativeFunctions.h without causing cycles.  There some
cases where I moved code around so that I didn't have to pull in other
unnecessary stuff.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D25278030

Pulled By: ezyang

fbshipit-source-id: 5f6b95a6bc734e452e9bd7bee8fe5278f5e45be2
---
 aten/src/ATen/Parallel.h                      |   9 -
 aten/src/ATen/ParallelOpenMP.h                |   1 -
 aten/src/ATen/{native => }/TensorIterator.cpp |   1 +
 aten/src/ATen/TensorIterator.h                | 556 ++++++++++++++++++
 aten/src/ATen/TensorMeta.cpp                  |  21 +
 aten/src/ATen/TensorMeta.h                    |  21 +-
 aten/src/ATen/native/TensorFactories.h        |   2 +
 aten/src/ATen/native/TensorIterator.h         | 549 +----------------
 aten/src/ATen/native/TensorIteratorReduce.cpp |   2 +
 .../ATen/native/cpu/RangeFactoriesKernel.cpp  |   1 +
 aten/src/ATen/templates/MetaFunctions.h       |   2 +-
 aten/src/ATen/templates/NativeFunctions.h     |   1 +
 c10/util/FunctionRef.h                        |   4 +
 13 files changed, 595 insertions(+), 575 deletions(-)
 rename aten/src/ATen/{native => }/TensorIterator.cpp (99%)
 create mode 100644 aten/src/ATen/TensorIterator.h
 create mode 100644 aten/src/ATen/TensorMeta.cpp

diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 9e2f9be3e66e..e21401c52bd5 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -1,18 +1,9 @@
 #pragma once
-#include <ATen/ATen.h>
 #include <ATen/Config.h>
 #include <ATen/core/ivalue.h>
 #include <c10/macros/Macros.h>
 
 namespace at {
-namespace internal {
-// This parameter is heuristically chosen to determine the minimum number of
-// work that warrants parallelism. For example, when summing an array, it is
-// deemed inefficient to parallelise over arrays shorter than 32768. Further,
-// no parallel algorithm (such as parallel_reduce) should split work into
-// smaller than GRAIN_SIZE chunks.
-constexpr int64_t GRAIN_SIZE = 32768;
-} // namespace internal
 
 inline int64_t divup(int64_t x, int64_t y) {
   return (x + y - 1) / y;
diff --git a/aten/src/ATen/ParallelOpenMP.h b/aten/src/ATen/ParallelOpenMP.h
index 5e01d1de9d18..bbb369ba3d50 100644
--- a/aten/src/ATen/ParallelOpenMP.h
+++ b/aten/src/ATen/ParallelOpenMP.h
@@ -1,5 +1,4 @@
 #pragma once
-#include <ATen/ATen.h>
 
 #include <cstddef>
 #include <exception>
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
similarity index 99%
rename from aten/src/ATen/native/TensorIterator.cpp
rename to aten/src/ATen/TensorIterator.cpp
index 33461d63e0e1..43acc9a070d5 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -6,6 +6,7 @@
 #include <ATen/native/TypeProperties.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/native/Resize.h>
+#include <ATen/TensorOperators.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
new file mode 100644
index 000000000000..11dbda5c7959
--- /dev/null
+++ b/aten/src/ATen/TensorIterator.h
@@ -0,0 +1,556 @@
+#pragma once
+
+#include <c10/util/FunctionRef.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/TypeCast.h>
+#include <ATen/core/Range.h>
+#include <bitset>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/TensorMeta.h>
+
+// TensorIterator is a helper class for element-wise operations, such as
+// arithmetic, comparisons, and trigonometric functions. It handles
+// broadcasting and type conversions of operands.
+//
+// This is inspired by NumPy's Array Iterator API (NpyIter).
+//
+// The files Loops.h and Loops.cuh provide functions to build kernels that
+// use TensorIterator.
+//
+// Example:
+//
+//   auto iter = TensorIteratorConfig()
+//     .add_output(output)
+//     .add_input(input)
+//     .build()
+//
+// [MyKernel.cpp / MyKernel.cu]
+//   cpu_kernel(iter, [](float a, float b) {
+//     return a + b;
+//   });
+//
+//   gpu_kernel(iter, []GPU_LAMBDA(float a, float b) -> float {
+//     return a + b;
+//   });
+//
+// Note [Common Dtype Computation]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Some operations have a natural notion of a "common dtype" or
+//   "computation dtype" where all inputs are cast to one dtype, the
+//   operation is performed, and then the results are cast to all outputs.
+//
+// TensorIterator infers a common dtype if all inputs have the same dtype,
+//   and it computes one using type promotion rules on its inputs if
+//   promote_inputs_to_common_dtype_ is true. Attempting to query
+//   a common dtype otherwise will throw an exception.
+//
+// Note that the outputs are not considered when computing a common dtype.
+
+namespace at {
+
+namespace internal {
+// This parameter is heuristically chosen to determine the minimum number of
+// work that warrants parallelism. For example, when summing an array, it is
+// deemed inefficient to parallelise over arrays shorter than 32768. Further,
+// no parallel algorithm (such as parallel_reduce) should split work into
+// smaller than GRAIN_SIZE chunks.
+constexpr int64_t GRAIN_SIZE = 32768;
+} // namespace internal
+
+struct DimCounter {
+  DimCounter(IntArrayRef shape, Range range);
+
+  void increment(const std::array<int64_t, 2>& step);
+  bool is_done() const;
+  std::array<int64_t, 2> max_2d_step() const;
+
+  IntArrayRef shape;
+  Range range;
+  DimVector values;
+  int64_t offset;
+};
+
+struct CAFFE2_API OperandInfo {
+  using StrideVector = SmallVector<int64_t, 6>;
+  OperandInfo() {}
+  explicit OperandInfo(Tensor t) : tensor(std::move(t)) {
+    if (tensor.defined()) {
+      device = tensor.device();
+      target_dtype = tensor.scalar_type();
+      current_dtype = target_dtype;
+    }
+    validate();
+  }
+
+  /// Stride after broadcasting. The stride is in bytes, not number of elements.
+  StrideVector stride_bytes;
+
+  /// The tensor operand. Note that the strides, data pointer, and
+  /// other attributes may differ due to dimension reordering and
+  /// coalescing.
+  Tensor tensor;
+
+  // Save the original tensor operand in cases when an output is modified
+  // (e.g. if dtype is changed)
+  Tensor original_tensor;
+
+  /// The desired device and type for the operand. For inputs, this specifies that
+  /// the input should be converted to this type if necessary. For outputs, this
+  /// specifies which type to allocate. target_dtype and device are initialized with the dtype and device of the tensor
+  /// but during type promotion target_dtype value can become different from tensor's dtype
+  /// also, during type promotion target_dtype and device can be set for an undefined tensor so that tensor can be properly
+  /// constructed later.
+  Device device = kCPU;
+  ScalarType target_dtype = ScalarType::Undefined;
+  // Caches dtype of the tensor, because scalar_type is an expensive operation
+  // If dtype of the tensor is changed (e.g. as a result of type promotion or in allocate_outputs), this
+  //value should be changed too.
+  ScalarType current_dtype = ScalarType::Undefined;
+
+  bool is_type_defined() const { return target_dtype != ScalarType::Undefined; }
+  TensorOptions options() const {
+    return TensorOptions(target_dtype).device(device);
+  }
+
+  /// The data pointer. This may be different from tensor.data_ptr() if the
+  /// iterator is split.
+  void* data = nullptr;
+
+  bool is_output = false;
+
+  bool will_resize = false;
+
+  bool is_read_write = false;
+
+  void validate() {
+    TORCH_CHECK(
+        !tensor.defined() || tensor.layout() == kStrided,
+        "unsupported tensor layout: ", tensor.layout());
+  }
+};
+
+struct SplitUntil32Bit;
+
+enum class FastSetupType : uint8_t {
+  NONE,
+  CONTIGUOUS,
+  CHANNELS_LAST,
+  NON_OVERLAPPING_DENSE
+};
+
+class TensorIteratorConfig;
+struct TensorIterator;
+
+struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
+  using DimMask = std::bitset<64>;
+  using PtrVector = SmallVector<char*, 4>;
+  using StrideVector = SmallVector<int64_t, 6>;
+
+  TensorIteratorBase();
+  void build(TensorIteratorConfig&);
+
+  // The inner-loop function operates on the fastest moving dimension. It
+  // implements element-wise operations in terms of 1-d strided tensors.
+  //
+  // Arguments:
+  //  data: data pointers for each operand (length `ntensors`)
+  //  strides: stride for each operand (length `ntensors`)
+  //  size: size of inner loop
+  //
+  // The `size` often matches shape[0], but may be smaller due to
+  // parallelization of the inner loop.
+  using loop_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size)>;
+  using loop2d_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>;
+
+  using loop_subiter_t = c10::function_ref<void(TensorIteratorBase& subiter)>;
+
+  void foreach_reduced_elt(loop_subiter_t loop, bool parallelize=true);
+
+  int ndim() const { return shape_.size(); }
+  IntArrayRef shape() const { return shape_; }
+  int64_t numel() const;
+  int ntensors() const { return operands_.size(); }
+  int noutputs() const { return num_outputs_; }
+  int ninputs() const { return ntensors() - noutputs(); }
+  IntArrayRef view_offsets() const { return view_offsets_; }
+
+  /// number of elements in the output operand. this is the same as numel() for
+  /// operations that are not reductions.
+  int64_t num_output_elements() const;
+
+  /// number of reduced dimensions in a reduction operation
+  int num_reduce_dims() const;
+
+  /// 1-dimensional iteration and no buffering or type conversion
+  bool is_trivial_1d() const;
+  /// Reducible to 1-dimensional and all operands are contiguous
+  bool is_contiguous() const;
+  bool is_dim_reduced(int dim) const;
+
+  /// Accessors for each operand
+  IntArrayRef strides(int arg) const { return operands_[arg].stride_bytes; }
+  void* data_ptr(int arg) const;
+  ScalarType dtype(int arg=0) const { return operands_[arg].current_dtype; }
+  ScalarType common_dtype() const {
+    TORCH_INTERNAL_ASSERT(common_dtype_ != ScalarType::Undefined, "Queried for invalid common dtype!");
+    return common_dtype_;
+  }
+  ScalarType input_dtype(int arg=0) const { return operands_[num_outputs_ + arg].current_dtype; }
+  Device device(int arg=0) const { return operands_[arg].device; }
+  DeviceType device_type(int arg=0) const { return device(arg).type(); }
+  int64_t element_size(int arg) const { return elementSize(dtype(arg)); }
+  bool is_scalar(int arg) const;
+  bool is_cpu_scalar(int arg) const;
+
+  const Tensor& tensor(int arg) const { return operands_[arg].tensor; }
+  Tensor& tensor(int arg) { return operands_[arg].tensor; }
+
+  Tensor output(int arg=0) const {
+    AT_ASSERT(arg < num_outputs_);
+    return operands_[arg].tensor;
+  }
+
+  // Copies from temporary outputs back to the original outputs
+  // NOTE: only used on CPU
+  void cast_outputs();
+
+  Tensor input(int arg=0) const {
+    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
+    return operands_[num_outputs_ + arg].tensor;
+  }
+
+  /// Removes an operand from this iterator
+  void remove_operand(int arg);
+  /// Shrinks an iterated dimension
+  void narrow(int dim, int64_t start, int64_t size);
+  /// Narrows every dim after and including `start_dim` to size one.
+  void select_all_keeping_dim(int start_dim, IntArrayRef starts);
+  /// Replaces the data pointer for the operand at index `arg`.
+  /// The new pointer should have the same sizes, strides and dtype as the
+  /// original
+  void unsafe_replace_operand(int arg, void* data);
+
+  /// Splits this TensorIterator into two iterators. Together they iterate over
+  /// the entire operation. Used by `with_32bit_indexing()`.
+  std::unique_ptr<TensorIterator> split(int dim);
+
+  /// Returns the dimension with the largest extent: (size[dim]-1) * stride[dim]
+  int get_dim_to_split() const;
+
+  template <typename T>
+  T scalar_value(int arg) {
+    auto& op = operands_[arg];
+    return c10::fetch_and_cast<T>(op.tensor.scalar_type(), op.data);
+  }
+
+  void for_each(loop_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);
+  void for_each(loop2d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);
+
+  void parallel_reduce(loop2d_t loop);
+
+  void serial_for_each(loop_t loop, Range range) const;
+  void serial_for_each(loop2d_t loop, Range range) const;
+
+  /// Create a strides array for a Tensor with shape of this iterator. The
+  /// parameter `element_size` specifies the size of Tensor's data type in
+  /// bytes (e.g. `4` for `float`)
+  StrideVector compatible_stride(int element_size) const;
+
+  /// Inverts the re-ordering done by reorder_dimensions. This can only be
+  /// called *before* coalesce_dimensions() is called.
+  DimVector invert_perm(IntArrayRef input) const;
+
+  /// Reapply same re-ordering as it is done by reorder_dimensions. This can
+  /// only be called *before* coalesce_dimensions() is called.
+  DimVector apply_perm_and_mul(IntArrayRef input, int mul) const;
+
+  /// Helper functions for CPU iteration
+  StrideVector get_dim_strides(int dim) const;
+  StrideVector get_strides() const;
+  StrideVector get_inner_strides() const { return get_dim_strides(0); }
+  PtrVector get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const;
+  PtrVector get_base_ptrs() const;
+
+  /// true if the stride computation can use 32-bit arithmetic. Used by GPU kernels
+  bool can_use_32bit_indexing() const;
+
+  /// An "iteratable" object that recursively splits this iterator into sub-iterators
+  /// that can use 32-bit indexing.
+  SplitUntil32Bit with_32bit_indexing() const;
+
+  /// If the kernel should accumulate into the output. Only relevant for CUDA
+  /// reductions.
+  bool should_accumulate() const { return accumulate_; }
+
+  /// Whether this iterator produces the actual output,
+  /// as opposed to something that will be accumulated further. Only relevant for
+  /// CUDA reductions.
+  bool is_final_output() const { return final_output_; }
+
+  bool has_contiguous_first_dim() const {
+    int num_tensors = ntensors();
+    for (int i = 0; i < num_tensors; i++) {
+      if (strides(i)[0] != element_size(i)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+protected:
+  // Mutable reference as it moves tensors out of TensorIteratorConfig
+  void populate_operands(TensorIteratorConfig&);
+  void mark_outputs();
+  void mark_resize_outputs(const TensorIteratorConfig&);
+  void compute_mem_overlaps(const TensorIteratorConfig&);
+  void compute_shape(const TensorIteratorConfig&);
+  void compute_strides(const TensorIteratorConfig&);
+  void reorder_dimensions();
+  void permute_dimensions(IntArrayRef perm);
+  void compute_types(const TensorIteratorConfig&);
+  ScalarType compute_common_dtype();
+  void allocate_or_resize_outputs();
+  bool fast_set_up(const TensorIteratorConfig&);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
+  void compute_names(const TensorIteratorConfig&);
+  void propagate_names_to_outputs();
+  void coalesce_dimensions();
+
+protected:
+
+  /// Records the "computation" shape of the output tensor.  The computation
+  /// shape is different from the regular shape in a few ways:
+  ///
+  ///   - The shape may be permuted (via permute_dimensions) so that we
+  ///     process the dimensions in the most computationally efficient order
+  ///     (rather than the logical order given to us by the users.)
+  ///   - The shape may have adjacent dimensions collapsed (via
+  ///     coalesce_dimensions) so that we minimize the number of
+  ///     dimensions we have to explicitly iterate over.  For example,
+  ///     a pointwise operation on a contiguous tensor "computationally"
+  ///     consists of only a single dimension.
+  ///
+  /// In other words, the computation shape is the output shape as it
+  /// actually matters for implementing the kernel, but not necessarily the
+  /// output shape that the user will see in the end.
+  ///
+  /// The lifecycle of mutations to shape_ in TensorIterator:
+  ///   - declare_static_shape() sets an initial shape explicitly
+  ///     provided by user, otherwise
+  ///   - compute_shape() computes the true (non-computational) shape
+  ///     specified by the user.
+  ///   - reorder_dimensions() reorders dimensions to improve coalescing.
+  ///   - coalesce_dimensions() then coalesces adjacent dimensions when
+  ///     possible.
+  ///
+  /// The shape may also be further modified if we create sub-TensorIterators,
+  /// e.g., via narrow or select_all_keeping_dim.
+  DimVector shape_;
+
+  /// Temporarily records the permutation computed by reorder_dimensions.
+  /// This permutation maps the computation output dimension (dim) to
+  /// the original true output dimension (perm_[dim]).  It is used by
+  /// invert_perm to undo the permutation.  After coalesce_dimensions is
+  /// called, the permutation is no longer valid (as, in general, there
+  /// is no permutation that will make computation dimensions to
+  /// output dimensions); methods that manipulate perm_ are obligated
+  /// to test that !has_coalesced_dimensions
+  DimVector perm_;
+
+  /// Has coalesce_dimensions() (or any moral equivalent, e.g., fast_build())
+  /// been called?  This is SOLELY used to check validity of perm_.
+  bool has_coalesced_dimensions_ = false;
+
+  /// The index offsets into the original tensors for each dimension.
+  /// This is only non-zero when you narrow() a TensorIterator (e.g.,
+  /// when you make sub-TensorIterators).
+  DimVector view_offsets_;
+
+  /// The computed names of the output tensor.  Computed by compute_names()
+  NameVector names_;
+
+  /// The operands of the TensorIterator: both the inputs and outputs.  The
+  /// outputs MUST come first in the operands_ list.  There is always an
+  /// operand for each output of the TensorIterator, even if TensorIterator
+  /// will ultimately be responsible for allocating the output; in those
+  /// cases, tensor is simply undefined (and will be populated later
+  /// during build()).
+  ///
+  /// This list is initially populated prior to build(), but build() mutates
+  /// OperandInfo to populate more information.
+  SmallVector<OperandInfo, 4> operands_;
+
+  /// Number of outputs in operands_ (the length of the outputs prefix
+  /// in operands_).
+  int num_outputs_ = 0;
+
+  /// Whether or not all operands have the same shape.  Having all the same
+  /// shape affects whether or not the iterator is eligible for fast setup.
+  bool all_ops_same_shape_ = false;
+
+  /// The "computation" dtype of TensorIterator, specifying what the dtype
+  /// we will do the internal computation in TensorIterator.  Typically,
+  /// this matches the dtype of the output tensors, but not always!
+  ScalarType common_dtype_ = ScalarType::Undefined;
+
+  /// Set by split(), see should_accumulate() and is_final_output()
+  bool accumulate_ = false;
+  bool final_output_ = true;
+
+  // From TensorIteratorConfig
+  bool is_reduction_ = false;
+};
+
+struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
+  TensorIterator() : TensorIteratorBase() {}
+  // Slicing is OK, TensorIterator guaranteed NOT to have any fields
+  TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}
+
+  static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator unary_op(Tensor& out, const Tensor& a);
+  static TensorIterator unary_float_op(Tensor& out, const Tensor& a);
+  static TensorIterator nullary_op(Tensor& out);
+  static TensorIterator reduce_op(Tensor& out, const Tensor& a);
+  static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
+
+  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
+};
+
+class CAFFE2_API TensorIteratorConfig final {
+public:
+  friend struct TensorIteratorBase;
+  friend struct TensorIterator;
+
+  TensorIteratorConfig() {}
+
+  C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig);
+
+  /// Construction
+  TensorIteratorConfig& add_output(const Tensor& output);
+  TensorIteratorConfig& add_input(const Tensor& input);
+
+  // Sets the check_mem_overlap_ flag, which is true by default.
+  // If true, inputs are checked for partial overlap with the outputs and
+  // outputs are checked for internal overlap (e.g. broadcasted views). An error
+  // is raised if unacceptable overlap is detected.
+  // If you're migrating an existing operator to using TensorIterator, please
+  // consider if the previous implementation checked memory overlap. If it did
+  // not, and if the operator is idempotent (for example, Tensor.fill_(0)), then
+  // checking memory overlap is BC-breaking. Please don't check memory overlap
+  // in that case.
+  TensorIteratorConfig& set_check_mem_overlap(bool check_mem_overlap);
+
+  // Sets the check_all_same_dtype_ flag, which is true by default
+  // If true, checks that all inputs and defined outputs have the same dtype
+  // Setting either of promote_inputs_to_common_dtype_
+  //   or cast_common_dtype_to_outputs_ to true will set
+  //   check_all_same_dtype_ to false.
+  TensorIteratorConfig& check_all_same_dtype(const bool _check_all_same_dtype);
+
+  // Sets the check_all_same_device_ flag, which is true by default
+  // If true, all operands must be on the same device, with the possible
+  //   exception of CPU scalars, which can be passed to some CUDA kernels
+  //   as kernel arguments.
+  TensorIteratorConfig& check_all_same_device(const bool _check_all_same_device);
+
+  // Sets the enforce_safe_casting_to_output_ flag, which is false by default
+  // If true, the iterator's "common dtype" must be computable
+  //   (see the [Common Dtype Computation] note) and
+  //   canCast(common dtype, output dtype) must be true for all outputs.
+  TensorIteratorConfig& enforce_safe_casting_to_output(const bool _enforce_safe_casting_to_output);
+
+  // Sets the promote_inputs_to_common_dtype_ flag, which is false by default
+  // If true, the iterator's "common dtype" is always computed (see the
+  //   [Common Dtype Computation] note) and, on the CPU, temporary copies of
+  //   the inputs in the common dtype are passed as the actual inputs to
+  //   the operation.
+  // Setting this flag to true sets check_all_same_dtype_ to false.
+  TensorIteratorConfig& promote_inputs_to_common_dtype(const bool _promote_inputs_to_common_dtype);
+
+  // Sets the promote_integer_inputs_to_float_ flag, which is false by default
+  // NOTE: If set to true, the promote_inputs_to_common_dtype_ must also be true.
+  // If true, if the iterator's "common dtype" is an integral type (including bool)
+  //   then it is changed to the default float scalar type.
+  TensorIteratorConfig& promote_integer_inputs_to_float(const bool _promote_integer_inputs_to_float);
+  TensorIteratorConfig& is_reduction(const bool _is_reduction);
+  TensorIteratorConfig& allow_cpu_scalars(const bool _allow_cpu_scalars);
+
+  // Sets the cast_common_dtype_to_outputs_ flag, which is false by default
+  // If true, the iterator's "common dtype" must be computatable
+  //   (see the [Common Dtype Computation] note) and, on the CPU, temporary
+  //   copies of the outputs are passed as the actual output to the operation.
+  //   These temporaries are then copied to the original outputs after
+  //   the operation is performed (see cast_outputs()).
+  // Setting this flag to true sets check_all_same_dtype_ to false.
+  TensorIteratorConfig& cast_common_dtype_to_outputs(const bool _cast_common_dtype_to_outputs);
+  TensorIteratorConfig& resize_outputs(bool resize_outputs);
+
+  // Bypass output dtype/device computation and fix the dtype/device as specified here.
+  TensorIteratorConfig& declare_static_dtype_and_device(ScalarType dtype, Device device);
+  TensorIteratorConfig& declare_static_shape(IntArrayRef shape);
+  TensorIteratorConfig& declare_static_shape(IntArrayRef shape, IntArrayRef squash_dims);
+
+  // It would be better if this was && qualified, but this would be at the cost
+  // of a lot of boilerplate above
+  TensorIterator build() {
+    TensorIterator iter;
+    iter.build(*this);
+    return iter;
+  }
+
+private:
+  SmallVector<Tensor, 4> tensors_;
+  int num_outputs_ = 0;
+  int num_inputs_ = 0;
+
+  c10::optional<DimVector> static_shape_ = c10::nullopt;
+  c10::optional<std::pair<ScalarType, Device>> static_dtype_and_device_ = c10::nullopt;
+  bool check_mem_overlap_ = true;
+  bool allow_cpu_scalars_ = false;
+  bool is_reduction_ = false;
+  bool resize_outputs_ = true;
+  bool check_all_same_dtype_ = true;
+  bool check_all_same_device_ = true;
+  bool enforce_safe_casting_to_output_ = false;
+  bool promote_inputs_to_common_dtype_ = false;
+  bool promote_integer_inputs_to_float_ = false;
+  bool cast_common_dtype_to_outputs_ = false;
+};
+
+
+
+/// A container-like struct that acts as if it contains splits of a
+/// TensorIterator that can use 32-bit indexing. Taken together the splits cover
+/// the original TensorIterator.
+struct CAFFE2_API SplitUntil32Bit {
+  struct CAFFE2_API iterator {
+    iterator() {};
+    iterator(const TensorIteratorBase& iter);
+    iterator(iterator&&) = default;
+
+    // Guaranteed to be a TensorIterator proper!
+    TensorIterator& operator*() const;
+    iterator& operator++();
+    bool operator==(const iterator& other) const {
+      // two iterators are equal if they are the same object or they're both empty
+      return this == &other || (vec.empty() && other.vec.empty());
+    }
+    // needed for C++11 range-based for loop
+    bool operator!=(const iterator& other) const { return !(*this == other); }
+
+    /// stack of TensorIterators to be split
+    std::vector<std::unique_ptr<TensorIterator>> vec;
+  };
+
+  SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {}
+
+  iterator begin() const;
+  iterator end() const;
+
+private:
+  const TensorIteratorBase& iter;
+};
+
+}  // namespace at
diff --git a/aten/src/ATen/TensorMeta.cpp b/aten/src/ATen/TensorMeta.cpp
new file mode 100644
index 000000000000..30dca8ccaf2e
--- /dev/null
+++ b/aten/src/ATen/TensorMeta.cpp
@@ -0,0 +1,21 @@
+#include <ATen/TensorMeta.h>
+#include <ATen/ATen.h>
+
+namespace at {
+
+Tensor meta_tensor_from_meta(const TensorMeta& meta) {
+  // TODO: eliminate indirection
+  return at::empty_meta(meta.sizes, meta.options);
+}
+
+Tensor tensor_from_meta(const TensorMeta& meta) {
+  // TODO: eliminate indirection
+  return at::empty(meta.sizes, meta.options);
+}
+
+// Analogous to self.new_empty(sizes)
+TensorMeta new_meta(const Tensor& self, IntArrayRef sizes) {
+  return TensorMeta(sizes, self.options());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h
index 59a7dc740175..baa6e6112b34 100644
--- a/aten/src/ATen/TensorMeta.h
+++ b/aten/src/ATen/TensorMeta.h
@@ -1,14 +1,13 @@
 #pragma once
 
-#include <ATen/ATen.h>  // TODO: improve
-// #include <ATen/NativeFunctions.h>
-
 #include <ATen/DimVector.h>
 #include <c10/core/TensorOptions.h>
 #include <ATen/core/Dimname.h>
 
 namespace at {
 
+class Tensor;
+
 namespace impl {
 
 struct MetaBase {
@@ -30,19 +29,9 @@ struct TensorMeta {
     : sizes(_sizes), options(_options) {}
 };
 
-inline Tensor meta_tensor_from_meta(const TensorMeta& meta) {
-  // TODO: eliminate indirection
-  return at::empty_meta(meta.sizes, meta.options);
-}
-
-inline Tensor tensor_from_meta(const TensorMeta& meta) {
-  // TODO: eliminate indirection
-  return at::empty(meta.sizes, meta.options);
-}
-
+CAFFE2_API Tensor meta_tensor_from_meta(const TensorMeta& meta);
+CAFFE2_API Tensor tensor_from_meta(const TensorMeta& meta);
 // Analogous to self.new_empty(sizes)
-inline TensorMeta new_meta(const Tensor& self, IntArrayRef sizes) {
-  return TensorMeta(sizes, self.options());
-}
+CAFFE2_API TensorMeta new_meta(const Tensor& self, IntArrayRef sizes);
 
 } // namespace at
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index 579cfdb624e7..d5943ac55ae5 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/core/TensorOptions.h>
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 25b255d72fef..e55d2a58d709 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -1,549 +1,2 @@
 #pragma once
-
-#include <ATen/ATen.h>
-#include <c10/util/FunctionRef.h>
-#include <c10/util/SmallVector.h>
-#include <c10/util/TypeCast.h>
-#include <ATen/core/Range.h>
-#include <bitset>
-#include <ATen/NamedTensorUtils.h>
-#include <ATen/Parallel.h>
-#include <ATen/TensorMeta.h>
-
-// TensorIterator is a helper class for element-wise operations, such as
-// arithmetic, comparisons, and trigonometric functions. It handles
-// broadcasting and type conversions of operands.
-//
-// This is inspired by NumPy's Array Iterator API (NpyIter).
-//
-// The files Loops.h and Loops.cuh provide functions to build kernels that
-// use TensorIterator.
-//
-// Example:
-//
-//   auto iter = TensorIteratorConfig()
-//     .add_output(output)
-//     .add_input(input)
-//     .build()
-//
-// [MyKernel.cpp / MyKernel.cu]
-//   cpu_kernel(iter, [](float a, float b) {
-//     return a + b;
-//   });
-//
-//   gpu_kernel(iter, []GPU_LAMBDA(float a, float b) -> float {
-//     return a + b;
-//   });
-//
-// Note [Common Dtype Computation]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// Some operations have a natural notion of a "common dtype" or
-//   "computation dtype" where all inputs are cast to one dtype, the
-//   operation is performed, and then the results are cast to all outputs.
-//
-// TensorIterator infers a common dtype if all inputs have the same dtype,
-//   and it computes one using type promotion rules on its inputs if
-//   promote_inputs_to_common_dtype_ is true. Attempting to query
-//   a common dtype otherwise will throw an exception.
-//
-// Note that the outputs are not considered when computing a common dtype.
-
-namespace at {
-
-struct DimCounter {
-  DimCounter(IntArrayRef shape, Range range);
-
-  void increment(const std::array<int64_t, 2>& step);
-  bool is_done() const;
-  std::array<int64_t, 2> max_2d_step() const;
-
-  IntArrayRef shape;
-  Range range;
-  DimVector values;
-  int64_t offset;
-};
-
-struct CAFFE2_API OperandInfo {
-  using StrideVector = SmallVector<int64_t, 6>;
-  OperandInfo() {}
-  explicit OperandInfo(Tensor t) : tensor(std::move(t)) {
-    if (tensor.defined()) {
-      device = tensor.device();
-      target_dtype = tensor.scalar_type();
-      current_dtype = target_dtype;
-    }
-    validate();
-  }
-
-  /// Stride after broadcasting. The stride is in bytes, not number of elements.
-  StrideVector stride_bytes;
-
-  /// The tensor operand. Note that the strides, data pointer, and
-  /// other attributes may differ due to dimension reordering and
-  /// coalescing.
-  Tensor tensor;
-
-  // Save the original tensor operand in cases when an output is modified
-  // (e.g. if dtype is changed)
-  Tensor original_tensor;
-
-  /// The desired device and type for the operand. For inputs, this specifies that
-  /// the input should be converted to this type if necessary. For outputs, this
-  /// specifies which type to allocate. target_dtype and device are initialized with the dtype and device of the tensor
-  /// but during type promotion target_dtype value can become different from tensor's dtype
-  /// also, during type promotion target_dtype and device can be set for an undefined tensor so that tensor can be properly
-  /// constructed later.
-  Device device = kCPU;
-  ScalarType target_dtype = ScalarType::Undefined;
-  // Caches dtype of the tensor, because scalar_type is an expensive operation
-  // If dtype of the tensor is changed (e.g. as a result of type promotion or in allocate_outputs), this
-  //value should be changed too.
-  ScalarType current_dtype = ScalarType::Undefined;
-
-  bool is_type_defined() const { return target_dtype != ScalarType::Undefined; }
-  TensorOptions options() const {
-    return TensorOptions(target_dtype).device(device);
-  }
-
-  /// The data pointer. This may be different from tensor.data_ptr() if the
-  /// iterator is split.
-  void* data = nullptr;
-
-  bool is_output = false;
-
-  bool will_resize = false;
-
-  bool is_read_write = false;
-
-  void validate() {
-    TORCH_CHECK(
-        !tensor.defined() || tensor.layout() == kStrided,
-        "unsupported tensor layout: ", tensor.layout());
-  }
-};
-
-struct SplitUntil32Bit;
-
-enum class FastSetupType : uint8_t {
-  NONE,
-  CONTIGUOUS,
-  CHANNELS_LAST,
-  NON_OVERLAPPING_DENSE
-};
-
-class TensorIteratorConfig;
-struct TensorIterator;
-
-struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
-  using DimMask = std::bitset<64>;
-  using PtrVector = SmallVector<char*, 4>;
-  using StrideVector = SmallVector<int64_t, 6>;
-
-  TensorIteratorBase();
-  void build(TensorIteratorConfig&);
-
-  // The inner-loop function operates on the fastest moving dimension. It
-  // implements element-wise operations in terms of 1-d strided tensors.
-  //
-  // Arguments:
-  //  data: data pointers for each operand (length `ntensors`)
-  //  strides: stride for each operand (length `ntensors`)
-  //  size: size of inner loop
-  //
-  // The `size` often matches shape[0], but may be smaller due to
-  // parallelization of the inner loop.
-  using loop_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size)>;
-  using loop2d_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>;
-
-  using loop_subiter_t = c10::function_ref<void(TensorIteratorBase& subiter)>;
-
-  void foreach_reduced_elt(loop_subiter_t loop, bool parallelize=true);
-
-  int ndim() const { return shape_.size(); }
-  IntArrayRef shape() const { return shape_; }
-  int64_t numel() const;
-  int ntensors() const { return operands_.size(); }
-  int noutputs() const { return num_outputs_; }
-  int ninputs() const { return ntensors() - noutputs(); }
-  IntArrayRef view_offsets() const { return view_offsets_; }
-
-  /// number of elements in the output operand. this is the same as numel() for
-  /// operations that are not reductions.
-  int64_t num_output_elements() const;
-
-  /// number of reduced dimensions in a reduction operation
-  int num_reduce_dims() const;
-
-  /// 1-dimensional iteration and no buffering or type conversion
-  bool is_trivial_1d() const;
-  /// Reducible to 1-dimensional and all operands are contiguous
-  bool is_contiguous() const;
-  bool is_dim_reduced(int dim) const;
-
-  /// Accessors for each operand
-  IntArrayRef strides(int arg) const { return operands_[arg].stride_bytes; }
-  void* data_ptr(int arg) const;
-  ScalarType dtype(int arg=0) const { return operands_[arg].current_dtype; }
-  ScalarType common_dtype() const {
-    TORCH_INTERNAL_ASSERT(common_dtype_ != ScalarType::Undefined, "Queried for invalid common dtype!");
-    return common_dtype_;
-  }
-  ScalarType input_dtype(int arg=0) const { return operands_[num_outputs_ + arg].current_dtype; }
-  Device device(int arg=0) const { return operands_[arg].device; }
-  DeviceType device_type(int arg=0) const { return device(arg).type(); }
-  int64_t element_size(int arg) const { return elementSize(dtype(arg)); }
-  bool is_scalar(int arg) const;
-  bool is_cpu_scalar(int arg) const;
-
-  const Tensor& tensor(int arg) const { return operands_[arg].tensor; }
-  Tensor& tensor(int arg) { return operands_[arg].tensor; }
-
-  Tensor output(int arg=0) const {
-    AT_ASSERT(arg < num_outputs_);
-    return operands_[arg].tensor;
-  }
-
-  // Copies from temporary outputs back to the original outputs
-  // NOTE: only used on CPU
-  void cast_outputs();
-
-  Tensor input(int arg=0) const {
-    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
-    return operands_[num_outputs_ + arg].tensor;
-  }
-
-  /// Removes an operand from this iterator
-  void remove_operand(int arg);
-  /// Shrinks an iterated dimension
-  void narrow(int dim, int64_t start, int64_t size);
-  /// Narrows every dim after and including `start_dim` to size one.
-  void select_all_keeping_dim(int start_dim, IntArrayRef starts);
-  /// Replaces the data pointer for the operand at index `arg`.
-  /// The new pointer should have the same sizes, strides and dtype as the
-  /// original
-  void unsafe_replace_operand(int arg, void* data);
-
-  /// Splits this TensorIterator into two iterators. Together they iterate over
-  /// the entire operation. Used by `with_32bit_indexing()`.
-  std::unique_ptr<TensorIterator> split(int dim);
-
-  /// Returns the dimension with the largest extent: (size[dim]-1) * stride[dim]
-  int get_dim_to_split() const;
-
-  template <typename T>
-  T scalar_value(int arg) {
-    auto& op = operands_[arg];
-    return c10::fetch_and_cast<T>(op.tensor.scalar_type(), op.data);
-  }
-
-  void for_each(loop_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);
-  void for_each(loop2d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);
-
-  void parallel_reduce(loop2d_t loop);
-
-  void serial_for_each(loop_t loop, Range range) const;
-  void serial_for_each(loop2d_t loop, Range range) const;
-
-  /// Create a strides array for a Tensor with shape of this iterator. The
-  /// parameter `element_size` specifies the size of Tensor's data type in
-  /// bytes (e.g. `4` for `float`)
-  StrideVector compatible_stride(int element_size) const;
-
-  /// Inverts the re-ordering done by reorder_dimensions. This can only be
-  /// called *before* coalesce_dimensions() is called.
-  DimVector invert_perm(IntArrayRef input) const;
-
-  /// Reapply same re-ordering as it is done by reorder_dimensions. This can
-  /// only be called *before* coalesce_dimensions() is called.
-  DimVector apply_perm_and_mul(IntArrayRef input, int mul) const;
-
-  /// Helper functions for CPU iteration
-  StrideVector get_dim_strides(int dim) const;
-  StrideVector get_strides() const;
-  StrideVector get_inner_strides() const { return get_dim_strides(0); }
-  PtrVector get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const;
-  PtrVector get_base_ptrs() const;
-
-  /// true if the stride computation can use 32-bit arithmetic. Used by GPU kernels
-  bool can_use_32bit_indexing() const;
-
-  /// An "iteratable" object that recursively splits this iterator into sub-iterators
-  /// that can use 32-bit indexing.
-  SplitUntil32Bit with_32bit_indexing() const;
-
-  /// If the kernel should accumulate into the output. Only relevant for CUDA
-  /// reductions.
-  bool should_accumulate() const { return accumulate_; }
-
-  /// Whether this iterator produces the actual output,
-  /// as opposed to something that will be accumulated further. Only relevant for
-  /// CUDA reductions.
-  bool is_final_output() const { return final_output_; }
-
-  bool has_contiguous_first_dim() const {
-    int num_tensors = ntensors();
-    for (int i = 0; i < num_tensors; i++) {
-      if (strides(i)[0] != element_size(i)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-protected:
-  // Mutable reference as it moves tensors out of TensorIteratorConfig
-  void populate_operands(TensorIteratorConfig&);
-  void mark_outputs();
-  void mark_resize_outputs(const TensorIteratorConfig&);
-  void compute_mem_overlaps(const TensorIteratorConfig&);
-  void compute_shape(const TensorIteratorConfig&);
-  void compute_strides(const TensorIteratorConfig&);
-  void reorder_dimensions();
-  void permute_dimensions(IntArrayRef perm);
-  void compute_types(const TensorIteratorConfig&);
-  ScalarType compute_common_dtype();
-  void allocate_or_resize_outputs();
-  bool fast_set_up(const TensorIteratorConfig&);
-  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
-  void compute_names(const TensorIteratorConfig&);
-  void propagate_names_to_outputs();
-  void coalesce_dimensions();
-
-protected:
-
-  /// Records the "computation" shape of the output tensor.  The computation
-  /// shape is different from the regular shape in a few ways:
-  ///
-  ///   - The shape may be permuted (via permute_dimensions) so that we
-  ///     process the dimensions in the most computationally efficient order
-  ///     (rather than the logical order given to us by the users.)
-  ///   - The shape may have adjacent dimensions collapsed (via
-  ///     coalesce_dimensions) so that we minimize the number of
-  ///     dimensions we have to explicitly iterate over.  For example,
-  ///     a pointwise operation on a contiguous tensor "computationally"
-  ///     consists of only a single dimension.
-  ///
-  /// In other words, the computation shape is the output shape as it
-  /// actually matters for implementing the kernel, but not necessarily the
-  /// output shape that the user will see in the end.
-  ///
-  /// The lifecycle of mutations to shape_ in TensorIterator:
-  ///   - declare_static_shape() sets an initial shape explicitly
-  ///     provided by user, otherwise
-  ///   - compute_shape() computes the true (non-computational) shape
-  ///     specified by the user.
-  ///   - reorder_dimensions() reorders dimensions to improve coalescing.
-  ///   - coalesce_dimensions() then coalesces adjacent dimensions when
-  ///     possible.
-  ///
-  /// The shape may also be further modified if we create sub-TensorIterators,
-  /// e.g., via narrow or select_all_keeping_dim.
-  DimVector shape_;
-
-  /// Temporarily records the permutation computed by reorder_dimensions.
-  /// This permutation maps the computation output dimension (dim) to
-  /// the original true output dimension (perm_[dim]).  It is used by
-  /// invert_perm to undo the permutation.  After coalesce_dimensions is
-  /// called, the permutation is no longer valid (as, in general, there
-  /// is no permutation that will make computation dimensions to
-  /// output dimensions); methods that manipulate perm_ are obligated
-  /// to test that !has_coalesced_dimensions
-  DimVector perm_;
-
-  /// Has coalesce_dimensions() (or any moral equivalent, e.g., fast_build())
-  /// been called?  This is SOLELY used to check validity of perm_.
-  bool has_coalesced_dimensions_ = false;
-
-  /// The index offsets into the original tensors for each dimension.
-  /// This is only non-zero when you narrow() a TensorIterator (e.g.,
-  /// when you make sub-TensorIterators).
-  DimVector view_offsets_;
-
-  /// The computed names of the output tensor.  Computed by compute_names()
-  NameVector names_;
-
-  /// The operands of the TensorIterator: both the inputs and outputs.  The
-  /// outputs MUST come first in the operands_ list.  There is always an
-  /// operand for each output of the TensorIterator, even if TensorIterator
-  /// will ultimately be responsible for allocating the output; in those
-  /// cases, tensor is simply undefined (and will be populated later
-  /// during build()).
-  ///
-  /// This list is initially populated prior to build(), but build() mutates
-  /// OperandInfo to populate more information.
-  SmallVector<OperandInfo, 4> operands_;
-
-  /// Number of outputs in operands_ (the length of the outputs prefix
-  /// in operands_).
-  int num_outputs_ = 0;
-
-  /// Whether or not all operands have the same shape.  Having all the same
-  /// shape affects whether or not the iterator is eligible for fast setup.
-  bool all_ops_same_shape_ = false;
-
-  /// The "computation" dtype of TensorIterator, specifying what the dtype
-  /// we will do the internal computation in TensorIterator.  Typically,
-  /// this matches the dtype of the output tensors, but not always!
-  ScalarType common_dtype_ = ScalarType::Undefined;
-
-  /// Set by split(), see should_accumulate() and is_final_output()
-  bool accumulate_ = false;
-  bool final_output_ = true;
-
-  // From TensorIteratorConfig
-  bool is_reduction_ = false;
-};
-
-struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
-  TensorIterator() : TensorIteratorBase() {}
-  // Slicing is OK, TensorIterator guaranteed NOT to have any fields
-  TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}
-
-  static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator unary_op(Tensor& out, const Tensor& a);
-  static TensorIterator unary_float_op(Tensor& out, const Tensor& a);
-  static TensorIterator nullary_op(Tensor& out);
-  static TensorIterator reduce_op(Tensor& out, const Tensor& a);
-  static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
-
-  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
-};
-
-class CAFFE2_API TensorIteratorConfig final {
-public:
-  friend struct TensorIteratorBase;
-  friend struct TensorIterator;
-
-  TensorIteratorConfig() {}
-
-  C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig);
-
-  /// Construction
-  TensorIteratorConfig& add_output(const Tensor& output);
-  TensorIteratorConfig& add_input(const Tensor& input);
-
-  // Sets the check_mem_overlap_ flag, which is true by default.
-  // If true, inputs are checked for partial overlap with the outputs and
-  // outputs are checked for internal overlap (e.g. broadcasted views). An error
-  // is raised if unacceptable overlap is detected.
-  // If you're migrating an existing operator to using TensorIterator, please
-  // consider if the previous implementation checked memory overlap. If it did
-  // not, and if the operator is idempotent (for example, Tensor.fill_(0)), then
-  // checking memory overlap is BC-breaking. Please don't check memory overlap
-  // in that case.
-  TensorIteratorConfig& set_check_mem_overlap(bool check_mem_overlap);
-
-  // Sets the check_all_same_dtype_ flag, which is true by default
-  // If true, checks that all inputs and defined outputs have the same dtype
-  // Setting either of promote_inputs_to_common_dtype_
-  //   or cast_common_dtype_to_outputs_ to true will set
-  //   check_all_same_dtype_ to false.
-  TensorIteratorConfig& check_all_same_dtype(const bool _check_all_same_dtype);
-
-  // Sets the check_all_same_device_ flag, which is true by default
-  // If true, all operands must be on the same device, with the possible
-  //   exception of CPU scalars, which can be passed to some CUDA kernels
-  //   as kernel arguments.
-  TensorIteratorConfig& check_all_same_device(const bool _check_all_same_device);
-
-  // Sets the enforce_safe_casting_to_output_ flag, which is false by default
-  // If true, the iterator's "common dtype" must be computable
-  //   (see the [Common Dtype Computation] note) and
-  //   canCast(common dtype, output dtype) must be true for all outputs.
-  TensorIteratorConfig& enforce_safe_casting_to_output(const bool _enforce_safe_casting_to_output);
-
-  // Sets the promote_inputs_to_common_dtype_ flag, which is false by default
-  // If true, the iterator's "common dtype" is always computed (see the
-  //   [Common Dtype Computation] note) and, on the CPU, temporary copies of
-  //   the inputs in the common dtype are passed as the actual inputs to
-  //   the operation.
-  // Setting this flag to true sets check_all_same_dtype_ to false.
-  TensorIteratorConfig& promote_inputs_to_common_dtype(const bool _promote_inputs_to_common_dtype);
-
-  // Sets the promote_integer_inputs_to_float_ flag, which is false by default
-  // NOTE: If set to true, the promote_inputs_to_common_dtype_ must also be true.
-  // If true, if the iterator's "common dtype" is an integral type (including bool)
-  //   then it is changed to the default float scalar type.
-  TensorIteratorConfig& promote_integer_inputs_to_float(const bool _promote_integer_inputs_to_float);
-  TensorIteratorConfig& is_reduction(const bool _is_reduction);
-  TensorIteratorConfig& allow_cpu_scalars(const bool _allow_cpu_scalars);
-
-  // Sets the cast_common_dtype_to_outputs_ flag, which is false by default
-  // If true, the iterator's "common dtype" must be computatable
-  //   (see the [Common Dtype Computation] note) and, on the CPU, temporary
-  //   copies of the outputs are passed as the actual output to the operation.
-  //   These temporaries are then copied to the original outputs after
-  //   the operation is performed (see cast_outputs()).
-  // Setting this flag to true sets check_all_same_dtype_ to false.
-  TensorIteratorConfig& cast_common_dtype_to_outputs(const bool _cast_common_dtype_to_outputs);
-  TensorIteratorConfig& resize_outputs(bool resize_outputs);
-
-  // Bypass output dtype/device computation and fix the dtype/device as specified here.
-  TensorIteratorConfig& declare_static_dtype_and_device(ScalarType dtype, Device device);
-  TensorIteratorConfig& declare_static_shape(IntArrayRef shape);
-  TensorIteratorConfig& declare_static_shape(IntArrayRef shape, IntArrayRef squash_dims);
-
-  // It would be better if this was && qualified, but this would be at the cost
-  // of a lot of boilerplate above
-  TensorIterator build() {
-    TensorIterator iter;
-    iter.build(*this);
-    return iter;
-  }
-
-private:
-  SmallVector<Tensor, 4> tensors_;
-  int num_outputs_ = 0;
-  int num_inputs_ = 0;
-
-  c10::optional<DimVector> static_shape_ = c10::nullopt;
-  c10::optional<std::pair<ScalarType, Device>> static_dtype_and_device_ = c10::nullopt;
-  bool check_mem_overlap_ = true;
-  bool allow_cpu_scalars_ = false;
-  bool is_reduction_ = false;
-  bool resize_outputs_ = true;
-  bool check_all_same_dtype_ = true;
-  bool check_all_same_device_ = true;
-  bool enforce_safe_casting_to_output_ = false;
-  bool promote_inputs_to_common_dtype_ = false;
-  bool promote_integer_inputs_to_float_ = false;
-  bool cast_common_dtype_to_outputs_ = false;
-};
-
-
-
-/// A container-like struct that acts as if it contains splits of a
-/// TensorIterator that can use 32-bit indexing. Taken together the splits cover
-/// the original TensorIterator.
-struct CAFFE2_API SplitUntil32Bit {
-  struct CAFFE2_API iterator {
-    iterator() {};
-    iterator(const TensorIteratorBase& iter);
-    iterator(iterator&&) = default;
-
-    // Guaranteed to be a TensorIterator proper!
-    TensorIterator& operator*() const;
-    iterator& operator++();
-    bool operator==(const iterator& other) const {
-      // two iterators are equal if they are the same object or they're both empty
-      return this == &other || (vec.empty() && other.vec.empty());
-    }
-    // needed for C++11 range-based for loop
-    bool operator!=(const iterator& other) const { return !(*this == other); }
-
-    /// stack of TensorIterators to be split
-    std::vector<std::unique_ptr<TensorIterator>> vec;
-  };
-
-  SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {}
-
-  iterator begin() const;
-  iterator end() const;
-
-private:
-  const TensorIteratorBase& iter;
-};
-
-}  // namespace at
+#include <ATen/TensorIterator.h>
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index 3837c7567e31..fcce06a6c936 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -2,6 +2,8 @@
 #include <ATen/Parallel.h>
 #include <algorithm>
 #include <memory>
+#include <ATen/Functions.h>
+#include <ATen/TensorOperators.h>
 
 /// Contains the implementation of parallel reductions in TensorIterator.
 
diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
index 55ab614e42d1..ba7f1af7eabb 100644
--- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
+++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
@@ -5,6 +5,7 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/cpu/vec256/vec256.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/Parallel.h>
 #include <ATen/native/cpu/Loops.h>
 
 
diff --git a/aten/src/ATen/templates/MetaFunctions.h b/aten/src/ATen/templates/MetaFunctions.h
index 448fc9b6f4b2..d0489d1964f3 100644
--- a/aten/src/ATen/templates/MetaFunctions.h
+++ b/aten/src/ATen/templates/MetaFunctions.h
@@ -2,8 +2,8 @@
 
 // ${generated_comment}
 
-#include <ATen/ATen.h>  // TODO: improve
 #include <ATen/TensorMeta.h>
+#include <ATen/TensorIterator.h>
 
 namespace at {
 namespace meta {
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index b4cb31f60ee8..2e35fde1b95e 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -3,6 +3,7 @@
 // ${generated_comment}
 
 #include <ATen/Context.h>
+#include <ATen/MetaFunctions.h>
 #include <ATen/core/Reduction.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>
diff --git a/c10/util/FunctionRef.h b/c10/util/FunctionRef.h
index a3730476b734..b3b9930cbbb5 100644
--- a/c10/util/FunctionRef.h
+++ b/c10/util/FunctionRef.h
@@ -18,6 +18,10 @@
 
 #pragma once
 
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
 namespace c10 {
 
 /// An efficient, type-erasing, non-owning reference to a callable. This is

From 92f376147c174bd8c5f700648d4d645540deb4df Mon Sep 17 00:00:00 2001
From: Joe Zhu <jozh@microsoft.com>
Date: Thu, 3 Dec 2020 08:30:43 -0800
Subject: [PATCH 016/132] Enable TCPStore on Windows (#47749)

Summary:
Enable TcpStore for DDP on Windows platform, in order to improve running DDP cross machines performance.

Related RFC is https://github.com/pytorch/pytorch/issues/47659

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47749

Reviewed By: bdhirsh

Differential Revision: D25220401

Pulled By: mrshenli

fbshipit-source-id: da4b46b42296e666fa7d8ec8040093de7443a529
---
 test/distributed/test_c10d.py        |  10 +-
 torch/csrc/distributed/c10d/init.cpp |   6 +-
 torch/distributed/__init__.py        |   2 +-
 torch/distributed/rendezvous.py      |  11 +-
 torch/lib/c10d/CMakeLists.txt        |  10 +-
 torch/lib/c10d/TCPStore.cpp          | 302 +++++++++++++++++----------
 torch/lib/c10d/TCPStore.hpp          |  17 +-
 torch/lib/c10d/UnixSockUtils.hpp     |  89 ++++++++
 torch/lib/c10d/Utils.cpp             |  78 ++-----
 torch/lib/c10d/Utils.hpp             |  44 +++-
 torch/lib/c10d/WinSockUtils.hpp      |  84 ++++++++
 torch/lib/c10d/test/CMakeLists.txt   |   2 +-
 torch/lib/c10d/test/TCPStoreTest.cpp |   2 +-
 13 files changed, 453 insertions(+), 204 deletions(-)
 create mode 100644 torch/lib/c10d/UnixSockUtils.hpp
 create mode 100644 torch/lib/c10d/WinSockUtils.hpp

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 195fa57c278b..3b25be6e49c1 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -275,7 +275,6 @@ def create_tcp_store(addr):
     raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
 
 
-@skip_if_win32()
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         store = create_tcp_store("localhost")
@@ -283,7 +282,11 @@ def _create_store(self):
         return store
 
     def test_address_already_in_use(self):
-        with self.assertRaisesRegex(RuntimeError, "^Address already in use$"):
+        if sys.platform == 'win32':
+            err_msg_reg = "Only one usage of each socket address*"
+        else:
+            err_msg_reg = "^Address already in use$"
+        with self.assertRaisesRegex(RuntimeError, err_msg_reg):
             addr = "localhost"
             port = common.find_free_port()
 
@@ -321,8 +324,6 @@ def _test_numkeys_delkeys(self, fs):
     def test_numkeys_delkeys(self):
         self._test_numkeys_delkeys(self._create_store())
 
-
-@skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         super(PrefixTCPStoreTest, self).setUp()
@@ -379,7 +380,6 @@ def test_unknown_handler(self):
             c10d.rendezvous("invalid://")
 
 
-@skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_nccl()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 8850c974c2a9..58556c8831ab 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -2,10 +2,10 @@
 
 #include <c10/util/intrusive_ptr.h>
 #include <c10d/FileStore.hpp>
+#include <c10d/TCPStore.hpp>
 #ifndef _WIN32
 #include <c10d/HashStore.hpp>
 #include <c10d/ProcessGroupRoundRobin.hpp>
-#include <c10d/TCPStore.hpp>
 #endif
 #include <c10d/ProcessGroup.hpp>
 
@@ -590,6 +590,7 @@ Example::
     >>> store.set("first_key", "first_value")
       )")
       .def(py::init<>());
+#endif
 
   intrusive_ptr_class_<::c10d::TCPStore>(
       module,
@@ -629,7 +630,6 @@ Example::
           py::arg("is_master"),
           py::arg("timeout") =
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
-#endif
 
   intrusive_ptr_class_<::c10d::PrefixStore>(
       module,
@@ -1235,7 +1235,6 @@ that adds a prefix to each key inserted to the store.
 static const auto StoreTorchBind =
     torch::class_<::c10d::Store>("dist_c10d", "Store");
 
-#ifndef _WIN32
 static const auto TCPStoreTorchBind =
     torch::class_<::c10d::TCPStore>("dist_c10d", "TCPStore")
         .def(torch::init([](const std::string& host_name,
@@ -1245,7 +1244,6 @@ static const auto TCPStoreTorchBind =
           return c10::make_intrusive<::c10d::TCPStore>(
               host_name, port, world_size, is_master);
         }));
-#endif
 
 // Torchbind the ProcessGroup to make it available in TorchScript
 static const auto ProcessGroupWorkTorchBind =
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index bba20cbed570..1335fe9d1d6d 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -23,6 +23,7 @@ def is_available():
     from torch._C._distributed_c10d import (
         Store,
         FileStore,
+        TCPStore,
         ProcessGroup,
         Reducer,
         BuiltinCommHookType,
@@ -36,7 +37,6 @@ def is_available():
     )
     if sys.platform != 'win32':
         from torch._C._distributed_c10d import (
-            TCPStore,
             HashStore,
             _round_robin_process_groups,
         )
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 55705f987a6e..b3bedfbc76ca 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -9,12 +9,9 @@
 import sys
 from datetime import timedelta
 from typing import Optional, Dict, Union
-from torch._C._distributed_c10d import FileStore
+from torch._C._distributed_c10d import FileStore, TCPStore
 from .constants import default_pg_timeout
 
-if sys.platform != 'win32':
-    from torch._C._distributed_c10d import TCPStore
-
 _rendezvous_handlers = {}
 
 
@@ -196,8 +193,6 @@ def _env_error(var):
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using env:// method")
 
-if sys.platform != 'win32':
-    register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
-    register_rendezvous_handler("env", _env_rendezvous_handler)
-
+register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+register_rendezvous_handler("env", _env_rendezvous_handler)
 register_rendezvous_handler("file", _file_rendezvous_handler)
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 20c12506f009..4e72e2e32fbf 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,14 +45,15 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
+  PrefixStore.cpp
   ProcessGroup.cpp
   Store.cpp
-  PrefixStore.cpp
+  TCPStore.cpp
   Utils.cpp
   )
 
 if(NOT WIN32)
-  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp)
+  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp)
 endif()
 
 set(C10D_LIBS torch)
@@ -123,6 +124,7 @@ copy_header(FileStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
+copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 if(USE_GLOO)
@@ -131,7 +133,9 @@ if(USE_GLOO)
 endif()
 if(NOT WIN32)
   copy_header(HashStore.hpp)
-  copy_header(TCPStore.hpp)
+  copy_header(UnixSockUtils.hpp)
+else()
+  copy_header(WinSockUtils.hpp)
 endif()
 
 if(USE_C10D_NCCL)
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 55705005aad0..4151448e677a 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -1,16 +1,22 @@
 #include <c10d/TCPStore.hpp>
 
+#ifdef _WIN32
+#include <winsock2.h>
+#include <io.h>
+#else
 #include <poll.h>
-
 #include <unistd.h>
+#endif
+
 #include <algorithm>
+#include <fcntl.h>
 #include <system_error>
 
 namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE_KEY };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -23,11 +29,7 @@ enum class WaitResponseType : uint8_t { STOP_WAITING };
 TCPStoreDaemon::TCPStoreDaemon(int storeListenSocket)
     : storeListenSocket_(storeListenSocket) {
   // Use control pipe to signal instance destruction to the daemon thread.
-  if (pipe(controlPipeFd_.data()) == -1) {
-    throw std::runtime_error(
-        "Failed to create the control pipe to start the "
-        "TCPStoreDaemon run");
-  }
+  initStopSignal();
   daemonThread_ = std::thread(&TCPStoreDaemon::run, this);
 }
 
@@ -39,123 +41,69 @@ TCPStoreDaemon::~TCPStoreDaemon() {
   // Close unclosed sockets
   for (auto socket : sockets_) {
     if (socket != -1) {
-      ::close(socket);
+      tcputil::closeSocket(socket);
     }
   }
   // Now close the rest control pipe
-  for (auto fd : controlPipeFd_) {
-    if (fd != -1) {
-      ::close(fd);
-    }
-  }
+  closeStopSignal();
 }
 
 void TCPStoreDaemon::join() {
   daemonThread_.join();
 }
 
-void TCPStoreDaemon::run() {
-  std::vector<struct pollfd> fds;
-  fds.push_back({.fd = storeListenSocket_, .events = POLLIN});
-  // Push the read end of the pipe to signal the stopping of the daemon run
-  fds.push_back({.fd = controlPipeFd_[0], .events = POLLHUP});
-
-  // receive the queries
-  bool finished = false;
-  while (!finished) {
-    for (size_t i = 0; i < sockets_.size(); i++) {
-      fds[i].revents = 0;
+void TCPStoreDaemon::queryFds(std::vector<struct pollfd>& fds) {
+  // Skipping the fds[0] and fds[1],
+  // fds[0] is master's listening socket
+  // fds[1] is control pipe's reading fd, it is not for Windows platform
+  for (size_t fdIdx = CONNECT_SOCKET_OFFSET; fdIdx < fds.size(); ++fdIdx) {
+    if (fds[fdIdx].revents == 0) {
+      continue;
     }
 
-    SYSCHECK_ERR_RETURN_NEG1(::poll(fds.data(), fds.size(), -1));
-
-    // TCPStore's listening socket has an event and it should now be able to
-    // accept new connections.
-    if (fds[0].revents != 0) {
-      if (fds[0].revents ^ POLLIN) {
-        throw std::system_error(
-            ECONNABORTED,
-            std::system_category(),
-            "Unexpected poll revent on the master's listening socket: " +
-                std::to_string(fds[0].revents));
-      }
-      int sockFd = std::get<0>(tcputil::accept(storeListenSocket_));
-      sockets_.push_back(sockFd);
-      fds.push_back({.fd = sockFd, .events = POLLIN});
-    }
-    // The pipe receives an event which tells us to shutdown the daemon
-    if (fds[1].revents != 0) {
-      // Will be POLLUP when the pipe is closed
-      if (fds[1].revents ^ POLLHUP) {
-        throw std::system_error(
-            ECONNABORTED,
-            std::system_category(),
-            "Unexpected poll revent on the control pipe's reading fd: " +
-                std::to_string(fds[1].revents));
-      }
-      finished = true;
-      break;
-    }
-    // Skipping the fds[0] and fds[1],
-    // fds[0] is master's listening socket
-    // fds[1] is control pipe's reading fd
-    for (size_t fdIdx = 2; fdIdx < fds.size(); ++fdIdx) {
-      if (fds[fdIdx].revents == 0) {
-        continue;
-      }
-
-      // Now query the socket that has the event
-      try {
-        query(fds[fdIdx].fd);
-      } catch (...) {
-        // There was an error when processing query. Probably an exception
-        // occurred in recv/send what would indicate that socket on the other
-        // side has been closed. If the closing was due to normal exit, then
-        // the store should continue executing. Otherwise, if it was different
-        // exception, other connections will get an exception once they try to
-        // use the store. We will go ahead and close this connection whenever
-        // we hit an exception here.
-        ::close(fds[fdIdx].fd);
-
-        // Remove all the tracking state of the close FD
-        for (auto it = waitingSockets_.begin(); it != waitingSockets_.end();) {
-          for (auto vecIt = it->second.begin(); vecIt != it->second.end();) {
-            if (*vecIt == fds[fdIdx].fd) {
-              vecIt = it->second.erase(vecIt);
-            } else {
-              ++vecIt;
-            }
-          }
-          if (it->second.size() == 0) {
-            it = waitingSockets_.erase(it);
+    // Now query the socket that has the event
+    try {
+      query(fds[fdIdx].fd);
+    } catch (...) {
+      // There was an error when processing query. Probably an exception
+      // occurred in recv/send what would indicate that socket on the other
+      // side has been closed. If the closing was due to normal exit, then
+      // the store should continue executing. Otherwise, if it was different
+      // exception, other connections will get an exception once they try to
+      // use the store. We will go ahead and close this connection whenever
+      // we hit an exception here.
+      tcputil::closeSocket(fds[fdIdx].fd);
+
+      // Remove all the tracking state of the close FD
+      for (auto it = waitingSockets_.begin(); it != waitingSockets_.end();) {
+        for (auto vecIt = it->second.begin(); vecIt != it->second.end();) {
+          if (*vecIt == fds[fdIdx].fd) {
+            vecIt = it->second.erase(vecIt);
           } else {
-            ++it;
+            ++vecIt;
           }
         }
-        for (auto it = keysAwaited_.begin(); it != keysAwaited_.end();) {
-          if (it->first == fds[fdIdx].fd) {
-            it = keysAwaited_.erase(it);
-          } else {
-            ++it;
-          }
+        if (it->second.size() == 0) {
+          it = waitingSockets_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+      for (auto it = keysAwaited_.begin(); it != keysAwaited_.end();) {
+        if (it->first == fds[fdIdx].fd) {
+          it = keysAwaited_.erase(it);
+        } else {
+          ++it;
         }
-        fds.erase(fds.begin() + fdIdx);
-        sockets_.erase(sockets_.begin() + fdIdx - 2);
-        --fdIdx;
-        continue;
       }
+      fds.erase(fds.begin() + fdIdx);
+      sockets_.erase(sockets_.begin() + fdIdx - CONNECT_SOCKET_OFFSET);
+      --fdIdx;
+      continue;
     }
   }
 }
 
-void TCPStoreDaemon::stop() {
-  if (controlPipeFd_[1] != -1) {
-    // close the write end of the pipe
-    ::close(controlPipeFd_[1]);
-    controlPipeFd_[1] = -1;
-  }
-}
-
 // query communicates with the worker. The format
 // of the query is as follows:
 // type of query | size of arg1 | arg1 | size of arg2 | arg2 | ...
@@ -183,7 +131,7 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::GETNUMKEYS) {
     getNumKeysHandler(socket);
 
-  } else if (qt == QueryType::DELETE) {
+  } else if (qt == QueryType::DELETE_KEY) {
     deleteHandler(socket);
 
   } else {
@@ -283,6 +231,137 @@ bool TCPStoreDaemon::checkKeys(const std::vector<std::string>& keys) const {
   });
 }
 
+#ifdef _WIN32
+void TCPStoreDaemon::initStopSignal() {
+  ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
+  if (ghStopEvent_ == NULL) {
+      throw std::runtime_error(
+          "Failed to create the control pipe to start the "
+          "TCPStoreDaemon run");
+  }
+}
+
+void TCPStoreDaemon::closeStopSignal() {
+  CloseHandle(ghStopEvent_);
+}
+
+void TCPStoreDaemon::stop() {
+  SetEvent(ghStopEvent_);
+}
+
+void TCPStoreDaemon::run() {
+  std::vector<struct pollfd> fds;
+  tcputil::addPollfd(fds, storeListenSocket_, POLLIN);
+
+  // receive the queries
+  bool finished = false;
+  while (!finished) {
+    for (size_t i = 0; i < sockets_.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+    int res;
+    SYSCHECK_ERR_RETURN_NEG1(
+      res = WSAPoll(fds.data(), fds.size(), checkTimeout_.count()))
+    if (res == 0) {
+      auto rv = WaitForSingleObject(ghStopEvent_, 0);
+      if (rv != WAIT_TIMEOUT) {
+          finished = true;
+          break;
+      }
+      continue;
+    }
+
+    // TCPStore's listening socket has an event and it should now be able to
+    // accept new connections.
+    if (fds[0].revents != 0) {
+      if (!(fds[0].revents & POLLIN)) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent on the master's listening socket: " +
+                std::to_string(fds[0].revents));
+      }
+      int sockFd = std::get<0>(tcputil::accept(storeListenSocket_));
+      sockets_.push_back(sockFd);
+      tcputil::addPollfd(fds, sockFd, POLLIN);
+    }
+    queryFds(fds);
+  }
+}
+#else
+void TCPStoreDaemon::initStopSignal() {
+  if (pipe(controlPipeFd_.data()) == -1) {
+    throw std::runtime_error(
+        "Failed to create the control pipe to start the "
+        "TCPStoreDaemon run");
+  }
+}
+
+void TCPStoreDaemon::closeStopSignal() {
+  for (auto fd : controlPipeFd_) {
+    if (fd != -1) {
+      ::close(fd);
+    }
+  }
+}
+
+void TCPStoreDaemon::stop() {
+  if (controlPipeFd_[1] != -1) {
+    // close the write end of the pipe
+    ::close(controlPipeFd_[1]);
+    controlPipeFd_[1] = -1;
+  }
+}
+
+void TCPStoreDaemon::run() {
+  std::vector<struct pollfd> fds;
+  tcputil::addPollfd(fds, storeListenSocket_, POLLIN);
+  // Push the read end of the pipe to signal the stopping of the daemon run
+  tcputil::addPollfd(fds, controlPipeFd_[0], POLLHUP);
+
+  // receive the queries
+  bool finished = false;
+  while (!finished) {
+    for (size_t i = 0; i < sockets_.size(); i++) {
+      fds[i].revents = 0;
+    }
+
+    SYSCHECK_ERR_RETURN_NEG1(::poll(fds.data(), fds.size(), -1));
+
+    // TCPStore's listening socket has an event and it should now be able to
+    // accept new connections.
+    if (fds[0].revents != 0) {
+      if (fds[0].revents ^ POLLIN) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent on the master's listening socket: " +
+                std::to_string(fds[0].revents));
+      }
+      int sockFd = std::get<0>(tcputil::accept(storeListenSocket_));
+      sockets_.push_back(sockFd);
+      tcputil::addPollfd(fds, sockFd, POLLIN);
+    }
+
+    // The pipe receives an event which tells us to shutdown the daemon
+    if (fds[1].revents != 0) {
+      // Will be POLLUP when the pipe is closed
+      if (fds[1].revents ^ POLLHUP) {
+        throw std::system_error(
+            ECONNABORTED,
+            std::system_category(),
+            "Unexpected poll revent on the control pipe's reading fd: " +
+                std::to_string(fds[1].revents));
+      }
+      finished = true;
+      break;
+    }
+    queryFds(fds);
+  }
+}
+#endif
+
 // TCPStore class methods
 TCPStore::TCPStore(
     const std::string& masterAddr,
@@ -298,6 +377,7 @@ TCPStore::TCPStore(
       numWorkers_(numWorkers),
       initKey_("init/"),
       regularPrefix_("/") {
+  tcputil::socketInitialize();
   if (isServer_) {
     // Opening up the listening socket
     std::tie(masterListenSocket_, tcpStorePort_) = tcputil::listen(masterPort);
@@ -308,19 +388,18 @@ TCPStore::TCPStore(
   // Connect to the daemon
   storeSocket_ = tcputil::connect(
       tcpStoreAddr_, tcpStorePort_, /* wait= */ true, timeout_);
-
   if (waitWorkers) {
     waitForWorkers();
   }
 }
 
 TCPStore::~TCPStore() {
-  ::close(storeSocket_);
+  tcputil::closeSocket(storeSocket_);
   if (isServer_) {
     // Store daemon should end because of closed connection.
     // daemon destructor should join the thread
     tcpStoreDaemon_.reset(nullptr);
-    ::close(masterListenSocket_);
+    tcputil::closeSocket(masterListenSocket_);
   }
 }
 
@@ -375,7 +454,7 @@ int64_t TCPStore::add(const std::string& key, int64_t value) {
 
 bool TCPStore::deleteKey(const std::string& key) {
   std::string regKey = regularPrefix_ + key;
-  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE);
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE_KEY);
   tcputil::sendString(storeSocket_, regKey, true);
   auto numDeleted = tcputil::recvValue<int64_t>(storeSocket_);
   return (numDeleted == 1);
@@ -431,8 +510,13 @@ void TCPStore::waitHelper_(
     const std::chrono::milliseconds& timeout) {
   // Set the socket timeout if there is a wait timeout
   if (timeout != kNoTimeout) {
+#ifdef _WIN32
+    struct timeval timeoutTV = {timeout.count() / 1000,
+                                (timeout.count() % 1000) * 1000};
+#else
     struct timeval timeoutTV = {.tv_sec = timeout.count() / 1000,
                                 .tv_usec = (timeout.count() % 1000) * 1000};
+#endif
     SYSCHECK_ERR_RETURN_NEG1(::setsockopt(
         storeSocket_,
         SOL_SOCKET,
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index d26df3e9e8ab..47c92b742520 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -5,7 +5,12 @@
 #include <unordered_map>
 
 #include <c10d/Store.hpp>
-#include <c10d/Utils.hpp>
+
+#ifdef _WIN32
+#include <c10d/WinSockUtils.hpp>
+#else
+#include <c10d/UnixSockUtils.hpp>
+#endif
 
 namespace c10d {
 
@@ -20,6 +25,7 @@ class TCPStoreDaemon {
   void run();
   void stop();
 
+  void queryFds(std::vector<struct pollfd>& fds);
   void query(int socket);
 
   void setHandler(int socket);
@@ -33,6 +39,9 @@ class TCPStoreDaemon {
   bool checkKeys(const std::vector<std::string>& keys) const;
   void wakeupWaitingClients(const std::string& key);
 
+  void initStopSignal();
+  void closeStopSignal();
+
   std::thread daemonThread_;
   std::unordered_map<std::string, std::vector<uint8_t>> tcpStore_;
   // From key -> the list of sockets waiting on it
@@ -42,7 +51,13 @@ class TCPStoreDaemon {
 
   std::vector<int> sockets_;
   int storeListenSocket_;
+#ifdef _WIN32
+  const std::chrono::milliseconds checkTimeout_
+      = std::chrono::milliseconds(10);
+  HANDLE ghStopEvent_;
+#else
   std::vector<int> controlPipeFd_{-1, -1};
+#endif
 };
 
 class TCPStore : public Store {
diff --git a/torch/lib/c10d/UnixSockUtils.hpp b/torch/lib/c10d/UnixSockUtils.hpp
new file mode 100644
index 000000000000..fa74be27f889
--- /dev/null
+++ b/torch/lib/c10d/UnixSockUtils.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <c10d/Utils.hpp>
+
+namespace c10d {
+namespace tcputil {
+
+#define AF_SELECTED AF_UNSPEC
+#define CONNECT_SOCKET_OFFSET 2
+
+inline void closeSocket(int socket) { ::close(socket); }
+
+inline int setSocketAddrReUse(int socket) {
+  int optval = 1;
+  return ::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(int));
+}
+
+inline int poll(struct pollfd *fds, unsigned long nfds, int timeout) {
+  return ::poll(fds, nfds, timeout);
+}
+
+inline void addPollfd(std::vector<struct pollfd> &fds, int socket,
+                      short events) {
+  fds.push_back({.fd = socket, .events = events});
+}
+
+inline void waitSocketConnected(
+    int socket,
+    struct ::addrinfo *nextAddr,
+    std::chrono::milliseconds timeout,
+    std::chrono::time_point<std::chrono::high_resolution_clock> startTime) {
+  SYSCHECK_ERR_RETURN_NEG1(::fcntl(socket, F_SETFL, O_NONBLOCK));
+
+  int ret = ::connect(socket, nextAddr->ai_addr, nextAddr->ai_addrlen);
+
+  if (ret != 0 && errno != EINPROGRESS) {
+    throw std::system_error(errno, std::system_category());
+  }
+
+  struct ::pollfd pfd;
+  pfd.fd = socket;
+  pfd.events = POLLOUT;
+
+  int64_t pollTimeout = -1;
+  if (timeout != kNoTimeout) {
+    // calculate remaining time and use that as timeout for poll()
+    const auto elapsed = std::chrono::high_resolution_clock::now() - startTime;
+    const auto remaining =
+        std::chrono::duration_cast<std::chrono::milliseconds>(timeout) -
+        std::chrono::duration_cast<std::chrono::milliseconds>(elapsed);
+    pollTimeout = std::max(static_cast<int64_t>(0),
+                           static_cast<int64_t>(remaining.count()));
+  }
+  int numReady = ::poll(&pfd, 1, pollTimeout);
+  if (numReady < 0) {
+    throw std::system_error(errno, std::system_category());
+  } else if (numReady == 0) {
+    errno = 0;
+    throw std::runtime_error(kConnectTimeoutMsg);
+  }
+
+  socklen_t errLen = sizeof(errno);
+  errno = 0;
+  ::getsockopt(socket, SOL_SOCKET, SO_ERROR, &errno, &errLen);
+
+  // `errno` is set when:
+  //  1. `getsockopt` has failed
+  //  2. there is awaiting error in the socket
+  //  (the error is saved to the `errno` variable)
+  if (errno != 0) {
+    throw std::system_error(errno, std::system_category());
+  }
+
+  // Disable non-blocking mode
+  int flags;
+  SYSCHECK_ERR_RETURN_NEG1(flags = ::fcntl(socket, F_GETFL));
+  SYSCHECK_ERR_RETURN_NEG1(::fcntl(socket, F_SETFL, flags & (~O_NONBLOCK)));
+}
+
+// Linux socket does not need init libs first
+inline void socketInitialize() {}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {.fd = socket, .events = events};
+  return res;
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index 6c6e941ef95d..62e1e195ca45 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -1,18 +1,18 @@
-#include <c10d/Utils.hpp>
-
-#ifndef _WIN32
+#ifdef _WIN32
+#include <c10d/WinSockUtils.hpp>
+#else
+#include <c10d/UnixSockUtils.hpp>
 #include <netdb.h>
 #include <sys/poll.h>
-
 #include <arpa/inet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-
-#include <fcntl.h>
 #include <unistd.h>
+#endif
 
 #include <algorithm>
 #include <cstring>
+#include <fcntl.h>
 #include <memory>
 #include <string>
 #include <thread>
@@ -23,7 +23,6 @@ namespace tcputil {
 namespace {
 
 constexpr int LISTEN_QUEUE_SIZE = 2048;
-const std::string kConnectTimeoutMsg = "connect() timed out.";
 
 void setSocketNoDelay(int socket) {
   int flag = 1;
@@ -82,7 +81,7 @@ std::pair<int, PortType> listen(PortType port) {
   struct ::addrinfo hints, *res = NULL;
   std::memset(&hints, 0x00, sizeof(hints));
   hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
-  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_family = AF_SELECTED; // IPv4 on Windows, IPv4/6 on Linux
   hints.ai_socktype = SOCK_STREAM; // TCP
 
   // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
@@ -106,18 +105,14 @@ std::pair<int, PortType> listen(PortType port) {
               nextAddr->ai_family,
               nextAddr->ai_socktype,
               nextAddr->ai_protocol))
-
-      int optval = 1;
-      SYSCHECK_ERR_RETURN_NEG1(
-          ::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(int)))
-
+      SYSCHECK_ERR_RETURN_NEG1(tcputil::setSocketAddrReUse(socket))
       SYSCHECK_ERR_RETURN_NEG1(
           ::bind(socket, nextAddr->ai_addr, nextAddr->ai_addrlen))
       SYSCHECK_ERR_RETURN_NEG1(::listen(socket, LISTEN_QUEUE_SIZE))
       break;
 
     } catch (const std::system_error& e) {
-      ::close(socket);
+      tcputil::closeSocket(socket);
       nextAddr = nextAddr->ai_next;
 
       // we have tried all addresses but could not start
@@ -203,7 +198,7 @@ int connect(
   struct ::addrinfo hints, *res = NULL;
   std::memset(&hints, 0x00, sizeof(hints));
   hints.ai_flags = AI_NUMERICSERV; // specifies that port (service) is numeric
-  hints.ai_family = AF_UNSPEC; // either IPv4 or IPv6
+  hints.ai_family = AF_SELECTED; // IPv4 on Windows, IPv4/6 on Linux
   hints.ai_socktype = SOCK_STREAM; // TCP
 
   // `getaddrinfo` will sort addresses according to RFC 3484 and can be tweeked
@@ -236,55 +231,11 @@ int connect(
               nextAddr->ai_socktype,
               nextAddr->ai_protocol))
 
-      ResourceGuard socketGuard([socket]() { ::close(socket); });
+      ResourceGuard socketGuard([socket]() { tcputil::closeSocket(socket); });
 
       // We need to connect in non-blocking mode, so we can use a timeout
-      SYSCHECK_ERR_RETURN_NEG1(::fcntl(socket, F_SETFL, O_NONBLOCK));
-
-      int ret = ::connect(socket, nextAddr->ai_addr, nextAddr->ai_addrlen);
+      waitSocketConnected(socket, nextAddr, timeout, start);
 
-      if (ret != 0 && errno != EINPROGRESS) {
-        throw std::system_error(errno, std::system_category());
-      }
-
-      struct ::pollfd pfd;
-      pfd.fd = socket;
-      pfd.events = POLLOUT;
-
-      int64_t pollTimeout = -1;
-      if (timeout != kNoTimeout) {
-        // calculate remaining time and use that as timeout for poll()
-        const auto elapsed = std::chrono::high_resolution_clock::now() - start;
-        const auto remaining =
-            std::chrono::duration_cast<std::chrono::milliseconds>(timeout) -
-            std::chrono::duration_cast<std::chrono::milliseconds>(elapsed);
-        pollTimeout = std::max(
-            static_cast<int64_t>(0), static_cast<int64_t>(remaining.count()));
-      }
-      int numReady = ::poll(&pfd, 1, pollTimeout);
-      if (numReady < 0) {
-        throw std::system_error(errno, std::system_category());
-      } else if (numReady == 0) {
-        errno = 0;
-        throw std::runtime_error(kConnectTimeoutMsg);
-      }
-
-      socklen_t errLen = sizeof(errno);
-      errno = 0;
-      ::getsockopt(socket, SOL_SOCKET, SO_ERROR, &errno, &errLen);
-
-      // `errno` is set when:
-      //  1. `getsockopt` has failed
-      //  2. there is awaiting error in the socket
-      //  (the error is saved to the `errno` variable)
-      if (errno != 0) {
-        throw std::system_error(errno, std::system_category());
-      }
-
-      // Disable non-blocking mode
-      int flags;
-      SYSCHECK_ERR_RETURN_NEG1(flags = ::fcntl(socket, F_GETFL));
-      SYSCHECK_ERR_RETURN_NEG1(::fcntl(socket, F_SETFL, flags & (~O_NONBLOCK)));
       socketGuard.release();
       break;
 
@@ -321,10 +272,10 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout) {
   // poll on listen socket, it allows to make timeout
   std::unique_ptr<struct ::pollfd[]> events(new struct ::pollfd[1]);
-  events[0] = {.fd = listenSocket, .events = POLLIN};
+  events[0] = tcputil::getPollfd(listenSocket, POLLIN);
 
   while (true) {
-    int res = ::poll(events.get(), 1, timeout.count());
+    int res = tcputil::poll(events.get(), 1, timeout.count());
     if (res == 0) {
       throw std::runtime_error(
           "waiting for processes to "
@@ -357,4 +308,3 @@ std::tuple<int, std::string> accept(
 }
 } // namespace tcputil
 } // namespace c10d
-#endif
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index bc1b71a1947b..e7b0f1834441 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -1,8 +1,5 @@
 #pragma once
 
-#ifndef _WIN32
-#include <sys/socket.h>
-#endif
 #include <sys/types.h>
 
 #include <chrono>
@@ -19,6 +16,19 @@
 
 #include <c10d/Types.hpp>
 
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+typedef SSIZE_T ssize_t;
+#pragma comment(lib, "Ws2_32.lib")
+#else
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+
 namespace c10d {
 
 // Turns at::IntArrayRef into "(1, 2, 3, 4)".
@@ -464,6 +474,25 @@ using SizeType = uint64_t;
 // `success_cond` is an expression used to check if an error has happend. So for
 // `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
 // is stored in variable `__output` and may be used in `success_cond`.
+#ifdef _WIN32
+#define SYSCHECK(expr, success_cond)                                               \
+  while (true) {                                                                   \
+    auto __output = (expr);                                                        \
+    auto errno_local = WSAGetLastError();                                          \
+    (void)__output;                                                                \
+    if (!(success_cond)) {                                                         \
+      if (errno == EINTR) {                                                        \
+        continue;                                                                  \
+      } else if (errno_local == WSAETIMEDOUT || errno_local == WSAEWOULDBLOCK ) {  \
+        throw std::runtime_error("Socket Timeout");                                \
+      } else {                                                                     \
+        throw std::system_error(errno_local, std::system_category());              \
+      }                                                                            \
+    } else {                                                                       \
+      break;                                                                       \
+    }                                                                              \
+  }
+#else
 #define SYSCHECK(expr, success_cond)                            \
   while (true) {                                                \
     auto __output = (expr);                                     \
@@ -480,9 +509,11 @@ using SizeType = uint64_t;
       break;                                                    \
     }                                                           \
   }
+#endif
 
 // Most functions indicate error by returning `-1`. This is a helper macro for
 // this common case with `SYSCHECK`.
+// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
 #define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
 
 // Helper resource guard class
@@ -506,10 +537,10 @@ class ResourceGuard {
   bool released_;
 };
 
-#ifndef _WIN32
 namespace tcputil {
 
 constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
+const std::string kConnectTimeoutMsg = "connect() timed out.";
 
 // Send and receive
 template <typename T>
@@ -537,7 +568,7 @@ void sendBytes(
   while (bytesToSend > 0) {
     ssize_t bytesSent;
     SYSCHECK_ERR_RETURN_NEG1(
-        bytesSent = ::send(socket, currentBytes, bytesToSend, flags))
+        bytesSent = ::send(socket, (const char*)currentBytes, bytesToSend, flags))
     if (bytesSent == 0) {
       throw std::system_error(ECONNRESET, std::system_category());
     }
@@ -560,7 +591,7 @@ void recvBytes(int socket, T* buffer, size_t length) {
   while (bytesToReceive > 0) {
     ssize_t bytesReceived;
     SYSCHECK_ERR_RETURN_NEG1(
-        bytesReceived = ::recv(socket, currentBytes, bytesToReceive, 0))
+        bytesReceived = recv(socket, (char*)currentBytes, bytesToReceive, 0))
     if (bytesReceived == 0) {
       throw std::system_error(ECONNRESET, std::system_category());
     }
@@ -636,5 +667,4 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout = kNoTimeout);
 
 } // namespace tcputil
-#endif
 } // namespace c10d
diff --git a/torch/lib/c10d/WinSockUtils.hpp b/torch/lib/c10d/WinSockUtils.hpp
new file mode 100644
index 000000000000..cd37695845ab
--- /dev/null
+++ b/torch/lib/c10d/WinSockUtils.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <c10d/Utils.hpp>
+
+namespace c10d {
+namespace tcputil {
+
+#define AF_SELECTED AF_INET
+#define CONNECT_SOCKET_OFFSET 1
+
+inline void closeSocket(int socket) { ::closesocket(socket); }
+
+inline int setSocketAddrReUse(int socket) {
+  bool optval = false;
+  return ::setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, (char *)&optval,
+                      sizeof(bool));
+}
+
+inline int poll(struct pollfd *fdArray, unsigned long fds, int timeout) {
+  return WSAPoll(fdArray, fds, timeout);
+}
+
+inline void addPollfd(std::vector<struct pollfd> &fds, int socket,
+                      short events) {
+  fds.push_back({(SOCKET)socket, events});
+}
+
+inline void waitSocketConnected(
+    int socket,
+    struct ::addrinfo *nextAddr,
+    std::chrono::milliseconds timeout,
+    std::chrono::time_point<std::chrono::high_resolution_clock> startTime) {
+  unsigned long block_mode = 1;
+  SYSCHECK_ERR_RETURN_NEG1(ioctlsocket(socket, FIONBIO, &block_mode));
+
+  int ret;
+  do {
+    ret = connect(socket, nextAddr->ai_addr, nextAddr->ai_addrlen);
+    if (ret == SOCKET_ERROR) {
+      int err = WSAGetLastError();
+      if (err == WSAEISCONN) {
+        break;
+      } else if (err == WSAEALREADY || err == WSAEWOULDBLOCK) {
+        if (timeout != kNoTimeout) {
+          const auto elapsed =
+              std::chrono::high_resolution_clock::now() - startTime;
+          if (elapsed > timeout) {
+            errno = 0;
+            throw std::runtime_error(kConnectTimeoutMsg);
+          }
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        continue;
+      }
+      throw std::system_error(err, std::system_category(),
+                              "Socket connect failed");
+    }
+  } while (ret == SOCKET_ERROR);
+
+  block_mode = 0;
+  SYSCHECK_ERR_RETURN_NEG1(ioctlsocket(socket, FIONBIO, &block_mode));
+}
+
+// All processes (applications or DLLs) that call Winsock
+// functions must initialize the use of the Windows Sockets
+// DLL before making other Winsock function calls.
+// This also makes certain that Winsock is supported on the system.
+// Ref to
+// https://docs.microsoft.com/en-us/windows/win32/winsock/initializing-winsock
+inline void socketInitialize() {
+  static std::once_flag init_flag;
+  std::call_once(init_flag, []() {
+    WSADATA wsa_data;
+    SYSCHECK_ERR_RETURN_NEG1(WSAStartup(MAKEWORD(2, 2), &wsa_data))
+  });
+}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {(SOCKET)socket, events};
+  return res;
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 003f56f30861..b74d4b65f70f 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -17,9 +17,9 @@ function(c10d_add_test test_src)
 endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d gtest_main)
+c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
 if(NOT WIN32)
   c10d_add_test(HashStoreTest.cpp c10d gtest_main)
-  c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
 endif()
 
 if(USE_CUDA)
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 8073ec0345e0..30a123dc163f 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -77,7 +77,7 @@ void testHelper(const std::string& prefix = "") {
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(
-        std::thread([&sem1, &sem2, &clientStores, i, &expectedCounterRes] {
+        std::thread([&sem1, &sem2, &clientStores, i, &expectedCounterRes, &numIterations, &numThreads] {
           for (auto j = 0; j < numIterations; j++) {
             clientStores[i]->add("counter", 1);
           }

From befab0d9d48d154b8ec5eab0cc3092c6c8eee5a2 Mon Sep 17 00:00:00 2001
From: David <jiafa@microsoft.com>
Date: Thu, 3 Dec 2020 09:33:09 -0800
Subject: [PATCH 017/132] [ONNX] Cast Gather index to Long if needed (#47653)

Summary:
Onnx op Gather index need be int32 or int64. However, we don't have this Cast in our converter.
Therefore, it fails the following UT (for opset 11+)
`seq_length.type().scalarType()` is None, so `_arange_cast_helper()` cannot treat it as all integral, then it will cast all to float. Then this float value will be used as Gather index, hence it throws error in ORT about float type index.
The fix is that we need cast Gather index type to Long if it is not int/long.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47653

Reviewed By: heitorschueroff

Differential Revision: D25298056

Pulled By: mruberry

fbshipit-source-id: 05e3a70ccfd74612233c63ec5bb78e060b211909
---
 torch/onnx/symbolic_helper.py | 17 +++++++++++++++++
 torch/onnx/symbolic_opset9.py | 14 ++------------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 43c690087ab8..7e16d7a08c44 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -213,6 +213,23 @@ def _try_get_scalar_type(*args):
     return None
 
 
+def _select_helper(g, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = index.type().dim()
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1])))
+
+    index_scalar_type = index.type().scalarType()
+    if index_scalar_type is None or index_scalar_type not in ['Long', 'Int']:
+        index = g.op("Cast", index, to_i=cast_pytorch_to_onnx["Long"])
+    return g.op("Gather", self, index, axis_i=dim)
+
+
 def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
     if _export_onnx_opset_version <= 9:
         from torch.onnx.symbolic_opset9 import _slice
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index d36a2a04eae8..1bb6fe19352f 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1332,17 +1332,7 @@ def index_select(g, self, dim, index):
     # In case of a scalar index, index_select returns a tensor with the same rank as the input.
     # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
     # also produces a tensor with the same rank as the input.
-
-    index_const = sym_help._maybe_get_scalar(index)
-    index_dim = index.type().dim()
-    if not sym_help._is_value(index_const):
-        # Index is a constant scalar. Make it a size 1 constant tensor.
-        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
-    elif index_dim is not None:
-        if index_dim == 0:
-            # Index is a scalar. Reshape it to a size 1 tensor.
-            index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1])))
-    return g.op("Gather", self, index, axis_i=dim)
+    return sym_help._select_helper(g, self, dim, index)
 
 
 def index_put(g, self, indices_list_value, values, accumulate):
@@ -2455,7 +2445,7 @@ def try_mask_to_index(index):
 
     indices = [try_mask_to_index(idx) for idx in indices]
     if len(indices) == 1:
-        return index_select(g, self, 0, indices[0])
+        return sym_help._select_helper(g, self, 0, indices[0], apply_reshape=False)
     else:
         # Multiple tensors as indices. Each tensor could either be
         #   1. prim::Constant()

From 5c9cef9a6c2df8e82ff49624a8e69b5cfe25880d Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 3 Dec 2020 10:30:00 -0800
Subject: [PATCH 018/132] [numpy] Add `torch.moveaxis` (#48581)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/38349 #36048 https://github.com/pytorch/pytorch/pull/41480#issuecomment-734398262

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48581

Reviewed By: bdhirsh

Differential Revision: D25276307

Pulled By: mruberry

fbshipit-source-id: 3e3e4df1343c5ce5b71457badc43f08c419ec5c3
---
 aten/src/ATen/core/aten_interned_strings.h |   1 -
 aten/src/ATen/core/interned_strings.h      |   2 +
 aten/src/ATen/native/TensorShape.cpp       |   8 +
 aten/src/ATen/native/native_functions.yaml |   9 ++
 docs/source/tensors.rst                    |   1 +
 docs/source/torch.rst                      |   1 +
 test/test_autograd.py                      |  15 +-
 test/test_op_aliases.py                    |   2 +
 test/test_shape_ops.py                     | 162 +++++++++++----------
 test/test_view_ops.py                      |   9 +-
 torch/_tensor_docs.py                      |   6 +
 torch/_torch_docs.py                       |  37 +++++
 torch/csrc/jit/passes/normalize_ops.cpp    |   1 +
 torch/overrides.py                         |   1 +
 14 files changed, 163 insertions(+), 92 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 56f1f0f60ddb..e3a855b825a0 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -546,7 +546,6 @@ _(aten, _euclidean_dist) \
 _(aten, pdist) \
 _(aten, cdist) \
 _(aten, permute) \
-_(aten, movedim) \
 _(aten, pin_memory) \
 _(aten, pinverse) \
 _(aten, pixel_shuffle) \
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index eede5f455b05..72cf48330b3a 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -284,6 +284,8 @@ namespace c10 {
   _(aten, swapaxes_)                 \
   _(aten, swapdims)                  \
   _(aten, swapdims_)                 \
+  _(aten, movedim)                   \
+  _(aten, moveaxis)                  \
   FORALL_ATEN_BASE_SYMBOLS(_)        \
   _(onnx, Add)                       \
   _(onnx, Concat)                    \
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 11889da42651..eda688ad6e1d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -2002,6 +2002,14 @@ Tensor movedim(const Tensor& self, int64_t src, int64_t dst) {
   return at::movedim(self, IntArrayRef{src}, IntArrayRef{dst});
 }
 
+Tensor moveaxis(const Tensor& self, IntArrayRef src, IntArrayRef dst) {
+  return at::movedim(self, src, dst);
+}
+
+Tensor moveaxis(const Tensor& self, int64_t src, int64_t dst) {
+  return at::movedim(self, IntArrayRef{src}, IntArrayRef{dst});
+}
+
 Tensor swapaxes(const Tensor& self, int64_t axis0, int64_t axis1) {
   return self.transpose(axis0, axis1);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3cc6f3a93f5a..8f237f9e2058 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3057,6 +3057,15 @@
   use_c10_dispatcher: full
   variants: function, method
 
+# moveaxis, alias for movedim
+- func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function, method
+
 # Only exposed from C++ -- in Python,
 # we expose it as an attribute `T`, not a function.
 #
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index d7b0af757d92..3f12004062cf 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -461,6 +461,7 @@ view of a storage and defines numeric operations on it.
       :noindex:
    .. automethod:: mode
    .. automethod:: movedim
+   .. automethod:: moveaxis
    .. automethod:: mul
    .. automethod:: mul_
    .. automethod:: multiply
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index b16f14de9bf6..d7c80de22189 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -92,6 +92,7 @@ Indexing, Slicing, Joining, Mutating Ops
     index_select
     masked_select
     movedim
+    moveaxis
     narrow
     nonzero
     reshape
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 5c8acd70a07a..125fa7a41ba9 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -7240,15 +7240,16 @@ def test_strided_leaf_grad_layout(self, device):
         self.assertEqual(c.grad.stride(), (2, 1))
 
     def test_movedim(self, device):
-        x = torch.randn(4, 3, 2, 1, dtype=torch.double, device=device, requires_grad=True)
+        for fn in [torch.movedim, torch.moveaxis]:
+            x = torch.randn(4, 3, 2, 1, dtype=torch.double, device=device, requires_grad=True)
 
-        # Positive axis
-        gradcheck(lambda x: torch.movedim(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
-        gradgradcheck(lambda x: torch.movedim(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
+            # Positive axis
+            gradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
+            gradgradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
 
-        # Negative axis
-        gradcheck(lambda x: torch.movedim(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
-        gradgradcheck(lambda x: torch.movedim(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
+            # Negative axis
+            gradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
+            gradgradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
 
     def _test_atleast(self, device, torch_fn):
         # 0-dim
diff --git a/test/test_op_aliases.py b/test/test_op_aliases.py
index 7ba2f4863e75..8a9b7c3b490a 100644
--- a/test/test_op_aliases.py
+++ b/test/test_op_aliases.py
@@ -161,6 +161,8 @@ def __init__(self,
               lambda d: torch.randn(20, 3, 2, 1, device=d), get_args=lambda d: (3, 1)),
     AliasInfo('row_stack', torch.row_stack, 'vstack', torch.vstack,
               lambda d: ((torch.randn(20, device=d), torch.randn(20, device=d)))),
+    AliasInfo('moveaxis', torch.moveaxis, 'movedim', torch.movedim,
+              lambda d: torch.randn(20, 3, 2, 1, device=d), get_args=lambda d: (3, 1)),
 )
 
 # Placeholder test class for validating that aliases are correctly
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 3a7afa790ff8..43321508e0e2 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -86,95 +86,97 @@ def test_movedim_invalid(self, device, dtype):
         shape = self._rand_shape(4, min_size=5, max_size=10)
         x = _generate_input(shape, dtype, device, False)
 
-        # Invalid `source` and `destination` dimension
-        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
-            torch.movedim(x, 5, 0)
+        for fn in [torch.movedim, torch.moveaxis]:
+            # Invalid `source` and `destination` dimension
+            with self.assertRaisesRegex(IndexError, "Dimension out of range"):
+                fn(x, 5, 0)
 
-        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
-            torch.movedim(x, 0, 5)
+            with self.assertRaisesRegex(IndexError, "Dimension out of range"):
+                fn(x, 0, 5)
 
-        # Mismatch in size of `source` and `destination`
-        with self.assertRaisesRegex(RuntimeError, "movedim: Invalid source or destination dims:"):
-            torch.movedim(x, (1, 0), (0, ))
+            # Mismatch in size of `source` and `destination`
+            with self.assertRaisesRegex(RuntimeError, "movedim: Invalid source or destination dims:"):
+                fn(x, (1, 0), (0, ))
 
-        with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `source`"):
-            torch.movedim(x, (0, 0), (0, 1))
+            with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `source`"):
+                fn(x, (0, 0), (0, 1))
 
-        with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `source`"):
-            torch.movedim(x, (0, 1, 0), (0, 1, 2))
+            with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `source`"):
+                fn(x, (0, 1, 0), (0, 1, 2))
 
-        with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `destination`"):
-            torch.movedim(x, (0, 1), (1, 1))
+            with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `destination`"):
+                fn(x, (0, 1), (1, 1))
 
-        with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `destination`"):
-            torch.movedim(x, (0, 1, 2), (1, 0, 1))
+            with self.assertRaisesRegex(RuntimeError, "movedim: repeated dim in `destination`"):
+                fn(x, (0, 1, 2), (1, 0, 1))
 
     @dtypes(torch.int64, torch.float, torch.complex128)
     def test_movedim(self, device, dtype):
-        for nd in range(5):
-            shape = self._rand_shape(nd, min_size=5, max_size=10)
-            x = _generate_input(shape, dtype, device, with_extremal=False)
-            for random_negative in [True, False]:
-                for src_dim, dst_dim in permutations(range(nd), r=2):
-                    random_prob = random.random()
-
-                    if random_negative and random_prob > 0.66:
-                        src_dim = src_dim - nd
-                    elif random_negative and random_prob > 0.33:
-                        dst_dim = dst_dim - nd
-                    elif random_negative:
-                        src_dim = src_dim - nd
-                        dst_dim = dst_dim - nd
-
-                    # Integer `source` and `destination`
-                    torch_fn = partial(torch.movedim, source=src_dim, destination=dst_dim)
-                    np_fn = partial(np.moveaxis, source=src_dim, destination=dst_dim)
-                    self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
-
-                if nd == 0:
-                    continue
-
-                def make_index_negative(sequence, idx):
-                    sequence = list(sequence)
-                    sequence[random_idx] = sequence[random_idx] - nd
-                    return tuple(src_sequence)
-
-                for src_sequence in permutations(range(nd), r=random.randint(1, nd)):
-                    # Sequence `source` and `destination`
-                    dst_sequence = tuple(random.sample(range(nd), len(src_sequence)))
-
-                    # Randomly change a dim to a negative dim representation of itself.
-                    random_prob = random.random()
-                    if random_negative and random_prob > 0.66:
-                        random_idx = random.randint(0, len(src_sequence) - 1)
-                        src_sequence = make_index_negative(src_sequence, random_idx)
-                    elif random_negative and random_prob > 0.33:
-                        random_idx = random.randint(0, len(src_sequence) - 1)
-                        dst_sequence = make_index_negative(dst_sequence, random_idx)
-                    elif random_negative:
-                        random_idx = random.randint(0, len(src_sequence) - 1)
-                        dst_sequence = make_index_negative(dst_sequence, random_idx)
-                        random_idx = random.randint(0, len(src_sequence) - 1)
-                        src_sequence = make_index_negative(src_sequence, random_idx)
-
-                    torch_fn = partial(torch.movedim, source=src_sequence, destination=dst_sequence)
-                    np_fn = partial(np.moveaxis, source=src_sequence, destination=dst_sequence)
-                    self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
-
-        # Move dim to same position
-        x = torch.randn(2, 3, 5, 7, 11)
-        torch_fn = partial(torch.movedim, source=(0, 1), destination=(0, 1))
-        np_fn = partial(np.moveaxis, source=(0, 1), destination=(0, 1))
-        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
-
-        torch_fn = partial(torch.movedim, source=1, destination=1)
-        np_fn = partial(np.moveaxis, source=1, destination=1)
-        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
-
-        # Empty Sequence
-        torch_fn = partial(torch.movedim, source=(), destination=())
-        np_fn = partial(np.moveaxis, source=(), destination=())
-        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+        for fn in [torch.moveaxis, torch.movedim]:
+            for nd in range(5):
+                shape = self._rand_shape(nd, min_size=5, max_size=10)
+                x = _generate_input(shape, dtype, device, with_extremal=False)
+                for random_negative in [True, False]:
+                    for src_dim, dst_dim in permutations(range(nd), r=2):
+                        random_prob = random.random()
+
+                        if random_negative and random_prob > 0.66:
+                            src_dim = src_dim - nd
+                        elif random_negative and random_prob > 0.33:
+                            dst_dim = dst_dim - nd
+                        elif random_negative:
+                            src_dim = src_dim - nd
+                            dst_dim = dst_dim - nd
+
+                        # Integer `source` and `destination`
+                        torch_fn = partial(fn, source=src_dim, destination=dst_dim)
+                        np_fn = partial(np.moveaxis, source=src_dim, destination=dst_dim)
+                        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+
+                    if nd == 0:
+                        continue
+
+                    def make_index_negative(sequence, idx):
+                        sequence = list(sequence)
+                        sequence[random_idx] = sequence[random_idx] - nd
+                        return tuple(src_sequence)
+
+                    for src_sequence in permutations(range(nd), r=random.randint(1, nd)):
+                        # Sequence `source` and `destination`
+                        dst_sequence = tuple(random.sample(range(nd), len(src_sequence)))
+
+                        # Randomly change a dim to a negative dim representation of itself.
+                        random_prob = random.random()
+                        if random_negative and random_prob > 0.66:
+                            random_idx = random.randint(0, len(src_sequence) - 1)
+                            src_sequence = make_index_negative(src_sequence, random_idx)
+                        elif random_negative and random_prob > 0.33:
+                            random_idx = random.randint(0, len(src_sequence) - 1)
+                            dst_sequence = make_index_negative(dst_sequence, random_idx)
+                        elif random_negative:
+                            random_idx = random.randint(0, len(src_sequence) - 1)
+                            dst_sequence = make_index_negative(dst_sequence, random_idx)
+                            random_idx = random.randint(0, len(src_sequence) - 1)
+                            src_sequence = make_index_negative(src_sequence, random_idx)
+
+                        torch_fn = partial(fn, source=src_sequence, destination=dst_sequence)
+                        np_fn = partial(np.moveaxis, source=src_sequence, destination=dst_sequence)
+                        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+
+            # Move dim to same position
+            x = torch.randn(2, 3, 5, 7, 11)
+            torch_fn = partial(fn, source=(0, 1), destination=(0, 1))
+            np_fn = partial(np.moveaxis, source=(0, 1), destination=(0, 1))
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+
+            torch_fn = partial(fn, source=1, destination=1)
+            np_fn = partial(np.moveaxis, source=1, destination=1)
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+
+            # Empty Sequence
+            torch_fn = partial(fn, source=(), destination=())
+            np_fn = partial(np.moveaxis, source=(), destination=())
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     @dtypes(torch.float, torch.bool)
     def test_diag(self, device, dtype):
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index d4e59a3dbf23..15f1bcd8183f 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -535,11 +535,12 @@ def run_test(device, op):
                 out[idx_1, idx_2] = random.random()
                 self.assertEqual(t[idx_2, idx_1], out[idx_1, idx_2])
 
-        op = partial(torch.movedim, source=(0, 1), destination=(1, 0))
-        run_test(device, op)
+        for fn in [torch.movedim, torch.moveaxis]:
+            op = partial(fn, source=(0, 1), destination=(1, 0))
+            run_test(device, op)
 
-        op = partial(torch.movedim, source=0, destination=1)
-        run_test(device, op)
+            op = partial(fn, source=0, destination=1)
+            run_test(device, op)
 
 class TestOldViewOps(TestCase):
     def test_ravel(self, device):
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 99f504a05c9c..87bbf38b0863 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -2403,6 +2403,12 @@ def callable(a, b) -> number
 See :func:`torch.movedim`
 """)
 
+add_docstr_all('moveaxis', r"""
+moveaxis(source, destination) -> Tensor
+
+See :func:`torch.moveaxis`
+""")
+
 add_docstr_all('mul', r"""
 mul(value) -> Tensor
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index dd4be74dde80..7852f3b759c7 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5749,6 +5749,43 @@ def merge_dicts(*dicts):
             [[-0.8437,  0.1727, -0.1398]]])
 """.format(**common_args))
 
+add_docstr(torch.moveaxis, r"""
+moveaxis(input, source, destination) -> Tensor
+
+Alias for :func:`torch.movedim`.
+
+This function is equivalent to NumPy's moveaxis function.
+
+Examples::
+
+    >>> t = torch.randn(3,2,1)
+    >>> t
+    tensor([[[-0.3362],
+            [-0.8437]],
+
+            [[-0.9627],
+            [ 0.1727]],
+
+            [[ 0.5173],
+            [-0.1398]]])
+    >>> torch.moveaxis(t, 1, 0).shape
+    torch.Size([2, 3, 1])
+    >>> torch.moveaxis(t, 1, 0)
+    tensor([[[-0.3362],
+            [-0.9627],
+            [ 0.5173]],
+
+            [[-0.8437],
+            [ 0.1727],
+            [-0.1398]]])
+    >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+    torch.Size([2, 1, 3])
+    >>> torch.moveaxis(t, (1, 2), (0, 1))
+    tensor([[[-0.3362, -0.9627,  0.5173]],
+
+            [[-0.8437,  0.1727, -0.1398]]])
+""".format(**common_args))
+
 add_docstr(torch.swapdims, r"""
 swapdims(input, dim0, dim1) -> Tensor
 
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 2f463f79fd83..e032eeb1a2d8 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -103,6 +103,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::swapdims_, aten::transpose_},
       {aten::swapaxes, aten::transpose},
       {aten::swapaxes_, aten::transpose_},
+      {aten::moveaxis, aten::movedim},
   };
   return alias_map;
 }
diff --git a/torch/overrides.py b/torch/overrides.py
index f6a49376ab53..f7b9bedf9106 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -522,6 +522,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.mm: lambda input, mat2, out=None: -1,
         torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1,
         torch.movedim: lambda input, source, destination: -1,
+        torch.moveaxis: lambda input, source, destination: -1,
         torch.mul: lambda input, other, out=None: -1,
         torch.multiply: lambda input, other, out=None: -1,
         torch.multinomial: lambda input, num_samples, replacement=False, out=None: -1,

From 416dc683410710092475f7686ccec3cfb47bd68b Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Thu, 3 Dec 2020 10:42:48 -0800
Subject: [PATCH 019/132] [Pytorch][Annotation] Update inlined callstack with
 module instance info (#47416)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47416

Test Plan: Imported from OSS

Reviewed By: kimishpatel

Differential Revision: D24752846

Pulled By: cccclai

fbshipit-source-id: 94d3c18c56161d1de3a16bb7c93502fedf71644c
---
 test/cpp/jit/test_fuser.cpp                   |  97 +++++++------
 test/cpp/jit/test_lite_interpreter.cpp        |  55 +++++++-
 test/cpp/jit/test_misc.cpp                    | 127 +++++++++---------
 test/mobile/test_lite_script_module.py        |  86 ++++++++++--
 torch/csrc/jit/ir/ir.cpp                      |  28 +++-
 torch/csrc/jit/ir/scope.cpp                   |   5 +-
 torch/csrc/jit/ir/scope.h                     |   3 +-
 torch/csrc/jit/passes/reconstruct_scopes.cpp  |   6 +-
 torch/csrc/jit/runtime/interpreter.cpp        |   4 +-
 .../csrc/jit/serialization/export_module.cpp  |  64 ++++++---
 10 files changed, 320 insertions(+), 155 deletions(-)

diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index c4aaa15d47c4..bff3ef4a32cd 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -1,43 +1,40 @@
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/passes/canonicalize.h>
-#include "ATen/core/interned_strings.h"
-#include "torch/csrc/autograd/generated/variable_factories.h"
-#include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/codegen/fuser/interface.h"
-#include "torch/csrc/jit/frontend/tracer.h"
-#include "torch/csrc/jit/ir/alias_analysis.h"
-#include "torch/csrc/jit/ir/attributes.h"
-#include "torch/csrc/jit/ir/irparser.h"
-#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
-#include "torch/csrc/jit/passes/constant_propagation.h"
-#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
-#include "torch/csrc/jit/passes/dead_code_elimination.h"
-#include "torch/csrc/jit/passes/graph_fuser.h"
-#include "torch/csrc/jit/passes/lower_grad_of.h"
-#include "torch/csrc/jit/passes/lower_tuples.h"
-#include "torch/csrc/jit/passes/requires_grad_analysis.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
-#include "torch/csrc/jit/passes/utils/subgraph_utils.h"
-#include "torch/csrc/jit/runtime/argument_spec.h"
-#include "torch/csrc/jit/runtime/autodiff.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/jit/runtime/interpreter.h"
-#include "torch/csrc/jit/runtime/symbolic_script.h"
-#include "torch/csrc/jit/serialization/import.h"
-
-#include "torch/csrc/autograd/engine.h"
-#include "torch/csrc/autograd/variable.h"
+#include <ATen/ATen.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
 
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/codegen/fuser/interface.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/attributes.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/canonicalize.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/graph_fuser.h>
+#include <torch/csrc/jit/passes/lower_grad_of.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/requires_grad_analysis.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/autodiff.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/symbolic_script.h>
+#include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "ATen/core/ivalue.h"
-#include "torch/csrc/jit/api/module.h"
-#include "torch/csrc/jit/frontend/ir_emitter.h"
-#include "torch/csrc/jit/runtime/graph_executor.h"
-
-#include "onnx/onnx_pb.h"
 
-#include <ATen/ATen.h>
+#include <onnx/onnx_pb.h>
 
 #include <c10/util/Exception.h>
 
@@ -57,9 +54,9 @@ namespace torch {
 namespace jit {
 
 TEST(FuserTest, TestSimple_CUDA) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
   const auto graph_string = R"IR(
       graph(%0 : Tensor,
             %1 : Tensor):
@@ -80,9 +77,9 @@ TEST(FuserTest, TestSimple_CUDA) {
 }
 
 TEST(FuserTest, TestOne_CUDA) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
   auto testOne = [&](int ti, int tj) {
     const auto graph_string = R"IR(
       graph(%0 : Tensor,
@@ -140,9 +137,9 @@ TEST(FuserTest, TestOne_CUDA) {
 }
 
 TEST(FuserTest, FusedConcat_CUDA) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
   const auto graph_string0 = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -186,9 +183,9 @@ TEST(FuserTest, FusedConcat_CUDA) {
 }
 
 TEST(FuserTest, FusionAliasing) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -214,9 +211,9 @@ TEST(FuserTest, FusionAliasing) {
 }
 
 TEST(FuserTest, KernelCaching) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
 
   // Constructs two functionally equivalent graphs
   const auto graph0_string = R"IR(
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index b6cfc9e3c519..63198473460e 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -523,6 +523,29 @@ TEST(LiteInterpreterTest, SequentialModuleInfo) {
     }
   }
 
+  // class A(nn.Module):
+  //   def __init__(self):
+  //     super(A, self).__init__()
+
+  //   def forward(self, x):
+  //     return x + 1
+
+  // class B(nn.Module):
+  //   def __init__(self):
+  //     super(B, self).__init__()
+
+  //   def forward(self, x):
+  //     return x + 2
+
+  // class C(nn.Module):
+  //   def __init__(self):
+  //     super(C, self).__init__()
+  //     self.A0 = A()
+  //     self.B0 = B()
+
+  //   def forward(self, x):
+  //     return self.A0.forward(self.B0.forward(x))
+
   std::unordered_set<std::string> expected_result(
       {"top(C).A0(A).forward", "top(C).B0(B).forward"});
   AT_ASSERT(module_debug_info_set == expected_result);
@@ -568,9 +591,11 @@ TEST(LiteInterpreterTest, HierarchyModuleInfo) {
   // There are 3 module information strings here.
   // "top(C).forward": for the add operator in top.
   // "top(C).B0(B).forward": for the add operator in B0.
-  // "top(C).B0(B).A0(A).forward": for the add operator in A0.
+  // "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
   std::unordered_set<std::string> expected_result(
-      {"top(C).forward", "top(C).B0(B).forward", "top(C).B0(B).A0(A).forward"});
+      {"top(C).forward",
+       "top(C).B0(B).forward",
+       "top(C).B0(B).forward.A0(A).forward"});
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
@@ -606,11 +631,29 @@ TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
     }
   }
 
-  // The current approach is not able to distinguish between A0 and A1,
-  // which have the same class type. Hence, it only records module
-  // information for A1.
+  // class A(nn.Module):
+  //   def __init__(self):
+  //     super(A, self).__init__()
+
+  //   def forward(self, x):
+  //     return x + 5
+
+  // class B(nn.Module):
+  //   def __init__(self):
+  //     super(B, self).__init__()
+  //     self.A0 = A()
+  //     self.A1 = A()
+
+  //   def forward(self, x):
+  //     return self.A0.forward(x) + self.A1.forward(x)
+
+  // There are 3 module information strings here.
+  // "top(B).forward": for the add operator in top.
+  // "top(B).A0(A).forward": for the add operator in A0.
+  // "top(B).A1(A).forward": for the add operator in A1.
+
   std::unordered_set<std::string> expected_result(
-      {"top(B).forward", "top(B).A1(A).forward"});
+      {"top(B).forward", "top(B).A0(A).forward", "top(B).A1(A).forward"});
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 85545a65cf03..ca4ba0fdb3da 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -5,57 +5,53 @@
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
 
-#include "test/cpp/jit/test_utils.h"
-
+#include <test/cpp/jit/test_utils.h>
+
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/codegen/fuser/interface.h>
+#include <torch/csrc/jit/frontend/code_template.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/attributes.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/ir/scope.h>
 #include <torch/csrc/jit/ir/type_hashing.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/bailout_graph.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
-#include "torch/csrc/autograd/generated/variable_factories.h"
-#include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/codegen/fuser/interface.h"
-#include "torch/csrc/jit/frontend/code_template.h"
-#include "torch/csrc/jit/frontend/tracer.h"
-#include "torch/csrc/jit/ir/alias_analysis.h"
-#include "torch/csrc/jit/ir/attributes.h"
-#include "torch/csrc/jit/ir/irparser.h"
-#include "torch/csrc/jit/ir/scope.h"
-#include "torch/csrc/jit/jit_log.h"
-#include "torch/csrc/jit/passes/bailout_graph.h"
-#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
-#include "torch/csrc/jit/passes/constant_propagation.h"
-#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
-#include "torch/csrc/jit/passes/dead_code_elimination.h"
-#include "torch/csrc/jit/passes/graph_fuser.h"
-#include "torch/csrc/jit/passes/guard_elimination.h"
-#include "torch/csrc/jit/passes/inline_autodiff_subgraphs.h"
-#include "torch/csrc/jit/passes/insert_guards.h"
-#include "torch/csrc/jit/passes/liveness.h"
-#include "torch/csrc/jit/passes/loop_unrolling.h"
-#include "torch/csrc/jit/passes/lower_grad_of.h"
-#include "torch/csrc/jit/passes/lower_tuples.h"
-#include "torch/csrc/jit/passes/pass_manager.h"
-#include "torch/csrc/jit/passes/requires_grad_analysis.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
-#include "torch/csrc/jit/passes/utils/subgraph_utils.h"
-#include "torch/csrc/jit/runtime/argument_spec.h"
-#include "torch/csrc/jit/runtime/autodiff.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/jit/runtime/interpreter.h"
-#include "torch/csrc/jit/runtime/symbolic_script.h"
-#include "torch/csrc/jit/serialization/import.h"
-
-#include "torch/csrc/autograd/engine.h"
-#include "torch/csrc/autograd/variable.h"
-
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/graph_fuser.h>
+#include <torch/csrc/jit/passes/guard_elimination.h>
+#include <torch/csrc/jit/passes/inline_autodiff_subgraphs.h>
+#include <torch/csrc/jit/passes/insert_guards.h>
+#include <torch/csrc/jit/passes/liveness.h>
+#include <torch/csrc/jit/passes/loop_unrolling.h>
+#include <torch/csrc/jit/passes/lower_grad_of.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/passes/requires_grad_analysis.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/autodiff.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+#include <torch/csrc/jit/runtime/symbolic_script.h>
+#include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/testing/file_check.h>
+#include <torch/jit.h>
 #include <torch/script.h>
 
-#include "torch/csrc/jit/api/module.h"
-#include "torch/csrc/jit/frontend/ir_emitter.h"
-#include "torch/csrc/jit/runtime/profiling_record.h"
-#include "torch/jit.h"
-
-#include "onnx/onnx_pb.h"
+#include <onnx/onnx_pb.h>
 
 #include <c10/util/Exception.h>
 #include <c10/util/ThreadLocalDebugInfo.h>
@@ -368,9 +364,9 @@ TEST(ATenNativeBatchNormTest, Basic) {
 }
 
 TEST(CustomFusionTest, Basic) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
 
   auto graph_string = R"IR(
     graph(%0 : Float(2, 3, 4),
@@ -406,9 +402,9 @@ TEST(CustomFusionTest, Basic) {
 }
 
 TEST(CustomFusionTest, NestedBlocks) {
-  #if defined(FBCODE_CAFFE2)
-      return;
-  #endif
+#if defined(FBCODE_CAFFE2)
+  return;
+#endif
 
   auto graph_string = R"IR(
   graph(%0 : Float(2, 3, 4),
@@ -1133,16 +1129,17 @@ TEST(RecordFunctionTest, Basic) {
 TEST(RecordFunctionTest, OperatorNameOverload) {
   std::set<std::string> operator_names;
 
-  at::addGlobalCallback(
-      at::RecordFunctionCallback([&operator_names](
-                                     const at::RecordFunction& fn) {
-        c10::optional<c10::OperatorName> op_name = fn.operator_name();
-        if (op_name.has_value()) {
-          operator_names.insert(c10::toString(*op_name));
-        } else {
-          operator_names.insert("No Operator Name");
-        }
-      }).scopes({at::RecordScope::FUNCTION}));
+  at::addGlobalCallback(at::RecordFunctionCallback(
+                            [&operator_names](const at::RecordFunction& fn) {
+                              c10::optional<c10::OperatorName> op_name =
+                                  fn.operator_name();
+                              if (op_name.has_value()) {
+                                operator_names.insert(c10::toString(*op_name));
+                              } else {
+                                operator_names.insert("No Operator Name");
+                              }
+                            })
+                            .scopes({at::RecordScope::FUNCTION}));
   auto t = torch::randn({1, 2, 3}, at::kCPU);
   t.set_requires_grad(false);
   auto t2 = t.pow(2);
@@ -1848,7 +1845,7 @@ def foo(x):
           ASSERT_TRUE(n->callstack());
           auto callstack_vector = (*n->callstack())->vec();
           ASSERT_EQ(callstack_vector.size(), 1);
-          ASSERT_EQ(callstack_vector[0].first, &cu->get_function("bar"));
+          ASSERT_EQ(std::get<0>(callstack_vector[0]), &cu->get_function("bar"));
           break;
         }
         case 7: {
@@ -1858,8 +1855,8 @@ def foo(x):
           ASSERT_TRUE(n->callstack());
           auto callstack_vector = (*n->callstack())->vec();
           ASSERT_EQ(callstack_vector.size(), 2);
-          ASSERT_EQ(callstack_vector[0].first, &cu->get_function("baz"));
-          ASSERT_EQ(callstack_vector[1].first, &cu->get_function("ham"));
+          ASSERT_EQ(std::get<0>(callstack_vector[0]), &cu->get_function("baz"));
+          ASSERT_EQ(std::get<0>(callstack_vector[1]), &cu->get_function("ham"));
           break;
         }
         case 11: {
@@ -1888,7 +1885,7 @@ def foo(x):
       ASSERT_TRUE(n->callstack());
       auto callstack_vector = (*n->callstack())->vec();
       ASSERT_EQ(callstack_vector.size(), 1);
-      ASSERT_EQ(callstack_vector[0].first, &cu->get_function("ham"));
+      ASSERT_EQ(std::get<0>(callstack_vector[0]), &cu->get_function("ham"));
     }
   }
 }
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index ca67875b107f..3549582dcfac 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -1,7 +1,7 @@
 import unittest
 import torch
 import torch.utils.bundled_inputs
-
+from torch.utils.mobile_optimizer import *
 import io
 from typing import NamedTuple
 from collections import namedtuple
@@ -36,7 +36,7 @@ def forward(self, x):
         mobile_module_run_method_result = mobile_module.run_method("forward", input)
         torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result)
 
-    def test_save_mobile_module_with_debug_info(self):
+    def test_save_mobile_module_with_debug_info_with_trace(self):
         class A(torch.nn.Module):
             def __init__(self):
                 super(A, self).__init__()
@@ -55,13 +55,83 @@ def forward(self, x):
 
         input = torch.tensor([5])
         trace_module = torch.jit.trace(B(), input)
-        bytes = trace_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+        exported_module = trace_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+
+        assert(b"mobile_debug.pkl" in exported_module)
+        assert(b"module_debug_info" in exported_module)
+        assert(b"top(B).forward" in exported_module)
+        assert(b"top(B).A0(A).forward" in exported_module)
+        assert(b"top(B).A1(A).forward" in exported_module)
+
+    def test_save_mobile_module_with_debug_info_with_script_duplicate_class(self):
+        class A(torch.nn.Module):
+            def __init__(self):
+                super(A, self).__init__()
+
+            def forward(self, x):
+                return x + 1
+
+        class B(torch.nn.Module):
+            def __init__(self):
+                super(B, self).__init__()
+                self.A0 = A()
+                self.A1 = A()
+
+            def forward(self, x):
+                return self.A0(x) + self.A1(x)
+
+        input_data = torch.tensor([5])
+        scripted_module = torch.jit.script(B(), input_data)
+        exported_module = scripted_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+
+        assert(b"mobile_debug.pkl" in exported_module)
+        assert(b"module_debug_info" in exported_module)
+        assert(b"top(B).forward" in exported_module)
+        assert(b"top(B).A0(A).forward" in exported_module)
+        assert(b"top(B).A1(A).forward" in exported_module)
 
-        assert(b"mobile_debug.pkl" in bytes)
-        assert(b"module_debug_info" in bytes)
-        assert(b"top(B).forward" in bytes)
-        assert(b"top(B).A0(A).forward" in bytes)
-        assert(b"top(B).A1(A).forward" in bytes)
+    def test_save_mobile_module_with_debug_info_with_script_nested_call(self):
+        class A(torch.nn.Module):
+            def __init__(self):
+                super(A, self).__init__()
+
+            def forward(self, x):
+                return x + 1
+
+        class B(torch.nn.Module):
+            def __init__(self):
+                super(B, self).__init__()
+
+            def forward(self, x):
+                return x + 2
+
+        class C(torch.nn.Module):
+            def __init__(self):
+                super(C, self).__init__()
+                self.A0 = A()
+                self.B0 = B()
+
+            def forward(self, x):
+                return self.A0(self.B0(x)) + 1
+
+        input = torch.tensor([5])
+        scripted_module = torch.jit.script(C(), input)
+
+        optimized_scripted_module = optimize_for_mobile(scripted_module)
+
+        exported_module = scripted_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+        optimized_exported_module = optimized_scripted_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+        assert(b"mobile_debug.pkl" in exported_module)
+        assert(b"module_debug_info" in exported_module)
+        assert(b"top(C).forward" in exported_module)
+        assert(b"top(C).A0(A).forward" in exported_module)
+        assert(b"top(C).B0(B).forward" in exported_module)
+
+        assert(b"mobile_debug.pkl" in optimized_exported_module)
+        assert(b"module_debug_info" in optimized_exported_module)
+        assert(b"top(C).forward" in optimized_exported_module)
+        assert(b"top(C).A0(A).forward" in optimized_exported_module)
+        assert(b"top(C).B0(B).forward" in optimized_exported_module)
 
     def test_load_mobile_module_with_debug_info(self):
         class MyTestModule(torch.nn.Module):
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index eecc23915212..fe79091c946f 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -1915,6 +1915,33 @@ std::vector<Value*> inlineCallTo(
   // are missing nodes without outputs (e.g. prim::Print).
   std::unordered_set<Node*> updated_nodes;
   for (const auto& kv : value_map) {
+    /* Skip the old value if it is the graph input.
+     * The reason is that, value_map contains values not all for the nodes of
+     * the graph but primary inputs as well, and it will create duplicates when
+     * the first inlined graph is input to the next one. To avoid this issue,
+     * skip the old value when it is one of the
+     * callee->optimized_graph()->inputs() or callee->graph()->inputs(), depends
+     * on if it is inlined_optimized_graph
+     */
+
+    if (inline_optimized_graph) {
+      auto is_graph_input = std::find(
+          callee->optimized_graph()->inputs().begin(),
+          callee->optimized_graph()->inputs().end(),
+          kv.first);
+      if (is_graph_input != callee->optimized_graph()->inputs().end()) {
+        continue;
+      }
+    } else {
+      auto is_graph_input = std::find(
+          callee->graph()->inputs().begin(),
+          callee->graph()->inputs().end(),
+          kv.first);
+      if (is_graph_input != callee->graph()->inputs().end()) {
+        continue;
+      }
+    }
+
     Node* new_node = kv.second->node();
     if (!updated_nodes.insert(new_node).second) {
       continue;
@@ -1941,7 +1968,6 @@ std::vector<Value*> inlineCallTo(
     }
     new_node->setCallStack(new_callstack_entries.at(raw_callstack_ptr));
   }
-
   const auto& old_outputs = to_replace->outputs();
 
   AT_ASSERT(new_outputs.size() == old_outputs.size());
diff --git a/torch/csrc/jit/ir/scope.cpp b/torch/csrc/jit/ir/scope.cpp
index 3901ce1038bf..eba291bf0204 100644
--- a/torch/csrc/jit/ir/scope.cpp
+++ b/torch/csrc/jit/ir/scope.cpp
@@ -123,7 +123,10 @@ std::vector<InlinedCallStackEntry> InlinedCallStack::vec() {
   std::vector<InlinedCallStackEntry> r;
   c10::optional<InlinedCallStackPtr> current = intrusive_from_this();
   while (current) {
-    r.emplace_back(std::make_pair((*current)->fn_, (*current)->source_range_));
+    r.emplace_back(std::make_tuple(
+        (*current)->fn_,
+        (*current)->source_range_,
+        (*current)->module_instance_info_));
     current = (*current)->callee_;
   }
   return r;
diff --git a/torch/csrc/jit/ir/scope.h b/torch/csrc/jit/ir/scope.h
index 784c2942c263..e742caba495b 100644
--- a/torch/csrc/jit/ir/scope.h
+++ b/torch/csrc/jit/ir/scope.h
@@ -107,8 +107,7 @@ struct ModuleInstanceInfo {
  *  [ham, source_range4]  --
  */
 using InlinedCallStackPtr = c10::intrusive_ptr<InlinedCallStack>;
-using InlinedCallStackEntry = std::pair<Function*, SourceRange>;
-using InlinedCallStackWithModuleInfo =
+using InlinedCallStackEntry =
     std::tuple<Function*, SourceRange, c10::optional<ModuleInstanceInfo>>;
 
 struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
diff --git a/torch/csrc/jit/passes/reconstruct_scopes.cpp b/torch/csrc/jit/passes/reconstruct_scopes.cpp
index a4787cd84c05..15aa5863fbf1 100644
--- a/torch/csrc/jit/passes/reconstruct_scopes.cpp
+++ b/torch/csrc/jit/passes/reconstruct_scopes.cpp
@@ -108,7 +108,7 @@ void ReconstructScopesPass::constructRelativeNamesForModules(
 void ReconstructScopesPass::appendSourceRangeInfo(
     std::string& scopeString,
     const InlinedCallStackEntry& frame) const {
-  SourceRange r = frame.second;
+  SourceRange r = std::get<1>(frame);
   if (r.source()) {
     if (auto orig = r.source()->findSourceRangeThatGenerated(r)) {
       r = *orig;
@@ -125,7 +125,7 @@ void ReconstructScopesPass::appendSourceRangeInfo(
 
 std::string ReconstructScopesPass::getScopeString(
     const InlinedCallStackEntry& frame) const {
-  Function* f = frame.first;
+  Function* f = std::get<0>(frame);
   if (!func_to_module_.count(f)) {
     return "<null (no func in the map)>";
   }
@@ -137,7 +137,7 @@ std::string ReconstructScopesPass::getScopeString(
 
   // When class types are not unique, the module information may be
   // incomplele. In this case, we add source range information,
-  // which can be helpful for deugging purposes.
+  // which can be helpful for debugging purposes.
   if (class_types_are_not_unique_) {
     appendSourceRangeInfo(scopeString, frame);
   }
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 4f6fc77da260..ef0f2dae9e0e 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -1640,8 +1640,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       Node* node = frame.function->instructions_source_[pc];
       if (node->callstack()) {
         for (const auto& p : (*node->callstack())->vec()) {
-          entries.emplace_back(StackEntry{previous_fn_name, p.second});
-          previous_fn_name = p.first->name();
+          entries.emplace_back(StackEntry{previous_fn_name, std::get<1>(p)});
+          previous_fn_name = std::get<0>(p)->name();
         }
       }
       entries.emplace_back(StackEntry{previous_fn_name, node->sourceRange()});
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 957c13ce065f..e9f9d27bf166 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -54,21 +54,53 @@ static IValue Table(
   return Tup(std::move(ivalue_entries));
 }
 
-std::string getModulePath(Node* node) {
-  std::string modulePath = node->scopeName();
-  size_t end = modulePath.size();
-  // Here we remove the source range information to make the
-  // module debugging information shorter and cleaner.
-  if (modulePath[end - 1] == '>') {
-    end = modulePath.rfind('<');
-    if (end > 0 && modulePath[end - 1] == '<') {
-      --end;
+std::string getModulePath(Node* node, const std::string& root_scope_string) {
+  constexpr size_t kFunction = 0;
+  constexpr size_t kModuleInstanceInfo = 2;
+
+  if (!node->callstack()) {
+    return root_scope_string + ".forward";
+  } else {
+    std::string module_info = root_scope_string;
+    auto callstack_ptr = *(node->callstack());
+    const auto& vec = callstack_ptr->vec();
+
+    for (const auto& element : vec) {
+      const auto& opt_module_instance_info =
+          std::get<kModuleInstanceInfo>(element);
+      if (opt_module_instance_info.has_value()) {
+        const auto& module_instance_info = opt_module_instance_info.value();
+        if (module_instance_info.class_type()) {
+          const auto& class_type = module_instance_info.class_type();
+          const auto& instance_name = module_instance_info.instance_name();
+          auto type_name = class_type->name()->qualifiedName();
+          type_name = type_name.substr(type_name.find_last_of('.') + 1);
+          module_info.append(".")
+              .append(instance_name)
+              .append("(")
+              .append(type_name)
+              .append(")")
+              .append(".")
+              .append(std::get<kFunction>(element)->name());
+        } else {
+          module_info += ".(UNKNOWN_INSTANCE(UNKNOWN_TYPE)";
+        }
+      } else {
+        module_info += ".(UNKNOWN_INSTANCE(UNKNOWN_TYPE)";
+      }
     }
+
+    return module_info;
   }
-  // We only keep the last function in a callstack.
-  size_t start = modulePath.rfind('/', end);
-  start = (start != std::string::npos) ? start + 1 : 0;
-  return modulePath.substr(start, end - start);
+}
+
+std::string getModuleTypeName(const Module& module, const std::string& prefix) {
+  std::string moduleType = module.type()->str();
+  size_t lastDotIndex = moduleType.rfind('.');
+  if (lastDotIndex != std::string::npos) {
+    moduleType = moduleType.substr(lastDotIndex + 1);
+  }
+  return prefix + "(" + moduleType + ")";
 }
 
 std::pair<IValue, c10::optional<IValue>> getFunctionTuple(
@@ -78,9 +110,6 @@ std::pair<IValue, c10::optional<IValue>> getFunctionTuple(
   auto graph = func.graph()->copy();
 
   Inline(*graph);
-  if (save_mobile_debug_info) {
-    ReconstructScopes(module, *graph, "top");
-  }
 
   torch::jit::Code code(graph, func.name());
   auto instructions_copy = code.instructions();
@@ -95,7 +124,8 @@ std::pair<IValue, c10::optional<IValue>> getFunctionTuple(
       auto node = code.instructions_source()[i];
       opnames.emplace_back(node->schema().operator_name());
       if (save_mobile_debug_info) {
-        op_module_paths.emplace_back(getModulePath(node));
+        std::string root_scope_string = getModuleTypeName(module, "top");
+        op_module_paths.emplace_back(getModulePath(node, root_scope_string));
       }
     }
     // CALL nodes at this point represent built-in (i.e. non-Graph)

From c2ad3c4e6a810120bf7d36f0e509fe76e5d53ba0 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 3 Dec 2020 10:54:09 -0800
Subject: [PATCH 020/132] Add scary comment in cpp_custom_type_hack.h (#48737)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48737

Test Plan: Imported from OSS

Reviewed By: dzhulgakov

Differential Revision: D25280542

Pulled By: jamesr66a

fbshipit-source-id: 67c3b8c82def848ba3059dd6f6a23f9c5e329c0f
---
 aten/src/ATen/cpp_custom_type_hack.h | 63 ++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/cpp_custom_type_hack.h b/aten/src/ATen/cpp_custom_type_hack.h
index 9f8f61f534ed..d690a00e0c2c 100644
--- a/aten/src/ATen/cpp_custom_type_hack.h
+++ b/aten/src/ATen/cpp_custom_type_hack.h
@@ -1,11 +1,52 @@
-// WARNING! WARNING! WARNING!
-// This file is a temporary hack to enable development of pytorch quantization
-//
-// It's a stub for wrapping arbitrary cpp types in TorchScript. Proper
-// implementation (under development) is to use TorchScript custom types.
-// In the meantime, we abuse ByteTensor with custom deleter for this purpose.
-//
-// Template argument <T> has to be registered with CAFFE_KNOWN_TYPE mechanism.
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+
+// YOU ARE IN THE WRONG PLACE! TURN BACK NOW!
+
+// This code was a temporary hack to enable embedding arbitrary C++ structures
+// into Tensors. THIS IS UNSAFE AND IS NOT SUPPORTED. IF YOU USE THIS CODE,
+// IT __WILL__ BREAK.
+
+// This code has been superseded by custom classes:
+// https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html
+
+// Please use custom classes and **DO NOT ADD MORE CALLSITES TO THINGS DEFINED
+// IN THIS FILE**.
+
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
 
 #include <ATen/ATen.h>
 #include <ATen/TracerMode.h>
@@ -14,6 +55,8 @@ namespace at {
 namespace cpp_custom_type_hack {
 
 template <typename T>
+[[deprecated("Use custom classes instead: "
+  "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]]
 bool isa(const Tensor& packed) {
   return (packed.scalar_type() == kByte) &&
       (packed.storage().data_ptr().get_deleter() ==
@@ -21,6 +64,8 @@ bool isa(const Tensor& packed) {
 }
 
 template <typename T>
+[[deprecated("Use custom classes instead: "
+  "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]]
 T& cast(const Tensor& packed) {
   TORCH_CHECK(
       packed.scalar_type() == kByte, "Expected temporary cpp type wrapper");
@@ -33,6 +78,8 @@ T& cast(const Tensor& packed) {
 }
 
 template <typename T>
+[[deprecated("Use custom classes instead: "
+  "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]]
 Tensor create(std::unique_ptr<T> ptr, TensorOptions options) {
   // None of this should trace, so turn off Tracer dispatching
   at::AutoNonVariableTypeMode guard;  // TODO: remove

From 2cb92041594661ceb32c7da5bd1967b53d3bb8c0 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kurtamohler@gmail.com>
Date: Thu, 3 Dec 2020 10:55:52 -0800
Subject: [PATCH 021/132] Add nondeterministic alert to index_copy, median CUDA
 and kthvalue CUDA (#46942)

Summary:
Also fixes issue where skipped tests did not properly restore deterministic flag.

Fixes https://github.com/pytorch/pytorch/issues/46743

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46942

Reviewed By: heitorschueroff

Differential Revision: D25298020

Pulled By: mruberry

fbshipit-source-id: 14b1680e1fa536ec72018d0cdb0a3cf83b098767
---
 .../ATen/native/TensorAdvancedIndexing.cpp    |  3 +
 aten/src/ATen/native/cuda/Sorting.cu          |  8 ++
 test/test_torch.py                            | 87 +++++++++++++++--
 torch/__init__.py                             |  3 +
 torch/_tensor_docs.py                         |  5 +
 torch/_torch_docs.py                          |  5 +
 torch/testing/_internal/common_device_type.py | 33 +++----
 .../_internal/common_methods_invocations.py   |  6 ++
 torch/testing/_internal/common_utils.py       | 94 ++++++++++++-------
 9 files changed, 188 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 9abaedd9ff14..ddc3ca8c2b34 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -333,6 +333,9 @@ Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value, con
 }
 
 Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
+  // See note [Writing Nondeterministic Operations]
+  // Nondeterministic when index contains duplicate entries
+  at::globalContext().alertNotDeterministic("index_copy");
   dim = maybe_wrap_dim(dim, self.dim());
 
   TORCH_CHECK_INDEX(index.dim() < 2, "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 889ccf606152..59b07653593e 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -316,6 +316,10 @@ std::tuple<Tensor&, Tensor&> median_with_indices_impl(
     int64_t dim,
     bool keepdim,
     bool ignore_nan) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of a median value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("median CUDA with indices output");
   NoNamesGuard guard;
 
   dim = at::maybe_wrap_dim(dim, self.dim());
@@ -410,6 +414,10 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_cuda(
     int64_t k,
     int64_t dim,
     bool keepdim) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue CUDA");
   auto result = [&]() {
     NoNamesGuard guard;
     // `kthvalue_out_impl_cuda` expects contiguous in input `self`.
diff --git a/test/test_torch.py b/test/test_torch.py
index b5a87b4dd2ae..fde60ca4174f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -23,14 +23,15 @@
     do_test_dtypes, IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest,
     skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, BytesIOContext,
     skipIfRocm, skipIfNoSciPy,
-    wrapDeterministicFlagAPITest)
+    wrapDeterministicFlagAPITest, DeterministicGuard)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     skipCUDAIfNoMagma, skipCUDAIfRocm, skipCUDAIfNotRocm,
     onlyCUDA, onlyCPU,
     dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast,
-    PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyOnCPUAndCUDA)
+    PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyOnCPUAndCUDA,
+    expectedAlertNondeterministic)
 from typing import Dict, List
 import torch.backends.quantized
 import torch.testing._internal.data
@@ -2823,8 +2824,7 @@ def _rand_shape(self, dim, min_size, max_size):
 
     @onlyCPU
     def test_set_deterministic_beta_warning(self, device):
-        det = torch.is_deterministic()
-        try:
+        with DeterministicGuard(torch.is_deterministic()):
             # Ensures setting to false does not throw a warning
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
@@ -2834,8 +2834,6 @@ def test_set_deterministic_beta_warning(self, device):
             # Setting set_deterministic(True) throws a warning once per process
             with self.maybeWarnsRegex(UserWarning, "torch.set_deterministic is in beta"):
                 torch.set_deterministic(True)
-        finally:
-            torch.set_deterministic(det)
 
     @dtypes(torch.float32, torch.complex64)
     def test_storage(self, device, dtype):
@@ -3483,6 +3481,29 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
                 _test_in_place_broadcastable(small2, small_expanded, large_expanded)
                 _test_in_place_broadcastable(small2, small, large)
 
+    # Ensures that kthvalue throws nondeterministic alerts in the correct cases
+    @dtypes(torch.double)
+    def test_kthvalue_nondeterministic_alert(self, device, dtype):
+        @expectedAlertNondeterministic('kthvalue CUDA', 'cuda')
+        def test_func(slf, device, call_type):
+            S = 10
+            k = 5
+            a = torch.randn(S, device=device)
+            if call_type == 'function':
+                torch.kthvalue(a, k)
+            elif call_type == 'method':
+                a.kthvalue(k)
+            elif call_type == 'out':
+                values = torch.empty_like(a)
+                indices = torch.empty((), device=device, dtype=torch.long)
+                torch.kthvalue(a, k, out=(values, indices))
+            else:
+                self.fail(f"'{call_type}' is not a valid call type")
+
+        test_func(self, device, 'function')
+        test_func(self, device, 'method')
+        test_func(self, device, 'out')
+
     def test_embedding_scalar_weight_error(self, device):
         indices = torch.rand(2, 2, device=device).long()
         weight = torch.tensor(1.0)
@@ -3503,6 +3524,37 @@ def run_test(x, y):
         y[1] = 1.
         run_test(x, y)
 
+    # Ensures that median throws nondeterministic alerts in the correct cases
+    @dtypes(torch.double)
+    def test_median_nondeterministic_alert(self, device, dtype):
+        def test_func(slf, device, call_type):
+            S = 10
+            a = torch.randn(S, device=device)
+            if call_type == 'function':
+                torch.median(a)
+            elif call_type == 'function with indices':
+                torch.median(a, 0)
+            elif call_type == 'method':
+                a.median()
+            elif call_type == 'method with indices':
+                a.median(0)
+            elif call_type == 'out with indices':
+                result = torch.empty_like(a)
+                indices = torch.empty((), dtype=torch.long, device=device)
+                torch.median(a, 0, out=(result, indices))
+            else:
+                self.fail(f"'{call_type}' is not a valid call type")
+
+        @expectedAlertNondeterministic('median CUDA with indices output', 'cuda')
+        def test_func_expect_error(slf, device, call_type):
+            test_func(slf, device, call_type)
+
+        test_func(self, device, 'function')
+        test_func_expect_error(self, device, 'function with indices')
+        test_func(self, device, 'method')
+        test_func_expect_error(self, device, 'method with indices')
+        test_func_expect_error(self, device, 'out with indices')
+
     @skipCUDANonDefaultStreamIf(True)
     def test_multinomial_alias(self, device):
         # Get probs vector to use in setup
@@ -4289,6 +4341,29 @@ def test_index_copy(self, device):
         c = torch.zeros(3)
         self.assertRaises(IndexError, lambda: a.index_copy_(dim=1, index=torch.tensor([3]), source=c))
 
+    # Ensures that index_copy throws nondeterministic alerts in the correct cases
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.double)
+    def test_index_copy_nondeterministic_alert(self, device, dtype):
+        @expectedAlertNondeterministic('index_copy')
+        def test_func(slf, device, call_type):
+            S = 10
+            a = torch.randn(S, device=device)
+            b = torch.randn(S, device=device)
+            index = torch.randint(S, (S,), device=device)
+            if call_type == 'function':
+                torch.index_copy(a, 0, index, b)
+            elif call_type == 'method':
+                a.index_copy(0, index, b)
+            elif call_type == 'method inplace':
+                a.index_copy_(0, index, b)
+            else:
+                self.fail(f"'{call_type}' is not a valid call type")
+
+        test_func(self, device, 'function')
+        test_func(self, device, 'method')
+        test_func(self, device, 'method inplace')
+
     def test_index_fill(self, device):
         for dt in torch.testing.get_all_dtypes():
             if dt == torch.half or dt == torch.bfloat16 or dt.is_complex:
diff --git a/torch/__init__.py b/torch/__init__.py
index 73b15cebc687..49049583f5ba 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -374,10 +374,13 @@ def set_deterministic(d):
         * :class:`torch.nn.EmbeddingBag` when called on a CUDA tensor that requires grad
         * :func:`torch.scatter_add_` when called on a CUDA tensor
         * :func:`torch.index_add_` when called on a CUDA tensor
+        * :func:`torch.index_copy`
         * :func:`torch.index_select` when called on a CUDA tensor that requires grad
         * :func:`torch.repeat_interleave` when called on a CUDA tensor that requires grad
         * :func:`torch.histc` when called on a CUDA tensor
         * :func:`torch.bincount` when called on a CUDA tensor
+        * :func:`torch.kthvalue` with called on a CUDA tensor
+        * :func:`torch.median` with indices output when called on a CUDA tensor
 
     A handful of CUDA operations are nondeterministic if the CUDA version is
     10.2 or greater, unless the environment variable `CUBLAS_WORKSPACE_CONFIG=:4096:8`
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 87bbf38b0863..a94291eba18c 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1768,6 +1768,11 @@ def add_docstr_all(method, docstr):
 length of :attr:`index` (which must be a vector), and all other dimensions must
 match :attr:`self`, or an error will be raised.
 
+.. note::
+    If :attr:`index` contains duplicate entries, multiple elements from
+    :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+    is nondeterministic since it depends on which copy occurs last.
+
 Args:
     dim (int): dimension along which to index
     index (LongTensor): indices of :attr:`tensor` to select from
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7852f3b759c7..20bf9ca3ad84 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3894,6 +3894,11 @@ def merge_dicts(*dicts):
 (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
 :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
 
+.. note::
+    When :attr:`input` is a CUDA tensor and there are multiple valid
+    :attr:`k` th values, this function may nondeterministically return
+    :attr:`indices` for any of them.
+
 Args:
     {input}
     k (int): k for the k-th smallest element
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index ee9e5f2845f6..0126b1dd0a93 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -11,7 +11,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
 from torch.testing import \
     (get_all_dtypes)
@@ -790,24 +790,21 @@ def __call__(self, fn):
         @wraps(fn)
         def efail_fn(slf, device, *args, **kwargs):
             if self.device_type is None or self.device_type == slf.device_type:
-                deterministic_restore = torch.is_deterministic()
-                torch.set_deterministic(True)
-                try:
-                    if self.fn_has_device_arg:
-                        fn(slf, device, *args, **kwargs)
+                with DeterministicGuard(True):
+                    try:
+                        if self.fn_has_device_arg:
+                            fn(slf, device, *args, **kwargs)
+                        else:
+                            fn(slf, *args, **kwargs)
+                    except RuntimeError as e:
+                        if self.error_message not in str(e):
+                            slf.fail(
+                                'expected non-deterministic error message to start with "'
+                                + self.error_message
+                                + '" but got this instead: "' + str(e) + '"')
+                        return
                     else:
-                        fn(slf, *args, **kwargs)
-                except RuntimeError as e:
-                    torch.set_deterministic(deterministic_restore)
-                    if self.error_message not in str(e):
-                        slf.fail(
-                            'expected non-deterministic error message to start with "'
-                            + self.error_message
-                            + '" but got this instead: "' + str(e) + '"')
-                    return
-                else:
-                    torch.set_deterministic(deterministic_restore)
-                    slf.fail('expected a non-deterministic error, but it was not raised')
+                        slf.fail('expected a non-deterministic error, but it was not raised')
 
             if self.fn_has_device_arg:
                 return fn(slf, device, *args, **kwargs)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 424d6d254470..116b47320a64 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1105,6 +1105,8 @@ def method_tests():
         ('mean', (S, S, S), (), 'dtype', (True,), (), (), ident, {'dtype': torch.float64}),
         ('kthvalue', (S, S, S), (2,)),
         ('kthvalue', (S, S, S), (2, 1,), 'dim', (), [1]),
+        ('kthvalue', (S, S, S), (2, 1,), 'dim_alert_nondeterministic', (), [1],
+            [expectedAlertNondeterministic('kthvalue CUDA', 'cuda')]),
         ('kthvalue', (S, S, S), (2, 1, True,), 'keepdim_dim', (), [1]),
         ('kthvalue', (S,), (2, 0,), 'dim_1d', (), [1]),
         ('kthvalue', (S,), (2, 0, True,), 'keepdim_dim_1d', (), [1]),
@@ -1123,6 +1125,8 @@ def method_tests():
         ('nanquantile', (), (0.5,), 'scalar'),
         ('median', (S, S, S), NO_ARGS),
         ('median', (S, S, S), (1,), 'dim', (), [0]),
+        ('median', (S, S, S), (1,), 'dim_alert_nondeterministic', (), [0],
+            [expectedAlertNondeterministic('median CUDA with indices output', 'cuda')]),
         ('median', (S, S, S), (1, True,), 'keepdim_dim', (), [0]),
         ('median', (), NO_ARGS, 'scalar'),
         ('median', (), (0,), 'scalar_dim', (), [0]),
@@ -1446,6 +1450,8 @@ def method_tests():
         ('index_add', (S, S), (0, index_variable(2, S), (2, S)), 'alert_nondeterministic', (), [0],
             [expectedAlertNondeterministic('index_add_cuda_', 'cuda')]),
         ('index_copy', (S, S), (0, index_perm_variable(2, S), (2, S)), 'dim', (), [0]),
+        ('index_copy', (S, S), (0, index_perm_variable(2, S), (2, S)), 'dim_alert_nondeterministic', (), [0],
+            [expectedAlertNondeterministic('index_copy')]),
         ('index_copy', (), (0, torch.tensor([0], dtype=torch.int64), (1,)), 'scalar_input_dim', (), [0]),
         ('index_copy', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim', (), [0]),
         ('index_fill', (S, S), (0, index_variable(2, S), 2), 'dim', (), [0]),
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 37465b956c7f..298727df152a 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -398,43 +398,73 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+# Context manager for setting deterministic flag and automatically
+# resetting it to its original value
+class DeterministicGuard:
+    def __init__(self, deterministic):
+        self.deterministic = deterministic
+
+    def __enter__(self):
+        self.deterministic_restore = torch.is_deterministic()
+        torch.set_deterministic(self.deterministic)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.set_deterministic(self.deterministic_restore)
+
 # This decorator can be used for API tests that call torch.set_deterministic().
 # When the test is finished, it will restore the previous deterministic flag
-# setting. Also, if CUDA >= 10.2, this will set the environment variable
-# CUBLAS_WORKSPACE_CONFIG=:4096:8 so that the error associated with that setting
-# is not thrown during the test unless the test changes that variable on purpose.
-# The previous CUBLAS_WORKSPACE_CONFIG setting will also be restored once the
-# test is finished.
+# setting.
+#
+# If CUDA >= 10.2, this will set the environment variable
+# CUBLAS_WORKSPACE_CONFIG=:4096:8 so that the error associated with that
+# setting is not thrown during the test unless the test changes that variable
+# on purpose. The previous CUBLAS_WORKSPACE_CONFIG setting will also be
+# restored once the test is finished.
+#
+# Note that if a test requires CUDA to actually register the changed
+# CUBLAS_WORKSPACE_CONFIG variable, a new subprocess must be created, because
+# CUDA only checks the variable when the runtime initializes. Tests can be
+# run inside a subprocess like so:
+#
+#   import subprocess, sys, os
+#   script = '''
+#   # Test code should go here
+#   '''
+#   try:
+#       subprocess.check_output(
+#           [sys.executable, '-c', script],
+#           stderr=subprocess.STDOUT,
+#           cwd=os.path.dirname(os.path.realpath(__file__)),
+#           env=os.environ.copy())
+#   except subprocess.CalledProcessError as e:
+#       error_message = e.output.decode('utf-8')
+#       # Handle exceptions raised by the subprocess here
+#
 def wrapDeterministicFlagAPITest(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
-        deterministic_restore = torch.is_deterministic()
-
-        is_cuda10_2_or_higher = (
-            (torch.version.cuda is not None)
-            and ([int(x) for x in torch.version.cuda.split(".")] >= [10, 2]))
-
-        if is_cuda10_2_or_higher:
-            cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
-            cublas_config_restore = os.environ.get(cublas_var_name)
-            os.environ[cublas_var_name] = ':4096:8'
-
-        def restore():
-            torch.set_deterministic(deterministic_restore)
-            if is_cuda10_2_or_higher:
-                cur_cublas_config = os.environ.get(cublas_var_name)
-                if cublas_config_restore is None:
-                    if cur_cublas_config is not None:
-                        del os.environ[cublas_var_name]
-                else:
-                    os.environ[cublas_var_name] = cublas_config_restore
-        try:
-            fn(*args, **kwargs)
-        except RuntimeError:
-            restore()
-            raise
-        else:
-            restore()
+        with DeterministicGuard(torch.is_deterministic()):
+            class CuBLASConfigGuard:
+                cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
+
+                def __enter__(self):
+                    self.is_cuda10_2_or_higher = (
+                        (torch.version.cuda is not None)
+                        and ([int(x) for x in torch.version.cuda.split(".")] >= [10, 2]))
+                    if self.is_cuda10_2_or_higher:
+                        self.cublas_config_restore = os.environ.get(self.cublas_var_name)
+                        os.environ[self.cublas_var_name] = ':4096:8'
+
+                def __exit__(self, exception_type, exception_value, traceback):
+                    if self.is_cuda10_2_or_higher:
+                        cur_cublas_config = os.environ.get(self.cublas_var_name)
+                        if self.cublas_config_restore is None:
+                            if cur_cublas_config is not None:
+                                del os.environ[self.cublas_var_name]
+                        else:
+                            os.environ[self.cublas_var_name] = self.cublas_config_restore
+            with CuBLASConfigGuard():
+                fn(*args, **kwargs)
     return wrapper
 
 def skipIfCompiledWithoutNumpy(fn):

From dabc286ab37d6a43029957af73ffc40d34161c4e Mon Sep 17 00:00:00 2001
From: jiej <jiej@nvidia.com>
Date: Thu, 3 Dec 2020 11:12:53 -0800
Subject: [PATCH 022/132] Remove output used only by sizes (#448) (#47665)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47665

Re-enabled the pass to remove outputs from fusion that is only used by aten::size;
Added size computation for reduction op via new operator prim::ReductionSizes;

Test Plan: Imported from OSS

Reviewed By: navahgar, jamesr66a

Differential Revision: D25254675

Pulled By: Krovatkin

fbshipit-source-id: e9a057b0287ed0ac93b415647fd8e5e836ba9856
---
 aten/src/ATen/core/interned_strings.h         |  1 +
 test/test_jit_cuda_fuser.py                   | 23 ++++++++++
 torch/csrc/jit/codegen/cuda/graph_fuser.cpp   | 42 ++++++++++++++++---
 torch/csrc/jit/codegen/cuda/parser.cpp        |  2 +-
 torch/csrc/jit/runtime/operator.cpp           |  1 +
 .../jit/runtime/register_prim_ops_fulljit.cpp | 28 +++++++++++++
 6 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 72cf48330b3a..5a0efffea261 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -27,6 +27,7 @@ namespace c10 {
   _(prim, Assign)                    \
   _(prim, BroadcastingChunk)         \
   _(prim, BroadcastSizes)            \
+  _(prim, ReductionSizes)            \
   _(prim, Constant)                  \
   _(prim, ChunkSizes)                \
   _(prim, Drop)                      \
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 591b774d3334..9143f770d630 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -780,6 +780,29 @@ def repro(x: torch.Tensor, alpha: float):
         repro_jit = torch.jit.script(repro)
         self._run_helper(repro_jit, repro, x, 0.6)
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction_sizes_op(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
+        y = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = x + y
+            o = torch.relu(o)
+            o = o.sum((1, 3))
+            return o.size()
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+        self.assertEqual(o, jit_o)
+        # since the output value is not used at all, the fusion operator should
+        # have been optimized away
+        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
+
 class TestPassManagerCudaFuser(JitTestCase):
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index f36dc51cb09d..e8299bd21450 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -679,7 +679,6 @@ struct CudaGraphFuser {
   // Builds up expressions that compute shapes of all intermediates (and
   // outputs) of the fusion group, based on the sizes of inputs. You should run
   // DCE to remove those that you end up not using.
-  /*
   std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
     WithInsertPoint insert_guard{fusion_group->next()};
     std::unordered_map<Value*, Value*> shape_of;
@@ -738,6 +737,38 @@ struct CudaGraphFuser {
         shape_of.emplace(outputs.at(outputs.size() - 1), last_size);
         continue;
       }
+      // extended shape expression support to reduction operations
+      // TODO: `aten::sum` is too flexible, we should restrict for a better
+      // match
+      if (n->kind() == aten::sum) {
+        // TODO: expand support to wire non-constant inputs, this is currently
+        // blocked by profiling executor not capable of profiling scalar inputs.
+        TORCH_INTERNAL_ASSERT(
+            n->input(1)->node()->kind() == prim::Constant &&
+                n->input(2)->node()->kind() == prim::Constant,
+            "only supports reduction axes and keepdim being constant");
+
+        // hmmm, do I need to setInsertPoint...
+        Node* in1_const =
+            graph->createClone(n->input(1)->node(), [](Value*) -> Value* {
+              throw std::runtime_error("unexpected input");
+            });
+        graph->insertNode(in1_const);
+        Node* in2_const =
+            graph->createClone(n->input(2)->node(), [](Value*) -> Value* {
+              throw std::runtime_error("unexpected input");
+            });
+        graph->insertNode(in2_const);
+
+        std::vector<Value*> inputs = {
+            shape_of.at(n->input(0)), in1_const->output(), in2_const->output()};
+        Node* size_node =
+            graph->insertNode(graph->create(prim::ReductionSizes, inputs, 1));
+        Value* size = size_node->output(0);
+        size->setType(ListType::ofInts());
+        shape_of.emplace(n->output(), size);
+        continue;
+      }
       auto tensor_inputs = filter(n->inputs(), [](Value* v) {
         return v->type()->isSubtypeOf(TensorType::get());
       });
@@ -755,6 +786,8 @@ struct CudaGraphFuser {
       return;
     auto subgraph = fusion_group->g(attr::Subgraph);
 
+    // TODO: failure in buildShapeExpressions should not break fusion execution,
+    // we can add a try/catch here to bailout from removeOutputsUsedOnlyInSize.
     auto shape_of = buildShapeExpressions(fusion_group);
     auto outputs = fusion_group->outputs().vec();
     auto soutputs = subgraph->outputs().vec();
@@ -776,7 +809,6 @@ struct CudaGraphFuser {
       }
     }
   }
-  */
 
   void refreshAliasDb() {
     aliasDb_ = torch::make_unique<AliasDb>(graph_);
@@ -837,9 +869,9 @@ struct CudaGraphFuser {
     //}
 
     // Remove outputs that have been added only because we need their size
-    // for (Node* n : block_->nodes()) {
-    //  removeOutputsUsedOnlyInSize(n);
-    //}
+    for (Node* n : block_->nodes()) {
+      removeOutputsUsedOnlyInSize(n);
+    }
 
     for (Node* node : block_->nodes()) {
       for (Block* sub_block : node->blocks()) {
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index d68b900dfa45..df250e061a89 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -474,7 +474,7 @@ class IrParser {
             value_map.emplace(node->output()->unique(), out);
           },
           [](const Node* node) -> bool {
-            // we don't support cast of output types yet;
+            // TODO: support cast of output types yet;
             if (!node->inputs()[3]->type()->isSubtypeOf(
                     static_cast<c10::TypePtr>(NoneType::get()))) {
               // We can only handle output as half and float;
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index dc1ff95cf735..0756d6b58e9f 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -230,6 +230,7 @@ bool printerHasSpecialCaseFor(Symbol sym) {
       prim::ConstantChunk, // optimization pass adds it
       prim::DifferentiableGraph, // optimization pass adds it,
       prim::FunctionalGraph, // optimization pass adds it,
+      prim::ReductionSizes, // optimization pass (fuser) adds it
       prim::BroadcastSizes, // optimization pass (fuser) adds it
       prim::ChunkSizes, // optimization pass (fuser) adds it
       prim::Drop, // used in interpreter only
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index 68b9b54dd42c..0be346246656 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -337,6 +337,34 @@ RegisterOperators reg(
          "prim::AutogradZero() -> Tensor",
          [](Stack* stack) { stack->emplace_back(at::Tensor()); },
          aliasAnalysisSpecialCase()),
+     Operator(
+         "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]",
+         [](Stack* stack) {
+           bool keepdim = pop(stack).toBool();
+           c10::List<int64_t> axes = pop(stack).toIntList();
+           c10::List<int64_t> size = pop(stack).toIntList();
+           if (keepdim) {
+             for (const auto& axis : axes) {
+               size.set(axis, 1);
+             }
+           } else {
+             int64_t index = 0;
+             auto iter = size.begin();
+             std::sort(axes.begin(), axes.end());
+             for (const auto& axis : axes) {
+               // move iter to the next axis
+               iter += axis - index;
+
+               // input iter points to axis and is updated to axis + 1
+               iter = size.erase(iter);
+
+               // update current index for iter
+               index = axis + 1;
+             }
+           }
+           push(stack, IValue(std::move(size)));
+         },
+         aliasAnalysisFromSchema()),
      Operator(
          "prim::BroadcastSizes(...) -> int[]",
          [](Stack* stack) {

From b726a1bbf8a0295bc1abff0bf0a348d0dbf27c91 Mon Sep 17 00:00:00 2001
From: Hector Yuen <hyz@fb.com>
Date: Thu, 3 Dec 2020 11:17:05 -0800
Subject: [PATCH 023/132] quantize bias of the quantization parameters (#48749)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48749

this change reverts D25179863 (https://github.com/pytorch/pytorch/commit/55e225a2dc1529d9c68d5f8b333b155bd5b5b334) because in 1.0.0.14 this behavior got
reintroduced
we believe this was already working pre 1.0.0.9, then intel regressed which is
why we had to remove this quantization section, and in 1.0.0.14 they fixed it

Test Plan:
we tested ctr_instagram_5x which now passes with bitwise matching
hl475 will test the top6 models and if they match, we will use this point
to lock any further changes in the future

Reviewed By: venkatacrc

Differential Revision: D25283605

fbshipit-source-id: 33aa9af008c113d4d61e3461a44932b502bf42ea
---
 caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py | 7 ++++++-
 caffe2/quantization/server/fbgemm_pack_op.cc       | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
index 5a91a00706ff..7f51523cb616 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@@ -8,7 +8,12 @@
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
 
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
+core.GlobalInit(["caffe2",
+                 "--caffe2_log_level=-3",
+                 "--glow_global_fp16=1",
+                 "--glow_clip_quant_range_to_fp16=1",
+                 "--glow_global_fp16_constants=1"
+                 ])
 
 
 class Int8OpsTest(serial.SerializedTestCase):
diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
index bcd5dde63cf9..25e396cf8919 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -207,6 +207,13 @@ void QuantizeConvBias(
         bias.data<int32_t>(), bias.data<int32_t>() + bias.numel());
   } else {
     const float* bdata = bias.data<float>();
+    vector<float> bdata_local;
+    if (use_fp16) {
+      bdata_local.resize(bias.numel());
+      fbgemm::RoundToFloat16(
+              bdata, bdata_local.data(), bias.numel(), false /* FLAGS_caffe2_fbgemm_fake_fp16_clamp */);
+      bdata = bdata_local.data();
+    }
     b_quantized.resize(bias.numel());
     for (int g = 0; g < filter_qparams.size(); ++g) {
       int i_begin = g * (M / filter_qparams.size());

From c134f3283515ce135c71049adfe923e3fb5581fa Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Thu, 3 Dec 2020 11:36:04 -0800
Subject: [PATCH 024/132] Implemented torch.inner (#46716)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46716

Implemented torch.inner similar to [numpy.inner](https://numpy.org/doc/stable/reference/generated/numpy.inner.html). For now it's implemented as a composite op.

TODO

- [x] Add documentation

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D24860351

Pulled By: heitorschueroff

fbshipit-source-id: de5c82f285893495491fdba73b35634f4d00bac8
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/native/LinearAlgebra.cpp        | 40 +++++++++++++
 aten/src/ATen/native/native_functions.yaml    |  6 ++
 docs/source/tensors.rst                       |  1 +
 docs/source/torch.rst                         |  1 +
 test/test_linalg.py                           | 42 ++++++++++++++
 torch/_tensor_docs.py                         |  6 ++
 torch/_torch_docs.py                          | 58 +++++++++++++++++++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   |  2 +
 10 files changed, 158 insertions(+)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index e3a855b825a0..817ccb210692 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -383,6 +383,7 @@ _(aten, index_fill) \
 _(aten, index_put) \
 _(aten, index_select) \
 _(aten, indices) \
+_(aten, inner) \
 _(aten, instance_norm) \
 _(aten, inverse) \
 _(aten, irfft) \
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index afd4ec15d25f..11050a7303d8 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -310,6 +310,46 @@ Tensor ger(const Tensor& self, const Tensor& vec2) {
   return self.outer(vec2);
 }
 
+Tensor& inner_out(Tensor& out, const Tensor& self, const Tensor& other) {
+  checkDeviceType("inner()", {out, self, other}, self.device().type());
+
+  // If either self or other is a scalar just multiply them
+  if (self.dim() == 0 || other.dim() == 0) {
+    at::mul_out(out, self, other);
+    return out;
+  }
+
+  // Last dimension should match (tensordot does not enforce this)
+  TORCH_CHECK(
+      self.size(-1) == other.size(-1),
+      "inner() the last dimension must match on both input tensors but got shapes ",
+      self.sizes(),
+      " and ",
+      other.sizes());
+  
+  at::tensordot_out(out, self, other, -1, -1);
+  return out;
+}
+
+Tensor inner(const Tensor& self, const Tensor& other) {
+  checkDeviceType("inner()", {self, other}, self.device().type());
+
+  // If either self or other is a scalar just multiply them
+  if (self.dim() == 0 || other.dim() == 0) {
+    return self * other;
+  }
+
+  // Last dimension should match (tensordot does not enforce this)
+  TORCH_CHECK(
+      self.size(-1) == other.size(-1),
+      "inner() the last dimension must match on both input tensors but got shapes ",
+      self.sizes(),
+      " and ",
+      other.sizes());
+
+  return at::tensordot(self, other, -1, -1);
+}
+
 Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
   check_1d(self, "self", "outer");
   check_1d(vec2, "vec2", "outer");
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8f237f9e2058..790fc0ea01f8 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9455,6 +9455,12 @@
   dispatch:
     DefaultBackend: linalg_eigvalsh_out
 
+- func: inner(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
 # torch.outer, alias for torch.ger
 - func: outer(Tensor self, Tensor vec2) -> Tensor
   use_c10_dispatcher: full
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 3f12004062cf..578f6a8b8a0e 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -371,6 +371,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: index_select
    .. automethod:: indices
       :noindex:
+   .. automethod:: inner
    .. automethod:: int
    .. automethod:: int_repr
    .. automethod:: inverse
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index d7c80de22189..ca288ff0ef6a 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -511,6 +511,7 @@ BLAS and LAPACK Operations
     eig
     geqrf
     ger
+    inner
     inverse
     det
     logdet
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 71c3cf654c1b..944513b82e0a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -41,6 +41,48 @@
 class TestLinalg(TestCase):
     exact_dtype = True
 
+    @dtypes(torch.float, torch.cfloat)
+    @precisionOverride({torch.float: 1e-06, torch.cfloat: 1e-06})
+    def test_inner(self, device, dtype):
+        def check(a_sizes_, b_sizes_):
+            for a_sizes, b_sizes in ((a_sizes_, b_sizes_), (b_sizes_, a_sizes_)):
+                a = torch.randn(a_sizes, dtype=dtype, device=device)
+                b = torch.randn(b_sizes, dtype=dtype, device=device)
+                res = torch.inner(a, b)
+                ref = np.inner(a.cpu().numpy(), b.cpu().numpy())
+                self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
+                out = torch.zeros_like(res)
+                torch.inner(a, b, out=out)
+                self.assertEqual(res, out)
+
+        check([], [])                       # scalar x scalar
+        check([], [0])                      # scalar x empty
+        check([], [3])                      # scalar x 1D
+        check([], [2, 3, 4])                # scalar x 3D
+
+        check([0], [0])                     # empty x empty
+        check([0], [2, 0])                  # empty x 2D
+
+        check([2], [2])                     # 1D x 1D
+        check([2], [3, 1, 2])               # 1D x 3D
+        check([2], [3, 0, 2])               # 1D x 3D empty
+
+        check([1, 2], [3, 2])               # 2D x 2D
+        check([1, 2], [3, 4, 2])            # 2D x 3D
+        check([2, 1, 3, 2], [1, 3, 2, 2])   # 4D x 4D
+
+        # Test discontiguous input
+        a = torch.randn(3, 2, device=device, dtype=dtype).transpose_(0, 1)
+        b = torch.randn(4, 3, device=device, dtype=dtype)[::2, :]
+        self.assertFalse(a.is_contiguous() or b.is_contiguous())
+        self.assertEqual(a.inner(b).cpu().numpy(), np.inner(a.cpu().numpy(), b.cpu().numpy()))
+
+        # Test error message
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"inner\(\) the last dimension must match on both "
+                                    r"input tensors but got shapes \[2, 3\] and \[2, 2\]"):
+            torch.randn(2, 3, device=device, dtype=dtype).inner(torch.randn(2, 2, device=device, dtype=dtype))
+
     # Tests torch.outer, and its alias, torch.ger, vs. NumPy
     @precisionOverride({torch.bfloat16: 1e-1})
     @dtypes(*(torch.testing.get_all_dtypes()))
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index a94291eba18c..9a4b90efb5a9 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1557,6 +1557,12 @@ def add_docstr_all(method, docstr):
 See :func:`torch.ger`
 """)
 
+add_docstr_all('inner', r"""
+inner(other) -> Tensor
+
+See :func:`torch.inner`.
+""")
+
 add_docstr_all('outer', r"""
 outer(vec2) -> Tensor
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 20bf9ca3ad84..022170efcd63 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3193,6 +3193,64 @@ def merge_dicts(*dicts):
 
 """)
 
+add_docstr(torch.inner, r"""
+inner(input, other, *, out=None) -> Tensor
+
+Computes the dot product for 1D tensors. For higher dimensions, sums the product
+of elements from :attr:`input` and :attr:`other` along their last dimension.
+
+.. note::
+
+    If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent
+    to `torch.mul(input, other)`.
+
+    If both :attr:`input` and :attr:`other` are non-scalars, the size of their last
+    dimension must match and the result is equivalent to `torch.tensordot(input,
+    other, dims=([-1], [-1]))`
+
+Args:
+    input (Tensor): First input tensor
+    other (Tensor): Second input tensor
+
+Keyword args:
+    out (Tensor, optional): Optional output tensor to write result into. The output
+                            shape is `input.shape[:-1] + other.shape[:-1]`.
+
+Example::
+
+    # Dot product
+    >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1]))
+    tensor(7)
+
+    # Multidimensional input tensors
+    >>> a = torch.randn(2, 3)
+    >>> a
+    tensor([[0.8173, 1.0874, 1.1784],
+            [0.3279, 0.1234, 2.7894]])
+    >>> b = torch.randn(2, 4, 3)
+    >>> b
+    tensor([[[-0.4682, -0.7159,  0.1506],
+            [ 0.4034, -0.3657,  1.0387],
+            [ 0.9892, -0.6684,  0.1774],
+            [ 0.9482,  1.3261,  0.3917]],
+
+            [[ 0.4537,  0.7493,  1.1724],
+            [ 0.2291,  0.5749, -0.2267],
+            [-0.7920,  0.3607, -0.3701],
+            [ 1.3666, -0.5850, -1.7242]]])
+    >>> torch.inner(a, b)
+    tensor([[[-0.9837,  1.1560,  0.2907,  2.6785],
+            [ 2.5671,  0.5452, -0.6912, -1.5509]],
+
+            [[ 0.1782,  2.9843,  0.7366,  1.5672],
+            [ 3.5115, -0.4864, -1.2476, -4.4337]]])
+
+    # Scalar input
+    >>> torch.inner(a, torch.tensor(2))
+    tensor([[1.6347, 2.1748, 2.3567],
+            [0.6558, 0.2469, 5.5787]])
+""")
+
 add_docstr(torch.outer, r"""
 outer(input, vec2, *, out=None) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index f7b9bedf9106..495f1435abee 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -408,6 +408,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.greater_equal: lambda input, other, out=None: -1,
         torch.geqrf: lambda input, out=None: -1,
         torch.i0: lambda input, out=None: -1,
+        torch.inner: lambda input, other, out=None: -1,
         torch.outer: lambda input, vec2, out=None: -1,  # alias for torch.ger
         torch.ger: lambda input, vec2, out=None: -1,
         torch.grid_sampler: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 116b47320a64..0b245ec108ae 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1291,6 +1291,8 @@ def method_tests():
         ('bmm', (M, S, M), ((M, M, S),), '', (True,)),
         ('mv', (S, M), ((M,),), '', (True,)),
         ('ger', (S,), ((M,),)),
+        ('inner', (S,), ((S,),), "1d_1d", (False,)),
+        ('inner', (), ((S, S),), "scalar_2d", (False,)),
         ('matmul', (L,), ((L,),), '', (True,)),
         ('matmul', (S, M), ((M,),), "2d_1d", (True,)),
         ('matmul', (M,), ((M, S),), "1d_2d", (True,)),

From 47aa2536328afc51876b2e04384c0cfe71ee1f06 Mon Sep 17 00:00:00 2001
From: x00480351 <xiekaiyuan@huawei.com>
Date: Thu, 3 Dec 2020 11:36:51 -0800
Subject: [PATCH 025/132] [Feature] Allow user to specify a fraction of the GPU
 memory. (#48172)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Add a new function,  torch.cuda.set_per_process_memory_fraction(fraction, device), to torch.cuda.  Related:  https://github.com/pytorch/pytorch/issues/18626
The fraction (float type, from 0 to 1) is used to limit memory  of cashing allocator on GPU device .  One can set it on any visible GPU. The allowed memory equals total memory * fraction. It will raise an OOM error when  try to apply GPU memory more than the allowed value. This function is similar to Tensorflow's per_process_gpu_memory_fraction
Note， this setting is just limit the cashing allocator in one process. If you are using multiprocess, you need to put this setting in to the subprocess to limit its GPU memory, because subprocess could have its own allocator.

## usage
In some cases, one needs to split a GPU device as two parts. Can set limitation before GPU memory using.
Eg. device: 0, each part takes half memory, the code as follows:
```
torch.cuda.set_per_process_memory_fraction(0.5, 0)
```
There is an example to show what it is.
```python
import torch
torch.cuda.set_per_process_memory_fraction(0.5, 0)
torch.cuda.empty_cache()
total_memory = torch.cuda.get_device_properties(0).total_memory
# less than 0.5 will be ok:
tmp_tensor = torch.empty(int(total_memory * 0.499), dtype=torch.int8, device='cuda')
del tmp_tensordel tmp_tensor
torch.cuda.empty_cache()
# this allocation will raise a OOM:
torch.empty(total_memory // 2, dtype=torch.int8, device='cuda')

"""
It raises an error as follows:
RuntimeError: CUDA out of memory. Tried to allocate 5.59 GiB (GPU 0; 11.17 GiB total capacity; 0 bytes already allocated; 10.91 GiB free; 5.59 GiB allowed; 0 bytes reserved in total by PyTorch)
"""
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48172

Reviewed By: bdhirsh

Differential Revision: D25275381

Pulled By: VitalyFedyunin

fbshipit-source-id: d8e7af31902c2eb795d416b57011cc8a22891b8f
---
 c10/cuda/CUDACachingAllocator.cpp | 54 ++++++++++++++++++++++++++++++-
 c10/cuda/CUDACachingAllocator.h   |  1 +
 test/test_cuda.py                 | 29 +++++++++++++++++
 torch/_C/__init__.pyi.in          |  1 +
 torch/csrc/cuda/Module.cpp        | 23 +++++++++++++
 torch/cuda/memory.py              | 27 ++++++++++++++++
 6 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 70c6ac9f3cd4..0b5d2992538c 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -202,6 +202,13 @@ class DeviceCachingAllocator {
   // outstanding cuda events
   std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
 
+  // record used memory.
+  size_t total_allocated_memory = 0;
+
+  size_t allowed_memory_maximum = 0;
+
+  bool set_fraction = false;
+
  public:
 
   DeviceCachingAllocator() :
@@ -241,10 +248,16 @@ class DeviceCachingAllocator {
         size_t device_free;
         size_t device_total;
         C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+        std::string allowed_info;
+
+        if (set_fraction) {
+          allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+        }
 
         stats.num_ooms += 1;
 
         // "total capacity": total global memory on GPU
+        // "allowed": memory is allowed to use, which set by fraction.
         // "already allocated": memory allocated by the program using the
         //                      caching allocator
         // "free": free memory as reported by the CUDA API
@@ -268,6 +281,7 @@ class DeviceCachingAllocator {
           format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
           " already allocated; ",
           format_size(device_free), " free; ",
+          allowed_info,
           format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
           " reserved in total by PyTorch)");
       } else {
@@ -373,6 +387,15 @@ class DeviceCachingAllocator {
     block->stream_uses.insert(stream);
   }
 
+  /** set memory fraction to limit maximum allocated memory **/
+  void setMemoryFraction(double fraction) {
+    size_t device_free;
+    size_t device_total;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    set_fraction = true;
+  }
+
   /** returns cached blocks to the system allocator **/
   void emptyCache() {
     std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -630,14 +653,19 @@ class DeviceCachingAllocator {
     if (isRetry) {
       stats.num_alloc_retries += 1;
     }
+    if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
+      p.err = cudaErrorMemoryAllocation;
+    } else {
+      p.err = cudaMalloc(&ptr, size);
+    }
 
-    p.err = cudaMalloc(&ptr, size);
     if (p.err != cudaSuccess) {
       if (!isRetry || p.err == cudaErrorMemoryAllocation)
         cudaGetLastError();  // clear CUDA error
       return false;
     }
 
+    total_allocated_memory += size;
     p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
     update_stat_array(stats.segment, 1, p.stat_types);
     update_stat_array(stats.reserved_bytes, size, p.stat_types);
@@ -665,6 +693,7 @@ class DeviceCachingAllocator {
       Block* block = *it;
       if (!block->prev && !block->next) {
         C10_CUDA_CHECK(cudaFree((void*)block->ptr));
+        total_allocated_memory -= block->size;
 
         StatTypes stat_types;
         stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
@@ -846,6 +875,25 @@ class THCCachingAllocator {
     device_allocator[block->device]->free(block);
   }
 
+  void setMemoryFraction(double fraction, int device) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && device < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction  && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+    int activated_device;
+    cudaGetDevice (&activated_device);
+    if (activated_device != device) {
+        cudaSetDevice(device);
+    }
+    device_allocator[device]->setMemoryFraction(fraction);
+  }
+
   void emptyCache() {
     int count = device_allocator.size();
     for (int i = 0; i < count; i++)
@@ -942,6 +990,10 @@ void init(int device_count) {
   caching_allocator.init(device_count);
 }
 
+void setMemoryFraction(double fraction, int device) {
+  caching_allocator.setMemoryFraction(fraction, device);
+}
+
 void emptyCache(void) {
   caching_allocator.emptyCache();
 }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 7b9ce4b3211f..8af8ec5073fe 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -111,6 +111,7 @@ C10_CUDA_API void raw_delete(void* ptr);
 
 C10_CUDA_API Allocator* get();
 C10_CUDA_API void init(int device_count);
+C10_CUDA_API void setMemoryFraction(double fraction, int device);
 C10_CUDA_API void emptyCache();
 C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
 C10_CUDA_API void* getBaseAllocation(void *ptr, size_t *size);
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 17e3942c001d..2a5754523876 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -391,6 +391,35 @@ def test_out_of_memory(self):
         tensor.fill_(1)
         self.assertTrue((tensor == 1).all())
 
+    def test_set_per_process_memory_fraction(self):
+        # test invalid fraction value.
+        with self.assertRaisesRegex(TypeError, "Invalid type"):
+            torch.cuda.set_per_process_memory_fraction(int(1))
+        with self.assertRaisesRegex(ValueError, "Invalid fraction value"):
+            torch.cuda.set_per_process_memory_fraction(-0.1)
+        with self.assertRaisesRegex(ValueError, "Invalid fraction value"):
+            torch.cuda.set_per_process_memory_fraction(2.0)
+
+        tensor = torch.zeros(1024, device='cuda')
+        torch.cuda.empty_cache()
+        total_memory = torch.cuda.get_device_properties(0).total_memory
+        torch.cuda.set_per_process_memory_fraction(0.5, 0)
+
+        # test 0.499 allocation is ok.
+        application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved()
+        tmp_tensor = torch.empty(application, dtype=torch.int8, device='cuda')
+        del tmp_tensor
+        torch.cuda.empty_cache()
+
+        application = int(total_memory * 0.5)
+        # it will get OOM when try to allocate more than half memory.
+        with self.assertRaisesRegex(RuntimeError, "out of memory"):
+            torch.empty(application, dtype=torch.int8, device='cuda')
+
+        # ensure out of memory error doesn't disturb subsequent kernel
+        tensor.fill_(1)
+        self.assertTrue((tensor == 1).all())
+
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_autogpu(self):
         x = torch.randn(5, 5).cuda()
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 31c38c5ef53d..cbb5b2452e21 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -580,6 +580,7 @@ def _cuda_getCompiledVersion() -> _int: ...
 def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
+def _cuda_setMemoryFraction(fraction: _float, device: _int) -> None: ...
 def _cuda_emptyCache() -> None: ...
 def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
 def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 5c28773b7447..f9a02c730141 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -263,6 +263,28 @@ PyObject * THCPModule_hasPrimaryContext(PyObject *_unused, PyObject *arg)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject * THCPModule_setMemoryFraction(PyObject *_unused, PyObject *args)
+{
+  HANDLE_TH_ERRORS
+  PyObject* fraction_o = nullptr;
+  PyObject* device_o = nullptr;
+  if(!PyArg_ParseTuple(args, "OO", &fraction_o, &device_o)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "set_memory_fraction",
+        1,
+        "(double fraction, int device);");
+    return nullptr;
+  }
+  double fraction = PyFloat_AsDouble(fraction_o);
+  int64_t device = PyLong_AsLongLong(device_o);
+
+  c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
 PyObject * THCPModule_emptyCache(PyObject *_unused, PyObject *noargs)
 {
   HANDLE_TH_ERRORS
@@ -498,6 +520,7 @@ static struct PyMethodDef _THCPModule_methods[] = {
   {"_cuda_setStream",    THCPModule_setStream_wrap,  METH_O, nullptr},
   {"_cuda_getCompiledVersion", THCPModule_getCompiledVersion, METH_NOARGS, nullptr},
   {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext,  METH_O,  nullptr},
+  {"_cuda_setMemoryFraction", THCPModule_setMemoryFraction, METH_VARARGS,  nullptr},
   {"_cuda_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
   {"_cuda_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
   {"_cuda_resetAccumulatedMemoryStats", THCPModule_resetAccumulatedMemoryStats, METH_O, nullptr},
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 6c2b1b867862..c67742820076 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -72,6 +72,33 @@ def caching_allocator_delete(mem_ptr):
     torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
 
 
+def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None) -> None:
+    r"""Set memory fraction for a process.
+    The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
+    The allowed value equals the total visible memory multiplied fraction.
+    If trying to allocate more than the allowed value in a process, will raise an out of
+    memory error in allocator.
+
+    Arguments:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+    .. note::
+        In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if not isinstance(fraction, float):
+        raise TypeError('Invalid type for fraction argument, must be `float`')
+    if fraction < 0 or fraction > 1:
+        raise ValueError('Invalid fraction value: {}. '
+                         'Allowed range: 0~1'.format(fraction))
+
+    torch._C._cuda_setMemoryFraction(fraction, device)
+
+
 def empty_cache() -> None:
     r"""Releases all unoccupied cached memory currently held by the caching
     allocator so that those can be used in other GPU application and visible in

From 1eed54d17a8ae686c79ce716db35bc6ead97c4cd Mon Sep 17 00:00:00 2001
From: pinzhenx <pinzhen.xu@intel.com>
Date: Thu, 3 Dec 2020 11:40:09 -0800
Subject: [PATCH 026/132] Upgrade oneDNN (mkl-dnn) to v1.7 (#47853)

Summary:
Bump oneDNN (mkl-dnn) to 1.7 for bug fixes and performance optimizations
- Fixes https://github.com/pytorch/pytorch/issues/42115. Fixed build issue on Windows for the case when oneDNN is built as submodule
- Fixes https://github.com/pytorch/pytorch/issues/45746. Fixed segmentation fault for convolution weight gradient on systems with Intel AVX512 support

This PR also contains a few changes in ideep for follow-up update (not enabled in current PR yet):
- Performance improvements for the CPU path of Convolution
- Channel-last support

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47853

Reviewed By: bdhirsh

Differential Revision: D25275268

Pulled By: VitalyFedyunin

fbshipit-source-id: 75a589d57e3d19a7f23272a67045ad7494f1bdbe
---
 .../installation-helpers/install_mkl.bat                 | 2 +-
 aten/src/ATen/native/CompositeRandomAccessorCommon.h     | 2 +-
 caffe2/ideep/ideep_utils.h                               | 2 +-
 third_party/ideep                                        | 2 +-
 third_party/mkl-dnn.BUILD                                | 9 ++++++---
 torch/csrc/python_headers.h                              | 2 +-
 6 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
index 1fa84920cd70..656a5494ea3f 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
@@ -7,4 +7,4 @@ if "%REBUILD%"=="" (
   7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
 )
 set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
-set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB
+set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
diff --git a/aten/src/ATen/native/CompositeRandomAccessorCommon.h b/aten/src/ATen/native/CompositeRandomAccessorCommon.h
index 256ae5b0d98f..683af2b4d426 100644
--- a/aten/src/ATen/native/CompositeRandomAccessorCommon.h
+++ b/aten/src/ATen/native/CompositeRandomAccessorCommon.h
@@ -129,7 +129,7 @@ class CompositeRandomAccessor {
 
   // Pointer-like operations {
   C10_HOST_DEVICE
-  reference operator*() {
+  reference operator*() const {
     return TupleInfo::tie(*keys, *values);
   }
 
diff --git a/caffe2/ideep/ideep_utils.h b/caffe2/ideep/ideep_utils.h
index 947d1b337ab3..b1b3aae3a8ee 100644
--- a/caffe2/ideep/ideep_utils.h
+++ b/caffe2/ideep/ideep_utils.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <caffe2/core/macros.h>  // For caffe2 macros.
-
+#include <caffe2/utils/eigen_utils.h>
 // All caffe2 ideep related headers
 #include <ideep.hpp>
 #include <caffe2/ideep/utils/ideep_context.h>
diff --git a/third_party/ideep b/third_party/ideep
index ba885200dbbc..f0280bb805c2 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit ba885200dbbc1f144c7b58eba487378eb324f281
+Subproject commit f0280bb805c2dedd4bb5dd4765cda7dfcd30989f
diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD
index c4491b10e111..9af253fde189 100644
--- a/third_party/mkl-dnn.BUILD
+++ b/third_party/mkl-dnn.BUILD
@@ -7,9 +7,9 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "5",
+        "@DNNL_VERSION_MINOR@": "7",
         "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "e2ac1fac44c5078ca927cb9b90e1b3066a0b2ed0",
+        "@DNNL_VERSION_HASH@": "2e4732679f0211bb311780d0f383cf2dce9baca7",
     },
 )
 
@@ -30,6 +30,8 @@ cc_library(
     srcs = glob([
         "src/common/*.cpp",
         "src/cpu/**/*.cpp",
+    ], exclude=[
+        "src/cpu/aarch64/*.cpp",
     ]),
     hdrs = glob([
         "include/*.h",
@@ -38,7 +40,8 @@ cc_library(
         "src/cpu/**/*.hpp",
         "src/cpu/**/*.h",
         "src/common/*.hpp",
-        "src/cpu/rnn/*.hpp",
+    ], exclude=[
+        "src/cpu/aarch64/*.hpp",
     ]) + [
         "include/dnnl_version.h",
         "include/dnnl_config.h",
diff --git a/torch/csrc/python_headers.h b/torch/csrc/python_headers.h
index 9c670b5202b0..2a64bdd5c6ee 100644
--- a/torch/csrc/python_headers.h
+++ b/torch/csrc/python_headers.h
@@ -1,5 +1,5 @@
 #pragma once
-
+#include <math.h>
 // workaround for Python 2 issue: https://bugs.python.org/issue17120
 // NOTE: It looks like this affects Python 3 as well.
 #pragma push_macro("_XOPEN_SOURCE")

From e7038a77256c9b479d9e82a3552c78b26130978d Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Thu, 3 Dec 2020 12:36:55 -0800
Subject: [PATCH 027/132] Improve an autograd warning (#48765)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48764

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48765

Reviewed By: heitorschueroff

Differential Revision: D25304145

Pulled By: albanD

fbshipit-source-id: e818413bf92ad0aa382eda77448183b9fd7d5e77
---
 torch/autograd/gradcheck.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 9b1ad2675fe5..fe4e6ec42703 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -297,7 +297,7 @@ def fail_test(msg):
         if is_tensor_like(inp) and inp.requires_grad:
             if not (inp.dtype == torch.float64 or inp.dtype == torch.complex128):
                 warnings.warn(
-                    'The {}th input requires gradient and '
+                    f'Input #{idx} requires gradient and '
                     'is not a double precision floating point or complex. '
                     'This check will likely fail if all the inputs are '
                     'not of double precision floating point or complex. ')

From b3ac628081bed09542d4e3bbe5f428027a36e5ae Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Thu, 3 Dec 2020 14:39:18 -0800
Subject: [PATCH 028/132] [JIT] Fix bug in get_annotation_str for ast.Subscript
 (#48741)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48741

**Summary**
This commit fixes a bug in the handling of `ast.Subscript` inside
`get_annotation_str`. `annotation.value` (which contains the AST node
representing the container name) should also be processed using
`get_annotation_str`.

**Test Plan**
This commit adds a unit test to `TestClassType` based on the test case
from the issue that reported this bug.

**Fixes**
This commit fixes #47570.

Test Plan: Imported from OSS

Reviewed By: ppwwyyxx

Differential Revision: D25286013

Pulled By: SplitInfinity

fbshipit-source-id: 61a9e5dc16d9f87b80578f78d537f91332093e52
---
 test/jit/test_class_type.py | 24 ++++++++++++++++++++++--
 torch/_jit_internal.py      |  2 +-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 3cf60c38f343..b4075dba14c8 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -1350,11 +1350,31 @@ def call_i():
             a = A()
             return a.i([3])
 
-        def call_o():
+        def call_j():
             a = A()
             return a.j([torch.device("cpu"), torch.device("cpu")])
 
-        for fn in [call_f, call_g, call_i, call_o]:
+        for fn in [call_f, call_g, call_i, call_j]:
             self.checkScript(fn, ())
             s = self.getExportImportCopy(torch.jit.script(fn))
             self.assertEqual(s(), fn())
+
+    def test_recursive_script_module_builtin_type_resolution(self):
+        """
+        Test resolution of built-in torch types(e.g. torch.Tensor, torch.device) when a class is recursively compiled
+        when compiling a module.
+        """
+        class Wrapper():
+            def __init__(self, t):
+                self.t = t
+
+            def to(self, l: List[torch.device], device: Optional[torch.device] = None):
+                return self.t.to(device=device)
+
+
+        class A(nn.Module):
+            def forward(self):
+                return Wrapper(torch.rand(4, 4))
+
+        scripted = torch.jit.script(A())
+        self.getExportImportCopy(scripted)
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index b0529cf0e923..906439320a2c 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -238,7 +238,7 @@ def get_annotation_str(annotation):
     elif isinstance(annotation, ast.Attribute):
         return '.'.join([get_annotation_str(annotation.value), annotation.attr])
     elif isinstance(annotation, ast.Subscript):
-        return f"{annotation.value}[{get_annotation_str(annotation.slice.value)}]"  # type: ignore
+        return f"{get_annotation_str(annotation.value)}[{get_annotation_str(annotation.slice.value)}]"  # type: ignore
     elif isinstance(annotation, ast.Tuple):
         return ','.join([get_annotation_str(elt) for elt in annotation.elts])
     elif isinstance(annotation, ast.Constant) or isinstance(annotation, ast.NameConstant):

From cc1c3063c57964ec275292951733e4457535ba44 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@fb.com>
Date: Thu, 3 Dec 2020 15:27:13 -0800
Subject: [PATCH 029/132] Add test binary to compare torch model outputs
 (#47933)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47933

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D25309199

Pulled By: SS-JIA

fbshipit-source-id: adc3fc7db33c251f6b661916265b86b7b8c68fc2
---
 binaries/CMakeLists.txt          |   2 +
 binaries/compare_models_torch.cc | 262 +++++++++++++++++++++++++++++++
 2 files changed, 264 insertions(+)
 create mode 100644 binaries/compare_models_torch.cc

diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index 075bc05b5ecf..74df5089e4e3 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -4,6 +4,7 @@ if(INTERN_BUILD_MOBILE)
     caffe2_binary_target("speed_benchmark.cc")
   else()
     caffe2_binary_target("speed_benchmark_torch.cc")
+    caffe2_binary_target("compare_models_torch.cc")
   endif()
   return()
 endif()
@@ -33,6 +34,7 @@ caffe2_binary_target("print_registered_core_operators.cc")
 caffe2_binary_target("run_plan.cc")
 caffe2_binary_target("speed_benchmark.cc")
 caffe2_binary_target("speed_benchmark_torch.cc")
+caffe2_binary_target("compare_models_torch.cc")
 caffe2_binary_target("split_db.cc")
 
 caffe2_binary_target("db_throughput.cc")
diff --git a/binaries/compare_models_torch.cc b/binaries/compare_models_torch.cc
new file mode 100644
index 000000000000..6275087fd4fa
--- /dev/null
+++ b/binaries/compare_models_torch.cc
@@ -0,0 +1,262 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <caffe2/core/timer.h>
+#include <caffe2/utils/string_utils.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/script.h>
+
+#include <c10/mobile/CPUCachingAllocator.h>
+
+C10_DEFINE_string(
+    refmodel,
+    "",
+    "The reference torch script model to compare against.");
+C10_DEFINE_string(
+    model,
+    "",
+    "The torch script model to compare to the reference model.");
+C10_DEFINE_string(
+    input_dims,
+    "",
+    "Alternate to input_files, if all inputs are simple "
+    "float TensorCPUs, specify the dimension using comma "
+    "separated numbers. If multiple input needed, use "
+    "semicolon to separate the dimension of different "
+    "tensors.");
+C10_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
+C10_DEFINE_string(
+    input_memory_format,
+    "contiguous_format",
+    "Input memory format (contiguous_format/channels_last)");
+C10_DEFINE_bool(
+    no_inputs,
+    false,
+    "Whether the model has any input. Will ignore other input arugments if true");
+C10_DEFINE_bool(
+    use_caching_allocator,
+    false,
+    "Whether to cache allocations between inference iterations");
+C10_DEFINE_bool(
+    print_output,
+    false,
+    "Whether to print output with all one input tensor.");
+C10_DEFINE_int(iter, 10, "The number of iterations to run.");
+C10_DEFINE_int(pytext_len, 0, "Length of input sequence.");
+C10_DEFINE_string(
+    backend,
+    "cpu",
+    "what backend to use for model (vulkan, cpu, metal) (default=cpu)");
+C10_DEFINE_string(
+    refbackend,
+    "cpu",
+    "what backend to use for model (vulkan, cpu, metal) (default=cpu)");
+C10_DEFINE_string(tolerance, "1e-5", "tolerance to use for comparison");
+
+bool checkRtol(
+    const at::Tensor& diff,
+    const std::vector<at::Tensor>& inputs,
+    float tolerance) {
+  float maxValue = 0.0f;
+
+  for (const auto& tensor : inputs) {
+    maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
+  }
+  float maxDiff = diff.abs().max().item<float>();
+
+  return maxDiff < (tolerance * maxValue);
+}
+
+bool almostEqual(const at::Tensor& a, const at::Tensor& b, float tolerance) {
+  return checkRtol(a - b, {a, b}, tolerance);
+}
+
+std::vector<std::string> split(
+    char separator,
+    const std::string& string,
+    bool ignore_empty = true) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (getline(ss, item, separator)) {
+    if (!ignore_empty || !item.empty()) {
+      pieces.push_back(std::move(item));
+    }
+  }
+  return pieces;
+}
+
+std::vector<c10::IValue> create_inputs(
+    std::vector<c10::IValue>& refinputs,
+    std::vector<c10::IValue>& inputs,
+    std::string& refbackend,
+    std::string& backend) {
+  if (FLAGS_no_inputs) {
+    return {};
+  }
+
+  CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified.");
+  CAFFE_ENFORCE_GE(FLAGS_input_type.size(), 0, "Input type must be specified.");
+
+  std::vector<std::string> input_dims_list = split(';', FLAGS_input_dims);
+  std::vector<std::string> input_type_list = split(';', FLAGS_input_type);
+  std::vector<std::string> input_memory_format_list =
+      split(';', FLAGS_input_memory_format);
+
+  CAFFE_ENFORCE_GE(
+      input_dims_list.size(), 0, "Input dims not specified correctly.");
+  CAFFE_ENFORCE_GE(
+      input_type_list.size(), 0, "Input type not specified correctly.");
+  CAFFE_ENFORCE_GE(
+      input_memory_format_list.size(),
+      0,
+      "Input format list not specified correctly.");
+
+  CAFFE_ENFORCE_EQ(
+      input_dims_list.size(),
+      input_type_list.size(),
+      "Input dims and type should have the same number of items.");
+  CAFFE_ENFORCE_EQ(
+      input_dims_list.size(),
+      input_memory_format_list.size(),
+      "Input dims and format should have the same number of items.");
+
+  for (size_t i = 0; i < input_dims_list.size(); ++i) {
+    auto input_dims_str = split(',', input_dims_list[i]);
+    std::vector<int64_t> input_dims;
+    input_dims.reserve(input_dims_str.size());
+    for (const auto& s : input_dims_str) {
+      input_dims.push_back(c10::stoi(s));
+    }
+
+    at::ScalarType input_type;
+    if (input_type_list[i] == "float") {
+      input_type = at::ScalarType::Float;
+    } else if (input_type_list[i] == "uint8_t") {
+      input_type = at::ScalarType::Byte;
+    } else if (input_type_list[i] == "int64") {
+      input_type = at::ScalarType::Long;
+    } else {
+      CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
+    }
+
+    at::MemoryFormat input_memory_format;
+    if (input_memory_format_list[i] == "channels_last") {
+      if (input_dims.size() != 4u) {
+        CAFFE_THROW(
+            "channels_last memory format only available on 4D tensors!");
+      }
+      input_memory_format = at::MemoryFormat::ChannelsLast;
+    } else if (input_memory_format_list[i] == "contiguous_format") {
+      input_memory_format = at::MemoryFormat::Contiguous;
+    } else {
+      CAFFE_THROW(
+          "Unsupported input memory format: ", input_memory_format_list[i]);
+    }
+
+    const auto input_tensor = torch::rand(
+        input_dims,
+        at::TensorOptions(input_type).memory_format(input_memory_format));
+
+    if (refbackend == "vulkan") {
+      refinputs.emplace_back(input_tensor.vulkan());
+    } else {
+      refinputs.emplace_back(input_tensor);
+    }
+
+    if (backend == "vulkan") {
+      inputs.emplace_back(input_tensor.vulkan());
+    } else {
+      inputs.emplace_back(input_tensor);
+    }
+  }
+
+  if (FLAGS_pytext_len > 0) {
+    auto stensor = FLAGS_pytext_len * at::ones({1}, torch::kI64);
+    if (refbackend == "vulkan") {
+      refinputs.emplace_back(stensor.vulkan());
+    } else {
+      refinputs.emplace_back(stensor);
+    }
+
+    if (backend == "vulkan") {
+      inputs.emplace_back(stensor.vulkan());
+    } else {
+      inputs.emplace_back(stensor);
+    }
+  }
+
+  return inputs;
+}
+
+int main(int argc, char** argv) {
+  c10::SetUsageMessage(
+      "Run accuracy comparison to a reference model for a pytorch model.\n"
+      "Example usage:\n"
+      "./compare_models_torch"
+      " --refmodel=<ref_model_file>"
+      " --model=<model_file>"
+      " --iter=20");
+  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
+    std::cerr << "Failed to parse command line flags!" << std::endl;
+    return 1;
+  }
+
+  std::stringstream ss(FLAGS_tolerance);
+  float tolerance = 0;
+  ss >> tolerance;
+
+  torch::autograd::AutoGradMode guard(false);
+  torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false);
+  auto module = torch::jit::load(FLAGS_model);
+  auto refmodule = torch::jit::load(FLAGS_refmodel);
+
+  module.eval();
+  refmodule.eval();
+
+  c10::CPUCachingAllocator caching_allocator;
+  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
+  if (FLAGS_use_caching_allocator) {
+    caching_allocator_guard.emplace(&caching_allocator);
+  }
+  std::cout << "Running modules." << std::endl;
+
+  int passed = 0;
+  for (int i = 0; i < FLAGS_iter; ++i) {
+    std::vector<c10::IValue> refinputs;
+    std::vector<c10::IValue> inputs;
+    create_inputs(refinputs, inputs, FLAGS_refbackend, FLAGS_backend);
+
+    const auto refoutput = refmodule.forward(refinputs).toTensor().cpu();
+    const auto output = module.forward(inputs).toTensor().cpu();
+
+    bool check = almostEqual(refoutput, output, tolerance);
+    if (check) {
+      passed += 1;
+    }
+  }
+  std::cout << "Output was equal within tolerance " << passed << "/"
+            << FLAGS_iter
+            << " times. Pass rate: " << (float)passed / (float)FLAGS_iter * 100
+            << std::setprecision(2) << "%" << std::endl;
+
+  return 0;
+}

From cf1e5d7d2b72d0a9ee69e70683ffcd213b12b99c Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Thu, 3 Dec 2020 16:10:05 -0800
Subject: [PATCH 030/132] Ignore MSVC's pdb file (#47963)

Summary:
These files are generated by MSVC when building with debug symbols `REL_WITH_DEB_INFO=1`:
```
PS C:\Users\Xiang Gao\source\repos\pytorch> git status
On branch master
Your branch is up to date with 'origin/master'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
        torch/lib/asmjit.pdb
        torch/lib/c10.pdb
        torch/lib/c10_cuda.pdb
        torch/lib/caffe2_detectron_ops_gpu.pdb
        torch/lib/caffe2_module_test_dynamic.pdb
        torch/lib/caffe2_observers.pdb
        torch/lib/fbgemm.pdb
        torch/lib/shm.pdb
        torch/lib/torch_cpu.pdb
        torch/lib/torch_cuda.pdb

nothing added to commit but untracked files present (use "git add" to track)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47963

Reviewed By: heitorschueroff

Differential Revision: D25311564

Pulled By: malfet

fbshipit-source-id: 1a7125f3c6ff296b4bb0975ee97b59c23586b1cb
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 10994ba7a64b..5de02417430e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,6 +76,7 @@ torch/lib/*.exe*
 torch/lib/*.dylib*
 torch/lib/*.h
 torch/lib/*.lib
+torch/lib/*.pdb
 torch/lib/*.so*
 torch/lib/protobuf*.pc
 torch/lib/build

From 16fd1c32c5cbf63d87a104396acacbb72b15f419 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare1@gmail.com>
Date: Thu, 3 Dec 2020 17:28:46 -0800
Subject: [PATCH 031/132] [ONNX] Update batch_norm symbolic to handle
 track_running_stats=False (#47903)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45333

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47903

Reviewed By: ejguan

Differential Revision: D25097509

Pulled By: bzinodev

fbshipit-source-id: 5584dac1150b13d4e0a6e0c39ac2f2caf41d3b38
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 13 +++++++++++++
 torch/onnx/symbolic_opset9.py              | 10 +++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index abaa078bb353..16f50cffd00b 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2243,6 +2243,14 @@ def test_batchnorm1d_noaffine(self):
         x = torch.randn(10, 10, 128)
         self.run_test(model, x)
 
+    def test_batchnorm1d_norunningstats(self):
+        x = torch.randn(10, 10)
+        model = torch.nn.BatchNorm1d(10, track_running_stats=False)
+        self.run_test(model, x)
+
+        x = torch.randn(10, 10, 128)
+        self.run_test(model, x)  
+
     def test_batchnorm2d(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.BatchNorm2d(3, affine=True)
@@ -2253,6 +2261,11 @@ def test_batchnorm2d_noaffine(self):
         model = torch.nn.BatchNorm2d(3, affine=False)
         self.run_test(model, x)
 
+    def test_batchnorm2d_norunningstats(self):
+        x = torch.randn(10, 3, 128, 128)
+        model = torch.nn.BatchNorm2d(3, track_running_stats=False)
+        self.run_test(model, x)
+
     def test_batchnorm3d(self):
         x = torch.randn(10, 3, 128, 128, 128)
         model = torch.nn.BatchNorm3d(3, affine=True)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 1bb6fe19352f..68da423280ac 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1235,7 +1235,15 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome
         bias_value = torch.tensor([0.] * input_sizes[1]).type(
             'torch.' + input.type().scalarType() + 'Tensor')
         bias = g.op("Constant", value_t=bias_value)
-
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time    
+    if running_mean is None or sym_help._is_none(running_mean) or running_var is None or sym_help._is_none(running_var):
+        assert len(input_sizes) > 1
+        reshape_in = g.op("Reshape", input, 
+                          g.op("Constant", value_t=torch.tensor([input_sizes[0], input_sizes[1], -1], dtype=torch.int64)))
+        trans_in = g.op('Transpose', reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(g, trans_in, 
+                                              g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)), 
+                                              False, False)
     out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
                epsilon_f=eps,
                momentum_f=1 - momentum,

From 536352e86fd22a167e8bee7b05d6ccc5dc40ea55 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 3 Dec 2020 19:21:33 -0800
Subject: [PATCH 032/132] fx quant: clean up functions in _generate_qconfig_map
 (#48772)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48772

Makes util functions in `_generate_qconfig_map` have no side
effects, all dependencies are now in arguments.

Test Plan:
```
python test/test_quantization.py TestQuantizeFx
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25295837

fbshipit-source-id: 49399abef626234e34bb5ec8c6d870da3c1760e7
---
 torch/quantization/fx/quantize.py | 92 +++++++++++++++----------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 68f560c95096..5bfc6dbb9123 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -208,6 +208,44 @@ def _convert_to_ordered_dict(key, qconfig_dict):
     _convert_to_ordered_dict('module_name_regex', qconfig_dict)
     _convert_to_ordered_dict('module_name', qconfig_dict)
 
+def get_module_type_qconfig(qconfig_dict, module_type, fallback_qconfig):
+    return qconfig_dict['object_type'].get(
+        module_type, fallback_qconfig)
+
+def get_function_qconfig(qconfig_dict, function, fallback_qconfig):
+    return qconfig_dict['object_type'].get(function, fallback_qconfig)
+
+def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig):
+    for regex_pattern, qconfig in \
+            qconfig_dict['module_name_regex'].items():
+        if re.match(regex_pattern, module_name):
+            # first match wins
+            return qconfig
+    return fallback_qconfig
+
+def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig):
+    if module_name == '':
+        # module name qconfig not found
+        return fallback_qconfig
+    if module_name in qconfig_dict['module_name']:
+        return qconfig_dict['module_name'][module_name]
+    else:
+        parent, _ = _parent_name(module_name)
+        return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig)
+
+# get qconfig for module_name,
+# fallback to module_name_regex_qconfig, module_type_qconfig,
+# global_qconfig if necessary
+def get_qconfig(modules, qconfig_dict, module_name, global_qconfig):
+    assert modules is not None
+    module_type_qconfig = get_module_type_qconfig(
+        qconfig_dict, type(modules[module_name]), global_qconfig)
+    module_name_regex_qconfig = get_module_name_regex_qconfig(
+        qconfig_dict, module_name, module_type_qconfig)
+    module_name_qconfig = get_module_name_qconfig(
+        qconfig_dict, module_name, module_name_regex_qconfig)
+    return module_name_qconfig
+
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
     torch.nn.functional.conv2d : [1],
@@ -262,58 +300,18 @@ def _generate_qconfig_map(self,
                               qconfig_dict):
         global_qconfig = qconfig_dict.get('', None)
 
-        def get_module_type_qconfig(
-                module_type, fallback_qconfig=global_qconfig):
-            return qconfig_dict['object_type'].get(
-                module_type, fallback_qconfig)
-
-        def get_function_qconfig(
-                function, fallback_qconfig=global_qconfig):
-            return qconfig_dict['object_type'].get(function, fallback_qconfig)
-
-        def get_module_name_regex_qconfig(
-                module_name, fallback_qconfig=global_qconfig):
-            for regex_pattern, qconfig in \
-                    qconfig_dict['module_name_regex'].items():
-                if re.match(regex_pattern, module_name):
-                    # first match wins
-                    return qconfig
-            return fallback_qconfig
-
-        def get_module_name_qconfig(
-                module_name, fallback_qconfig=global_qconfig):
-            if module_name == '':
-                # module name qconfig not found
-                return fallback_qconfig
-            if module_name in qconfig_dict['module_name']:
-                return qconfig_dict['module_name'][module_name]
-            else:
-                parent, _ = _parent_name(module_name)
-                return get_module_name_qconfig(parent, fallback_qconfig)
-
-        # get qconfig for module_name,
-        # fallback to module_name_regex_qconfig, module_type_qconfig,
-        # global_qconfig if necessary
-        def get_qconfig(module_name):
-            assert self.modules is not None
-            module_type_qconfig = \
-                get_module_type_qconfig(type(self.modules[module_name]))
-            module_name_regex_qconfig = \
-                get_module_name_regex_qconfig(module_name, module_type_qconfig)
-            module_name_qconfig = \
-                get_module_name_qconfig(module_name, module_name_regex_qconfig)
-            return module_name_qconfig
-
         self.qconfig_map = dict()
         for node in input_graph.nodes:
             if node.op == 'get_attr':
                 module_name, _ = _parent_name(node.target)
-                self.qconfig_map[node.name] = get_qconfig(module_name)
+                self.qconfig_map[node.name] = get_qconfig(
+                    self.modules, qconfig_dict, module_name, global_qconfig)
             elif node.op == 'call_function':
                 # precedence: [TODO] module_name_qconfig (need scope support
                 # from fx)
                 # > function_qconfig > global_qconfig
-                function_qconfig = get_function_qconfig(node.target)
+                function_qconfig = get_function_qconfig(
+                    qconfig_dict, node.target, global_qconfig)
                 self.qconfig_map[node.name] = function_qconfig
             elif node.op == 'call_method':
                 self_obj = node.args[0]
@@ -326,10 +324,12 @@ def get_qconfig(module_name):
                     warnings.warn(
                         "Scope info is not yet supported, taking default " +
                         "qconfig for value {}".format(node.name))
-                    qconfig = get_qconfig('')
+                    qconfig = get_qconfig(
+                        self.modules, qconfig_dict, '', global_qconfig)
                 self.qconfig_map[node.name] = qconfig
             elif node.op == 'call_module':
-                module_qconfig = get_qconfig(node.target)
+                module_qconfig = get_qconfig(
+                    self.modules, qconfig_dict, node.target, global_qconfig)
                 # regex is not supported eager mode propagate_qconfig_, we'll
                 # need to set the qconfig explicitly here in case regex
                 # is used

From c98c617b44678ad8267a830e8d0ed2f7d89c160f Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 3 Dec 2020 19:21:33 -0800
Subject: [PATCH 033/132] fx quant: clean up functions in _prepare (#48773)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48773

Makes util functions in `_prepare` have no side effects,
all dependencies are now in arguments.

Note: arg names are added in order as they appeared in function
code. It's not the most readable, but the lowest risk. This can
be cleaned up in future PRs if needed.

```
python test/test_quantization.py TestQuantizeFx
```

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25295839

fbshipit-source-id: 60c687f6b64924473f969541c8703118e4f7d16e
---
 torch/quantization/fx/quantize.py | 308 ++++++++++++++++--------------
 1 file changed, 169 insertions(+), 139 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 5bfc6dbb9123..d473caee8944 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -57,7 +57,7 @@
 import warnings
 import re
 
-from typing import Optional, Dict, Any, List, Union, Tuple
+from typing import Optional, Dict, Any, List, Union, Tuple, Set
 
 # Define helper types
 
@@ -246,6 +246,162 @@ def get_qconfig(modules, qconfig_dict, module_name, global_qconfig):
         qconfig_dict, module_name, module_name_regex_qconfig)
     return module_name_qconfig
 
+def insert_observer(
+        node, observer, model_device, model,
+        activation_post_process_map, env, observed_graph, load_arg,
+        observed_node_names_set):
+    """Insert observer for node by modifying the observed_graph and
+       attach observer module to the model
+       Args:
+         node: Node
+         observer: observer/fake_quantize module instance
+    """
+    # respect device affinity when adding observers
+    if model_device:
+        observer.to(model_device)
+    # add observer module as attribute
+    prefix = node.name + '_activation_post_process_'
+    get_new_observer_name = get_new_attr_name_with_prefix(prefix)
+    observer_name = get_new_observer_name(model)
+    setattr(model, observer_name, observer)
+    # put observer instance activation_post_process map
+    assert activation_post_process_map is not None
+    activation_post_process_map[node.name] = observer
+    # insert observer call
+    env[node.name] = observed_graph.create_node(
+        'call_module', observer_name, (load_arg(node),), {})
+    observed_node_names_set.add(node.name)
+
+def insert_observer_for_special_module(
+        quantize_handler, modules, prepare_custom_config_dict, qconfig,
+        node):
+    """ Insert observer for custom module and standalone module
+      Returns: standalone_module_input_idxs: the indexs for inputs that
+      needs to be observed by parent module
+    """
+    assert modules is not None
+    if isinstance(quantize_handler, CustomModuleQuantizeHandler):
+        custom_module = modules[node.target]
+        custom_module_class_mapping = prepare_custom_config_dict.get(
+            "float_to_observed_custom_module_class", {})
+        observed_custom_module_class = \
+            get_swapped_custom_module_class(
+                custom_module, custom_module_class_mapping, qconfig)
+        observed_custom_module = \
+            observed_custom_module_class.from_float(custom_module)
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name, observed_custom_module)
+    elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
+        # observe standalone module
+        standalone_module = modules[node.target]
+        prepare = \
+            torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
+        observed_standalone_module = \
+            prepare(standalone_module, {"": qconfig})
+        observed_standalone_module.qconfig = qconfig
+        observed_standalone_module = mark_observed_standalone_module(
+            observed_standalone_module)
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name,
+                observed_standalone_module)
+        modules[node.target] = observed_standalone_module
+
+def insert_observer_for_output_of_the_node(
+        node,
+        quantize_handler,
+        qconfig,
+        modules,
+        model,
+        pattern,
+        model_device,
+        activation_post_process_map,
+        env,
+        observed_graph,
+        load_arg,
+        observed_node_names_set,
+        matched_nodes):
+    """ Insert observer/fake_quantize module for output of the observed
+    module if needed
+    """
+    # don't need to insert observer for output if activation does not
+    # need to be statically quantized
+    assert modules is not None
+    if activation_is_statically_quantized(qconfig):
+        if isinstance(quantize_handler, FixedQParamsOpQuantizeHandler) \
+                and model.training:
+            # we only insert fake quantize module in qat
+            assert pattern is not None
+            activation_post_process_ctr = \
+                get_default_output_activation_post_process_map().get(
+                    pattern, None)
+            assert activation_post_process_ctr is not None, \
+                "activation_post_process constructor not provided " + \
+                "for pattern:" + str(pattern)
+            insert_observer(
+                node, activation_post_process_ctr(), model_device,
+                model, activation_post_process_map, env, observed_graph,
+                load_arg, observed_node_names_set)
+        elif (isinstance(quantize_handler,
+                         FixedQParamsOpQuantizeHandler) and
+              not model.training) or \
+                isinstance(quantize_handler, CopyNode):
+            # inserting observers for output of observed module, or
+            # mark the output as observed
+            assert node.op in [
+                'call_module',
+                'call_function',
+                'call_method'], \
+                'CopyNode of type ' + node.op + ' is not handled'
+
+            def is_observed(input_arg):
+                if isinstance(input_arg, Node):
+                    return input_arg.name in observed_node_names_set
+                elif isinstance(input_arg, list):
+                    return all(map(is_observed, input_arg))
+            # propagate observed property from input
+            if is_observed(node.args[0]):
+                observed_node_names_set.add(node.name)
+        elif ((isinstance(quantize_handler, Add) or
+                isinstance(quantize_handler, Mul)) and
+              quantize_handler.num_node_args == 1):
+            assert matched_nodes is not None
+            input_node = matched_nodes[-1]  # first node in the sequence
+
+            def input_is_observed(arg):
+                return (isinstance(arg, Node) and
+                        arg.name in observed_node_names_set)
+            # This is checking if one of the argument of add/mul
+            # is an observed node
+            # If both of the inputs are number,
+            # we will not consider the output to be observed
+            if (input_is_observed(input_node.args[0]) or
+                    input_is_observed(input_node.args[1])):
+                observed_node_names_set.add(node.name)
+        elif isinstance(quantize_handler,
+                        StandaloneModuleQuantizeHandler):
+            # output is observed in the standalone module
+            return
+        elif (quantize_handler.all_node_args and
+              input_output_observed(quantize_handler)):
+            # observer for outputs
+            new_observer = qconfig.activation()
+            insert_observer(
+                node, new_observer, model_device, model,
+                activation_post_process_map, env, observed_graph,
+                load_arg, observed_node_names_set)
+
+def insert_observer_for_input_arg_of_observed_node(
+        node, observed_node_names_set, quants,
+        model_device, model, activation_post_process_map, env, observed_graph,
+        load_arg):
+    if node.name not in observed_node_names_set and node.name in quants:
+        _, activation_post_process_ctr = quants[node.name]
+        if activation_post_process_ctr is not None:
+            insert_observer(
+                node, activation_post_process_ctr(),
+                model_device, model, activation_post_process_map,
+                env, observed_graph, load_arg, observed_node_names_set)
+
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
     torch.nn.functional.conv2d : [1],
@@ -388,7 +544,7 @@ def _prepare(self, model, qconfig_dict, prepare_custom_config_dict,
         self.activation_post_process_map = dict()
         env: Dict[Any, Any] = {}
         observed_graph = Graph()
-        observed_node_names_set = set()
+        observed_node_names_set: Set[str] = set()
 
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
@@ -404,140 +560,6 @@ def load_arg(a):
             'activation_post_process_')
         model_device = assert_and_get_unique_device(model)
 
-        def insert_observer(node, observer):
-            """Insert observer for node by modifying the observed_graph and
-               attach observer module to the model
-               Args:
-                 node: Node
-                 observer: observer/fake_quantize module instance
-            """
-            # respect device affinity when adding observers
-            if model_device:
-                observer.to(model_device)
-            # add observer module as attribute
-            prefix = node.name + '_activation_post_process_'
-            get_new_observer_name = get_new_attr_name_with_prefix(prefix)
-            observer_name = get_new_observer_name(model)
-            setattr(model, observer_name, observer)
-            # put observer instance activation_post_process map
-            assert self.activation_post_process_map is not None
-            self.activation_post_process_map[node.name] = observer
-            # insert observer call
-            env[node.name] = observed_graph.create_node(
-                'call_module', observer_name, (load_arg(node),), {})
-            observed_node_names_set.add(node.name)
-
-        def insert_observer_for_special_module(quantize_handler):
-            """ Insert observer for custom module and standalone module
-              Returns: standalone_module_input_idxs: the indexs for inputs that
-              needs to be observed by parent module
-            """
-            assert self.modules is not None
-            if isinstance(quantize_handler, CustomModuleQuantizeHandler):
-                custom_module = self.modules[node.target]
-                custom_module_class_mapping = prepare_custom_config_dict.get(
-                    "float_to_observed_custom_module_class", {})
-                observed_custom_module_class = \
-                    get_swapped_custom_module_class(
-                        custom_module, custom_module_class_mapping, qconfig)
-                observed_custom_module = \
-                    observed_custom_module_class.from_float(custom_module)
-                parent_name, name = _parent_name(node.target)
-                setattr(self.modules[parent_name], name, observed_custom_module)
-            elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
-                # observe standalone module
-                standalone_module = self.modules[node.target]
-                prepare = \
-                    torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
-                observed_standalone_module = \
-                    prepare(standalone_module, {"": qconfig})
-                observed_standalone_module.qconfig = qconfig
-                observed_standalone_module = mark_observed_standalone_module(
-                    observed_standalone_module)
-                parent_name, name = _parent_name(node.target)
-                setattr(self.modules[parent_name], name,
-                        observed_standalone_module)
-                self.modules[node.target] = observed_standalone_module
-
-        def insert_observer_for_output_of_the_node(
-                node,
-                quantize_handler,
-                qconfig):
-            """ Insert observer/fake_quantize module for output of the observed
-            module if needed
-            """
-            # don't need to insert observer for output if activation does not
-            # need to be statically quantized
-            assert self.modules is not None
-            if activation_is_statically_quantized(qconfig):
-                if isinstance(quantize_handler, FixedQParamsOpQuantizeHandler) \
-                        and model.training:
-                    # we only insert fake quantize module in qat
-                    assert pattern is not None
-                    activation_post_process_ctr = \
-                        get_default_output_activation_post_process_map().get(
-                            pattern, None)
-                    assert activation_post_process_ctr is not None, \
-                        "activation_post_process constructor not provided " + \
-                        "for pattern:" + str(pattern)
-                    insert_observer(node, activation_post_process_ctr())
-                elif (isinstance(quantize_handler,
-                                 FixedQParamsOpQuantizeHandler) and
-                      not model.training) or \
-                        isinstance(quantize_handler, CopyNode):
-                    # inserting observers for output of observed module, or
-                    # mark the output as observed
-                    assert node.op in [
-                        'call_module',
-                        'call_function',
-                        'call_method'], \
-                        'CopyNode of type ' + node.op + ' is not handled'
-
-                    def is_observed(input_arg):
-                        if isinstance(input_arg, Node):
-                            return input_arg.name in observed_node_names_set
-                        elif isinstance(input_arg, list):
-                            return all(map(is_observed, input_arg))
-                    # propagate observed property from input
-                    if is_observed(node.args[0]):
-                        observed_node_names_set.add(node.name)
-                elif ((isinstance(quantize_handler, Add) or
-                        isinstance(quantize_handler, Mul)) and
-                      quantize_handler.num_node_args == 1):
-                    assert matched_nodes is not None
-                    input_node = matched_nodes[-1]  # first node in the sequence
-
-                    def input_is_observed(arg):
-                        return (isinstance(arg, Node) and
-                                arg.name in observed_node_names_set)
-                    # This is checking if one of the argument of add/mul
-                    # is an observed node
-                    # If both of the inputs are number,
-                    # we will not consider the output to be observed
-                    if (input_is_observed(input_node.args[0]) or
-                            input_is_observed(input_node.args[1])):
-                        observed_node_names_set.add(node.name)
-                elif isinstance(quantize_handler,
-                                StandaloneModuleQuantizeHandler):
-                    # output is observed in the standalone module
-                    return
-                elif (quantize_handler.all_node_args and
-                      input_output_observed(quantize_handler)):
-                    # observer for outputs
-                    new_observer = qconfig.activation()
-                    insert_observer(node, new_observer)
-
-        def insert_observer_for_input_arg_of_observed_node(arg):
-            """
-               Input:
-                 arg: input arg node for another observed node, e.g.
-                 input activaiton for functional linear node
-            """
-            if node.name not in observed_node_names_set and node.name in quants:
-                _, activation_post_process_ctr = quants[node.name]
-                if activation_post_process_ctr is not None:
-                    insert_observer(node, activation_post_process_ctr())
-
         result_node : Optional[Node] = None
         for node in model.graph.nodes:
             if node.op == 'output':
@@ -556,12 +578,20 @@ def insert_observer_for_input_arg_of_observed_node(arg):
                 # index for input of custom module that needs to be observed in
                 # parent
                 if qconfig is not None:
-                    insert_observer_for_special_module(obj)
+                    insert_observer_for_special_module(
+                        obj, self.modules, prepare_custom_config_dict, qconfig,
+                        node)
                     insert_observer_for_output_of_the_node(
-                        node, obj, qconfig)
+                        node, obj, qconfig, self.modules, model, pattern,
+                        model_device, self.activation_post_process_map, env,
+                        observed_graph, load_arg, observed_node_names_set,
+                        matched_nodes)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
-            insert_observer_for_input_arg_of_observed_node(node)
+            insert_observer_for_input_arg_of_observed_node(
+                node, observed_node_names_set, quants,
+                model_device, model, self.activation_post_process_map, env,
+                observed_graph, load_arg)
 
 
         model = GraphModule(model, observed_graph)

From f5bcf45e3bd6186452971c0de99ac78d50a3a5b3 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 3 Dec 2020 19:21:33 -0800
Subject: [PATCH 034/132] fx quant: add more typehints (#48774)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48774

Adds some more typehints throughout `quantization/fx/quantize.py`.

More are needed, ran out of time for now, we can continue in
future PRs.

Test Plan:
```
mypy torch/quantization/fx/quantize.py
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25295836

fbshipit-source-id: 4029aa8ea5b07ce9a57e4be6a66314d7a4e19585
---
 torch/quantization/fx/quantize.py | 84 ++++++++++++++++---------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index d473caee8944..8c776f515a3a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -57,12 +57,12 @@
 import warnings
 import re
 
-from typing import Optional, Dict, Any, List, Union, Tuple, Set
+from typing import Optional, Dict, Any, List, Union, Tuple, Set, Callable
 
 # Define helper types
 
 QConfigAny = Union[torch.quantization.QConfig,
-                   torch.quantization.QConfigDynamic]
+                   torch.quantization.QConfigDynamic, None]
 MatchResult = Tuple[Node, List[Node], Optional[Pattern], QuantizeHandler,
                     QConfigAny]
 
@@ -75,9 +75,9 @@
 # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer')
 # >> new_name = get_new_observer_name(module)
 # new_name will be an unused attribute name on module, e.g. `_observer_1`
-def get_new_attr_name_with_prefix(prefix):
-    def get_new_attr_name(module):
-        def get_attr_name(i):
+def get_new_attr_name_with_prefix(prefix: str) -> Callable:
+    def get_new_attr_name(module: torch.nn.Module):
+        def get_attr_name(i: int):
             return prefix + str(i)
         i = 0
         attr_name = get_attr_name(i)
@@ -87,7 +87,7 @@ def get_attr_name(i):
         return attr_name
     return get_new_attr_name
 
-def collect_producer_nodes(node):
+def collect_producer_nodes(node: Node) -> Optional[List[Node]]:
     r''' Starting from a target node, trace back until we hit inpu or
     getattr node. This is used to extract the chain of operators
     starting from getattr to the target node, for example
@@ -114,7 +114,8 @@ def forward(self, x):
                 frontier.append(arg)
     return nodes
 
-def graph_module_from_producer_nodes(root, producer_nodes):
+def graph_module_from_producer_nodes(
+        root: GraphModule, producer_nodes: List[Node]) -> GraphModule:
     r''' Construct a graph module from extracted producer nodes
     from `collect_producer_nodes` function
     Args:
@@ -137,7 +138,7 @@ def load_arg(a):
     graph_module = GraphModule(root, graph)
     return graph_module
 
-def assert_and_get_unique_device(module):
+def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     Returns the unique device for a module, or None if no device is found.
     Throws an error if multiple devices are detected.
@@ -151,13 +152,10 @@ def assert_and_get_unique_device(module):
     device = next(iter(devices)) if len(devices) > 0 else None
     return device
 
-def is_submodule_of_fake_quant(name, module, named_modules):
-    parent_name, _ = _parent_name(name)
-    return is_activation_post_process(named_modules[parent_name])
-
-def is_observed_standalone_module_node(node, modules):
+def is_observed_standalone_module_node(
+        node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
     return node.op == 'call_module' and \
-        is_observed_standalone_module(modules[node.target])
+        is_observed_standalone_module(modules[node.target])  # type: ignore
 
 
 def get_flattened_qconfig_dict(qconfig_dict):
@@ -247,9 +245,11 @@ def get_qconfig(modules, qconfig_dict, module_name, global_qconfig):
     return module_name_qconfig
 
 def insert_observer(
-        node, observer, model_device, model,
-        activation_post_process_map, env, observed_graph, load_arg,
-        observed_node_names_set):
+        node: Node, observer: torch.quantization.ObserverBase,
+        model_device: Any, model: torch.nn.Module,
+        activation_post_process_map: Dict[str, torch.quantization.ObserverBase],
+        env: Dict[Any, Any], observed_graph: Graph, load_arg: Callable,
+        observed_node_names_set: Set[str]):
     """Insert observer for node by modifying the observed_graph and
        attach observer module to the model
        Args:
@@ -273,15 +273,15 @@ def insert_observer(
     observed_node_names_set.add(node.name)
 
 def insert_observer_for_special_module(
-        quantize_handler, modules, prepare_custom_config_dict, qconfig,
-        node):
+        quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module],
+        prepare_custom_config_dict: Any, qconfig: Any, node: Node):
     """ Insert observer for custom module and standalone module
       Returns: standalone_module_input_idxs: the indexs for inputs that
       needs to be observed by parent module
     """
     assert modules is not None
     if isinstance(quantize_handler, CustomModuleQuantizeHandler):
-        custom_module = modules[node.target]
+        custom_module = modules[node.target]  # type: ignore
         custom_module_class_mapping = prepare_custom_config_dict.get(
             "float_to_observed_custom_module_class", {})
         observed_custom_module_class = \
@@ -293,7 +293,7 @@ def insert_observer_for_special_module(
         setattr(modules[parent_name], name, observed_custom_module)
     elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler):
         # observe standalone module
-        standalone_module = modules[node.target]
+        standalone_module = modules[node.target]  # type: ignore
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
@@ -304,22 +304,22 @@ def insert_observer_for_special_module(
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name,
                 observed_standalone_module)
-        modules[node.target] = observed_standalone_module
+        modules[node.target] = observed_standalone_module  # type: ignore
 
 def insert_observer_for_output_of_the_node(
-        node,
-        quantize_handler,
-        qconfig,
-        modules,
-        model,
-        pattern,
-        model_device,
-        activation_post_process_map,
-        env,
-        observed_graph,
-        load_arg,
-        observed_node_names_set,
-        matched_nodes):
+        node: Node,
+        quantize_handler: QuantizeHandler,
+        qconfig: Any,
+        modules: Dict[str, torch.nn.Module],
+        model: torch.nn.Module,
+        pattern: Any,
+        model_device: Any,
+        activation_post_process_map: Dict[str, torch.quantization.ObserverBase],
+        env: Dict[Any, Any],
+        observed_graph: Graph,
+        load_arg: Callable,
+        observed_node_names_set: Set[str],
+        matched_nodes: Optional[List[Node]]):
     """ Insert observer/fake_quantize module for output of the observed
     module if needed
     """
@@ -391,9 +391,11 @@ def input_is_observed(arg):
                 load_arg, observed_node_names_set)
 
 def insert_observer_for_input_arg_of_observed_node(
-        node, observed_node_names_set, quants,
-        model_device, model, activation_post_process_map, env, observed_graph,
-        load_arg):
+        node: Node, observed_node_names_set: Set[str], quants: Dict[str, Any],
+        model_device: Any, model: torch.nn.Module,
+        activation_post_process_map: Dict[str, torch.quantization.ObserverBase],
+        env: Dict[str, str], observed_graph: Graph,
+        load_arg: Callable):
     if node.name not in observed_node_names_set and node.name in quants:
         _, activation_post_process_ctr = quants[node.name]
         if activation_post_process_ctr is not None:
@@ -578,6 +580,7 @@ def load_arg(a):
                 # index for input of custom module that needs to be observed in
                 # parent
                 if qconfig is not None:
+                    assert obj is not None
                     insert_observer_for_special_module(
                         obj, self.modules, prepare_custom_config_dict, qconfig,
                         node)
@@ -1049,7 +1052,8 @@ def is_standalone_module(node_target):
 
         return match_map
 
-    def _find_quants(self, graph, matches):
+    def _find_quants(self, graph: Graph, matches: Dict[str, MatchResult],
+                     ) -> Dict[str, Any]:
         """
         Takes the nodes in the input graph and pending matches, and finds and
         returns the input and output nodes which need to be quantized.
@@ -1062,7 +1066,7 @@ def _find_quants(self, graph, matches):
          node_name -> (QuantizeHandler instance (always DefaultQuantizeHandler),
          activation_post_process (observer/fake_quantize module) constructor)
         """
-        quants: Dict[Any, Any] = {}
+        quants: Dict[str, Any] = {}
 
         def visit(node, matched_pattern, qconfig):
             def visit_arg(arg):

From 54da2dadd80b14f3b322ca54c81eaa54b1f2db86 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 3 Dec 2020 19:21:33 -0800
Subject: [PATCH 035/132] fx quant: more typehints, part 2 (#48792)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48792

Adds some more typehints throughout quantization/fx/quantize.py,
to help with readability.

Test Plan:
```
mypy torch/quantization/fx/quantize.py
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25306683

fbshipit-source-id: fc38b885a2cb5bf2c6d23b6305658704c6eb7811
---
 torch/quantization/fx/quantize.py | 78 ++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 32 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 8c776f515a3a..fe7dc53a8019 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -447,15 +447,18 @@ def __init__(self):
         self.patterns: Optional[Dict[Pattern, QuantizeHandler]] = None
 
 
-    def _qat_swap_modules(self, root, additional_qat_module_mapping):
+    def _qat_swap_modules(
+            self, root: torch.nn.Module,
+            additional_qat_module_mapping: Dict[Callable, Callable]) -> None:
         all_mappings = get_combined_dict(
             get_default_qat_module_mappings(), additional_qat_module_mapping)
         convert(root, mapping=all_mappings, inplace=True, remove_qconfig=False)
 
-    def _generate_qconfig_map(self,
-                              root,
-                              input_graph,
-                              qconfig_dict):
+    def _generate_qconfig_map(
+            self,
+            root: torch.nn.Module,
+            input_graph: Graph,
+            qconfig_dict: Any) -> None:
         global_qconfig = qconfig_dict.get('', None)
 
         self.qconfig_map = dict()
@@ -495,8 +498,9 @@ def _generate_qconfig_map(self,
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, prepare_custom_config_dict,
-                 is_standalone_module):
+    def _prepare(self, model: GraphModule, qconfig_dict: Any,
+                 prepare_custom_config_dict: Optional[Dict[str, Any]],
+                 is_standalone_module: bool) -> GraphModule:
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
@@ -534,6 +538,7 @@ def _prepare(self, model, qconfig_dict, prepare_custom_config_dict,
             "standalone_module_class", None)
         custom_module_classes = get_custom_module_class_keys(
             prepare_custom_config_dict, "float_to_observed_custom_module_class")
+        assert self.patterns is not None
         matches = self._find_matches(
             model.graph, self.modules, self.patterns, standalone_module_names,
             standalone_module_classes, custom_module_classes)
@@ -552,7 +557,7 @@ def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
         # indexes for the inputs that needs to be observed
-        standalone_module_observed_input_idxs = []
+        standalone_module_observed_input_idxs: List[int] = []
         graph_inputs = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
@@ -602,25 +607,28 @@ def load_arg(a):
         model = mark_observed_module(model)
         return model
 
-    def save_state(self, observed):
-        observed._activation_post_process_map = self.activation_post_process_map
-        observed._patterns = self.patterns
-        observed._qconfig_map = self.qconfig_map
+    def save_state(self, observed: GraphModule) -> None:
+        observed._activation_post_process_map = \
+            self.activation_post_process_map  # type: ignore
+        observed._patterns = self.patterns  # type: ignore
+        observed._qconfig_map = self.qconfig_map  # type: ignore
 
-    def restore_state(self, observed):
+    def restore_state(self, observed: GraphModule) -> None:
         assert is_observed_module(observed), \
             'incoming model must be produced by prepare_fx'
-        self.activation_post_process_map = observed._activation_post_process_map
-        self.patterns = observed._patterns
-        self.qconfig_map = observed._qconfig_map
-
-    def prepare(self, model, qconfig_dict, prepare_custom_config_dict=None,
-                is_standalone_module=False):
+        self.activation_post_process_map = \
+            observed._activation_post_process_map  # type: ignore
+        self.patterns = observed._patterns  # type: ignore
+        self.qconfig_map = observed._qconfig_map  # type: ignore
+
+    def prepare(self, model: GraphModule, qconfig_dict: Any,
+                prepare_custom_config_dict: Dict[str, Any] = None,
+                is_standalone_module: bool = False) -> GraphModule:
         return self._prepare(
             model, qconfig_dict, prepare_custom_config_dict,
             is_standalone_module)
 
-    def _run_weight_observers(self, observed):
+    def _run_weight_observers(self, observed: GraphModule) -> None:
         r''' Extract the subgraph that produces the weight for dynamic quant
         or weight only quant node and run the subgraph to observe the weight.
         Note that the observers of dynamic quant or weight only quant ops are
@@ -640,8 +648,9 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, debug=False, convert_custom_config_dict=None,
-                 is_standalone_module=False):
+    def _convert(self, model: GraphModule, debug: bool = False,
+                 convert_custom_config_dict: Dict[str, Any] = None,
+                 is_standalone_module: bool = False) -> GraphModule:
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
@@ -662,6 +671,7 @@ def _convert(self, model, debug=False, convert_custom_config_dict=None,
         custom_module_classes = get_custom_module_class_keys(
             convert_custom_config_dict,
             "observed_to_quantized_custom_module_class")
+        assert self.patterns is not None
         matches = self._find_matches(
             model.graph, self.modules, self.patterns,
             custom_module_classes=custom_module_classes)
@@ -905,7 +915,7 @@ def load_arg(a):  # type: ignore
     # Trace back from the weight node util we hit getattr, reconstruct the
     # graph module with the traced nodes and run the graph module to pack the
     # weight. then replace the original chain of ops with the packed weight.
-    def _fold_weight(self, quantized):
+    def _fold_weight(self, quantized: GraphModule) -> GraphModule:
         packed_weights = dict()
         # map from folded node name to the prepacked weight name
         folded_nodes = dict()
@@ -951,8 +961,9 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, debug=False, convert_custom_config_dict=None,
-                is_standalone_module=False):
+    def convert(self, model: GraphModule, debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None,
+                is_standalone_module: bool = False) -> GraphModule:
         quantized = self._convert(
             model, debug, convert_custom_config_dict, is_standalone_module)
         if not debug:
@@ -960,10 +971,11 @@ def convert(self, model, debug=False, convert_custom_config_dict=None,
         return quantized
 
     def _find_matches(
-            self, graph, modules, patterns,
-            standalone_module_names=None,
-            standalone_module_classes=None,
-            custom_module_classes=None) -> Dict[str, MatchResult]:
+            self, graph: Graph, modules: Dict[str, torch.nn.Module],
+            patterns: Dict[Pattern, QuantizeHandler],
+            standalone_module_names: List[str] = None,
+            standalone_module_classes: List[Callable] = None,
+            custom_module_classes: List[Any] = None) -> Dict[str, MatchResult]:
         """
         Matches the nodes in the input graph to quantization patterns, and
         outputs the information needed to quantize them in future steps.
@@ -1017,7 +1029,7 @@ def record_match(pattern, node, matched):
                         record_match(pattern, node, matched)
                         for n in matched:
                             match_map[n.name] = (
-                                node, matched, pattern, value(self, node),
+                                node, matched, pattern, value(self, node),  # type: ignore
                                 self.qconfig_map[n.name])
                             all_matched.add(n.name)
                         # break after finding the first match
@@ -1035,8 +1047,10 @@ def record_match(pattern, node, matched):
 
         def is_standalone_module(node_target):
             assert self.modules is not None
-            return node_target in standalone_module_names or \
-                type(self.modules[node_target]) in standalone_module_classes
+            return (
+                node_target in standalone_module_names or  # type: ignore
+                type(self.modules[node_target]) in standalone_module_classes  # type: ignore
+            )
 
         # add standalone modules to the match
         for node in graph.nodes:

From f5d94244b2bc4e73d905c75d65db41e488d7e339 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 3 Dec 2020 19:21:33 -0800
Subject: [PATCH 036/132] fx quant: more typehints, part 3 (#48794)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48794

Adds typehints to function I/O in `torch/quantization/quantize_fx.py`,
for readability.

Test Plan:
```
mypy torch/quantization/
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25307084

fbshipit-source-id: 67bdf95b78836dcabc7d829e1854ca5b8ceb8346
---
 torch/quantization/quantize_fx.py | 44 ++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 7d12a7316896..ba1f58af402e 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -6,15 +6,16 @@
 from .fx.utils import graph_pretty_str  # noqa: F401
 from .fx.utils import get_custom_module_class_keys  # noqa: F401
 from torch.nn.intrinsic import _FusedModule
+from typing import Dict, Any, List, Callable
 
-def _check_is_graph_module(model):
+def _check_is_graph_module(model: torch.nn.Module) -> None:
     if not isinstance(model, GraphModule):
         raise ValueError(
             'input model must be a GraphModule, ' +
             'Got type:' + str(type(model)) + ' Please make ' +
             'sure to follow the tutorials.')
 
-def _swap_ff_with_fxff(model):
+def _swap_ff_with_fxff(model: torch.nn.Module) -> None:
     r""" Swap FloatFunctional with FXFloatFunctional
     """
     modules_to_swap = []
@@ -28,7 +29,9 @@ def _swap_ff_with_fxff(model):
         del model._modules[name]
         model._modules[name] = torch.nn.quantized.FXFloatFunctional()
 
-def _fuse_fx(graph_module, fuse_custom_config_dict=None):
+def _fuse_fx(
+        graph_module: GraphModule,
+        fuse_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" Internal helper function to fuse modules in preparation for quantization
 
     Args:
@@ -39,7 +42,8 @@ def _fuse_fx(graph_module, fuse_custom_config_dict=None):
     return fuser.fuse(graph_module, fuse_custom_config_dict)
 
 class CustomTracer(Tracer):
-    def __init__(self, skipped_module_names, skipped_module_classes):
+    def __init__(self, skipped_module_names: List[str],
+                 skipped_module_classes: List[Callable]):
         super().__init__()
         self.skipped_module_names = skipped_module_names
         self.skipped_module_classes = skipped_module_classes
@@ -52,7 +56,9 @@ def is_leaf_module(self, m, module_qualified_name):
             isinstance(m, _FusedModule)
 
 
-def _prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None, is_standalone_module=False):
+def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any,
+                prepare_custom_config_dict: Dict[str, Any] = None,
+                is_standalone_module: bool = False) -> GraphModule:
     r""" Internal helper function for prepare_fx
     Args:
       `model`, `qconfig_dict`, `prepare_custom_config_dict`: see docs for :func:`~torch.quantization.prepare_fx`
@@ -93,7 +99,9 @@ def _prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None, is_standal
         prepare_custom_config_dict=prepare_custom_config_dict,
         is_standalone_module=is_standalone_module)
 
-def _prepare_standalone_module_fx(model, qconfig_dict, prepare_custom_config_dict=None):
+def _prepare_standalone_module_fx(
+        model: torch.nn.Module, qconfig_dict: Any,
+        prepare_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
     parent module.
     standalone_module means it a submodule that is not inlined in parent module,
@@ -104,7 +112,8 @@ def _prepare_standalone_module_fx(model, qconfig_dict, prepare_custom_config_dic
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
-def fuse_fx(model, fuse_custom_config_dict=None):
+def fuse_fx(model: torch.nn.Module,
+            fuse_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
     Fusion rules are defined in torch.quantization.fx.fusion_pattern.py
     Args:
@@ -128,7 +137,9 @@ def fuse_fx(model, fuse_custom_config_dict=None):
     graph_module = torch.fx.symbolic_trace(model)  # type: ignore
     return _fuse_fx(graph_module, fuse_custom_config_dict)
 
-def prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None):
+def prepare_fx(
+        model: torch.nn.Module, qconfig_dict: Any,
+        prepare_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" Prepare a model for post training static quantization
 
     Args:
@@ -247,7 +258,9 @@ def calibrate(model, data_loader):
         'eval mode'
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
-def prepare_qat_fx(model, qconfig_dict, prepare_custom_config_dict=None):
+def prepare_qat_fx(
+        model: torch.nn.Module, qconfig_dict: Any,
+        prepare_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" Prepare a model for quantization aware training
     Args:
       `model`: torch.nn.Module model, must be in train mode
@@ -282,14 +295,19 @@ def train_loop(model, train_data):
         'train mode'
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
-def _convert_fx(graph_module, debug, convert_custom_config_dict=None, is_standalone_module=False):
+def _convert_fx(
+        graph_module: GraphModule, debug: bool,
+        convert_custom_config_dict: Dict[str, Any] = None,
+        is_standalone_module: bool = False) -> GraphModule:
     """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
     """
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
     return quantizer.convert(graph_module, debug, convert_custom_config_dict, is_standalone_module)
 
-def convert_fx(graph_module, debug=False, convert_custom_config_dict=None):
+def convert_fx(
+        graph_module: GraphModule, debug: bool = False,
+        convert_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" Convert a calibrated or trained model to a quantized model
     Args:
         `graph_module`: A prepared and calibrated/trained model (GraphModule)
@@ -346,7 +364,9 @@ def convert_fx(graph_module, debug=False, convert_custom_config_dict=None):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_fx")
     return _convert_fx(graph_module, debug, convert_custom_config_dict)
 
-def _convert_standalone_module_fx(graph_module, debug=False, convert_custom_config_dict=None):
+def _convert_standalone_module_fx(
+        graph_module: GraphModule, debug: bool = False,
+        convert_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 

From c55d45f04be16ff16a9089a8d22d3a2f5a8672a7 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@fb.com>
Date: Thu, 3 Dec 2020 19:44:38 -0800
Subject: [PATCH 037/132] [qnnpack] Fix unused var warning when building for
 different archs. (#48730)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48730

.

Test Plan: CI

Reviewed By: kimishpatel

Differential Revision: D25273068

fbshipit-source-id: 3a0cea633bf1c02fa3176b3b3f43db46d2beb861
---
 .../native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-neon.c | 3 +++
 .../native/quantized/cpu/qnnpack/src/q8avgpool/up8x9-neon.c    | 3 +++
 .../cpu/qnnpack/src/q8dwconv/mp8x25-neon-per-channel.c         | 3 +++
 .../native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon.c    | 3 +++
 .../cpu/qnnpack/src/q8dwconv/up8x9-neon-per-channel.c          | 3 +++
 .../native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon.c     | 3 +++
 .../quantized/cpu/qnnpack/src/q8gavgpool/mp8x7p7q-neon.c       | 3 +++
 .../native/quantized/cpu/qnnpack/src/q8gavgpool/up8x7-neon.c   | 3 +++
 8 files changed, 24 insertions(+)

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-neon.c
index 3145442299dc..fa1fdebdd4d4 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-neon.c
@@ -30,16 +30,19 @@ void pytorch_q8avgpool_ukernel_mp8x9p8q__neon(
 
   const int32x4_t vbias = vld1q_dup_s32(&quantization_params->neon.bias);
   const float32x4_t vscale = vdupq_n_f32(quantization_params->neon.scale);
+#if defined(__aarch64__)
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   do {
     {
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/up8x9-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/up8x9-neon.c
index 453cf80fa08b..dc7209cd5f32 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/up8x9-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/up8x9-neon.c
@@ -30,16 +30,19 @@ void pytorch_q8avgpool_ukernel_up8x9__neon(
   const int32x4_t vbias = vld1q_dup_s32(&quantization_params->neon.bias);
   const float32x4_t vscale =
       vdupq_n_f32(quantization_params->neon.scale);
+#if defined(__aarch64__)
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   do {
     const uint8_t* i0 = input[0];
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon-per-channel.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon-per-channel.c
index 940cd2847833..3e2d11408eac 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon-per-channel.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon-per-channel.c
@@ -23,14 +23,17 @@ void pytorch_q8dwconv_ukernel_mp8x25_per_channel__neon(
         quantization_params[restrict static 1]) {
   const uint8x8_t vinput_zero_point =
       vld1_dup_u8((const uint8_t*)&quantization_params->neon.input_zero_point);
+#ifdef __aarch64__
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min = vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max = vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   do {
     uint8_t* output_start = output;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon.c
index e338f6d9673a..25c7957714d6 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-neon.c
@@ -27,14 +27,17 @@ void pytorch_q8dwconv_ukernel_mp8x25__neon(
       vdup_n_u8(quantization_params->neon.kernel_zero_points[0]);
   const float32x4_t requantization_scale_v =
       vdupq_n_f32(quantization_params->neon.requantization_scales[0]);
+#ifdef __aarch64__
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min = vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max = vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   do {
     uint8_t* output_start = output;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon-per-channel.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon-per-channel.c
index c8a102aaaa71..68ff1a3b41b1 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon-per-channel.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon-per-channel.c
@@ -23,16 +23,19 @@ void pytorch_q8dwconv_ukernel_up8x9_per_channel__neon(
         quantization_params[restrict static 1]) {
   const uint8x8_t va_zero_point =
       vld1_dup_u8((const uint8_t*)&quantization_params->neon.input_zero_point);
+#ifdef __aarch64__
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
 #ifdef __aarch64__
   /* Larger number of registers on AArch64 make it possible to process few
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon.c
index b6dd3b7a4455..9f442938f7dc 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/up8x9-neon.c
@@ -27,16 +27,19 @@ void pytorch_q8dwconv_ukernel_up8x9__neon(
       vdup_n_u8(quantization_params->neon.kernel_zero_points[0]);
   const float32x4_t requantization_scale_v =
       vdupq_n_f32(quantization_params->neon.requantization_scales[0]);
+#ifdef __aarch64__
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
 #ifdef __aarch64__
   /* Larger number of registers on AArch64 make it possible to process few
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/mp8x7p7q-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/mp8x7p7q-neon.c
index 88a59311bc89..27040ef67280 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/mp8x7p7q-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/mp8x7p7q-neon.c
@@ -119,16 +119,19 @@ void pytorch_q8gavgpool_ukernel_mp8x7p7q__neon(
 
   const float32x4_t vscale =
       vdupq_n_f32(quantization_params->neon.scale);
+#if defined(__aarch64__)
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   i0 = (const uint8_t*)((uintptr_t)i0 + input_increment);
   i1 = (const uint8_t*)((uintptr_t)i1 + input_increment);
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/up8x7-neon.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/up8x7-neon.c
index 36359286bd06..3d69ef13a604 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/up8x7-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gavgpool/up8x7-neon.c
@@ -52,16 +52,19 @@ void pytorch_q8gavgpool_ukernel_up8x7__neon(
   }
   const int32x4_t vbias = vld1q_dup_s32(&quantization_params->neon.bias);
   const float32x4_t vscale = vdupq_n_f32(quantization_params->neon.scale);
+#if defined(__aarch64__)
   const int16x8_t voutput_zero_point =
       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
   const uint8x8_t voutput_min =
       vld1_dup_u8(&quantization_params->neon.output_min);
   const uint8x8_t voutput_max =
       vld1_dup_u8(&quantization_params->neon.output_max);
+#else
   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
+#endif
 
   do {
     const uint8x8_t vi0 = vld1_u8(i0);

From 86540dbf41ff251f58d5a9b312618f98d45c374c Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <SsnL@users.noreply.github.com>
Date: Thu, 3 Dec 2020 20:44:52 -0800
Subject: [PATCH 038/132] Fix jit doc model loading example (#48104)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48104

Reviewed By: jamesr66a

Differential Revision: D25028353

Pulled By: suo

fbshipit-source-id: aaf74a40e7150a278d100e129740cfe1cef99af2
---
 docs/source/jit.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 710f2f928c5f..ccd37738277f 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -547,10 +547,10 @@ best practices?
 
       cpu_model = gpu_model.cpu()
       sample_input_cpu = sample_input_gpu.cpu()
-      traced_cpu = torch.jit.trace(traced_cpu, sample_input_cpu)
+      traced_cpu = torch.jit.trace(cpu_model, sample_input_cpu)
       torch.jit.save(traced_cpu, "cpu.pth")
 
-      traced_gpu = torch.jit.trace(traced_gpu, sample_input_gpu)
+      traced_gpu = torch.jit.trace(gpu_model, sample_input_gpu)
       torch.jit.save(traced_gpu, "gpu.pth")
 
       # ... later, when using the model:

From f06508756731088e373f188902934509f46bdc81 Mon Sep 17 00:00:00 2001
From: David <jiafa@microsoft.com>
Date: Thu, 3 Dec 2020 21:57:40 -0800
Subject: [PATCH 039/132] [ONNX] Handle dynamic input axes for
 prim_ConstantChunk (#48176)

Summary:
When converting a model that uses `torch.chunk`, it does not work when we have a dynamic input axes, because `Split` split attr is static for opset 11. Therefore, we convert it using `Slice` (support opset 11+). This PR also handles the cases that the input axes cannot be divided by the number of outputs. Pytorch works a way that fit the first (n-1) outputs for the same dim, and remaining for the last one. Added UT for it.

The existing code on `sequence` `split` cannot be leveraged here, because `start`, `end` of `Slice` are static there, but dynamic here.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48176

Reviewed By: bdhirsh

Differential Revision: D25274862

Pulled By: bzinodev

fbshipit-source-id: 7d213a7605ad128aca133c057d6dd86c65cc6de9
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 19 +++++++++++++++++++
 torch/onnx/symbolic_opset11.py             | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 16f50cffd00b..eee095d22314 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3263,6 +3263,25 @@ def forward(self, input):
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel2(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_chunk(self):
+        class ChunkModel(torch.nn.Module):
+            def __init__(self):
+                super(ChunkModel, self).__init__()
+
+            def forward(self, x):
+                return torch.chunk(x, 3, dim=1)
+
+        model = ChunkModel()
+        model.eval()
+        x = torch.randn(1, 18)
+
+        for dim_size_ in range(13, 16):
+            y = torch.randn(1, dim_size_)
+            self.run_test(model, x, test_with_inputs=[y],
+                          input_names=['x'],
+                          dynamic_axes={'x': {0: 'batch_size', 1: 'dims'}})
+
     def test_concat(self):
         class ConcatModel(torch.nn.Module):
             def forward(self, x, y, z):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index dd9060036b04..de2acf6085a0 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -944,3 +944,22 @@ def embedding_bag(g,
     # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
     # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
     return loop.node().output(), None, None, None
+
+
+def prim_ConstantChunk(g, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    axis_next = g.op("Constant", value_t=torch.tensor([dim + 1], dtype=torch.long))
+    input_shape_dim = g.op("Slice", input_shape, axis, axis_next)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op("Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long))
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res

From 15bc21c2807ced8aa6ca72130e16e1c81883c9b4 Mon Sep 17 00:00:00 2001
From: neginraoof <neginmr@utexas.edu>
Date: Thu, 3 Dec 2020 23:05:43 -0800
Subject: [PATCH 040/132] [ONNX] Track and list model params for scripting
 (#47348)

Summary:
List model parameters as inputs following freezing script module.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47348

Reviewed By: heitorschueroff

Differential Revision: D25309756

Pulled By: bzinodev

fbshipit-source-id: cbe679ece934d5e6c418a22f08c1662256914c4c
---
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  72 ++++---
 test/onnx/test_utility_funs.py                |  25 +++
 tools/build_variables.bzl                     |   1 +
 torch/csrc/jit/passes/freeze_module.cpp       |   4 +-
 .../jit/passes/onnx/list_model_parameters.cpp | 186 ++++++++++++++++++
 .../jit/passes/onnx/list_model_parameters.h   |  13 ++
 torch/csrc/jit/python/init.cpp                |   4 +
 torch/onnx/utils.py                           |  10 +-
 8 files changed, 284 insertions(+), 31 deletions(-)
 create mode 100644 torch/csrc/jit/passes/onnx/list_model_parameters.cpp
 create mode 100644 torch/csrc/jit/passes/onnx/list_model_parameters.h

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index eee095d22314..63f5c729a75f 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -472,6 +472,7 @@ def forward(self, x_in):
         x = {"test_key_in": torch.randn(1, 2, 3)}
         self.run_test(MyModel(), (x,))
 
+    @disableScriptTest()
     def test_none_as_input(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -482,6 +483,7 @@ def forward(self, x, y):
         x = torch.randn(2, 3)
         self.run_test(Model(), (x, None))
 
+    @disableScriptTest()
     def test_none_as_tuple_input(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -495,6 +497,7 @@ def forward(self, x, y):
         y = torch.randn(2, 3)
         self.run_test(Model(), (x, (None, y)))
 
+    @disableScriptTest()
     def test_none_as_named_input(self):
         class Model(torch.nn.Module):
             def forward(self, x, y=None, z=None):
@@ -678,23 +681,11 @@ def __init__(self):
             def forward(self, input1, input2, input3):
                 return self.conv1(input1), self.conv2(input2), self.conv3(input3)
 
-        class ScriptModel(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ScriptModel, self).__init__()
-                self.conv1 = torch.nn.Conv1d(16, 33, 3, stride=2)
-                self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-                self.conv3 = torch.nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
-
-            @torch.jit.script_method
-            def forward(self, input1, input2, input3):
-                return self.conv1(input1), self.conv2(input2), self.conv3(input3)
-
         x1 = torch.randn(20, 16, 50)
         x2 = torch.randn(20, 16, 50, 100)
         x3 = torch.randn(20, 16, 10, 50, 100)
 
         self.run_test(TraceModel(), (x1, x2, x3), atol=10e-5)
-        self.run_test(ScriptModel(), (x1, x2, x3), atol=10e-5)
 
     def test_conv_shape_inference(self):
         class Model(torch.nn.Module):
@@ -721,23 +712,11 @@ def __init__(self):
             def forward(self, input1, input2, input3):
                 return self.conv1(input1), self.conv2(input2), self.conv3(input3)
 
-        class ScriptModel(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ScriptModel, self).__init__()
-                self.conv1 = torch.nn.ConvTranspose1d(16, 33, 3, stride=2)
-                self.conv2 = torch.nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-                self.conv3 = torch.nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
-
-            @torch.jit.script_method
-            def forward(self, input1, input2, input3):
-                return self.conv1(input1), self.conv2(input2), self.conv3(input3)
-
         x1 = torch.randn(20, 16, 50)
         x2 = torch.randn(20, 16, 50, 100)
         x3 = torch.randn(20, 16, 10, 50, 100)
 
         self.run_test(TraceModel(), (x1, x2, x3), atol=10e-5)
-        self.run_test(ScriptModel(), (x1, x2, x3), atol=10e-5)
 
     # Conversion of Transpose depends on input shape to be known.
     # The following test only works when onnx shape inference is enabled.
@@ -5053,9 +5032,17 @@ def forward(self, x):
         ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version,
                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
-
         [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in zip(pytorch_out, ort_outs)]
 
+        model_export = torch.jit.script(MyModule())
+        ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version,
+                                   example_outputs=out,
+                                   training=torch.onnx.TrainingMode.TRAINING,
+                                   use_new_jit_passes=True, onnx_shape_inference=True)
+        ort_outs = run_ort(ort_sess, input=(x,))
+        [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in
+         zip(pytorch_out, ort_outs)]
+
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_dropout_training(self):
         class MyModule(torch.nn.Module):
@@ -5077,6 +5064,14 @@ def forward(self, x):
         ort_outs = run_ort(ort_sess, input=(x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
 
+        script_model = torch.jit.script(model)
+        output = model(x)
+        ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
+                                   example_outputs=output, use_new_jit_passes=True,
+                                   training=torch.onnx.TrainingMode.TRAINING)
+        ort_outs = run_ort(ort_sess, input=(x,))
+        assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
+
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_dropout_training_zero(self):
         class MyModule(torch.nn.Module):
@@ -5105,7 +5100,21 @@ def forward(self, x):
 
         y = model(input)
         output = y.cpu().numpy()
+        ort_mask = np.where(ort_outs[0] != 0, 1, 0)
+        pyt_mask = np.where(output != 0, 1, 0)
+
+        ratio_pytorch = np.sum(pyt_mask) / nb_elements
+        ratio_ort = np.sum(ort_mask) / nb_elements
+
+        np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01)
 
+        script_model = torch.jit.script(model)
+        y = model(input)
+        output = y.cpu().numpy()
+        ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
+                                   example_outputs=y, use_new_jit_passes=True,
+                                   training=torch.onnx.TrainingMode.TRAINING)
+        ort_outs = run_ort(ort_sess, input=(x,))
         ort_mask = np.where(ort_outs[0] != 0, 1, 0)
         pyt_mask = np.where(output != 0, 1, 0)
 
@@ -5137,6 +5146,19 @@ def forward(self, x):
         [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
          zip(ort_outs1, ort_outs2)]
 
+        script_model = torch.jit.script(model)
+        outputs = model(x)
+        ort_sess1 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
+                                    example_outputs=outputs, use_new_jit_passes=True,
+                                    training=torch.onnx.TrainingMode.TRAINING)
+        ort_outs1 = run_ort(ort_sess1, input=(x,))
+        ort_sess2 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
+                                    example_outputs=outputs, use_new_jit_passes=True,
+                                    training=torch.onnx.TrainingMode.EVAL)
+        ort_outs2 = run_ort(ort_sess2, input=(x,))
+        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
+         zip(ort_outs1, ort_outs2)]
+
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 88daef3d5fb0..5c1bfe8b5515 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -678,6 +678,31 @@ def forward(self, x):
 
         assert len(params_dict) == 2
 
+    def test_scripting_param(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.conv = torch.nn.Conv2d(3, 16, kernel_size=1, stride=2, padding=3, bias=True)
+                self.bn = torch.nn.BatchNorm2d(16, affine=True)
+
+            def forward(self, x):
+                x = self.conv(x)
+                bn = self.bn(x)
+                return bn
+
+        model = torch.jit.script(MyModule())
+        x = torch.randn(10, 3, 128, 128)
+        example_outputs = model(x)
+        f = io.BytesIO()
+        _set_opset_version(self.opset_version)
+        _set_operator_export_type(OperatorExportTypes.ONNX)
+        graph, _, __ = utils._model_to_graph(model, (x,), do_constant_folding=True, example_outputs=example_outputs,
+                                             operator_export_type=OperatorExportTypes.ONNX)
+
+        graph_input_params = [param.debugName() for param in graph.inputs()]
+        assert all(item in graph_input_params for item in dict(model.named_parameters())), \
+            "Graph parameter names does not match model parameters."
+
     def test_modifying_params(self):
         class MyModel(torch.nn.Module):
             def __init__(self):
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index accd810b7085..7e5a5e4e7f8a 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -520,6 +520,7 @@ libtorch_python_core_sources = [
     "torch/csrc/jit/passes/onnx/constant_fold.cpp",
     "torch/csrc/jit/passes/onnx/eliminate_unused_items.cpp",
     "torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp",
+    "torch/csrc/jit/passes/onnx/list_model_parameters.cpp",
     "torch/csrc/jit/passes/onnx/function_substitution.cpp",
     "torch/csrc/jit/passes/onnx/helper.cpp",
     "torch/csrc/jit/passes/onnx/peephole.cpp",
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 35be415b1a1e..2778c7712f23 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -442,7 +442,9 @@ class AttributePropagator {
             if (!isEval || preserveParameters_) {
               auto type = attrModule.type();
               auto slot = *type->findAttributeSlot(name);
-              if (type->is_parameter(slot) || type->is_buffer(slot)) {
+              if (type->is_parameter(slot) || type->is_buffer(slot) ||
+                  (attr.isObject() &&
+                   !attr.toObjectRef().type()->is_module())) {
                 continue;
               } else {
                 attr = overrideGradient(attr);
diff --git a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
new file mode 100644
index 000000000000..4f0b4e2b1437
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
@@ -0,0 +1,186 @@
+#include <torch/csrc/jit/passes/onnx/list_model_parameters.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+namespace torch {
+namespace jit {
+
+// findSubModuleAttr function chases getAttr chains backwards to locate the
+// submodules. For example: module M {
+//   attributes {
+//     A = <SubModule at ...>
+//   }
+//   ...
+//   %A = prim::GetAttr[name="A"](%self)
+//   ...
+//   %B = prim::GetAttr[name="B"](%A)
+//   ...
+//   %weight = prim::GetAttr[name="scale"](%B)
+//   ...
+
+std::deque<std::string> findSubModuleAttr(
+    Value* input,
+    std::string& name,
+    Module& attrModule,
+    std::shared_ptr<Graph>& graph) {
+  Node* node = input->node();
+  std::deque<std::string> moduleNames;
+
+  // Loop starts from inner submodule and follows the chain until reaches the
+  // top module.
+  while (node->outputs().at(0)->type() != graph->inputs().at(0)->type()) {
+    if (node->kind() == prim::GetAttr) {
+      moduleNames.push_front(node->s(attr::name));
+      node = node->inputs()[0]->node();
+    }
+  }
+
+  // Assign the inner module to attrModule.
+  for (auto& moduleName : moduleNames) {
+    attrModule = attrModule.attr(moduleName).toModule();
+  }
+  return moduleNames;
+}
+
+Value* addParamAsArgument(Function* function, std::string& name, IValue& attr) {
+  auto schema = function->getSchema();
+  auto args = schema.arguments();
+  args.emplace_back(Argument(name, nullptr, c10::nullopt, attr));
+  auto new_schema = FunctionSchema(
+      schema.name(),
+      schema.overload_name(),
+      args,
+      schema.returns(),
+      schema.is_vararg(),
+      schema.is_varret());
+  function->setSchema(new_schema);
+  return function->graph()->addInput(name)->setType(attr.type());
+}
+
+std::vector<IValue> getParamAttributes(
+    std::shared_ptr<Graph>& graph,
+    const Module& module_,
+    Function* function_) {
+  std::vector<IValue> attrValues;
+  auto isEval = !module_.hasattr("training") || !module_.is_training();
+  auto block = graph->block();
+  std::vector<Block*> blocks({block});
+
+  Node* m = *block->nodes().begin();
+  WithInsertPoint guard(m);
+
+  while (!blocks.empty()) {
+    Block* block = blocks.back();
+    blocks.pop_back();
+    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+      Node* n = *it;
+      it++; // node n can be destroyed
+
+      for (Block* sub_block : n->blocks()) {
+        blocks.emplace_back(sub_block);
+      }
+      if (n->kind() == prim::SetAttr &&
+          n->s(attr::name) == "num_batches_tracked") {
+        n->destroy();
+      } else if (n->kind() == prim::GetAttr) {
+        for (auto use : n->output()->uses()) {
+          if (use.user->kind() == prim::PythonOp)
+            throw ErrorReport(n->sourceRange())
+                << "Couldn't export Python method.";
+        }
+
+        auto name = n->s(attr::name);
+        auto attrModule = module_;
+        auto input = n->inputs()[0];
+
+        auto moduleNames = findSubModuleAttr(input, name, attrModule, graph);
+        if (!attrModule.hasattr(name)) {
+          continue;
+        }
+        Value* paramConst = nullptr;
+
+        auto attr = attrModule.attr(name);
+
+        std::string fullName("");
+        for (auto& name : moduleNames) {
+          fullName += name + '.';
+        }
+        fullName += name;
+
+        auto type = attrModule.type();
+        auto slot = *type->findAttributeSlot(name);
+
+        if (type->is_parameter(slot) || type->is_buffer(slot) ||
+            (attr.isObject() && !attr.toObjectRef().type()->is_module()) ||
+            name == "training") {
+          if (attr.isTensor()) {
+            TORCH_INTERNAL_ASSERT(attr.isTensor());
+            auto tensor_ = attr.toTensor();
+            if (isEval && tensor_.requires_grad()) {
+              tensor_ = tensor_.detach();
+              tensor_.set_requires_grad(false);
+              attr = IValue(tensor_);
+            }
+            attrValues.emplace_back(attr.toTensor());
+            paramConst = addParamAsArgument(function_, fullName, attr);
+          } else if (
+              attr.isObject() && !attr.toObjectRef().type()->is_module()) {
+            // Only below registered torch classes are supported.
+            auto type = attr.type();
+            TORCH_CHECK(
+                (type ==
+                 getCustomClass(
+                     "__torch__.torch.classes.quantized.Conv2dPackedParamsBase")) ||
+                    (type ==
+                     getCustomClass(
+                         "__torch__.torch.classes.quantized.Conv3dPackedParamsBase")) ||
+                    (type ==
+                     getCustomClass(
+                         "__torch__.torch.classes.quantized.LinearPackedParamsBase")),
+                "Unknown type ",
+                type->repr_str(),
+                " encountered in handling model params. This type is not supported in ONNX export.");
+            attrValues.emplace_back(
+                script::Object(attr.toObject()).run_method("__getstate__"));
+            paramConst = addParamAsArgument(function_, fullName, attr);
+          } else if (attr.isNone() || name == "training") {
+            auto attrVal = tryInsertConstant(*graph, attr);
+            paramConst = *attrVal;
+          }
+          n->output()->replaceAllUsesWith(paramConst);
+          n->removeAllInputs();
+
+          GRAPH_UPDATE("Folding GetAttr %", n->outputs()[0]->debugName());
+        }
+      }
+    }
+  }
+  return attrValues;
+}
+
+std::pair<Module, std::vector<IValue>> list_module_parameters(
+    const Module& module) {
+  Module moduleClone = module.clone(true);
+  Method method = moduleClone.get_method("forward");
+  auto function = &method.function();
+  std::vector<IValue> modelParams;
+
+  GRAPH_DEBUG("List attributes for function: " + function->name());
+  auto graph = function->graph();
+  // Add model_parameters and model_buffers as model inputs. Order is based on
+  // the appearance in the graph.
+  auto attributes = getParamAttributes(graph, moduleClone, function);
+
+  modelParams.reserve(attributes.size());
+  for (auto& attr_ : attributes) {
+    modelParams.push_back(attr_);
+  }
+  GRAPH_DEBUG("Cleaning up module");
+  EliminateDeadCode(graph->block());
+
+  return std::make_pair(moduleClone, modelParams);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/list_model_parameters.h b/torch/csrc/jit/passes/onnx/list_model_parameters.h
new file mode 100644
index 000000000000..50d1cea2b8fe
--- /dev/null
+++ b/torch/csrc/jit/passes/onnx/list_model_parameters.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API std::pair<Module, std::vector<IValue>> list_module_parameters(
+    const Module& module);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 254ad141e589..5f88a8a6c79d 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -39,6 +39,7 @@
 #include <torch/csrc/jit/passes/onnx/eval_peephole.h>
 #include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
 #include <torch/csrc/jit/passes/onnx/function_substitution.h>
+#include <torch/csrc/jit/passes/onnx/list_model_parameters.h>
 #include <torch/csrc/jit/passes/onnx/peephole.h>
 #include <torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h>
 #include <torch/csrc/jit/passes/onnx/preprocess_for_onnx.h>
@@ -279,6 +280,9 @@ void initJITBindings(PyObject* module) {
           "_jit_pass_quant_fusion",
           [](std::shared_ptr<Graph>& g) { return QuantFusion(g); })
       .def("_jit_pass_fold_convbn", &FoldConvBatchNorm)
+      .def(
+          "_jit_onnx_list_model_parameters",
+          [](Module& module) { return list_module_parameters(module); })
       .def(
           "_freeze_module",
           [](Module& module,
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 044e61b42973..5c41306b9ee2 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -121,7 +121,7 @@ def _split_tensor_list_constants(g, block):
 
 
 def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    params_dict=None, use_new_jit_passes=False, dynamic_axes=None, input_names=None):
+                    params_dict=None, use_new_jit_passes=True, dynamic_axes=None, input_names=None):
     # Inline everything
     torch._C._jit_pass_inline(graph)
 
@@ -358,10 +358,10 @@ def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes):
             if not use_new_jit_passes:
                 method_graph, params = torch._C._jit_pass_lower_graph(graph, model._c)
             else:
-                freezed_m = torch._C._freeze_module(model._c)
+                freezed_m = torch._C._freeze_module(model._c, preserveParameters=True)
+                freezed_m, params = torch._C._jit_onnx_list_model_parameters(freezed_m)
                 method_graph = freezed_m._get_method('forward').graph
                 method_graph.eraseInput(0)  # Remove 'self' from model inputs
-                params = []
 
             in_vars, in_desc = torch.jit._flatten(tuple(args) + tuple(params))
             graph = _propagate_and_assign_input_shapes(
@@ -396,7 +396,7 @@ def _model_to_graph(model, args, verbose=False,
                     example_outputs=None,
                     _retain_param_name=False, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, use_new_jit_passes=False,
+                    training=None, use_new_jit_passes=True,
                     dynamic_axes=None):
     from torch.onnx.symbolic_helper import _export_onnx_opset_version
     # Special case for common case of passing a single Tensor
@@ -586,7 +586,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
             strip_doc_string=True, dynamic_axes=None, keep_initializers_as_inputs=None,
             fixed_batch_size=False, custom_opsets=None, add_node_names=True,
             enable_onnx_checker=True, use_external_data_format=False,
-            onnx_shape_inference=True, use_new_jit_passes=False):
+            onnx_shape_inference=True, use_new_jit_passes=True):
 
     if isinstance(model, torch.nn.DataParallel):
         raise ValueError('torch.nn.DataParallel is not supported by ONNX '

From 5fd61de99e9a2b91bf8e90ed91bd073785ce9c84 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare1@gmail.com>
Date: Thu, 3 Dec 2020 23:20:34 -0800
Subject: [PATCH 041/132] [ONNX] Added hardswish symbolic in opset 9 (#48423)

Summary:
Adds support for torch.nn.Hardswish operator in Export

Fixes https://github.com/pytorch/pytorch/issues/43665

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48423

Reviewed By: heitorschueroff

Differential Revision: D25309868

Pulled By: bzinodev

fbshipit-source-id: f5583eb01b1b0e8f0bc95d5054941dd29605d6a5
---
 docs/source/onnx.rst                       |  1 +
 test/onnx/test_pytorch_onnx_onnxruntime.py | 21 +++++++++++++++++++++
 torch/onnx/symbolic_helper.py              |  7 +++++++
 torch/onnx/symbolic_opset9.py              |  9 +++++++++
 4 files changed, 38 insertions(+)

diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index cdda93c60d3f..9dc107d86267 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -510,6 +510,7 @@ The following operators are supported:
 * glu
 * group_norm
 * gt
+* hardswish
 * hardtanh
 * im2col
 * index_copy
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 63f5c729a75f..e2e12af88c1e 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -589,6 +589,27 @@ def forward(self, x):
         x = torch.arange(-5, 5).to(dtype=torch.float32)
         self.run_test(MyModel(), x)
 
+    def test_hardswish(self):
+        model = torch.nn.Hardswish()
+
+        x = torch.rand(3, 3).to(dtype=torch.float32)
+        self.run_test(model, x)
+
+        # Testing edge cases
+        x = torch.tensor(3).to(dtype=torch.float32)
+        self.run_test(model, x)
+        x = torch.tensor(-3).to(dtype=torch.float32)
+        self.run_test(model, x)
+
+    def test_hardswish_script(self):
+        class MyModel(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                return torch.nn.functional.hardswish(x)
+
+        x = torch.rand(3, 3).to(dtype=torch.float32)
+        self.run_test(MyModel(), x)
+
     def test_clamp(self):
         class ClampModel(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 7e16d7a08c44..5e9430f995f8 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -238,6 +238,13 @@ def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False)
         from torch.onnx.symbolic_opset10 import _slice
         return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
 
+def _hardtanh_helper(g, input, min_val, max_val):
+    if _export_onnx_opset_version <= 10:
+        from torch.onnx.symbolic_opset9 import hardtanh
+        return hardtanh(g, input, min_val, max_val)
+    else:
+        from torch.onnx.symbolic_opset11 import hardtanh
+        return hardtanh(g, input, min_val, max_val)
 
 def _is_fp(value):
     if value:
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 68da423280ac..e395ce5c703f 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1740,6 +1740,15 @@ def hardtanh(g, self, min_val, max_val):
     return g.op("Clip", self, min_f=min_val, max_f=max_val)
 
 
+@parse_args('v')
+def hardswish(g, self):
+    input = g.op("Add", self, g.op('Constant', value_t=torch.tensor(3, dtype=torch.float)))
+    hardtanh_ = sym_help._hardtanh_helper(g, input, 
+                                          g.op('Constant', value_t=torch.tensor(0, dtype=torch.float)), 
+                                          g.op('Constant', value_t=torch.tensor(6, dtype=torch.float)))
+    hardtanh_ = g.op("Div", hardtanh_, g.op('Constant', value_t=torch.tensor(6, dtype=torch.float)))
+    return g.op("Mul", self, hardtanh_)
+
 def alias(g, self):
     return self
 

From 2181ff89bb6126b1382f35d7cfca23ee45f40fc9 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Thu, 3 Dec 2020 23:43:19 -0800
Subject: [PATCH 042/132] [vulkan][test] Not use non 1 dilation for conv2d
 (#48800)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48800

Test Plan: Imported from OSS

Reviewed By: SS-JIA

Differential Revision: D25312276

Pulled By: IvanKobzarev

fbshipit-source-id: edb36c284ddb79969cbc4e774f11d85f14b39343
---
 aten/src/ATen/test/vulkan_api_test.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 73b221c81b9d..cbd65fd9b68f 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -269,7 +269,8 @@ TEST(VulkanAPITest, conv2d) {
   constexpr int64_t groups = 1;
   constexpr std::array<int64_t, 2u> stride{1, 2};
   constexpr std::array<int64_t, 2u> padding{3, 0};
-  constexpr std::array<int64_t, 2u> dilation{1, 3};
+  //TODO: Support conv2d with dilation != 1
+  constexpr std::array<int64_t, 2u> dilation{1, 1};
 
   constexpr struct {
     uint32_t batches;

From 4cc163f8ec5884a60df20871fb5c9acaa6d6fb4e Mon Sep 17 00:00:00 2001
From: Huamin Li <huaminli@fb.com>
Date: Fri, 4 Dec 2020 00:44:21 -0800
Subject: [PATCH 043/132] Add deadline to fakelowp tests (#48823)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48823

deadline=None is not good because

Sandcastle tests will return success for tests timeout (default flag), and we cannot efficiently detect broken tests if there is any.

In addition, the return signal for timeout is 64, which is same as skip test.

Test Plan: Sandcastle, and run tests on card

Reviewed By: hyuen

Differential Revision: D25318184

fbshipit-source-id: de1b55a259edb2452fb51ba4c598ab8cca9e76b7
---
 .../test/test_batchmatmul_nnpi_fp16.py        |  8 ++-----
 .../fakelowp/test/test_batchnorm_nnpi_fp16.py |  8 ++-----
 .../test/test_deq_swish_quant_nnpi.py         |  5 ++--
 .../fakelowp/test/test_fc_nnpi_fp16.py        | 14 ++++-------
 caffe2/contrib/fakelowp/test/test_fusions.py  |  5 ++--
 .../fakelowp/test/test_int8_ops_nnpi.py       |  9 ++++---
 .../contrib/fakelowp/test/test_int8_quant.py  |  6 +++--
 .../fakelowp/test/test_layernorm_nnpi_fp16.py |  5 ++--
 .../fakelowp/test/test_op_nnpi_fp16.py        | 24 ++++++++-----------
 .../fakelowp/test/test_sls_4bit_nnpi_fp16.py  | 10 +++-----
 .../fakelowp/test/test_sls_8bit_nnpi_fp16.py  |  9 ++++---
 .../fakelowp/test/test_sls_8bit_nnpi_fp32.py  |  7 +++---
 12 files changed, 45 insertions(+), 65 deletions(-)

diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
index 94a76fed85f5..d6e5c5db6d2a 100644
--- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import numpy as np
 import unittest
 import caffe2.python.fakelowp.init_shared_libs  # noqa
@@ -11,6 +6,7 @@
 from caffe2.python import core, workspace
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
+import datetime
 from hypothesis import given, settings
 import hypothesis.strategies as st
 import caffe2.python.serialized_test.serialized_test_util as serial
@@ -29,7 +25,7 @@ class TestBatchMatMul(serial.SerializedTestCase):
         trans_b=st.booleans(),
         run_ints=st.booleans()
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints):
         np.random.seed(rand_seed)
         workspace.ResetWorkspace()
diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
index 7b1b5f070171..56ac6733f13d 100644
--- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import numpy as np
 import unittest
 
@@ -15,6 +10,7 @@
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
+import datetime
 
 core.GlobalInit(["caffe2", "--glow_global_fp16=1",
                  "--glow_global_fused_scale_offset_fp16=1",
@@ -46,7 +42,7 @@ class BatchnormTest(serial.SerializedTestCase):
            size=st.integers(2, 30),
            input_channels=st.integers(2, 40),
            batch_size=st.integers(2, 20))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_bn(self, seed, size, input_channels, batch_size):
         workspace.ResetWorkspace()
         np.random.seed(seed)
diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
index b7a9fc810cfc..7ee160e19602 100644
--- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
@@ -1,11 +1,11 @@
-
-
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
 from caffe2.python import core, workspace
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
+import datetime
+from hypothesis import settings
 
 core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
 
@@ -24,6 +24,7 @@ def _sigmoid(self, x):
     def _swish(self, x):
         return np.float32(x) * self._sigmoid(x)
 
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_swish_int8(self):
         np.random.seed(0)
         workspace.ResetWorkspace()
diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
index 682f1685c082..d9c2bd37daeb 100644
--- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import numpy as np
 import unittest
 
@@ -14,6 +9,7 @@
 from caffe2.python import workspace
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
+import datetime
 import caffe2.python.serialized_test.serialized_test_util as serial
 
 core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
@@ -23,7 +19,7 @@
 
 class FCTest(serial.SerializedTestCase):
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_clip(self, seed):
         np.random.seed(seed)
         m, n, k = 8, 8, 8
@@ -82,7 +78,7 @@ def test_clip(self, seed):
         n=st.integers(4, 50),
         seed=st.integers(0, 65534)
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_fc_exercise(self, m, k, n, seed):
         """ Test that the matmul engine is working, this doesn't test
             precision
@@ -147,7 +143,7 @@ def test_fc_exercise(self, m, k, n, seed):
                 assert(0)
 
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_fc_numeric_cases(self, seed):
         """ Test numerics, use examples found from the unit test.
             Use Fp16FCAcc16NNPI as a reference.
@@ -272,7 +268,7 @@ def test_fc_numeric_cases(self, seed):
         seed=st.integers(0, 65534),
         use_packed=st.integers(0, 2)
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_fc_num0(self, seed, m, k, n, use_packed):
         """ Test numerics, fix a dimension and determine the ranges of error.
             Use Fp16FCAcc16 as a reference.
diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
index 45757badba43..335159c8318e 100644
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@@ -1,7 +1,6 @@
-
-
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
 import numpy as np
 from hypothesis import given, settings
 from hypothesis import strategies as st
@@ -27,7 +26,7 @@ class Fusions(serial.SerializedTestCase):
         size=st.integers(1, 100000),
         rand_seed=st.integers(0, 65534),
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def Skip_test_tanhquantize(self, scale, zp, size, rand_seed):
         np.random.seed(rand_seed)
 
diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
index 7f51523cb616..1507f41a4861 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@@ -1,5 +1,3 @@
-
-
 import caffe2.python.fakelowp.init_shared_libs  # noqa
 import numpy as np
 from caffe2.python import core, workspace
@@ -7,6 +5,7 @@
 from hypothesis import given, strategies as st, settings
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
+import datetime
 
 core.GlobalInit(["caffe2",
                  "--caffe2_log_level=-3",
@@ -32,7 +31,7 @@ def _get_scale_zp(self, tensor):
         rand_seed=st.integers(0, 65534),
         non_zero_offset=st.booleans()
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=50))
     def test_int8_quantize(self, n, rand_seed, non_zero_offset):
         print("n={}, rand_seed={}".format(n, rand_seed))
         np.random.seed(rand_seed)
@@ -133,7 +132,7 @@ def test_int8_quantize(self, n, rand_seed, non_zero_offset):
         rand_seed=st.integers(0, 65534),
         quantize_bias=st.sampled_from([False]),
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=50))
     def test_int8_fc(
         self, n, m, k, rand_seed, quantize_bias, f
     ):
@@ -234,7 +233,7 @@ def test_int8_fc(
         n=st.integers(1, 4),
         rand_seed=st.integers(0, 65534)
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_int8_small_input(self, n, rand_seed):
         print("n={}, rand_seed={}".format(n, rand_seed))
         np.random.seed(rand_seed)
diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py
index 02095286e1ee..2770dc7bef04 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_quant.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py
@@ -1,12 +1,12 @@
-
-
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
 import numpy as np
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import settings
 
 workspace.GlobalInit(
     [
@@ -18,6 +18,7 @@
 )
 
 class QuantTest(serial.SerializedTestCase):
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_dequantize(self):
         pred_net = caffe2_pb2.NetDef()
         pred_net.name = "pred"
@@ -60,6 +61,7 @@ def test_dequantize(self):
         Y_glow = workspace.FetchBlob("Y")
         np.testing.assert_equal(Y_ref, Y_glow)
 
+    @settings(deadline=datetime.timedelta(seconds=20))
     def test_quantize(self):
         pred_net = caffe2_pb2.NetDef()
         pred_net.name = "pred"
diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
index 5129a38c5241..36d6ba73e0c3 100644
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
@@ -8,6 +8,7 @@
 from hypothesis import given, settings
 from hypothesis import strategies as st
 import caffe2.python.serialized_test.serialized_test_util as serial
+import datetime
 
 core.GlobalInit(["caffe2",
                  "--glow_global_fp16=1",
@@ -25,7 +26,7 @@ class LayerNorm(serial.SerializedTestCase):
            size=st.integers(min_value=2, max_value=128),
            epsilon=st.floats(min_value=1e-4, max_value=1e-3),
            elementwise_affine=st.booleans())
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def Skip_test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine):
         np.random.seed(seed)
         # Reset the workspace
@@ -139,7 +140,7 @@ def _layernorm_transform(self, X):
            size=st.integers(min_value=2, max_value=128),
            epsilon=st.floats(min_value=1e-4, max_value=1e-3),
            elementwise_affine=st.booleans())
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     # re-enable when T74553975 gets fixed
     def Skip_test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine):
         np.random.seed(seed)
diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
index e8512b4dcd74..8a5a2aaeaae7 100644
--- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
@@ -1,11 +1,7 @@
-
-
-
-
-
 import numpy as np
 
 import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
 from hypothesis import given, settings
 from hypothesis import strategies as st
 from caffe2.proto import caffe2_pb2
@@ -103,22 +99,22 @@ def _test_binary_op_graph(self, name, seed):
                 assert(0)
 
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_add_graph(self, seed):
         self._test_binary_op_graph("Add", seed)
 
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_sub_graph(self, seed):
         self._test_binary_op_graph("Sub", seed)
 
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_mul_graph(self, seed):
         self._test_binary_op_graph("Mul", seed)
 
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_div_graph(self, seed):
         self._test_binary_op_graph("Div", seed)
 
@@ -199,7 +195,7 @@ def _test_op_w_ulp_error(self, seed, opname, regions, atol=0, err_threshold=2):
     # Once hypothesis.testing version is updated, we can re-enable
     # testing with different hypothesis examples.
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_sigmoid(self, seed):
         np.random.seed(seed)
         opname = "Sigmoid"
@@ -213,7 +209,7 @@ def test_sigmoid(self, seed):
     # Once hypothesis.testing version is updated, we can re-enable
     # testing with different hypothesis examples.
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_tanh(self, seed):
         np.random.seed(seed)
         opname = "Tanh"
@@ -230,7 +226,7 @@ def test_tanh(self, seed):
     # testing with different hypothesis examples.
     # TODO: move atol to 1e-8 once we get a non-lowered swish implementation
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_swish(self, seed):
         np.random.seed(seed)
         opname = "Swish"
@@ -243,7 +239,7 @@ def test_swish(self, seed):
     # Once hypothesis.testing version is updated, we can re-enable
     # testing with different hypothesis examples.
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_logit(self, seed):
         np.random.seed(seed)
         workspace.ResetWorkspace()
@@ -309,7 +305,7 @@ def test_logit(self, seed):
 
 class ReluTest(serial.SerializedTestCase):
     @given(seed=st.integers(0, 65534))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def relu_test(self, inputs, gc, dc, seed):
         np.random.seed(seed)
         inputs = np.random.rand(1).astype(np.float32)
diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
index a8d6640fa58e..489bfbc37f4f 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import numpy as np
 import unittest
 
@@ -16,6 +11,7 @@
 from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
+import datetime
 
 workspace.GlobalInit(["caffe2", "--glow_global_fp16=1",
                       "--glow_global_fused_scale_offset_fp16=1",
@@ -24,7 +20,7 @@
 
 class SparseLengthsSum4BitFakeNNPIFp16Test(serial.SerializedTestCase):
     @given(seed=st.integers(0, 65535))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_slws_fused_4bit_rowwise_all_same(self, seed):
         np.random.seed(seed)
         workspace.ResetWorkspace()
@@ -118,7 +114,7 @@ def test_slws_fused_4bit_rowwise_all_same(self, seed):
         batch_size=st.integers(1, 32),
         max_weight=st.integers(0, 1),
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
         workspace.ResetWorkspace()
         np.random.seed(seed)
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
index f8fd03cbfb73..c5aea77d7199 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
@@ -1,9 +1,8 @@
-
-
 import unittest
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
 import numpy as np
 from hypothesis import given, settings
 from hypothesis import strategies as st
@@ -99,7 +98,7 @@ def Skip_test_SLS_NonQuantized_fp16(self):
             assert 0
 
     @given(seed=st.integers(0, 65535))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_slws_fused_8bit_rowwise_all_same(self, seed):
         # Comment out for predictable debugging
         np.random.seed(seed)
@@ -207,7 +206,7 @@ def test_slws_fused_8bit_rowwise_all_same(self, seed):
         batch_size=st.integers(1, 5),
         max_weight=st.integers(0, 100),
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
         np.random.seed(seed)
         workspace.ResetWorkspace()
@@ -315,7 +314,7 @@ def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size
     # Simple test to aid debugging order of operations
     # Minimize the case to an SLS that adds two rows
     @given(seed=st.integers(0, 65535))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_small_sls(self, seed):
         np.random.seed(seed)
         workspace.ResetWorkspace()
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
index 207403f1bd0d..971bf8412f4c 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
@@ -1,9 +1,8 @@
-
-
 import unittest
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
 import numpy as np
 from hypothesis import given, settings
 from hypothesis import strategies as st
@@ -32,7 +31,7 @@ class SparseLengthsSum8BitFakeNNPIFp32Test(serial.SerializedTestCase):
         batch_size=st.integers(1, 5),
         max_weight=st.integers(0, 100),
     )
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_slws_fused_8bit_rowwise_acc32_nnpi(
         self, seed, num_rows, embedding_dim, batch_size, max_weight
     ):
@@ -148,7 +147,7 @@ def test_slws_fused_8bit_rowwise_acc32_nnpi(
 
 
     @given(seed=st.integers(0, 65535))
-    @settings(deadline=None)
+    @settings(deadline=datetime.timedelta(seconds=10))
     def test_small_sls_acc32(self, seed):
         workspace.GlobalInit(
             [

From cb285080b0f82e8133a2e8a35c9ecdb182458d09 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Fri, 4 Dec 2020 02:21:55 -0800
Subject: [PATCH 044/132] Added computing matrix condition numbers
 (linalg.cond) (#45832)

Summary:
This PR adds `torch.linalg.cond` for NumPy compatibility.

Ref https://github.com/pytorch/pytorch/issues/42666.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45832

Reviewed By: ngimel

Differential Revision: D25183690

Pulled By: mruberry

fbshipit-source-id: a727959bfec2bc2dc36df59d9ef79c0534b68194
---
 aten/src/ATen/native/LinearAlgebra.cpp     | 156 +++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  24 ++++
 test/test_jit.py                           |   2 +-
 test/test_linalg.py                        | 116 +++++++++++++++
 torch/linalg/__init__.py                   |  83 +++++++++++
 torch/overrides.py                         |   1 +
 6 files changed, 381 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 11050a7303d8..bbc8d29dfab7 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -18,6 +18,8 @@
 #include <limits>
 #include <ATen/NamedTensorUtils.h>
 
+#include <c10/util/variant.h>
+
 namespace at {
 namespace native {
 
@@ -1769,6 +1771,160 @@ Tensor& linalg_norm_out(Tensor& result, const Tensor& self, std::string ord, opt
   return linalg_norm_out_impl(result, self, c10::nullopt, ord, opt_dim, keepdim, opt_dtype);
 }
 
+Tensor _linalg_cond_exception_helper(const Tensor& self) {
+  // For batched input if at least one matrix in the batch is not invertible,
+  // we can't get the result for all other (possibly) invertible matrices in the batch without an explicit for loop.
+  // This should change when at::inverse works with silent errors
+  if (self.dim() > 2) {
+    TORCH_CHECK(false,
+      "One or more matrices in the batch was not invertible! "
+      "linalg_cond does not support yet this case.");
+  }
+  auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2);
+  Tensor result = at::full(result_shape, INFINITY, self.options());
+  return result;
+}
+
+// This function helps to dispatch norm computations depending on 'ord' of variant type
+Tensor _linalg_cond_helper(const Tensor& self, c10::variant<Scalar, std::string> ord_variant) {
+  // Ignore errors if not invertible, result is INFINITY in this case
+  // Currently checking for error in at::inverse causes cross-device data movement
+  // For batched input if at least one matrix in the batch is not invertible,
+  // then the result for all other (possibly) invertible matrices will be infinity as well
+  // since there is currently no way to use at::inverse with silent errors
+  Tensor self_inverse;
+  try {
+    self_inverse = at::inverse(self);
+  } catch (const std::exception& e) {
+    if (strstr(e.what(), "singular")) {
+      return _linalg_cond_exception_helper(self);
+    } else {
+      TORCH_CHECK(false, "linalg_cond got an unexpected error:\n", e.what());
+    }
+  }
+  std::array<int64_t, 2> dim_arr = {-2, -1};
+  optional<IntArrayRef> dim = IntArrayRef(dim_arr);
+
+  return c10::visit([&](auto&& ord) {
+    Tensor norm_self = at::linalg_norm(self, ord, dim);
+    Tensor norm_inverse = at::linalg_norm(self_inverse, ord, dim);
+    Tensor result = norm_self * norm_inverse;
+    return result;
+  }, ord_variant);
+}
+
+// Return zero for each matrix in the batch
+Tensor _linalg_cond_empty_matrix(const Tensor& self, c10::ScalarType dtype) {
+  auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2);
+  return at::zeros(result_shape, self.options().dtype(dtype));
+}
+
+void _linalg_cond_check_ord(c10::variant<Scalar, std::string> ord_variant) {
+  if (ord_variant.index() == 0) {
+    Scalar* ord = c10::get_if<Scalar>(&ord_variant);
+    double abs_ord = std::abs(ord->toDouble());
+    TORCH_CHECK(abs_ord == 2.0 || abs_ord == 1.0 || abs_ord == INFINITY,
+      "linalg_cond got an invalid norm type: ", ord->toDouble());
+  } else if (ord_variant.index() == 1) {
+    std::string* ord = c10::get_if<std::string>(&ord_variant);
+    TORCH_CHECK(*ord == "fro" || *ord == "nuc",
+      "linalg_cond got an invalid norm type: ", *ord);
+  } else {
+    TORCH_CHECK(false,
+      "linalg_cond: something went wrong while checking the norm type");
+  }
+}
+
+// Numerical or None norms
+Tensor linalg_cond(const Tensor& self, optional<Scalar> opt_ord) {
+  TORCH_CHECK(self.dim() >= 2, "linalg_cond only supports matrices or batches of matrices, but got a tensor with ",
+    self.dim(), " dimensions.");
+
+  // The default case is using 2-norm
+  Scalar ord = opt_ord.has_value() ? opt_ord.value() : 2;
+
+  c10::variant<Scalar, std::string> ord_variant = ord;
+  _linalg_cond_check_ord(ord_variant);
+
+  // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input
+  if (self.numel() == 0) {
+    auto real_dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    auto expected_dtype = std::abs(ord.toDouble()) == 2.0 ? real_dtype : self.scalar_type();
+    return _linalg_cond_empty_matrix(self, expected_dtype);
+  }
+
+  // If ord == None or ord == ±2
+  if (std::abs(ord.toDouble()) == 2.0) {
+    auto singular_values = std::get<1>(at::svd(self));
+    // singular values are sorted in descending order
+    auto s_max = at::narrow(singular_values, /*dim=*/-1, /*start=*/0, /*length=*/1);
+    auto s_min = at::narrow(singular_values, /*dim=*/-1, /*start=*/-1, /*length=*/1);
+    Tensor result;
+    if (ord.toDouble() == -2.0) {
+      result = s_min / s_max;
+    } else {
+      result = s_max / s_min;
+    }
+    return result;
+  }
+
+  // ord == ±1 ord == ±inf
+  // since at::inverse is used in the implementation, self has to be a tensor consisting of square matrices
+  // the same check as squareCheckInputs(self) but with a slightly more informative error message
+  TORCH_CHECK(self.size(-1) == self.size(-2),
+              "linalg_cond with ±1 or ±inf norm types only supports square matrices or batches of square matrices "
+              "but got ", self.size(-1), " by ", self.size(-2), " matrices");
+
+  return _linalg_cond_helper(self, ord_variant);
+}
+
+Tensor& linalg_cond_out(Tensor& result, const Tensor& self, optional<Scalar> opt_ord) {
+  // If ord == None or ord == ±2 then SVD is used to compute the condition number
+  // the result is always real-valued, for other cases it is complex-valued for the complex-valued input.
+  ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype()));
+  Scalar ord = opt_ord.has_value() ? opt_ord.value() : 2;
+  auto expected_dtype = std::abs(ord.toDouble()) == 2.0 ? real_dtype : self.scalar_type();
+
+  TORCH_CHECK(result.scalar_type() == expected_dtype,
+    "result dtype ", result.scalar_type(), " does not match the expected dtype ", expected_dtype);
+
+  Tensor result_tmp = at::linalg_cond(self, opt_ord);
+  at::native::resize_output(result, result_tmp.sizes());
+  result.copy_(result_tmp);
+  return result;
+}
+
+// Frobenius or nuclear norms
+Tensor linalg_cond(const Tensor& self, std::string ord) {
+  // the same checks as squareCheckInputs(self) but with a slightly more informative error message
+  TORCH_CHECK(self.dim() >= 2, "linalg_cond only supports matrices or batches of matrices, but got a tensor with ",
+    self.dim(), " dimensions.");
+  TORCH_CHECK(self.size(-1) == self.size(-2),
+              "linalg_cond with frobenius or nuclear norm types only supports square matrices or batches of square matrices "
+              "but got ", self.size(-1), " by ", self.size(-2), " matrices");
+
+  c10::variant<Scalar, std::string> ord_variant = ord;
+  _linalg_cond_check_ord(ord_variant);
+
+  // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input
+  if (self.numel() == 0) {
+    return _linalg_cond_empty_matrix(self, self.scalar_type());
+  }
+
+  return _linalg_cond_helper(self, ord_variant);
+}
+
+// TODO: implement _out variant avoiding copy and using already allocated storage directly
+Tensor& linalg_cond_out(Tensor& result, const Tensor& self, std::string ord) {
+  TORCH_CHECK(result.scalar_type() == self.scalar_type(),
+    "result dtype ", result.scalar_type(), " does not match the expected dtype ", self.scalar_type());
+
+  Tensor result_tmp = at::linalg_cond(self, ord);
+  at::native::resize_output(result, result_tmp.sizes());
+  result.copy_(result_tmp);
+  return result;
+}
+
 Tensor linalg_tensorinv(const Tensor& self, int64_t ind) {
   /*
   The idea is to reduce the problem to 2D square matrix inversion.
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 790fc0ea01f8..46b7173197a7 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9494,6 +9494,30 @@
   python_module: linalg
   variants: function
 
+- func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_cond
+
+- func: linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_cond_out
+
+- func: linalg_cond.p_str(Tensor self, str p) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_cond
+
+- func: linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_cond_out
+
 - func: linalg_tensorinv(Tensor self, int ind=2) -> Tensor
   python_module: linalg
   variants: function
diff --git a/test/test_jit.py b/test/test_jit.py
index c2a0804103a6..c85fcbd19747 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15732,7 +15732,7 @@ def fn(*inputs, **kwargs):
                                                     check_types=check_types)
 
                 # alias annotation testing
-                if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
+                if not is_magic_method and test_name not in EXCLUDE_SCRIPT and not exclude_tensor_method(name, test_name):
                     check_alias_annotation(name, (self_variable,) + args_variable, kwargs_variable)
 
             check(name)
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 944513b82e0a..114ee1842c42 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -987,6 +987,122 @@ def run_test_case(input, p, dim, keepdim):
                 for ord in ord_settings:
                     run_test_case(input, ord, dim, keepdim)
 
+    @skipCPUIfNoLapack
+    @skipCUDAIfNoMagma
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3})
+    def test_cond(self, device, dtype):
+        def run_test_case(input, p):
+            result = torch.linalg.cond(input, p)
+            result_numpy = np.linalg.cond(input.cpu().numpy(), p)
+            self.assertEqual(result, result_numpy, rtol=1e-2, atol=self.precision)
+
+            # test out= variant
+            out = torch.empty_like(result)
+            ans = torch.linalg.cond(input, p, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, result)
+
+        norm_types = [1, -1, 2, -2, inf, -inf, 'fro', 'nuc', None]
+        input_sizes = [(32, 32), (2, 3, 3, 3)]
+        for input_size in input_sizes:
+            input = torch.randn(*input_size, dtype=dtype, device=device)
+            for p in norm_types:
+                # frobenius norm not supported for complex tensors
+                if dtype.is_complex and p == 'fro':
+                    with self.assertRaisesRegex(RuntimeError, "frobenius norm not supported for complex tensors"):
+                        torch.linalg.cond(input, p)
+                    continue
+                run_test_case(input, p)
+
+        # test empty batch sizes
+        input_sizes = [(0, 3, 3), (0, 2, 5, 5)]
+        for input_size in input_sizes:
+            input = torch.randn(*input_size, dtype=dtype, device=device)
+            for p in norm_types:
+                run_test_case(input, p)
+
+        # test non-square input
+        input_sizes = [(16, 32), (32, 16), (2, 3, 5, 3), (2, 3, 3, 5)]
+        for input_size in input_sizes:
+            input = torch.randn(*input_size, dtype=dtype, device=device)
+            for p in [2, -2, None]:
+                run_test_case(input, p)
+
+        # test for singular input
+        a = torch.eye(3, dtype=dtype, device=device)
+        a[-1, -1] = 0  # make 'a' singular
+        for p in norm_types:
+            run_test_case(a, p)
+
+        # test for 0x0 matrices. NumPy doesn't work for such input, we return 0
+        input_sizes = [(0, 0), (2, 5, 0, 0)]
+        for input_size in input_sizes:
+            input = torch.randn(*input_size, dtype=dtype, device=device)
+            for p in ['fro', 2]:
+                expected_dtype = a.real.dtype if dtype.is_complex and p == 2 else dtype
+                expected = torch.zeros(input_size[:-2], dtype=expected_dtype, device=device)
+                actual = torch.linalg.cond(input, p)
+                self.assertEqual(actual, expected)
+
+    @skipCPUIfNoLapack
+    @skipCUDAIfNoMagma
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3})
+    def test_cond_errors_and_warnings(self, device, dtype):
+        norm_types = [1, -1, 2, -2, inf, -inf, 'fro', 'nuc', None]
+
+        # cond expects the input to be at least 2-dimensional
+        a = torch.ones(3, dtype=dtype, device=device)
+        for p in norm_types:
+            with self.assertRaisesRegex(RuntimeError, r'supports matrices or batches of matrices'):
+                torch.linalg.cond(a, p)
+
+        # for some norm types cond expects the input to be square
+        a = torch.ones(3, 2, dtype=dtype, device=device)
+        norm_types = [1, -1, inf, -inf, 'fro', 'nuc']
+        for p in norm_types:
+            with self.assertRaisesRegex(RuntimeError, r'supports square matrices or batches of square matrices'):
+                torch.linalg.cond(a, p)
+
+        # if non-empty out tensor with wrong shape is passed a warning is given
+        a = torch.ones((2, 2), dtype=dtype, device=device)
+        for p in ['fro', 2]:
+            real_dtype = a.real.dtype if dtype.is_complex and p == 2 else dtype
+            out = torch.empty(a.shape, dtype=real_dtype, device=device)
+            with warnings.catch_warnings(record=True) as w:
+                # Trigger warning
+                torch.linalg.cond(a, p, out=out)
+                # Check warning occurs
+                self.assertEqual(len(w), 1)
+                self.assertTrue("An output with one or more elements was resized" in str(w[-1].message))
+
+        # dtypes should match
+        out = torch.empty_like(a).to(torch.int)
+        for p in ['fro', 2]:
+            with self.assertRaisesRegex(RuntimeError, "result dtype Int does not match"):
+                torch.linalg.cond(a, p, out=out)
+
+        # for batched input if at least one matrix in the batch is not invertible,
+        # we can't get the result for all other (possibly) invertible matrices in the batch without an explicit for loop.
+        # this should change when at::inverse works with silent errors
+        # NumPy works fine in this case because it's possible to silence the error and get the inverse matrix results
+        # possibly filled with NANs
+        batch_dim = 3
+        a = torch.eye(3, 3, dtype=dtype, device=device)
+        a = a.reshape((1, 3, 3))
+        a = a.repeat(batch_dim, 1, 1)
+        a[0, -1, -1] = 0  # now a[0] is singular
+        for p in [1, -1, inf, -inf, 'fro', 'nuc']:
+            with self.assertRaisesRegex(RuntimeError, "linalg_cond does not support yet"):
+                torch.linalg.cond(a, p)
+
+        # check invalid norm type
+        a = torch.ones(3, 3, dtype=dtype, device=device)
+        for p in ['wrong_norm', 5]:
+            with self.assertRaisesRegex(RuntimeError, f"linalg_cond got an invalid norm type: {p}"):
+                torch.linalg.cond(a, p)
+
     # Test autograd and jit functionality for linalg functions.
     # TODO: Once support for linalg functions is added to method_tests in common_methods_invocations.py,
     #       the `test_cases` entries below should be moved there. These entries are in a similar format,
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index d2cc7e1df9d0..85b2e0754b05 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -400,6 +400,89 @@
     (tensor(3.7417), tensor(11.2250))
 """)
 
+cond = _add_docstr(_linalg.linalg_cond, r"""
+linalg.cond(input, p=None, *, out=None) -> Tensor
+
+Computes the condition number of a matrix :attr:`input`,
+or of each matrix in a batched :attr:`input`, using the matrix norm defined by :attr:`p`.
+For norms ``p = {'fro', 'nuc', inf, -inf, 1, -1}`` this is defined as the matrix norm of :attr:`input`
+times the matrix norm of the inverse of :attr:`input`. And for norms ``p = {None, 2, -2}`` this is defined as
+the ratio between the largest and smallest singular values.
+
+This function supports ``float``, ``double``, and only on CPU, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+
+.. note:: For ``p = {None, 2, -2}`` the condition number is computed as the ratio between the largest
+          and smallest singular values computed using :func:`torch.linalg.svd`.
+          For these norms :attr:`input` may be a non-square matrix or batch of non-square matrices.
+          For other norms, however, :attr:`input` must be a square matrix or a batch of square matrices,
+          and if this requirement is not satisfied a RuntimeError will be thrown.
+
+.. note:: For ``p = {'fro', 'nuc', inf, -inf, 1, -1}`` if :attr:`input` is a non-invertible matrix then
+          a tensor containing infinity will be returned. If :attr:`input` is a batch of matrices and one
+          or more of them is not invertible then a RuntimeError will be thrown.
+
+.. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
+
+Args:
+    input (Tensor): the input matrix of size :math:`(m, n)` or the batch of matrices of size :math:`(*, m, n)`
+                    where `*` is one or more batch dimensions.
+
+    p (int, float, inf, -inf, 'fro', 'nuc', optional): the type of the matrix norm to use in the computations.
+        The following norms are supported:
+
+        =====  ============================
+        p      norm for matrices
+        =====  ============================
+        None   ratio of the largest singular value to the smallest singular value
+        'fro'  Frobenius norm
+        'nuc'  nuclear norm
+        inf    max(sum(abs(x), dim=1))
+        -inf   min(sum(abs(x), dim=1))
+        1      max(sum(abs(x), dim=0))
+        -1     min(sum(abs(x), dim=0))
+        2      ratio of the largest singular value to the smallest singular value
+        -2     ratio of the smallest singular value to the largest singular value
+        =====  ============================
+
+        Default: ``None``
+
+Keyword args:
+    out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+
+Examples::
+
+    >>> from torch import linalg as LA
+    >>> a = torch.tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
+    >>> LA.cond(a)
+    tensor(1.4142)
+    >>> LA.cond(a, 'fro')
+    tensor(3.1623)
+    >>> LA.cond(a, 'nuc')
+    tensor(9.2426)
+    >>> LA.cond(a, np.inf)
+    tensor(2.)
+    >>> LA.cond(a, -np.inf)
+    tensor(1.)
+    >>> LA.cond(a, 1)
+    tensor(2.)
+    >>> LA.cond(a, -1)
+    tensor(1.)
+    >>> LA.cond(a, 2)
+    tensor(1.4142)
+    >>> LA.cond(a, -2)
+    tensor(0.7071)
+
+    >>> a = torch.randn(3, 4, 4)
+    >>> LA.cond(a)
+    tensor([ 4.4739, 76.5234, 10.8409])
+
+    >>> a = torch.randn(3, 4, 4, dtype=torch.complex64)
+    >>> LA.cond(a)
+    tensor([ 5.9175, 48.4590,  5.6443])
+    >>> LA.cond(a, 1)
+    >>> tensor([ 11.6734+0.j, 105.1037+0.j,  10.1978+0.j])
+""")
+
 tensorinv = _add_docstr(_linalg.linalg_tensorinv, r"""
 linalg.tensorinv(input, ind=2, *, out=None) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 495f1435abee..590bc80b720e 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -318,6 +318,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.complex: lambda real, imag: -1,
         torch.copysign: lambda input, other, out=None: -1,
         torch.polar: lambda abs, ang: -1,
+        torch.linalg.cond: lambda input, ord=None: -1,
         torch.conj: lambda input, out=None: -1,
         torch.constant_pad_nd: lambda input, pad, value=0: -1,
         torch.conv1d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,

From 6ab84ca0f3458924e79f0279521a8b5720d3a9e3 Mon Sep 17 00:00:00 2001
From: kiyosora <xueht.fnst@cn.fujitsu.com>
Date: Fri, 4 Dec 2020 04:30:23 -0800
Subject: [PATCH 045/132] Implement NumPy-like function torch.msort() (#48440)

Summary:
- Related with https://github.com/pytorch/pytorch/issues/38349
- Implementing the NumPy-like function `torch.msort()` .

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48440

Reviewed By: bdhirsh

Differential Revision: D25265753

Pulled By: mruberry

fbshipit-source-id: 7709ac5e5667e7541a3dc9048b9c9896b1a6dfa1
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/native/Sorting.cpp              | 10 +++++++
 aten/src/ATen/native/native_functions.yaml    | 10 +++++++
 docs/source/tensors.rst                       |  1 +
 docs/source/torch.rst                         |  1 +
 test/test_sort_and_select.py                  | 30 ++++++++++++++++++-
 torch/_tensor_docs.py                         |  7 +++++
 torch/_torch_docs.py                          | 29 ++++++++++++++++++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   |  1 +
 10 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 817ccb210692..92952799ec49 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -497,6 +497,7 @@ _(aten, mode) \
 _(aten, mse_loss) \
 _(aten, mse_loss_backward) \
 _(aten, mse_loss_forward) \
+_(aten, msort) \
 _(aten, multi_margin_loss) \
 _(aten, multi_margin_loss_backward) \
 _(aten, multi_margin_loss_forward) \
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index c5768321b521..e365d48fdffe 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -708,5 +708,15 @@ std::tuple<Tensor, Tensor> sort_cpu(
   return sort_out_cpu(values, indices, self, dim, descending);
 }
 
+Tensor& msort_out(Tensor& values, const Tensor& self) {
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::sort_out(values, indices, self, 0, false);
+  return values;
+}
+
+Tensor msort(const Tensor& self) {
+  return std::get<0>(at::sort(self, 0, false));
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 46b7173197a7..c7c1dc33d112 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6779,6 +6779,16 @@
 - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
   variants: method, function
 
+- func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    Math: msort_out
+
+- func: msort(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+  dispatch:
+    Math: msort
+
 - func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 578f6a8b8a0e..f523d604ade7 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -463,6 +463,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: mode
    .. automethod:: movedim
    .. automethod:: moveaxis
+   .. automethod:: msort
    .. automethod:: mul
    .. automethod:: mul_
    .. automethod:: multiply
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index ca288ff0ef6a..98934e5e9849 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -414,6 +414,7 @@ Comparison Ops
     not_equal
     sort
     topk
+    msort
 
 
 Spectral Ops
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 494541a8ea67..ebd7c058f73f 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -1,11 +1,12 @@
 import torch
+import numpy as np
 
 import random
 from torch._six import nan
 from itertools import product
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests)
+    (TestCase, run_tests, make_tensor)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyOnCPUAndCUDA,
      skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA)
@@ -112,6 +113,33 @@ def test_sort(self, device):
         self.assertIsOrdered('descending', x, res2val, res2ind,
                              'random with NaNs')
 
+    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    def test_msort(self, device, dtype):
+        def test(shape):
+            tensor = make_tensor(shape, device, dtype, low=-9, high=9)
+            if tensor.size() != torch.Size([]):
+                expected = torch.from_numpy(np.msort(tensor.cpu().numpy()))
+            else:
+                expected = tensor  # numpy.msort() does not support empty shapes tensor
+
+            result = torch.msort(tensor)
+            self.assertEqual(result, expected)
+
+            out = torch.empty_like(result)
+            torch.msort(tensor, out=out)
+            self.assertEqual(out, expected)
+
+        shapes = (
+            [],
+            [0, ],
+            [20, ],
+            [1, 20],
+            [30, 30],
+            [10, 20, 30]
+        )
+        for shape in shapes:
+            test(shape)
+
     def test_topk(self, device):
         def topKViaSort(t, k, dim, dir):
             sorted, indices = t.sort(dim, dir)
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 9a4b90efb5a9..ef7d71586d32 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3382,6 +3382,13 @@ def callable(a, b) -> number
 See :func:`torch.sort`
 """)
 
+add_docstr_all('msort',
+               r"""
+msort() -> Tensor
+
+See :func:`torch.msort`
+""")
+
 add_docstr_all('argsort',
                r"""
 argsort(dim=-1, descending=False) -> LongTensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 022170efcd63..5cc796dfaf7d 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7556,6 +7556,35 @@ def merge_dicts(*dicts):
             [3, 2, 1, 0]])
 """.format(**common_args))
 
+add_docstr(torch.msort,
+           r"""
+msort(input, *, out=None) -> Tensor
+
+Sorts the elements of the :attr:`input` tensor along its first dimension
+in ascending order by value.
+
+.. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`.
+          See also :func:`torch.sort`.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> t = torch.randn(3, 4)
+    >>> t
+    tensor([[-0.1321,  0.4370, -1.2631, -1.1289],
+            [-2.0527, -1.1250,  0.2275,  0.3077],
+            [-0.0881, -0.1259, -0.5495,  1.0284]])
+    >>> torch.msort(t)
+    tensor([[-2.0527, -1.1250, -1.2631, -1.1289],
+            [-0.1321, -0.1259, -0.5495,  0.3077],
+            [-0.0881,  0.4370,  0.2275,  1.0284]])
+""".format(**common_args))
+
 add_docstr(torch.sparse_coo_tensor,
            r"""
 sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
diff --git a/torch/overrides.py b/torch/overrides.py
index 590bc80b720e..f96549612812 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -525,6 +525,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1,
         torch.movedim: lambda input, source, destination: -1,
         torch.moveaxis: lambda input, source, destination: -1,
+        torch.msort: lambda input, descending=False, out=None: -1,
         torch.mul: lambda input, other, out=None: -1,
         torch.multiply: lambda input, other, out=None: -1,
         torch.multinomial: lambda input, num_samples, replacement=False, out=None: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0b245ec108ae..226dbdd6ad5d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1721,6 +1721,7 @@ def method_tests():
         ('sort', (), NO_ARGS, 'scalar'),
         ('sort', (), (0,), 'dim_scalar'),
         ('sort', (), (0, True), 'dim_desc_scalar'),
+        ('msort', (S, M, S), NO_ARGS),
         ('topk', (S, M, S), (3,)),
         ('topk', (S, M, S), (3, 1), 'dim', (), [1]),
         ('topk', (S, M, S), (3, 1, True), 'dim_desc', (), [1]),

From eb43e12ee4b59b8a99785507c343d9de5f9ff9ca Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 4 Dec 2020 07:07:01 -0800
Subject: [PATCH 046/132] Revert D25277886: [pytorch][PR] Replace constexpr
 with CONSTEXPR_EXCEPT_WIN_CUDA

Test Plan: revert-hammer

Differential Revision:
D25277886 (https://github.com/pytorch/pytorch/commit/0484b048d050ea5b10cb0efd147148aa893f2a4b)

Original commit changeset: eb845db35d31

fbshipit-source-id: 133b938ff8ae1aa54878a03ea5a7e732c6bd5901
---
 torch/csrc/jit/ir/ir.h                 | 4 ++--
 torch/csrc/jit/serialization/pickler.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 37cb31f0967d..9db2dbdf2516 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -1326,7 +1326,7 @@ inline const Graph* Value::owningGraph() const {
 
 /************* All nodes not required to be defined before Graph **************/
 struct ProfileOp : public Node {
-  static CONSTEXPR_EXCEPT_WIN_CUDA Symbol Kind = ::c10::prim::profile;
+  static constexpr Symbol Kind = ::c10::prim::profile;
   ProfileOp(Graph* graph, std::function<void(std::vector<IValue>&)> callback)
       : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {}
 
@@ -1346,7 +1346,7 @@ struct ProfileOp : public Node {
 };
 
 struct TORCH_API ProfileOptionalOp : public Node {
-  static CONSTEXPR_EXCEPT_WIN_CUDA Symbol Kind = ::c10::prim::profile_optional;
+  static constexpr Symbol Kind = ::c10::prim::profile_optional;
   ProfileOptionalOp(
       Graph* graph,
       std::function<void(std::vector<IValue>&)> callback)
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 6a557e6e53f3..4473b0cb50dd 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -209,7 +209,7 @@ class TORCH_API Pickler {
   // the left of a '::', its type cannot be deduced by the compiler so one must
   // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
   // does not)
-  static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256;
+  static constexpr size_t kBufferSize = 256;
   template <typename T>
   void push(typename std::common_type<T>::type value) {
     const char* begin = reinterpret_cast<const char*>(&value);

From 3a0d4240c32b568ebe74a0f6901fc744d5179b3b Mon Sep 17 00:00:00 2001
From: Emile van Krieken <emilevankrieken@live.nl>
Date: Fri, 4 Dec 2020 07:31:03 -0800
Subject: [PATCH 047/132] Fix broadcast_all crashing on Tensor-likes (#48169)

Summary:
This ensures Tensor-likes that implement `__torch_function__` are properly handled by `torch.distributions.utils.broadcast_all`.  See Issue https://github.com/pytorch/pytorch/issues/37141 .

In this implementation, Number's will not be cast to the dtype of Tensor-likes.

Fixes https://github.com/pytorch/pytorch/issues/37141

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48169

Reviewed By: izdeby

Differential Revision: D25091414

Pulled By: walterddr

fbshipit-source-id: c5c99374b02409393a68dcb85e2f8feab154318f
---
 test/test_overrides.py       | 23 +++++++++++++++++++++++
 torch/distributions/utils.py | 18 ++++++++++--------
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/test/test_overrides.py b/test/test_overrides.py
index 4e794db1a0ce..95f94504d84e 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -838,6 +838,29 @@ def test_newones(self):
         n = t.new_ones((1, 2))
         self.assertEqual(type(n), SubTensor2)
 
+
+class TestBroadcastAllOverride(TestCase):
+    """ test for gh-37141 """
+    def test_broadcast_all(self):
+        from torch.distributions.utils import broadcast_all
+        a = torch.tensor([1.2, 3.4, 5.6])
+        a_w = Wrapper(a)
+        b = torch.tensor(5.0)
+        b_w = Wrapper(b)
+        c = torch.tensor([5.0, 5.0, 5.0])
+
+        o_1 = broadcast_all(a_w, b_w)
+        self.assertTrue(isinstance(o_1[0], Wrapper))
+        self.assertTrue(isinstance(o_1[1], Wrapper))
+        self.assertEqual(o_1[0]._data, a)
+        self.assertEqual(o_1[1]._data, c)
+
+        o_2 = broadcast_all(a_w, b)
+        self.assertTrue(isinstance(o_2[0], Wrapper))
+        self.assertTrue(isinstance(o_2[1], Wrapper))
+        self.assertEqual(o_2[0]._data, a)
+        self.assertEqual(o_2[1]._data, c)
+
 class TestWrapTorchFunction(TestCase):
     def test_wrap_torch_function(self):
         class A:
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 05500f22c344..380b98785f6c 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn.functional as F
 from typing import Dict, Any
-
+from torch.overrides import has_torch_function
 
 euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
 
@@ -18,21 +18,23 @@ def broadcast_all(*values):
         values are scalars, then they are upcasted to scalar Tensors.
 
     Args:
-        values (list of `numbers.Number` or `torch.*Tensor`)
+        values (list of `numbers.Number`, `torch.*Tensor` or objects implementing __torch_function__)
 
     Raises:
-        ValueError: if any of the values is not a `numbers.Number` or
-            `torch.*Tensor` instance
+        ValueError: if any of the values is not a `numbers.Number` instance,
+            a `torch.*Tensor` instance, or an instance implementing __torch_function__
     """
-    if not all(isinstance(v, torch.Tensor) or isinstance(v, Number) for v in values):
-        raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.')
-    if not all([isinstance(v, torch.Tensor) for v in values]):
+    if not all(isinstance(v, torch.Tensor) or has_torch_function((v,)) or isinstance(v, Number)
+               for v in values):
+        raise ValueError('Input arguments must all be instances of numbers.Number, '
+                         'torch.Tensor or objects implementing __torch_function__.')
+    if not all([isinstance(v, torch.Tensor) or has_torch_function((v,)) for v in values]):
         options: Dict[str, Any] = dict(dtype=torch.get_default_dtype())
         for value in values:
             if isinstance(value, torch.Tensor):
                 options = dict(dtype=value.dtype, device=value.device)
                 break
-        new_values = [v if isinstance(v, torch.Tensor) else torch.tensor(v, **options)
+        new_values = [v if isinstance(v, torch.Tensor) or has_torch_function((v,)) else torch.tensor(v, **options)
                       for v in values]
         return torch.broadcast_tensors(*new_values)
     return torch.broadcast_tensors(*values)

From bc2352e8c3b1120dc04decc5a2122eb76f3a7986 Mon Sep 17 00:00:00 2001
From: Peng Wu <pengwu@fb.com>
Date: Fri, 4 Dec 2020 08:08:27 -0800
Subject: [PATCH 048/132] [NNC] Complete SimpleIREvaluator support for bitwise
 ops (#48053) (#48179)

Summary:
Add missing types for bitwise_ops in `SimpleIREvaluator`

This is the first part of fixes for issue https://github.com/pytorch/pytorch/issues/48053.
- Original implementation of bitwise_ops supports only int operands, the
fix all support for integral types supported by the IR

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48179

Test Plan: `python test/test_jit_fuser_te.py TestTEFuser.test_bitwise_ops`

Reviewed By: ZolotukhinM

Differential Revision: D25126944

Pulled By: penguinwu

fbshipit-source-id: 04dc7fc00c93b2bf1bd9f9cd09f7252357840b85
---
 c10/core/ScalarType.h            |  7 +++++
 test/test_jit_fuser_te.py        |  1 -
 torch/csrc/jit/tensorexpr/eval.h | 54 ++++++++++++++++++++++++++++----
 3 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 6903cf9f77ce..29fa2020f684 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -115,6 +115,13 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 
 #undef SPECIALIZE_CppTypeToScalarType
 
+#define AT_FORALL_INT_TYPES(_) \
+  _(uint8_t, Byte)             \
+  _(int8_t, Char)              \
+  _(int16_t, Short)            \
+  _(int, Int)                  \
+  _(int64_t, Long)
+
 #define AT_FORALL_SCALAR_TYPES(_) \
   _(uint8_t, Byte)                \
   _(int8_t, Char)                 \
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index b2d271909ec7..79a1a664e843 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -457,7 +457,6 @@ def f(x, y, z):
             ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
             self.assertAllFused(ge.graph_for(x, y, z))
 
-    @unittest.skipIf(not LLVM_ENABLED, "TODO: bugs in ir eval")
     def test_bitwise_ops(self):
         def apply(fn):
             return lambda x, y, z: fn(fn(x, y), z)
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index c63d46e88797..7b8a4c194782 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -291,13 +291,14 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor {
     return Value(result_v);
   }
 
+  template <typename T>
   Value bitwise_binary_op(
       const Value& lhs,
       const Value& rhs,
       IRNodeType op_type) {
-    std::vector<int> lhs_v = lhs.as_vec<int>();
-    std::vector<int> rhs_v = rhs.as_vec<int>();
-    std::vector<int> result_v(lhs_v.size());
+    std::vector<T> lhs_v = lhs.as_vec<T>();
+    std::vector<T> rhs_v = rhs.as_vec<T>();
+    std::vector<T> result_v(lhs_v.size());
     for (size_t i = 0; i < lhs_v.size(); i++) {
       switch (op_type) {
         case IRNodeType::kAnd:
@@ -309,6 +310,24 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor {
         case IRNodeType::kXor:
           result_v[i] = lhs_v[i] ^ rhs_v[i];
           break;
+        default:
+          // TODO: change to a proper error report
+          throw std::runtime_error("invalid operator type");
+      }
+    }
+    return Value(result_v);
+  }
+
+  template <typename T>
+  Value shift_binary_op(
+      const Value& lhs,
+      const Value& rhs,
+      IRNodeType op_type) {
+    std::vector<T> lhs_v = lhs.as_vec<T>();
+    std::vector<T> rhs_v = rhs.as_vec<T>();
+    std::vector<T> result_v(lhs_v.size());
+    for (size_t i = 0; i < lhs_v.size(); i++) {
+      switch (op_type) {
         case IRNodeType::kLshift:
           result_v[i] = lhs_v[i] << rhs_v[i];
           break;
@@ -372,11 +391,34 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor {
     if (lhs_v.dtype() != rhs_v.dtype()) {
       throw malformed_input("bad dtype in binary op", v);
     }
+
     IRNodeType expr_type = v->expr_type();
     if (expr_type == IRNodeType::kAnd || expr_type == IRNodeType::kOr ||
-        expr_type == IRNodeType::kXor || expr_type == IRNodeType::kLshift ||
-        expr_type == IRNodeType::kRshift) {
-      value_ = bitwise_binary_op(lhs_v, rhs_v, expr_type);
+        expr_type == IRNodeType::kXor) {
+      switch (lhs_v.dtype().scalar_type()) {
+#define TYPE_CASE(Type, Name)                                  \
+  case ScalarType::Name:                                       \
+    value_ = bitwise_binary_op<Type>(lhs_v, rhs_v, expr_type); \
+    break;
+        AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+        case ScalarType::Bool:
+          value_ = bitwise_binary_op<unsigned char>(lhs_v, rhs_v, expr_type);
+          break;
+        default:
+          throw unsupported_dtype();
+      }
+      return;
+    }
+
+    if (expr_type == IRNodeType::kLshift || expr_type == IRNodeType::kRshift) {
+      switch (lhs_v.dtype().scalar_type()) {
+        case ScalarType::Int:
+          value_ = shift_binary_op<int>(lhs_v, rhs_v, expr_type);
+          break;
+        default:
+          throw unsupported_dtype();
+      }
       return;
     }
 

From 07d185ef051ec1c99d6ffe7856b80478703b7fe4 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 4 Dec 2020 08:35:57 -0800
Subject: [PATCH 049/132] [ROCm] add 3.10 docker image (#48791)

Summary:
Add a ROCm 3.10 docker image for CI.  Keep the 3.9 image and remove the 3.8 image.  Plan is to keep two ROCm versions at a time.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48791

Reviewed By: janeyx99

Differential Revision: D25307102

Pulled By: walterddr

fbshipit-source-id: 88371aafd07db7c5d0dd210759bb7c3aac1f0187
---
 .circleci/cimodel/data/simple/docker_definitions.py | 2 +-
 .circleci/config.yml                                | 6 +++---
 .circleci/docker/build.sh                           | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index 9ba9fb0a8c0c..960bd2fbff85 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -29,8 +29,8 @@
     "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
-    "pytorch-linux-bionic-rocm3.8-py3.6",
     "pytorch-linux-bionic-rocm3.9-py3.6",
+    "pytorch-linux-bionic-rocm3.10-py3.6",
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index e988ed7b53bf..b8e8aed96ae7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7272,12 +7272,12 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
-      - docker_build_job:
-          name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
-          image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.9-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.9-py3.6"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.10-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.10-py3.6"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc5_4_build
           requires:
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 150e2bb9f380..e01ca37d471d 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -274,19 +274,19 @@ case "$image" in
     VISION=yes
     KATEX=yes
     ;;
-  pytorch-linux-bionic-rocm3.8-py3.6)
+  pytorch-linux-bionic-rocm3.9-py3.6)
     ANACONDA_PYTHON_VERSION=3.6
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=3.8
+    ROCM_VERSION=3.9
     ;;
-  pytorch-linux-bionic-rocm3.9-py3.6)
+  pytorch-linux-bionic-rocm3.10-py3.6)
     ANACONDA_PYTHON_VERSION=3.6
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=3.9
+    ROCM_VERSION=3.10
     ;;
   *)
     # Catch-all for builds that are not hardcoded.

From fadec77c301d21e04cbc1d9652f2cf29dc4e7572 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Fri, 4 Dec 2020 10:11:15 -0800
Subject: [PATCH 050/132] [quant][fx][graphmode] Renable torchvision test
 (#48602)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48602

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D25224917

fbshipit-source-id: efc73f425253c4eb7ae51064b6760416097f0437
---
 test/quantization/test_quantize_fx.py         | 18 ++++++------------
 torch/quantization/quantize_fx.py             |  4 ++--
 torch/testing/_internal/common_distributed.py | 18 ++++++++++--------
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 27064c41805a..7e4048b98cbf 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -2103,15 +2103,14 @@ def _test_model_impl(
         original_out = model(input_value)
         is_not_tuple_out = not isinstance(original_out, tuple)
         script_out = script(input_value)
-        self.assertEqual(
-            (original_out - script_out).abs().max(), 0,
-            'Reslut of original graph module and script module does not match')
 
         # set to train just before quantization
+        prepare_fx_fn = prepare_fx
         if mode != 'static':
             model.train()
+            prepare_fx_fn = prepare_qat_fx
 
-        prepared = prepare_fx(model, qconfig_dict)
+        prepared = prepare_fx_fn(model, qconfig_dict)
 
         if mode == 'ddp':
             mp.spawn(run_ddp,
@@ -2207,15 +2206,11 @@ def get_available_classification_models(models):
         quantized_model_list = set(quantized_model_list) - no_pretrained_model
         # test eager and graph consistency
         model_list = quantized_model_list
-        # slice need to be fixed in symbolic tracing(https://github.com/pytorch/pytorch/issues/43511)
-        model_list = set(model_list) - {'googlenet', 'inception_v3'}
-        # getattr should not be used as node name(https://github.com/pytorch/pytorch/issues/43522)
-        model_list -= {'shufflenet_v2_x1_0', 'mobilenet_v2'}
-
+        # inception_v3 is not symbolically traceable: https://github.com/pytorch/pytorch/issues/48813
+        model_list = set(model_list) - {'inception_v3'}
         # mobilenet: dropout error RuntimeError: "bernoulli_scalar_cpu_" not implemented for 'QUInt8'
         # incpetion_v3: looks like there is some problem with AuxLogits
-        quantized_not_working = [('qat', 'mobilenet_v2'),
-                                 ('qat', 'inception_v3'),
+        quantized_not_working = [('qat', 'inception_v3'),
                                  ('static', 'inception_v3')]
 
         fx_eager_not_matching = ['googlenet',  # because _transform_input is not quantized in eager
@@ -2257,7 +2252,6 @@ def print_diffs(diffs):
     @skip_if_no_torchvision
     @skip_if_not_multigpu
     @skipIfNoFBGEMM
-    @unittest.skip('TODO: not working yet due to https://github.com/pytorch/pytorch/issues/43513')
     def test_resnet18_ddp(self):
         from torchvision import models
         from torchvision.models import quantization as quantized_models
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index ba1f58af402e..77f598e1acf9 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -254,7 +254,7 @@ def calibrate(model, data_loader):
     ```
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
-    assert not model.training, 'prepare_fx only works for models in' + \
+    assert not model.training, 'prepare_fx only works for models in ' + \
         'eval mode'
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
@@ -291,7 +291,7 @@ def train_loop(model, train_data):
     ```
     """
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
-    assert model.training, 'prepare_qat_fx only works for models in ' + \
+    assert model.training, 'prepare_qat_fx only works for models in  ' + \
         'train mode'
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
 
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 2a1e2b61b3eb..e44d5df09258 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -63,15 +63,17 @@ def wrapper(*args, **kwargs):
 
 def skip_if_not_multigpu(func):
     """Multi-GPU tests requires at least 2 GPUS. Skip if this is not met."""
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
-            return func(*args, **kwargs)
-        message = "Need at least {} CUDA devices".format(2)
-        TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
-        sys.exit(TEST_SKIPS['multi-gpu'].exit_code)
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
+                return func(*args, **kwargs)
+            message = "Need at least {} CUDA devices".format(2)
+            TEST_SKIPS["multi-gpu"] = TestSkip(75, message)
+            sys.exit(TEST_SKIPS['multi-gpu'].exit_code)
+        return wrapper
 
-    return wrapper
+    return decorator
 
 def require_n_gpus_for_nccl_backend(n, backend):
     def decorator(func):

From a5fb12d168125fd228c4b36ef08a3ad4904ac457 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 4 Dec 2020 11:31:36 -0800
Subject: [PATCH 051/132] RRef proxy support for ScriptModule methods (#48339)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48339

Closes https://github.com/pytorch/pytorch/issues/48294
https://github.com/pytorch/pytorch/pull/48293 added creation and transfer of ScriptModule over RPC in python, but it did not work with ScriptModule.

This PR makes the above work with ScriptModule as per a discussion with mrshenli:
1) We remove the `hasattr()` check and just let Python throw the exception as it would when accessing the py function with `getattr`
2) We  condition on `issubclass(type, ScriptModule)` when checking if it is wrapped with async_function, because `ScriptModule` does not have getattr implemented (this is because ScriptModule forward/function is not a python function, it is a torchscript specific function):
```
torch/jit/_script.py", line 229, in __get__
    return self.__getattr__("forward")  # type: ignore
AttributeError: '_CachedForward' object has no attribute '__getattr__'
```
ghstack-source-id: 117631795

Test Plan: Modified ut

Reviewed By: wanchaol

Differential Revision: D25134423

fbshipit-source-id: 918ca88891c7b0531325f046b61f28947575cff0
---
 torch/distributed/rpc/rref_proxy.py           | 12 +++++------
 .../_internal/distributed/rpc/jit/rpc_test.py | 20 +++++++++----------
 .../_internal/distributed/rpc/rpc_test.py     |  8 ++++----
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/torch/distributed/rpc/rref_proxy.py b/torch/distributed/rpc/rref_proxy.py
index 17ce9da643b9..f087514d92a8 100644
--- a/torch/distributed/rpc/rref_proxy.py
+++ b/torch/distributed/rpc/rref_proxy.py
@@ -15,13 +15,11 @@ def _invoke_rpc(rref, rpc_api, func_name, *args, **kwargs):
     rref_type = rref._get_type()
 
     _invoke_func = _local_invoke
-    if rref_type is not torch._C.ScriptModule:
-        if not hasattr(rref_type, func_name):
-            raise ValueError(
-                f"Function {func_name} is not an attribute of type {rref_type} "
-                f"referenced by RRef {rref}."
-            )
-
+    # Bypass ScriptModules when checking for async function attribute.
+    bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
+        rref_type, torch._C.ScriptModule
+    )
+    if not bypass_type:
         func = getattr(rref_type, func_name)
         if hasattr(func, "_wrapped_async_rpc_function"):
             _invoke_func = _local_invoke_async_execution
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 95d2ca860afd..2a0b114f2b8a 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -21,9 +21,6 @@
     RpcAgentTestFixture,
 )
 
-def run(rref, func_name, args, kwargs):
-    return getattr(rref.local_value(), func_name)(*args, **kwargs)
-
 def rref_isinstance(rref, cls_to_check):
     return isinstance(rref.local_value(), cls_to_check)
 
@@ -362,6 +359,10 @@ def __init__(self, rank):
     def forward(self) -> Tensor:
         return self.a
 
+    @torch.jit.script_method
+    def custom_func(self) -> Tensor:
+        return self.a
+
 
 def owner_create_rref_my_script_class(a):
     return rpc.RRef(MyScriptClass(a))
@@ -973,20 +974,19 @@ def test_create_script_module_on_remote(self):
         )
         self.assertTrue(remote_end_is_script)
         # Run forward pass remotely.
-        # TODO: make RRef helper work with ScriptModule.
-        remote_forward_output = rpc.rpc_sync(
-            remote_script_module.owner(),
-            run,
-            args=(remote_script_module, "forward", (), {}),
-        )
+        remote_forward_output = remote_script_module.rpc_sync().forward()
         self.assertEqual(remote_forward_output, torch.ones(self.rank))
+        # Run function defined on ScriptModule remotely.
+        remote_func_output = remote_script_module.rpc_sync().custom_func()
+        self.assertEqual(remote_func_output, torch.ones(self.rank))
         # Ensure we can transfer ScriptModule RRef to this rank and run
         # forward pass.
         local_script_module = remote_script_module.to_here()
         self.assertTrue(isinstance(local_script_module, torch.jit.ScriptModule))
         rank_ones_tensor = local_script_module()
         self.assertEqual(rank_ones_tensor, torch.ones(self.rank))
-
+        local_script_func_output = local_script_module.custom_func()
+        self.assertEqual(local_script_func_output, torch.ones(self.rank))
 
     @dist_init
     def test_load_script_module_with_pickled_rref(self):
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index ba7f0d650b22..46dbacc3c2eb 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -606,14 +606,14 @@ def test_self_remote_rref_as_remote_arg(self):
     def test_rref_proxy_non_exist(self):
         dst = worker_name((self.rank + 1) % self.world_size)
         rref = rpc.remote(dst, my_function, args=(torch.ones(2, 2), 1, 3))
-        msg = "non_exist is not an attribute of type"
-        with self.assertRaisesRegex(ValueError, msg):
+        msg = "has no attribute \'non_exist\'"
+        with self.assertRaisesRegex(AttributeError, msg):
             rref.rpc_sync().non_exist()
 
-        with self.assertRaisesRegex(ValueError, msg):
+        with self.assertRaisesRegex(AttributeError, msg):
             rref.rpc_async().non_exist()
 
-        with self.assertRaisesRegex(ValueError, msg):
+        with self.assertRaisesRegex(AttributeError, msg):
             rref.remote().non_exist()
 
     def _test_rref_proxy_tensor(self, dst):

From 9af627fda10e4332bd7bd41385f7d0165d0e59d0 Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Fri, 4 Dec 2020 12:16:29 -0800
Subject: [PATCH 052/132] fix some typos in the fx ir test_fx_experiemntal
 (#48847)

Summary:
fix some typos in test_fx_experimental.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48847

Reviewed By: malfet, gcatron

Differential Revision: D25339391

Pulled By: scottxu0730

fbshipit-source-id: 388d9da94259d2b306d59f3f4a167e486ac06d60
---
 test/test_fx_experimental.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 2e3205578320..201680cec4bd 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -131,7 +131,7 @@ def forward(self, a, b):
         devices = [
             Device("dev_0", 125, 0),
             Device("dev_1", 125, 1),
-            Device("dev_2", 125, 2),
+            Device("dev_2", 125, 2)
         ]
         partitioner_config = PartitionerConfig(devices)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
@@ -172,12 +172,12 @@ class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(4, 4)
+                self.c = torch.rand(4)
 
             def forward(self, a, b):
                 add_1 = a + b
                 linear = self.linear(add_1)
-                e = torch.rand(4)
-                add_2 = linear + e
+                add_2 = linear + self.c
                 return add_2
 
         m = TestModule()
@@ -189,9 +189,9 @@ def forward(self, a, b):
         devices = [
             Device("dev_0", 125, 0),
             Device("dev_1", 125, 1),
-            Device("dev_2", 125, 2),
+            Device("dev_2", 125, 2)
         ]
-        partitioner_config = PartitionerConfig(devices)
+        partitioner_config = PartitionerConfig(devices, PartitionMode.size_based)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
@@ -219,7 +219,7 @@ def forward(self, a):
         graph_manipulation.get_size_of_all_nodes(traced, [a])
         partitioner = Partitioner()
         devices = [Device("dev_0", 120, 0), Device("dev_1", 160, 1)]
-        partitioner_config = PartitionerConfig(devices)
+        partitioner_config = PartitionerConfig(devices, PartitionMode.size_based)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
         dag = ret.dag
@@ -282,7 +282,7 @@ def forward(self, a, b, offset):
         devices = [
             Device("dev_0", 33000000, 0),
             Device("dev_1", 33000000, 1),
-            Device("dev_2", 33000000, 2),
+            Device("dev_2", 33000000, 2)
         ]
         partitioner_config = PartitionerConfig(devices, PartitionMode.sparse_nn)
         partitioner = Partitioner()

From 31808dcdd8ec2c10699e6df7fd305f724c9ece8b Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 4 Dec 2020 12:33:13 -0800
Subject: [PATCH 053/132] [RELAND] [CUDA graphs] Make CUDAGeneratorImpl
 capturable (ci-all edition) (#48694)

Summary:
Resubmission of https://github.com/pytorch/pytorch/pull/47989 with attempted fix for the unexpected context creation that caused revert (https://github.com/pytorch/pytorch/pull/47989#issuecomment-736689145).

Submitting from a ci-all branch because the failing test isn't public.

Diffs relative to master should be the same as https://github.com/pytorch/pytorch/pull/47989 's approved diffs, aside from the fix itself https://github.com/pytorch/pytorch/pull/48688/commits/a5c80f63d3aae66d691bbafc726615e9be8e68be.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48694

Reviewed By: mruberry

Differential Revision: D25291431

Pulled By: ngimel

fbshipit-source-id: 8c27f85c64eecaf1f5cb925020fa6d38a07ff095
---
 BUILD.bazel                                   |   1 +
 aten/src/ATen/CUDAGeneratorImpl.h             | 121 ++++++++++++++++++
 aten/src/ATen/core/Generator.h                |   6 +
 aten/src/ATen/cuda/CUDAApplyUtils.cuh         |  21 ++-
 aten/src/ATen/cuda/CUDAGeneratorImpl.cpp      | 103 +++++++++++++--
 aten/src/ATen/cuda/CUDAGraphsUtils.cuh        |  97 ++++++++++++++
 .../ATen/native/cuda/DistributionTemplates.h  |  66 +++++-----
 aten/src/ATen/native/cuda/Distributions.cu    |  66 +++++-----
 aten/src/ATen/native/cuda/Dropout.cu          |  56 ++++----
 .../src/ATen/native/cuda/MultinomialKernel.cu |  16 ++-
 aten/src/THCUNN/RReLU.cu                      |  10 +-
 aten/src/THCUNN/generic/RReLU.cu              |   4 +-
 12 files changed, 450 insertions(+), 117 deletions(-)
 create mode 100644 aten/src/ATen/cuda/CUDAGraphsUtils.cuh

diff --git a/BUILD.bazel b/BUILD.bazel
index 621494b3dc7b..76afe6aec1ea 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -458,6 +458,7 @@ filegroup(
     name = "aten_srcs_cu",
     srcs = [
         "aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
+        "aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
         "aten/src/ATen/native/cuda/Activation.cu.cc",
         "aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
         "aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h
index 57ace5f63bcc..ec83128c7013 100644
--- a/aten/src/ATen/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/CUDAGeneratorImpl.h
@@ -2,10 +2,122 @@
 
 #include <c10/core/GeneratorImpl.h>
 #include <ATen/core/Generator.h>
+#include <ATen/Tensor.h>
+#include <ATen/Context.h>
+#include <limits>
 
 // TODO: this file should be in ATen/cuda, not top level
 
 namespace at {
+/**
+ * Note [CUDA Graph-safe RNG states]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Strategy:
+ * ~~~~~~~~~
+ * A CUDA graph containing multiple RNG ops behaves like a
+ * single giant kernel from the perspective of ops external
+ * to the graph.  During graph capture, logic below records
+ * the total of all offset increments that occur in the graphed
+ * region, and records the final total as the offset for the
+ * entire graph.
+ *
+ * When the graph reruns, the logic that reruns it
+ * increments this device's CUDA generator's offset
+ * by that total.
+ *
+ * Meanwhile, within the graph, at capture time, instead of
+ * populating PhiloxCudaStates with the uint64_t offset pulled
+ * directly from the global state, PhiloxCudaState instead
+ * holds a pointer to one-element stream-local int64_t device tensor
+ * holding an initial offset value, and a uint64_t holding an
+ * intra-graph offset. (The intra-graph offset starts from zero
+ * when capture begins.)  In each consumer kernel,
+ * at::cuda::philox::unpack computes the offset to use for this kernel
+ * as intra-graph offset + *initial offset.
+ *
+ * When the graph reruns, the logic that reruns it first
+ * fill_s the initial offset tensor with this device's
+ * CUDA generator's current offset.
+ *
+ * The control flow above ensures graphed execution is bitwise
+ * identical to eager execution as long as RNG ops are enqueued
+ * from a single thread, even if RNG ops and graphs containing
+ * RNG ops are enqueued and run simultaneously on multiple streams.
+ *
+ * Usage:
+ * ~~~~~~
+ * PhiloxCudaState in this file, and unpack() in
+ * cuda/CUDAGraphsUtils.cuh allow non-divergent use of
+ * CUDAGeneratorImpl whether graph capture is underway or not.
+ *
+ * Each PhiloxCudaState instance should be used for one and only one
+ * consumer kernel.
+ *
+ * Example (see e.g. native/cuda/Dropout.cu):
+ *
+ * #include <ATen/cuda/CUDAGeneratorImpl.h>
+ * #include <ATen/cuda/CUDAGraphsUtils.cuh>
+ *
+ * __global__ void kernel(..., PhiloxCudaState philox_args) {
+ *   auto seeds = at::cuda::philox::unpack(philox_args);
+ *   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+ *   curandStatePhilox4_32_10_t state;
+ *   curand_init(std::get<0>(seeds), // seed
+ *               idx,                // per-thread subsequence
+ *               std::get<1>(seeds), // offset in subsequence
+ *               &state);
+ *   ...
+ * }
+ *
+ * host_caller(...) {
+ *   PhiloxCudaState rng_engine_inputs;
+ *   {
+ *     // See Note [Acquire lock when using random generators]
+ *     std::lock_guard<std::mutex> lock(gen->mutex_);
+ *
+ *     // gen could be HostState or DevState here! No divergent code needed!
+ *     rng_engine_inputs = gen->philox_cuda_state(offset_increment);
+ *   }
+ *   kernel<<<...>>>(..., rng_engine_inputs);
+ * }
+ *
+ */
+
+
+// Stores state values. Passed as a kernel argument. See "Usage:" above.
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  PhiloxCudaState(const PhiloxCudaState&) = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed,
+                  uint64_t offset) {
+    seed_ = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(uint64_t seed,
+                  int64_t* offset_extragraph,
+                  uint32_t offset_intragraph) {
+    seed_ = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  uint64_t seed_;
+  Payload offset_;
+  uint32_t offset_intragraph_;
+  bool captured_ = false;
+};
 
 struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   // Constructors
@@ -19,13 +131,22 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread();
+  void graph_prologue(int64_t* offset_extragraph);
+  uint64_t graph_epilogue();
+  PhiloxCudaState philox_cuda_state(uint64_t increment);
+
+  // Temporarily accommodates call sites that use philox_engine_inputs.
+  // Allows incremental refactor of call sites to use philox_cuda_state.
   std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+
   static DeviceType device_type();
 
 private:
   CUDAGeneratorImpl* clone_impl() const override;
   uint64_t seed_ = default_rng_seed_val;
   uint64_t philox_offset_per_thread_ = 0;
+  int64_t* offset_extragraph_;
+  uint32_t offset_intragraph_ = 0;
 };
 
 namespace cuda {
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 06e22d1a6b1f..62f43fd2fef7 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -13,6 +13,12 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
+
+// For the record I don't think this is a correct pimpl idiom.
+// Including Impl header in interface header defeats the purpose
+// because you can't change Impl private members without forcing
+// everything that included the interface to rebuild.
+// Impl should be forward-declared in the interface header instead.
 #include <c10/core/GeneratorImpl.h>
 
 /**
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 3e4ea5a2b3c2..6810b51d3f70 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -356,9 +356,11 @@ template <typename Op,
           typename scalar2,
           typename IndexType,
           int ADims, int BDims,
-          int step>
+          int step,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
 #if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
 #endif
 __global__ void
 kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
@@ -400,7 +402,9 @@ inline dim3 getApplyBlock() {
   return dim3(AT_APPLY_THREADS_PER_BLOCK);
 }
 
-template <typename scalar1, typename scalar2, int step, typename Op>
+template <typename scalar1, typename scalar2, int step, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
 inline bool CUDA_tensor_apply2(at::Tensor a,
                                at::Tensor b,
                                const Op op,
@@ -463,7 +467,9 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
   kernelPointwiseApply2<Op,                                            \
                         scalar1,                                       \
                         scalar2,                                       \
-                        TYPE, A, B, step>                              \
+                        TYPE, A, B, step,                              \
+                        max_threads_per_block,                         \
+                        min_blocks_per_sm>                             \
    <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(    \
        aInfo, bInfo, static_cast<TYPE>(totalElements), op);
 
@@ -549,13 +555,16 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 }
 
 /* Provides default step = 1 to CUDA_tensor_apply2. */
-template <typename scalar1, typename scalar2, typename Op>
+template <typename scalar1, typename scalar2, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
 inline bool CUDA_tensor_apply2(at::Tensor a,
                                at::Tensor b,
                                const Op op,
                                TensorArgType aType = TensorArgType::ReadWrite,
                                TensorArgType bType = TensorArgType::ReadOnly) {
-  return CUDA_tensor_apply2<scalar1, scalar2, 1, Op>(a, b, op, aType, bType);
+  return CUDA_tensor_apply2<scalar1, scalar2, 1, Op,
+                            max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
 }
 
 } // cuda
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index ea7c015499ea..f0db9014163a 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -1,10 +1,15 @@
+#include <ATen/Utils.h>
 #include <ATen/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <ATen/Utils.h>
 
 namespace at {
+namespace cuda {
+namespace detail {
 
-namespace cuda { namespace detail {
+namespace {
 
 // Ensures we only call cudaGetDeviceCount only once.
 static std::once_flag num_gpu_init_flag;
@@ -18,7 +23,7 @@ static std::deque<std::once_flag> cuda_gens_init_flag;
 // Default, global CUDA generators, one per GPU.
 static std::vector<Generator> default_gens_cuda;
 
-/* 
+/*
 * Populates the global variables related to CUDA generators
 * Warning: this function must only be called once!
 */
@@ -28,6 +33,8 @@ static void initCUDAGenVector(){
   default_gens_cuda.resize(num_gpus);
 }
 
+} // anonymous namespace
+
 /**
  * PyTorch maintains a collection of default generators that get
  * initialized once. The purpose of these default generators is to
@@ -71,39 +78,55 @@ Generator createCUDAGenerator(DeviceIndex device_index) {
 } // namespace detail
 } // namespace cuda
 
+
 /**
  * CUDAGeneratorImpl class implementation
  */
 CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
   : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),
-              DispatchKeySet(c10::DispatchKey::CUDA)} { }
+              DispatchKeySet(c10::DispatchKey::CUDA)} { 
+  at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");
+}
 
 /**
  * Sets the seed to be used by curandStatePhilox4_32_10
  * Resets the philox_offset_per_thread_ to 0
- * 
+ *
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");
   seed_ = seed;
   philox_offset_per_thread_ = 0;
 }
 
+#define CAPTURE_DEFAULT_GENS_MSG \
+"Non-default (user-constructed) CUDA RNG generators cannot be used " \
+"in regions captured by CUDA graphs. " \
+"If you need a non-default CUDA generator in a captured region, " \
+"please file an issue."
+
 /**
  * Gets the current seed of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::current_seed() const {
+  TORCH_CHECK((at::cuda::currentStreamCaptureStatus() ==
+               at::cuda::CaptureStatus::None) ||
+              ((void*)this ==
+               (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index())),
+              CAPTURE_DEFAULT_GENS_MSG);
   return seed_;
 }
 
 /**
  * Gets a nondeterministic random number from /dev/urandom or time,
  * seeds the CPUGeneratorImpl with it and then returns that number.
- * 
+ *
  * FIXME: You can move this function to Generator.cpp if the algorithm
  * in getNonDeterministicRandom is unified for both CPU and CUDA
  */
 uint64_t CUDAGeneratorImpl::seed() {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::seed");
   auto random = c10::detail::getNonDeterministicRandom(true);
   this->set_current_seed(random);
   return random;
@@ -111,10 +134,11 @@ uint64_t CUDAGeneratorImpl::seed() {
 
 /**
  * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
- * 
+ *
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_philox_offset_per_thread");
   philox_offset_per_thread_ = offset;
 }
 
@@ -122,28 +146,80 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread");
   return philox_offset_per_thread_;
 }
 
+/**
+ * Prepares this instance for a cuda graph capture region.
+ * offset_extragraph is the initial offset at the start of the graphed region.
+ * offset_intragraph tracks the offset in the graphed region.
+ */
+void CUDAGeneratorImpl::graph_prologue(int64_t* offset_extragraph) {
+  TORCH_CHECK((void*)this ==
+              (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()),
+              CAPTURE_DEFAULT_GENS_MSG);
+  offset_extragraph_ = offset_extragraph;
+  offset_intragraph_ = 0;
+}
+
+/**
+ * Finalizes a cuda graph capture region for this instance.
+ */
+uint64_t CUDAGeneratorImpl::graph_epilogue() {
+  TORCH_CHECK((void*)this ==
+              (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()),
+              CAPTURE_DEFAULT_GENS_MSG);
+  return offset_intragraph_;
+}
+
 /**
  * Gets the seed and philox offset value to be used in
- * curandStatePhilox4_32_10
- * 
+ * curandStatePhilox4_32_10, in an opaque PhiloxCudaState that's safe
+ * and can be used non-divergently in callers whether CUDA graph
+ * capture is underway or not.  See
+ * Note [CUDA Graph-safe RNG states]
+ *
  * Each kernel using philox has to sensibly increment offset
  * for future users of philox. So it gets the "old" value for
  * itself (before add), and tells subsequent users which offset
  * they should use, since only the kernel knows how many randoms
- * it intends to generate. 
- * 
+ * it intends to generate.
+ *
  * Increment should be at least the number of curand() random numbers used in
  * each thread. It is the user's responsibility to make sure that the increment
  * for philox is never smaller than the number of curand() calls. Increment
  * value > the number of curand() calls won't harm but anything less would mean
  * that you would be reusing random values from previous calls.
- * 
+ *
  * See Note [Acquire lock when using random generators]
  */
+PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
+  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
+    TORCH_CHECK((void*)this ==
+                (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()),
+                CAPTURE_DEFAULT_GENS_MSG);
+    uint32_t offset = this->offset_intragraph_;
+    TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <=
+                          std::numeric_limits<uint32_t>::max() - increment); 
+    this->offset_intragraph_ += increment;
+    return PhiloxCudaState(this->seed_,
+                           this->offset_extragraph_,
+                           offset);
+  } else {
+    uint64_t offset = this->philox_offset_per_thread_;
+    this->philox_offset_per_thread_ += increment;
+    return PhiloxCudaState(this->seed_, offset);
+  }
+}
+
+/**
+ * Temporarily accommodates call sites that use philox_engine_inputs.
+ * Allows incremental refactor of call sites to use philox_cuda_state.
+ */
 std::pair<uint64_t, uint64_t> CUDAGeneratorImpl::philox_engine_inputs(uint64_t increment) {
+  at::cuda::assertNotCapturing("Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. "
+                               "Cannot call CUDAGeneratorImpl::philox_engine_inputs");
   uint64_t offset = this->philox_offset_per_thread_;
   this->philox_offset_per_thread_ += increment;
   return std::make_pair(this->seed_, offset);
@@ -159,7 +235,7 @@ DeviceType CUDAGeneratorImpl::device_type() {
 
 /**
  * Public clone method implementation
- * 
+ *
  * See Note [Acquire lock when using random generators]
  */
 std::shared_ptr<CUDAGeneratorImpl> CUDAGeneratorImpl::clone() const {
@@ -168,10 +244,11 @@ std::shared_ptr<CUDAGeneratorImpl> CUDAGeneratorImpl::clone() const {
 
 /**
  * Private clone method implementation
- * 
+ *
  * See Note [Acquire lock when using random generators]
  */
 CUDAGeneratorImpl* CUDAGeneratorImpl::clone_impl() const {
+  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::clone_impl");
   auto gen = new CUDAGeneratorImpl(this->device().index());
   gen->set_current_seed(this->seed_);
   gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
diff --git a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
new file mode 100644
index 000000000000..4b2d09ad74d4
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <ATen/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/cuda/CUDAGuard.h>
+
+namespace at {
+namespace cuda {
+namespace philox {
+
+// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen.
+// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable.
+// Easiest thing that comes to mind is, define a free function here, in ATen/cuda.
+// Any cuda consumer can include this header.
+__device__ __forceinline__ std::tuple<uint64_t, uint64_t>
+unpack(at::PhiloxCudaState arg) {
+  if (arg.captured_) {
+    return std::make_tuple(arg.seed_, *(arg.offset_.ptr) + arg.offset_intragraph_);
+  } else {
+    return std::make_tuple(arg.seed_, arg.offset_.val);
+  }
+}
+
+} // namespace philox
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+// Protects against enum cudaStreamCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0,
+              "unexpected int(cudaStreamCaptureStatusNone) value");
+static_assert(int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1,
+              "unexpected int(cudaStreamCaptureStatusActive) value");
+static_assert(int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2,
+              "unexpected int(cudaStreamCaptureStatusInvalidated) value");
+#endif
+
+enum class CaptureStatus: int {
+  #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
+  Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
+  Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
+  #else
+  None = 0
+  #endif
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch(status) {
+    case CaptureStatus::None:
+      os << "cudaStreamCaptureStatusNone";
+      break;
+    #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    case CaptureStatus::Active:
+      os << "cudaStreamCaptureStatusActive";
+      break;
+    case CaptureStatus::Invalidated:
+      os << "cudaStreamCaptureStatusInvalidated";
+      break;
+    #endif
+    default:
+      TORCH_INTERNAL_ASSERT(false,
+                            "Unknown CUDA graph CaptureStatus",
+                            int(status));
+  }
+  return os;
+}
+
+inline CaptureStatus currentStreamCaptureStatus() {
+  #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  // don't create a context if we don't have to
+  if (at::detail::getCUDAHooks().hasPrimaryContext(c10::cuda::current_device())) {
+    cudaStreamCaptureStatus is_capturing;
+    AT_CUDA_CHECK(cudaStreamIsCapturing(at::cuda::getCurrentCUDAStream(),
+                                        &is_capturing));
+    return CaptureStatus(is_capturing);
+  } else {
+    return CaptureStatus::None;
+  }
+  #else
+  return CaptureStatus::None;
+  #endif
+}
+
+inline void assertNotCapturing(std::string attempt) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(status == CaptureStatus::None,
+              attempt,
+              " during CUDA graph capture. If you need this call to be captured, "
+              "please file an issue. "
+              "Current cudaStreamCaptureStatus: ",
+              status);
+}
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 1cf107c171f4..8cfc6c10f1ba 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -8,6 +8,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/core/DistributionsHelper.h>
 
@@ -62,16 +63,17 @@ std::tuple<uint64_t, dim3, dim3> calc_execution_policy(int64_t total_elements) {
 template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t>
 C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound)
 __global__ void distribution_elementwise_grid_stride_kernel(int numel,
-                                                            std::pair<uint64_t, uint64_t> seeds,
+                                                            PhiloxCudaState philox_args,
                                                             const dist_t dist_func,
                                                             const transform_t transform_func) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-  curand_init(
-      seeds.first,
-      idx,
-      seeds.second,
-      &state);
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
   int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) *
       blockDim.x * gridDim.x * unroll_factor;
   for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
@@ -123,11 +125,11 @@ void distribution_nullary_kernel(at::TensorIterator& iter,
   auto counter_offset = std::get<0>(execution_policy);
   auto grid = std::get<1>(execution_policy);
   auto block = std::get<2>(execution_policy);
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
   }
 
   if (!iter.can_use_32bit_indexing()) {
@@ -174,12 +176,14 @@ template <typename func_t, typename inp_offset_calc_t, typename out_offset_calc_
 __global__ void distribution_binary_elementwise_kernel(
     int numel,
     func_t f,
-    std::pair<uint64_t, uint64_t> seeds,
+    PhiloxCudaState philox_args,
     typename function_traits<func_t>::result_type *output_data,
     const typename function_traits<func_t>::template arg<1>::type *input_data_1,
     const typename function_traits<func_t>::template arg<2>::type *input_data_2,
     inp_offset_calc_t inp_calc,
     out_offset_calc_t out_calc) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+
   using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
   using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
 
@@ -190,7 +194,10 @@ __global__ void distribution_binary_elementwise_kernel(
   int remaining = std::min<int>(numel - base_index, BLOCK_WORK_SIZE);
 
   curandStatePhilox4_32_10_t state;
-  curand_init(seeds.first, blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state);
+  curand_init(std::get<0>(seeds),
+              blockIdx.x * blockDim.x + threadIdx.x,
+              std::get<1>(seeds),
+              &state);
 
   // load data into registers
   int thread_idx = threadIdx.x;
@@ -222,7 +229,7 @@ __global__ void distribution_binary_elementwise_kernel(
 }
 
 template <typename func_t>
-void distribution_binary_kernel(TensorIterator &iter, std::pair<uint64_t, uint64_t> seeds, const func_t &f) {
+void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_args, const func_t &f) {
   static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t");
   using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
   using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
@@ -230,7 +237,7 @@ void distribution_binary_kernel(TensorIterator &iter, std::pair<uint64_t, uint64
 
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
-      distribution_binary_kernel(sub_iter, seeds, f);
+      distribution_binary_kernel(sub_iter, philox_args, f);
     }
     return;
   }
@@ -251,11 +258,11 @@ void distribution_binary_kernel(TensorIterator &iter, std::pair<uint64_t, uint64
 
   if (iter.is_contiguous()) {
     distribution_binary_elementwise_kernel<<<grid,num_threads, 0, stream>>>(
-        numel, f, seeds, output_data, input_data_1, input_data_2,
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
         TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
   } else {
     distribution_binary_elementwise_kernel<<<grid, num_threads, 0, stream>>>(
-        numel, f, seeds, output_data, input_data_1, input_data_2,
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
         make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
   }
 }
@@ -570,20 +577,17 @@ struct CauchyKernel {
 template<typename scalar_t, typename prob_t>
 void bernoulli_tensor_cuda_kernel(
     at::Tensor& ret, const at::Tensor& p,
-    std::pair<uint64_t, uint64_t> seeds) {
-  // The template argument `4` below indicates that we want to operate on four
-  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
-  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4>(
-      ret, p,
-      [seeds] __device__(
+    PhiloxCudaState philox_args) {
+  auto functor = [philox_args] __device__(
           int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
           const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
+        auto seeds = at::cuda::philox::unpack(philox_args);
         curandStatePhilox4_32_10_t state;
-        curand_init(
-            seeds.first,
-            blockIdx.x * blockDim.x + threadIdx.x,
-            seeds.second,
-            &state);
+        curand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+
         // See Note [Register spilling in curand call for CUDA < 10]
         float4 rand = curand_uniform4(&state);
         switch (n) {
@@ -607,17 +611,21 @@ void bernoulli_tensor_cuda_kernel(
             v1 = static_cast<scalar_t>(rand.x <= p1);
           }
         }
-      }
-    );
+      };
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, p, functor);
 }
 
 template<typename RNG>
 void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(10);
+    rng_engine_inputs = gen->philox_cuda_state(10);
   }
   auto p = std::get<0>(expand_inplace(self, p_.to(kCUDA)));
   AT_DISPATCH_ALL_TYPES_AND3(
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index 90b7644abfe3..dd09efc9e719 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -49,20 +49,20 @@ template <typename scalar_t>
 void poisson_cuda_kernel(
     at::Tensor& ret,
     const at::Tensor& lambda,
-    std::pair<uint64_t, uint64_t> seeds) {
-  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t>(
-      ret,
-      lambda,
-      [seeds] __device__(
+    at::PhiloxCudaState philox_args) {
+  auto functor = [philox_args] __device__(
           scalar_t & ret_val, const scalar_t& lambda) {
+        auto seeds = at::cuda::philox::unpack(philox_args);
         curandStatePhilox4_32_10_t state;
-        curand_init(
-            seeds.first,
-            blockIdx.x * blockDim.x + threadIdx.x,
-            seeds.second,
-            &state);
+        curand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
         ret_val = static_cast<scalar_t>(curand_poisson(&state, lambda));
-      });
+      };
+  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, lambda, functor);
 }
 
 struct curand_uniform_wrapper {
@@ -82,7 +82,7 @@ void binomial_cuda_kernel(
     at::Tensor& ret,
     const at::Tensor& count,
     const at::Tensor& prob,
-    std::pair<uint64_t, uint64_t> seeds) {
+    at::PhiloxCudaState philox_args) {
   using accscalar_t = at::acc_type<scalar_t, true>;
   at::TensorIterator iter = at::TensorIteratorConfig()
       .add_output(ret)
@@ -90,8 +90,8 @@ void binomial_cuda_kernel(
       .add_input(prob)
       .build();
 
-  at::native::distribution_binary_kernel(iter, seeds,
-      [seeds] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
+  at::native::distribution_binary_kernel(iter, philox_args,
+      [philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
         #if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
         auto uniform_lambda = curand_uniform_wrapper(state);
         BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
@@ -108,19 +108,16 @@ template <typename scalar_t>
 void gamma_cuda_kernel(
     at::Tensor& ret,
     const at::Tensor& alpha,
-    std::pair<uint64_t, uint64_t> seeds) {
+    at::PhiloxCudaState philox_args) {
   using accscalar_t = at::acc_type<scalar_t, true>;
-  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t>(
-      ret,
-      alpha,
-      [seeds] __device__(
+  auto functor = [philox_args] __device__(
           scalar_t & ret_val, const scalar_t& alpha) {
+        auto seeds = at::cuda::philox::unpack(philox_args);
         curandStatePhilox4_32_10_t state;
-        curand_init(
-            seeds.first,
-            blockIdx.x * blockDim.x + threadIdx.x,
-            seeds.second,
-            &state);
+        curand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
 
         auto uniform_lambda = [&state] __device__ () {
           return curand_uniform(&state);
@@ -134,7 +131,10 @@ void gamma_cuda_kernel(
         auto sample = sample_gamma<scalar_t, accscalar_t, decltype(uniform_lambda), decltype(normal_lambda)>(alpha, standard_uniform, standard_normal);
         auto min_value = std::numeric_limits<scalar_t>::min();
         ret_val = (min_value > sample) ? min_value : sample;
-      });
+      };
+  at::cuda::CUDA_tensor_apply2<scalar_t, scalar_t, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, alpha, functor);
 }
 
 template<typename scalar_t>
@@ -164,11 +164,11 @@ namespace at { namespace native {
 
 Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(20);
+    rng_engine_inputs = gen->philox_cuda_state(20);
   }
   Tensor ret = at::empty(lambda.sizes(), lambda.options());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "poisson_cuda", [&] {
@@ -179,11 +179,11 @@ Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
 
 Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(42);
+    rng_engine_inputs = gen->philox_cuda_state(42);
   }
   Tensor ret = at::empty(count.sizes(), count.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "binomial_cuda", [&] {
@@ -194,11 +194,11 @@ Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<G
 
 Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(10);
+    rng_engine_inputs = gen->philox_cuda_state(10);
   }
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "gamma_cuda", [&] {
@@ -209,11 +209,11 @@ Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
 
 Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(10);
+    rng_engine_inputs = gen->philox_cuda_state(10);
   }
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "dirichlet", [&] {
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index d41ecd119067..67adbaabbb84 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -4,6 +4,7 @@
 #include <ATen/CUDAGeneratorImpl.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <c10/macros/Macros.h>
 #include <curand_kernel.h>
 
@@ -30,31 +31,31 @@ template <
           int ADims,
           int VEC>
 #if __CUDA_ARCH__ >= 350
-C10_LAUNCH_BOUNDS_2(256, 8)
+C10_LAUNCH_BOUNDS_2(256, 4)
 #elif defined (__HIP_PLATFORM_HCC__)
 C10_LAUNCH_BOUNDS_2(256, 4)
 #endif
 __global__ void
 fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
-                            at::cuda::detail::TensorInfo<scalar_t, IndexType> b,
-                            at::cuda::detail::TensorInfo<uint8_t, IndexType> c,
-                            IndexType totalElements, accscalar_t p, std::pair<uint64_t, uint64_t> seeds
-                           ) {
-
+                         at::cuda::detail::TensorInfo<scalar_t, IndexType> b,
+                         at::cuda::detail::TensorInfo<uint8_t, IndexType> c,
+                         IndexType totalElements, accscalar_t p,
+                         PhiloxCudaState philox_args) {
   // make sure we don't break assumption that we can't have > 4 elements / thread
   static_assert(VEC <= 4, "Value of VEC must be in [2, 4]");
 
   using LoadT = memory::aligned_vector<scalar_t, VEC>;
   using MaskLoadT = memory::aligned_vector<uint8_t, VEC>;
 
-  accscalar_t pinv = accscalar_t(1)/p;
+  auto seeds = at::cuda::philox::unpack(philox_args);
   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-  curand_init(
-      seeds.first,
-      idx,
-      seeds.second,
-      &state);
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  accscalar_t pinv = accscalar_t(1)/p;
 
   // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time
   for (IndexType linearIndex = idx * VEC;
@@ -105,25 +106,26 @@ template <
           int ADims,
           int BDims=ADims>
 #if __CUDA_ARCH__ >= 350
-C10_LAUNCH_BOUNDS_2(256, 8)
+C10_LAUNCH_BOUNDS_2(256, 4)
 #elif defined (__HIP_PLATFORM_HCC__)
 C10_LAUNCH_BOUNDS_2(256, 4)
 #endif
 __global__ void
 fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
-                      cuda::detail::TensorInfo<scalar_t, IndexType> b,
-                      cuda::detail::TensorInfo<uint8_t, IndexType> c,
-                      IndexType totalElements, accscalar_t p, std::pair<uint64_t, uint64_t> seeds
-                      ) {
-
-  accscalar_t pinv = accscalar_t(1)/p;
+                     cuda::detail::TensorInfo<scalar_t, IndexType> b,
+                     cuda::detail::TensorInfo<uint8_t, IndexType> c,
+                     IndexType totalElements, accscalar_t p,
+                     PhiloxCudaState philox_args) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-    curand_init(
-        seeds.first,
-        idx,
-        seeds.second,
-        &state);
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  accscalar_t pinv = accscalar_t(1)/p;
+
   IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) *
         blockDim.x * gridDim.x * UNROLL;
   for (IndexType linearIndex = idx;
@@ -201,7 +203,7 @@ inline void launcher(
     Tensor& mask,
     double p,
     const int64_t nelem,
-    const std::pair<uint64_t, uint64_t> rng_engine_inputs,
+    const PhiloxCudaState rng_engine_inputs,
     dim3 grid,
     dim3 dim_block) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -324,11 +326,11 @@ fused_dropout_cuda(const Tensor& self, double p, c10::optional<Generator> gen_){
   grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
 //number of times random will be generated per thread, to offset philox counter in thc random state
   int64_t counter_offset = ((nelem - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
-  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  PhiloxCudaState rng_engine_inputs;
   {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
   }
   if (cuda::detail::canUse32BitIndexMath(self)){
     launcher<unsigned int>(
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index b828e47e8461..a8779d3d97af 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -6,6 +6,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/native/cuda/LaunchUtils.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include <THC/THCReduceApplyUtils.cuh>
 #include <THC/THCTensorMathReduce.cuh>
@@ -113,7 +114,7 @@ __device__ int binarySearchForMultinomial(scalar_t* cumdist,
 
 template <typename scalar_t>
 __global__ void
-sampleMultinomialWithReplacement(std::pair<uint64_t, uint64_t> seeds,
+sampleMultinomialWithReplacement(PhiloxCudaState philox_args,
                                  int totalSamples,
                                  int64_t* dest,
                                  int64_t distributions,
@@ -124,11 +125,16 @@ sampleMultinomialWithReplacement(std::pair<uint64_t, uint64_t> seeds,
   // search due to divergence. It seems possible to compute multiple
   // values and limit divergence though later on.
 
+  auto seeds = at::cuda::philox::unpack(philox_args);
+
   // global index formula for 2D grid of 1D blocks
   int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
 
   curandStatePhilox4_32_10_t state;
-  curand_init(seeds.first, idx, seeds.second, &state);
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
 
   // The block determines the distribution for which we generate a point
   for (int64_t curDist = blockIdx.y;
@@ -361,7 +367,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
       // Prefix sum along rows
       at::_cumsum_out(prefixSum, normDist, 1);
 
-      std::pair<uint64_t, uint64_t> rng_engine_inputs;
+      PhiloxCudaState rng_engine_inputs;
 
       if (with_replacement) {
         // Binary search is warp divergent (so effectively we're running
@@ -381,13 +387,13 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
           // curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
           // offset is 4 times that.
           auto offset = ((numDist-1)/grid.y+1)*4;
-          rng_engine_inputs = gen->philox_engine_inputs(offset);
+          rng_engine_inputs = gen->philox_cuda_state(offset);
         }
         // Sample with replacement
 
         sampleMultinomialWithReplacement
             <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-            rng_engine_inputs,
+                rng_engine_inputs,
                 n_sample,
                 result.data_ptr<int64_t>(),
                 numDist, numCategories,
diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu
index 7a5c1811f252..048f5f7294b2 100644
--- a/aten/src/THCUNN/RReLU.cu
+++ b/aten/src/THCUNN/RReLU.cu
@@ -7,6 +7,7 @@
 #include <THC/THCApply.cuh>
 #include <THCUNN/common.h>
 #include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <curand.h>
 #include <curand_kernel.h>
 #include <curand_philox4x32_x.h>
@@ -39,12 +40,17 @@ inline double __device__ curand_uniform_type<double>(curandStatePhilox4_32_10_t
 }
 
 template <typename T>
-__global__ void rreluUpdateOutputTrain(int n, std::pair<uint64_t, uint64_t> seeds,
+__global__ void rreluUpdateOutputTrain(int n, at::PhiloxCudaState philox_args,
   T *input, T* noise, T *output, double a, double b)
 {
+  auto seeds = at::cuda::philox::unpack(philox_args);
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-  curand_init(seeds.first, idx, seeds.second, &state);
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
   CUDA_KERNEL_LOOP(i, n)
   {
     if (input[i] <= 0)
diff --git a/aten/src/THCUNN/generic/RReLU.cu b/aten/src/THCUNN/generic/RReLU.cu
index fd4e6ff0cf4c..9d664220e5a3 100644
--- a/aten/src/THCUNN/generic/RReLU.cu
+++ b/aten/src/THCUNN/generic/RReLU.cu
@@ -31,11 +31,11 @@ void THNN_(RReLU_updateOutput)(
     const uint32_t curand4_engine_calls = 4;
     dim3 grid = NUM_BLOCKS(n);
     uint64_t counter_offset = ((n - 1) / (BLOCK_SIZE * grid.x) + 1) * curand4_engine_calls;
-    std::pair<uint64_t, uint64_t> rng_engine_inputs;
+    at::PhiloxCudaState rng_engine_inputs;
     {
       // See Note [Acquire lock when using random generators]
       std::lock_guard<std::mutex> lock(gen->mutex_);
-      rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+      rng_engine_inputs = gen->philox_cuda_state(counter_offset);
     }
     if (inplace)
     {

From 0a42003f8f16e043c636dd2e2078a5c82bf21d7b Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Fri, 4 Dec 2020 12:46:03 -0800
Subject: [PATCH 054/132] [TensorExpr Fuser] Handle fusing values with
 un-profiled uses (#48689)

Summary:
Copying myself from the code comments:

A value can be profiled with differently typed uses.
This can occur from:
- having a use which is not executed, so the type will be
TensorType::get()
- control-flow that depends on tensor type:
  if x.size() == 2 op(x) else op(x)
- mutation of the value on a field represented in the tensor type
  op(x); x.resize_([...]); op(x)

The most common case today with num_profiles = 1 is from the first case. Here we can just ignore non-profiled uses, and choose any of the profiled uses. Because we guard all tensor types in the runtime, even if we set a Value to have a profiled type from one use and then execute a use with a different profiled type, we will still be correct. In the future we could consider unifying the types of uses, or adding a type refinement node so uses can have the correct corresponding type.

Fix for https://github.com/pytorch/pytorch/issues/48043 I think there's probably too much context required for that to be a good bootcamp task...

There was an observed missed fusion opportunity in detectron2 because of this issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48689

Reviewed By: ngimel

Differential Revision: D25278791

Pulled By: eellison

fbshipit-source-id: 443e5e1254446a31cc895a275b5f1ac3798c327f
---
 test/jit/test_profiler.py                  | 19 +++++++++
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 45 +++++++++++-----------
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 55604f5ff6bf..e42f8225a3d6 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -137,6 +137,25 @@ def foo(a, b):
         self.assertEqual(len(list(g.findAllNodes("prim::TypeCheck"))), 2)
         FileCheck().check("TensorExpr").check("aten::add_").check("TensorExpr").run(g)
 
+    def test_use_not_profiled(self):
+        def foo(t1, t2, t3, t4, t: float):
+            h = t1 + t2 + t3 + t4
+            if t > 0.5:
+                # Putting a use of t1 in a never-executed conditional prevents
+                return t1 + 1
+            return h
+
+        t = torch.rand(8, dtype=torch.float)
+
+        foo_script = torch.jit.script(foo)
+        for _ in range(torch._C._jit_get_num_profiled_runs() + 1):
+            foo_script(t, t, t, t, 0.1)
+
+        self.assertEqual(foo(t, t, t, t, 0.1), foo_script(t, t, t, t, 0.1))
+        g = torch.jit.last_executed_optimized_graph()
+        # all adds fused
+        FileCheck().check("graph").check_not("aten::add").check("prim::If").run(g)
+
     def test_not_fusing_scalar_ops(self):
         @torch.jit.script
         def foo(x: int, y: int):
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 6bf2dd727c95..31f9cce08481 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -234,36 +234,35 @@ bool texprReductionsEnabled() {
   return texpr_reductions_enabled;
 }
 
-// TODO: if a value has differently typed uses, temporarily insert a node
-// specializing the type for each use and later remove, instead of bailing
-bool profiledWithDifferentTypes(Value* v) {
-  std::vector<TypePtr> types;
-  for (const auto& use : v->uses()) {
-    if (use.user->kind() == prim::profile) {
-      types.push_back(use.user->ty(attr::profiled_type));
-    }
-  }
-  for (size_t i = 1; i < types.size(); ++i) {
-    if (types.at(i - 1) != types.at(i)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void removeProfileNodesAndSpecializeTypes(Block* b) {
   for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
     if (it->kind() == prim::profile) {
       GRAPH_DEBUG("Removing prim::profile: %", it->output()->debugName());
       it->output()->replaceAllUsesWith(it->input());
-      if (!profiledWithDifferentTypes(it->input())) {
-        it->input()->setType(it->ty(attr::profiled_type));
-      } else {
-        GRAPH_DEBUG(
-            "Ignoring value with differently typed profiles :%",
-            it->output()->debugName());
+      auto profiled_type = it->ty(attr::profiled_type)->expect<TensorType>();
+
+      // A value can be profiled with differently typed uses.
+      // This can occur from:
+      // - having a use which is not executed, so the type will be
+      // TensorType::get()
+      // - control-flow that depends on tensor type:
+      //   if x.size() == 2 op(x) else op(x)
+      // - mutation of the value on a field represented in the tensor type
+      //   op(x); x.resize_([...]); op(x)
+
+      // The most common case today with num_profiles = 1 is from the first
+      // case. Here we can just ignore non-profiled uses, and choose any of the
+      // profiled uses. Because we guard all tensor types in the runtime, even
+      // if we set a Value to have a profiled type from one use and then execute
+      // a use with a different profiled type, we will still be correct.
+      // In the future we could consider unifying the types of uses, or adding a
+      // type refinement node so uses can have the correct corresponding type.
+      if (profiled_type == TensorType::get()) {
+        continue;
       }
+      it->input()->setType(it->ty(attr::profiled_type));
       it.destroyCurrent();
+
     } else {
       for (Block* ib : it->blocks()) {
         removeProfileNodesAndSpecializeTypes(ib);

From ba3962f5f0f22bd6086ec7bc179c26f5643fdb65 Mon Sep 17 00:00:00 2001
From: Oleg Khabinov <khabinov@fb.com>
Date: Fri, 4 Dec 2020 12:52:50 -0800
Subject: [PATCH 055/132] [Onnxifi] Warmup cache of output shapes (#48346)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48346

Onnxifi now accepts output shape info for all possible batch sizes. This is used to avoid doing shape inference inside `OnnxifiOp::extractOutputBatchSizes()`.

FB:
In this diff we try to pre-calculate output shapes for all possible batch sizes inside `PredictorContainer` where we supposedly have enough data to do so. This data is then passed down to OnnxifiOp.

Here is the dependency graph that I built manually trying to understand the entire flow.
https://pxl.cl/1rQRv

Test Plan:
Strobelight data https://fburl.com/strobelight/jlhhgt21 shows that `OnnxifiOp::RunOnDevice()` now takes only 2.17% of CPU instead of ~20% CPU with the current implementation.

Also, the current implementation takes dozens of milliseconds according to ipiszy:
> After adding more logs, I found each shapeinference call actually takes 40~50ms.

I also added added time measurements temporarily for `OnnxifiOp::extractOutputBatchSizes()`. New impenentation typically consumes 1 to 4 microseconds, and, when data for current bs is not present yet in `output_reshape_info_`, it takes 20-40 microseconds which is still much better than the current implementation.

AF canary https://www.internalfb.com/intern/ads/canary/431357944274985799
AI canary https://www.internalfb.com/intern/ads/canary/431365503038313840

Verifying using test tier https://pxl.cl/1sZ4S

Reviewed By: yinghai, ipiszy

Differential Revision: D25047110

fbshipit-source-id: 872dc1578a1e8e7c3ade5f5e2711e77ba290a671
---
 caffe2/opt/glow_net_transform.cc  |   8 +-
 caffe2/opt/glow_net_transform.h   |   5 +-
 caffe2/opt/onnxifi_op.cc          | 152 +++++++++++++++++-------------
 caffe2/opt/onnxifi_op.h           |  47 ++++++++-
 caffe2/opt/onnxifi_transformer.cc | 100 +++++++++++++-------
 caffe2/opt/onnxifi_transformer.h  |  18 +++-
 6 files changed, 216 insertions(+), 114 deletions(-)

diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
index ece62abea258..ee3ce1b27e2c 100644
--- a/caffe2/opt/glow_net_transform.cc
+++ b/caffe2/opt/glow_net_transform.cc
@@ -108,12 +108,13 @@ void onnxifi(
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
     const std::unordered_set<int>& blacklist,
-    const ShapeInfoMap& shape_hints,
+    const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size,
     size_t max_seq_size,
     bool load_model_by_blob,
-    bool predictor_net_ssa_rewritten) {
+    bool predictor_net_ssa_rewritten,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   // Split SparseLengthsSumSparse so that we can lower the SparseLengthsSum part
   splitSparseLengthsSumSparse(net, *ws);
 
@@ -143,8 +144,9 @@ void onnxifi(
   opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
   opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
   opts.timeout = FLAGS_onnxifi_timeout_ms;
+  opts.shape_hints_per_bs = shape_hints_per_bs;
 
-  ShapeInfoMap more_shape_hints = shape_hints;
+  ShapeInfoMap more_shape_hints = shape_hints_max_bs;
   if (!FLAGS_onnxifi_shape_hints.empty()) {
     parseShapeInfoMapFromString(FLAGS_onnxifi_shape_hints, more_shape_hints);
   }
diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h
index 7e2eedec90aa..e8d1c9b9054f 100644
--- a/caffe2/opt/glow_net_transform.h
+++ b/caffe2/opt/glow_net_transform.h
@@ -26,12 +26,13 @@ void onnxifi(
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
     const std::unordered_set<int>& blacklist,
-    const ShapeInfoMap& shape_hints,
+    const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size = 0,
     size_t max_seq_size = 0,
     bool load_model_by_blob = false,
-    bool predictor_net_ssa_rewritten = false);
+    bool predictor_net_ssa_rewritten = false,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs = {});
 
 std::unordered_set<int> ParseNetPositionList(const std::string& str);
 std::unordered_set<std::string> ParseBlackListOps(const std::string& str);
diff --git a/caffe2/opt/onnxifi_op.cc b/caffe2/opt/onnxifi_op.cc
index 158f9b7a7ed8..e22a297b0dd4 100644
--- a/caffe2/opt/onnxifi_op.cc
+++ b/caffe2/opt/onnxifi_op.cc
@@ -300,6 +300,46 @@ details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
   return output_reshape_info;
 }
 
+template <>
+template <typename DimContainer>
+void OnnxifiOp<CPUContext>::fillOutputReshapeInfo(
+    const DimContainer& real_shape,
+    c10::ArrayRef<uint64_t> max_shape,
+    details::OutputReshapeInfo &output_reshape_info,
+    int currentIndex) {
+  CAFFE_ENFORCE_EQ(real_shape.size(), max_shape.size());
+  const auto dim_size = real_shape.size();
+  auto& begin = output_reshape_info.begins[currentIndex];
+  begin.Resize(dim_size);
+  int32_t* begin_ptr = begin.template mutable_data<int32_t>();
+  auto& end = output_reshape_info.ends[currentIndex];
+  end.Resize(dim_size);
+  int32_t* end_ptr = end.template mutable_data<int32_t>();
+  int32_t mismatch = 0;
+  for (int j = 0; j < dim_size; ++j) {
+    CAFFE_ENFORCE_GE(
+        max_shape[j],
+        real_shape[j],
+        "It is weird that max shape of ",
+        output_names_[currentIndex],
+        " is smaller than real shape at dim ",
+        j,
+        " (",
+        max_shape[j],
+        " vs ",
+        real_shape[j],
+        ")");
+    begin_ptr[j] = 0;
+    if (max_shape[j] >= real_shape[j]) {
+      end_ptr[j] = real_shape[j];
+      mismatch += j;
+    } else {
+      end_ptr[j] = -1;
+    }
+  }
+  output_reshape_info.fast_path[currentIndex] = !mismatch;
+}
+
 template <>
 int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
   if (use_onnx_ || !adjust_output_batch_) {
@@ -337,77 +377,55 @@ int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
     return current_batch_size;
   }
 
-  auto it =
-      output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo());
-  auto& output_reshape_info = it.first->second;
-  BoundShapeSpec spec(dims[0], max_seq_size_);
-  auto bound_shape_inferencer =
-      BoundShapeInferencerRegistry()->Create("C10", spec);
-  for (int i = 0; i < InputSize(); ++i) {
-    at::IntArrayRef dim0;
-    bool quantized = false;
-    if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
-      const auto& input_tensor_int8 =
-          this->template Input<int8::Int8TensorCPU>(i);
-      const auto& t0 = input_tensor_int8.t;
-      dim0 = t0.sizes();
-      quantized = true;
-    } else {
-      const auto& t0 = Input(i);
-      dim0 = t0.sizes();
-    }
-    TensorShape shape;
-    for (const auto d : dim0) {
-      shape.add_dims(d);
-    }
-    std::vector<TensorBoundShape::DimType> dim_type(
-        shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
-    if (dim_type.size()) {
-      dim_type[0] = TensorBoundShape_DimType_BATCH;
+  auto& output_reshape_info = output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo()).first->second;
+
+  if (use_passed_output_shapes_) {
+    auto shape_info_it = output_shapes_per_bs_.find(current_batch_size);
+    CAFFE_ENFORCE(shape_info_it != output_shapes_per_bs_.end(), "Unable to find outputs shapes for bs=", current_batch_size);
+    CAFFE_ENFORCE_EQ(shape_info_it->second.size(), OutputSize());
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      fillOutputReshapeInfo(shape_info_it->second[i], output_shapes_max_bs_[i], output_reshape_info, i);
     }
-    input_shape_info_[input_names_[i]] =
-        ShapeInfo(dim_type, std::move(shape), quantized);
-  }
-  bound_shape_inferencer->InferBoundShapeAndType(
-      netdef_, input_shape_info_, nullptr, false);
-  const auto& shape_info = bound_shape_inferencer->shape_info();
-  for (int i = 0; i < OutputSize(); ++i) {
-    const auto it = shape_info.find(output_names_[i]);
-    CAFFE_ENFORCE(it != shape_info.end());
-    const auto& real_shape = it->second.shape;
-    const auto& max_shape = output_shapes_[i];
-    CAFFE_ENFORCE_EQ(real_shape.dims_size(), max_shape.size());
-    const auto dim_size = real_shape.dims_size();
-    auto& begin = output_reshape_info.begins[i];
-    begin.Resize(dim_size);
-    int32_t* begin_ptr = begin.template mutable_data<int32_t>();
-    auto& end = output_reshape_info.ends[i];
-    end.Resize(dim_size);
-    int32_t* end_ptr = end.template mutable_data<int32_t>();
-    int32_t mismatch = 0;
-    for (int j = 0; j < dim_size; ++j) {
-      CAFFE_ENFORCE_GE(
-          max_shape[j],
-          real_shape.dims(j),
-          "It is weird that max shape of ",
-          output_names_[i],
-          " is smaller than real shape at dim ",
-          j,
-          " (",
-          max_shape[j],
-          " vs ",
-          real_shape.dims(j),
-          ")");
-      begin_ptr[j] = 0;
-      if (max_shape[j] >= real_shape.dims(j)) {
-        end_ptr[j] = real_shape.dims(j);
-        mismatch += j;
+  } else {
+    BoundShapeSpec spec(dims[0], max_seq_size_);
+    auto bound_shape_inferencer =
+        BoundShapeInferencerRegistry()->Create("C10", spec);
+    for (int i = 0; i < InputSize(); ++i) {
+      at::IntArrayRef dim0;
+      bool quantized = false;
+      if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
+        const auto& input_tensor_int8 =
+            this->template Input<int8::Int8TensorCPU>(i);
+        const auto& t0 = input_tensor_int8.t;
+        dim0 = t0.sizes();
+        quantized = true;
       } else {
-        end_ptr[j] = -1;
+        const auto& t0 = Input(i);
+        dim0 = t0.sizes();
+      }
+      TensorShape shape;
+      for (const auto d : dim0) {
+        shape.add_dims(d);
+      }
+      std::vector<TensorBoundShape::DimType> dim_type(
+          shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
+      if (dim_type.size()) {
+        dim_type[0] = TensorBoundShape_DimType_BATCH;
       }
+      input_shape_info_[input_names_[i]] =
+          ShapeInfo(dim_type, std::move(shape), quantized);
+    }
+    bound_shape_inferencer->InferBoundShapeAndType(
+        netdef_, input_shape_info_, nullptr, false);
+    const auto& shape_info = bound_shape_inferencer->shape_info();
+    for (int i = 0; i < OutputSize(); ++i) {
+      const auto find_res = shape_info.find(output_names_[i]);
+      CAFFE_ENFORCE(find_res != shape_info.end());
+      fillOutputReshapeInfo(find_res->second.shape.dims(), output_shapes_max_bs_[i], output_reshape_info, i);
     }
-    output_reshape_info.fast_path[i] = !mismatch;
   }
+
   return current_batch_size;
 }
 
@@ -458,7 +476,7 @@ void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
   tensor_descriptor.dimensions = tensor_dims.size();
   CAFFE_ENFORCE(
       tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
-  auto& output_shape = output_shapes_[output_idx];
+  auto& output_shape = output_shapes_max_bs_[output_idx];
   output_shape.clear();
   output_shape.insert(
       output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
index 865fdf301ca1..eeb93c51e6f8 100644
--- a/caffe2/opt/onnxifi_op.h
+++ b/caffe2/opt/onnxifi_op.h
@@ -19,7 +19,7 @@ namespace caffe2 {
 namespace details {
 
 /// Provides slicing info for the outputs. All the vector members should be of
-/// the same size as number of outpus of the Onnxifi op.
+/// the same size as number of outputs of the Onnxifi op.
 struct OutputReshapeInfo {
   std::vector<Tensor> begins;
   std::vector<Tensor> ends;
@@ -55,6 +55,7 @@ class OnnxifiOp final : public Operator<Context> {
         timeout_(this->template GetSingleArgument<int>("timeout", 0)),
         nominal_batch_idx_(
             this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
+        use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
         adjust_quantized_offset_(this->template GetSingleArgument<int>(
             "adjust_quantized_offset",
             128)) {
@@ -86,7 +87,7 @@ class OnnxifiOp final : public Operator<Context> {
     all_offsets_.reserve(ws->Blobs().size());
     all_scales_.reserve(ws->Blobs().size());
     input_shapes_.resize(input_names_.size());
-    output_shapes_.resize(output_names_.size());
+    output_shapes_max_bs_.resize(output_names_.size());
     quantized_outputs_.resize(output_names_.size(), false);
     int output_idx = 0;
     ArgumentHelper helper(operator_def);
@@ -127,6 +128,30 @@ class OnnxifiOp final : public Operator<Context> {
       adjust_quantized_offset_ = 0;
     }
 
+    if (use_passed_output_shapes_) {
+      // Populate output_shapes_per_bs_
+      for (int bs = 1; bs < max_batch_size_; ++bs) {
+        auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
+        auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
+        CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());
+
+        std::unordered_map<std::string, details::TensorInfo> name_to_shape;
+        for (const auto& output_shape_tp : output_shapes_tp) {
+          name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
+        }
+        for (const auto& output_qshape_tp : output_qshapes_tp) {
+          name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
+        }
+
+        for (output_idx = 0; output_idx < output_names_.size(); ++output_idx) {
+          auto it = name_to_shape.find(output_names_[output_idx]);
+          output_shapes_per_bs_[bs].push_back({});
+          auto &output_shapes = output_shapes_per_bs_[bs].back();
+          std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
+        }
+      }
+    }
+
     // Get output resizing hints
     adjust_output_batch_ =
         this->template GetSingleArgument<int>("adjust_output_batch", 0);
@@ -333,6 +358,14 @@ class OnnxifiOp final : public Operator<Context> {
 #endif
   }
 
+  /// Helper method for extractOutputBatchSizes(), used to deduplicate code of populating output reshape infos
+  template <typename DimContainer>
+  void fillOutputReshapeInfo(
+      const DimContainer& real_shape,
+      c10::ArrayRef<uint64_t> max_shape,
+      details::OutputReshapeInfo &output_reshape_info,
+      int index);
+
   /// Extract output batch size. If the output batch size is going to be at
   /// max_batch_size_, return true indicating that no output shape adjustment is
   /// needed. Otherwise, return false.
@@ -418,7 +451,7 @@ class OnnxifiOp final : public Operator<Context> {
   int nominal_batch_idx_{0};
 
   // We bind the op input/output by position while ONNXIFI binds input/output by
-  // names. In addition, op input/output names can be writtten by, for example,
+  // names. In addition, op input/output names can be written by, for example,
   // memonger. We cache the original input/output name of ONNX object here and
   // bind them by position.
   std::vector<std::string> input_names_;
@@ -428,7 +461,10 @@ class OnnxifiOp final : public Operator<Context> {
   NetDef netdef_;
 
   std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
-  std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_;
+  std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_max_bs_;
+
+  // Mapping of batch sizes to output shapes
+  std::unordered_map<int, std::vector<c10::SmallVector<uint64_t, 4>>> output_shapes_per_bs_;
 
   // Indicate if i-th output is a quantized tensor
   std::vector<bool> quantized_outputs_;
@@ -449,6 +485,9 @@ class OnnxifiOp final : public Operator<Context> {
   // max_batch_size
   std::unordered_map<std::string, ShapeInfo> input_shape_info_;
 
+  // Whether we should use passed output shape hints or do shape inference
+  bool use_passed_output_shapes_{false};
+
   // Whether we need to resize outputs or not
   bool adjust_output_batch_{false};
 
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index c77101984790..9ccc662d99a9 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -511,7 +511,8 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp(
     const std::unordered_set<std::string>& initialization_list,
     const std::vector<std::string>& external_inputs,
     const std::vector<std::string>& external_outputs,
-    const std::unordered_map<std::string, ShapeInfo>& shape_hints) {
+    const ShapeInfoMap& shape_hints_max_bs,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   OperatorDef op;
   op.set_type("Onnxifi");
   auto* onnx_model_arg = op.add_arg();
@@ -549,9 +550,9 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp(
   int nominal_batch_idx{0};
   for (const auto& input : external_inputs) {
     if (!initialization_list.count(input)) {
-      const auto it = shape_hints.find(input);
+      const auto it = shape_hints_max_bs.find(input);
       CAFFE_ENFORCE(
-          it != shape_hints.end(), "Input shape for ", input, " not found");
+          it != shape_hints_max_bs.end(), "Input shape for ", input, " not found");
       const auto& info = it->second;
       if (info.getDimType(0) == TensorBoundShape_DimType_BATCH &&
           getBlob1stDimSize(info) == max_batch_size) {
@@ -562,15 +563,15 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp(
     }
   }
 
-  // Add output size hints
+  // Add output size hints for max batch size
   auto* output_shape_info_arg = op.add_arg();
   output_shape_info_arg->set_name("output_shape_info");
   auto* output_qshape_info_arg = op.add_arg();
   output_qshape_info_arg->set_name("output_qshape_info");
   for (int i = 0; i < op.output_size(); ++i) {
     const auto& o = op.output(i);
-    const auto it = shape_hints.find(o);
-    if (it != shape_hints.end()) {
+    const auto it = shape_hints_max_bs.find(o);
+    if (it != shape_hints_max_bs.end()) {
       if (!it->second.is_quantized) {
         output_shape_info_arg->mutable_tensors()->Add()->CopyFrom(
             wrapShapeInfoIntoTensorProto(o, it->second));
@@ -582,6 +583,33 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp(
     }
   }
 
+  // Add output size hints for per batch size
+  AddArgument("use_passed_output_shapes", shape_hints_per_bs.empty() ? 0 : 1, &op);
+  if (!shape_hints_per_bs.empty()) {
+    for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) {
+      auto it = shape_hints_per_bs.find(bs);
+      CAFFE_ENFORCE(it != shape_hints_per_bs.end());
+      const auto& shape_hints_current_bs = it->second;
+
+      auto* output_shape_arg = op.add_arg();
+      output_shape_arg->set_name("output_shapes_bs_" + caffe2::to_string(bs));
+      auto* output_qshape_arg = op.add_arg();
+      output_qshape_arg->set_name("output_qshapes_bs_" + caffe2::to_string(bs));
+
+      for (int output_idx = 0; output_idx < op.output_size(); ++output_idx) {
+        const auto& output_name = op.output(output_idx);
+        auto it_output = shape_hints_current_bs.find(output_name);
+        if (it_output != shape_hints_current_bs.end()) {
+          if (!it_output->second.is_quantized) {
+            output_shape_arg->mutable_tensors()->Add()->CopyFrom(wrapShapeInfoIntoTensorProto(output_name, it_output->second));
+          } else {
+            output_shape_arg->mutable_qtensors()->Add()->CopyFrom(wrapShapeInfoIntoQTensorProto(output_name, it_output->second));
+          }
+        }
+      }
+    }
+  }
+
   // Tell Onnxifi op that the model is in onnx or c2 proto format
   AddArgument("use_onnx", opts_.use_onnx ? 1 : 0, &op);
 
@@ -609,7 +637,8 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp(
 NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
     const caffe2::NetDef& net,
     const std::unordered_set<std::string>& weights_in_ws,
-    const ShapeInfoMap& shape_hints) {
+    const ShapeInfoMap& shape_hints_max_bs,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   int onnxifi_op_id = onnxifi_op_id_;
   if (opts_.debug) {
     WriteProtoToTextFile(
@@ -647,8 +676,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
                       op.type() == "SparseLengthsWeightedSumFused4BitRowwise")
           ? 1
           : 0;
-      const auto& indices_hint = shape_hints.at(op.input(1 + weighted));
-      const auto& lengths_hint = shape_hints.at(op.input(2 + weighted));
+      const auto& indices_hint = shape_hints_max_bs.at(op.input(1 + weighted));
+      const auto& lengths_hint = shape_hints_max_bs.at(op.input(2 + weighted));
       const auto& indices_shape = indices_hint.shape;
       const auto& lengths_shape = lengths_hint.shape;
       if ((indices_hint.getDimType(0) ==
@@ -689,13 +718,13 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
   onnxifi_net.clear_external_input();
   for (const auto& i : total_inputs_vec) {
     onnxifi_net.add_external_input(i);
-    auto info = shape_hints.at(i);
+    auto info = shape_hints_max_bs.at(i);
     if (!info.is_quantized) {
       shape_arg->mutable_tensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoTensorProto(i, shape_hints.at(i)));
+          wrapShapeInfoIntoTensorProto(i, shape_hints_max_bs.at(i)));
     } else {
       qshape_arg->mutable_qtensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoQTensorProto(i, shape_hints.at(i)));
+          wrapShapeInfoIntoQTensorProto(i, shape_hints_max_bs.at(i)));
     }
   }
 
@@ -724,7 +753,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
       initialization_list,
       onnxifi_net_inputs,
       onnxifi_net_outputs,
-      shape_hints);
+      shape_hints_max_bs,
+      shape_hints_per_bs);
   NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff
@@ -746,7 +776,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
     const std::unordered_set<std::string>& weights_in_ws,
     Workspace* ws,
     onnx::OnnxExporter* exporter,
-    ShapeInfoMap* shape_hints) {
+    ShapeInfoMap* shape_hints_max_bs,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   if (opts_.min_ops > net.op_size()) {
     return net;
   }
@@ -770,7 +801,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
       TensorShape shape;
       shape.mutable_dims()->CopyFrom(t.dims());
       auto ret = shape_hints_onnx_.emplace(t.name(), std::move(shape));
-      shape_hints->emplace(
+      shape_hints_max_bs->emplace(
           std::piecewise_construct,
           std::forward_as_tuple(ret.first->first),
           std::forward_as_tuple(
@@ -845,7 +876,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
       initialization_list,
       onnxifi_net_inputs,
       onnxifi_net_outputs,
-      *shape_hints);
+      *shape_hints_max_bs,
+      shape_hints_per_bs);
   NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff
@@ -1163,20 +1195,21 @@ NetDef OnnxifiTransformer::TransformViaC2(
     NetDef* pred_net,
     const std::unordered_set<std::string>& weights,
     const std::unordered_set<int>& blocklisted_ops,
-    const ShapeInfoMap& shape_hints) {
+    const ShapeInfoMap& shape_hints_max_bs,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   onnxBackendID backend_id = backend_ids_[idx_];
 
   auto c2_supports = [this,
-                      &shape_hints,
+                      &shape_hints_max_bs,
                       &blocklisted_ops,
                       backend_id,
                       &weights](const caffe2::OperatorDef& op) {
-    return supportOpC2(op, shape_hints, weights, blocklisted_ops, backend_id);
+    return supportOpC2(op, shape_hints_max_bs, weights, blocklisted_ops, backend_id);
   };
 
   auto c2_converter =
-      [this, &weights, &shape_hints](const caffe2::NetDef& net) {
-        return SubnetToOnnxifiOpViaC2(net, weights, shape_hints);
+      [this, &weights, &shape_hints_max_bs, &shape_hints_per_bs](const caffe2::NetDef& net) {
+        return SubnetToOnnxifiOpViaC2(net, weights, shape_hints_max_bs, shape_hints_per_bs);
       };
 
   return opt::OptimizeForBackend(
@@ -1188,7 +1221,8 @@ NetDef OnnxifiTransformer::TransformViaOnnx(
     NetDef* pred_net,
     const std::unordered_set<std::string>& weights,
     const std::unordered_set<int>& blocklisted_ops,
-    ShapeInfoMap* shape_hints) {
+    ShapeInfoMap* shape_hints_max_bs,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   onnxBackendID backend_id = backend_ids_[idx_];
 
   // function to tell whether the ONNXIFI backend supports a given C2 op or not
@@ -1202,9 +1236,9 @@ NetDef OnnxifiTransformer::TransformViaOnnx(
   // the same exporter throughout the process to avoid duplicated dummy name
   // generation
   onnx::OnnxExporter exporter2(nullptr);
-  auto onnx_converter = [this, ws, &weights, shape_hints, &exporter2](
+  auto onnx_converter = [this, ws, &weights, shape_hints_max_bs, &exporter2, &shape_hints_per_bs](
                             const caffe2::NetDef& net) mutable {
-    return SubnetToOnnxifiOpViaOnnx(net, weights, ws, &exporter2, shape_hints);
+    return SubnetToOnnxifiOpViaOnnx(net, weights, ws, &exporter2, shape_hints_max_bs, shape_hints_per_bs);
   };
 
   return opt::OptimizeForBackend(
@@ -1264,17 +1298,17 @@ void OnnxifiTransformer::transform(
   // blob for output is created. This causes problem if inferShape uses original
   // ws since it does not expect the output blob to be present.
   Workspace mapped_ws(ws, input_mapping_);
-  ShapeInfoMap shape_hints = inferShapes(
+  ShapeInfoMap shape_hints_max_bs = inferShapes(
       &mapped_ws, pred_net, shape_hints_mapped, opts_.bound_shape_spec);
   if (opts_.use_onnx) {
-    shape_hints_onnx_ = stripShapeInfoMap(shape_hints);
+    shape_hints_onnx_ = stripShapeInfoMap(shape_hints_max_bs);
   }
   if (opts_.enforce_fp32_inputs_into_fp16) {
-    enforceFp32InputsToFp16(weights, pred_net, &shape_hints);
+    enforceFp32InputsToFp16(weights, pred_net, &shape_hints_max_bs);
   }
   if (opts_.merge_fp32_inputs_into_fp16) {
     mergeFp32InputsAndConvertToFp16(
-        opts_.bound_shape_spec.max_batch_size, weights, pred_net, &shape_hints);
+        opts_.bound_shape_spec.max_batch_size, weights, pred_net, &shape_hints_max_bs);
   }
 
   if (opts_.debug) {
@@ -1285,7 +1319,7 @@ void OnnxifiTransformer::transform(
     for (const auto& w : weights) {
       w_arg->add_strings(w);
     }
-    dumpNet(ssa_net, shape_hints, "debug_ssa_net.pb_txt");
+    dumpNet(ssa_net, shape_hints_max_bs, "debug_ssa_net.pb_txt");
   }
   extractPartitionInfo(*pred_net);
 
@@ -1295,13 +1329,13 @@ void OnnxifiTransformer::transform(
   // Apply some filtering rules
   std::unordered_set<int> new_blocklisted_ops(
       blocklisted_ops.begin(), blocklisted_ops.end());
-  applyFilteringRules(*pred_net, shape_hints, weights, &new_blocklisted_ops);
+  applyFilteringRules(*pred_net, shape_hints_max_bs, weights, &new_blocklisted_ops);
 
   // Transform the net
   NetDef net_opt = opts_.use_onnx
       ? TransformViaOnnx(
-            ws, pred_net, weights, new_blocklisted_ops, &shape_hints)
-      : TransformViaC2(pred_net, weights, new_blocklisted_ops, shape_hints);
+            ws, pred_net, weights, new_blocklisted_ops, &shape_hints_max_bs, opts_.shape_hints_per_bs)
+      : TransformViaC2(pred_net, weights, new_blocklisted_ops, shape_hints_max_bs, opts_.shape_hints_per_bs);
 
   // Need to figure out a proper place to handle device option
   net_opt.mutable_device_option()->CopyFrom(pred_net->device_option());
@@ -1309,7 +1343,7 @@ void OnnxifiTransformer::transform(
 
   pred_net->Swap(&net_opt);
 
-  addShapeToNet(*pred_net, shape_hints);
+  addShapeToNet(*pred_net, shape_hints_max_bs);
   if (opts_.debug) {
     WriteProtoToTextFile(*pred_net, "debug_full_opt_net.pb_txt", false);
   }
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index 86e061411dd9..5836486bfd31 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -44,6 +44,9 @@ struct OnnxifiTransformerOptions final : public BackendTransformOptions {
 
   // Inference timeout
   int timeout{0};
+
+  // Mapping of batch sizes to shape infos
+  std::unordered_map<int, ShapeInfoMap> shape_hints_per_bs;
 };
 
 class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
@@ -69,13 +72,15 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_set<std::string>& weights_in_ws,
       Workspace* ws,
       onnx::OnnxExporter* exporter,
-      ShapeInfoMap* shape_hints);
+      ShapeInfoMap* shape_hints_max_bs,
+      const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
   // Convert a cutoff subgraph net to an Onnxifi op
   caffe2::NetDef SubnetToOnnxifiOpViaC2(
       const caffe2::NetDef& net,
       const std::unordered_set<std::string>& weights_in_ws,
-      const ShapeInfoMap& shape_hints);
+      const ShapeInfoMap& shape_hints_max_bs,
+      const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
   // We already have all the ops and external inputs and outputs!
   OperatorDef buildOnnxifiOp(
@@ -83,14 +88,16 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_set<std::string>& initialization_list,
       const std::vector<std::string>& external_inputs,
       const std::vector<std::string>& external_outputs,
-      const std::unordered_map<std::string, ShapeInfo>& shape_hints);
+      const ShapeInfoMap& shape_hints_max_bs,
+      const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
   // Transform by passing C2 proto to backend
   NetDef TransformViaC2(
       NetDef* pred_net,
       const std::unordered_set<std::string>& weights,
       const std::unordered_set<int>& blocklisted_ops,
-      const ShapeInfoMap& shape_hints);
+      const ShapeInfoMap& shape_hints_max_bs,
+      const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
   // Transform by passing ONNX proto to backend
   NetDef TransformViaOnnx(
@@ -98,7 +105,8 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
       NetDef* pred_net,
       const std::unordered_set<std::string>& weights,
       const std::unordered_set<int>& blocklisted_ops,
-      ShapeInfoMap* shape_hints);
+      ShapeInfoMap* shape_hints_max_bs,
+      const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
   // Query whether an operator is supported by passing C2 protobuf
   bool supportOpC2(

From 714c7020eee18404c631d584bde89e42b91a5112 Mon Sep 17 00:00:00 2001
From: "Anshul Jain (B*8)" <anshulj@fb.com>
Date: Fri, 4 Dec 2020 12:53:03 -0800
Subject: [PATCH 056/132] [Mask R-CNN]Add Int8 AABB Generate proposals Op

Summary: Adds support for additional Eigen Utils for custom type defs.

Reviewed By: vkuzo

Differential Revision: D23898398

fbshipit-source-id: fb5f6d6ed8a56e6244f4f0cb419140b365ff7a82
---
 caffe2/utils/eigen_utils.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
index d5dbe121f6f8..cc4d84a24636 100644
--- a/caffe2/utils/eigen_utils.h
+++ b/caffe2/utils/eigen_utils.h
@@ -76,6 +76,7 @@ using EArrXb = EArrXt<bool>;
 using EArrXI32  = EArrXt<int32_t>;
 using EArrXU16  = EArrXt<uint16_t>;
 using EArrXU8  = EArrXt<uint8_t>;
+using EArr3U8 = Eigen::Array<uint8_t, 3, 1>;
 
 // 2-d array, column major
 template <typename T>
@@ -95,6 +96,8 @@ using ERArrXXI32t = ERArrXXt<int32_t>;
 using ERArrXXU16t = ERArrXXt<uint16_t>;
 using ERArrXXU8t = ERArrXXt<uint8_t>;
 using ERArrXXi = ERArrXXt<int>;
+using ERArrXXi64t = ERArrXXt<int64_t>;
+using ERArrXXi32t = ERArrXXt<int32_t>;
 
 // 1-d vector
 template <typename T>
@@ -103,14 +106,19 @@ using EVecXd = Eigen::VectorXd;
 using EVecXf = Eigen::VectorXf;
 
 // 1-d row vector
+template <typename T>
+using ERVecXt = Eigen::RowVector<T, Eigen::Dynamic>;
 using ERVecXd = Eigen::RowVectorXd;
 using ERVecXf = Eigen::RowVectorXf;
+using ERVecXU8 = Eigen::RowVectorX<uint8_t>;
 
 // 2-d matrix, column major
 template <typename T>
 using EMatXt = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
 using EMatXd = Eigen::MatrixXd;
 using EMatXf = Eigen::MatrixXf;
+using EMatXU8 = EMatXt<uint8_t>;
+using EMatXU16 = EMatXt<uint16_t>;
 
 // 2-d matrix, row major
 template <typename T>
@@ -118,6 +126,7 @@ using ERMatXt =
     Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 using ERMatXd = ERMatXt<double>;
 using ERMatXf = ERMatXt<float>;
+using ERMatXU8 = ERMatXt<uint8_t>;
 
 namespace utils {
 

From 42e6951e62a15b063a96f2d012d2f268b8c1d23e Mon Sep 17 00:00:00 2001
From: jsrozner <jsrozner@gmail.com>
Date: Fri, 4 Dec 2020 13:16:09 -0800
Subject: [PATCH 057/132] Remove save_state_warning in LambdaLR (#46813)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46405, https://github.com/pytorch/pytorch/issues/43352

I updated the docstring in the local file (function level comments). Do I also need to edit somewhere else or recompile docstrings?

Also, though I didn't change any types here, how is typing (for IDE type checking) documentation generated / used)?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46813

Reviewed By: ezyang

Differential Revision: D24923112

Pulled By: vincentqb

fbshipit-source-id: be7818e0d4593bfc5d74023b9c361ac2a538589a
---
 torch/optim/lr_scheduler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 906aa519f1bd..706d12683c27 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -19,8 +19,6 @@
     "https://github.com/pytorch/pytorch/issues/new/choose."
 )
 
-SAVE_STATE_WARNING = "Please also save or load the state of the optimizer when saving or loading the scheduler."
-
 class _LRScheduler(object):
 
     def __init__(self, optimizer, last_epoch=-1, verbose=False):
@@ -211,9 +209,10 @@ def state_dict(self):
         is not the optimizer.
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
         """
 
-        warnings.warn(SAVE_STATE_WARNING, UserWarning)
         state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
         state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
 
@@ -226,12 +225,13 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         """Loads the schedulers state.
 
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+
         Arguments:
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
 
-        warnings.warn(SAVE_STATE_WARNING, UserWarning)
         lr_lambdas = state_dict.pop('lr_lambdas')
         self.__dict__.update(state_dict)
         # Restore state_dict keys in order to prevent side effects

From 7c9ba621305f939891bbb6219cf9151092d0e301 Mon Sep 17 00:00:00 2001
From: Zrss <huangzhesi@gmail.com>
Date: Fri, 4 Dec 2020 13:16:18 -0800
Subject: [PATCH 058/132] Server connects to its listen socket addr (#46801)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46800

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46801

Reviewed By: heitorschueroff

Differential Revision: D25293474

fbshipit-source-id: 15f75dab48a4360645436360c216885cf3bd5667
---
 torch/lib/c10d/TCPStore.cpp          |  6 ++++++
 torch/lib/c10d/Utils.cpp             |  8 ++++++++
 torch/lib/c10d/Utils.hpp             |  2 ++
 torch/lib/c10d/test/TCPStoreTest.cpp | 15 +++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 4151448e677a..6fcf817011d5 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -381,9 +381,15 @@ TCPStore::TCPStore(
   if (isServer_) {
     // Opening up the listening socket
     std::tie(masterListenSocket_, tcpStorePort_) = tcputil::listen(masterPort);
+    
+    std::string socketAddr = tcputil::getLocalSocketAddr(masterListenSocket_);
+
     // Now start the daemon
     tcpStoreDaemon_ = std::unique_ptr<TCPStoreDaemon>(
         new TCPStoreDaemon(masterListenSocket_));
+
+    // Server should connect to its listen addr
+    tcpStoreAddr_ = socketAddr;
   }
   // Connect to the daemon
   storeSocket_ = tcputil::connect(
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index 62e1e195ca45..eaf09ec354b9 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -56,6 +56,14 @@ PortType getSocketPort(int fd) {
 
 } // namespace
 
+std::string getLocalSocketAddr(int fd) {
+  struct ::sockaddr_storage addrStorage;
+  socklen_t addrLen = sizeof(addrStorage);
+  SYSCHECK_ERR_RETURN_NEG1(getsockname(
+      fd, reinterpret_cast<struct ::sockaddr*>(&addrStorage), &addrLen));
+  return sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addrStorage));
+}
+
 std::string sockaddrToString(struct ::sockaddr* addr) {
   char address[INET6_ADDRSTRLEN + 1];
   if (addr->sa_family == AF_INET) {
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index e7b0f1834441..164d8ba749c6 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -652,6 +652,8 @@ inline std::string recvString(int socket) {
 }
 
 // Other helpers
+std::string getLocalSocketAddr(int fd);
+
 std::string sockaddrToString(struct sockaddr* addr);
 
 std::pair<int, PortType> listen(PortType port);
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 30a123dc163f..e6f9a318d668 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -11,6 +11,17 @@
 
 constexpr int64_t kShortStoreTimeoutMillis = 100;
 
+void testNewTcpStore() {
+  const auto unKnownMasterAddr = "192.168.11.17";
+
+  // Ensure that the server can still connect to itself while the master addr is wrong
+  c10::make_intrusive<c10d::TCPStore>(unKnownMasterAddr,
+      0,
+      0, // 0 worker
+      true,
+      std::chrono::milliseconds(kShortStoreTimeoutMillis), true);
+}
+
 // Different ports for different tests.
 void testHelper(const std::string& prefix = "") {
   const auto numThreads = 16;
@@ -129,6 +140,10 @@ void testHelper(const std::string& prefix = "") {
   }
 }
 
+TEST(TCPStoreTest, testNewTcpStore) {
+  testNewTcpStore();
+}
+
 TEST(TCPStoreTest, testHelper) {
   testHelper();
 }

From e1f9542d0098e0c5064d1c12e9f5db645b8031be Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Fri, 4 Dec 2020 13:33:15 -0800
Subject: [PATCH 059/132] Revert D23898398: [Mask R-CNN]Add Int8 AABB Generate
 proposals Op

Test Plan: revert-hammer

Differential Revision:
D23898398 (https://github.com/pytorch/pytorch/commit/714c7020eee18404c631d584bde89e42b91a5112)

Original commit changeset: fb5f6d6ed8a5

fbshipit-source-id: 05284ff4db6c05fff3f4a6bb80f665e87c0bf085
---
 caffe2/utils/eigen_utils.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
index cc4d84a24636..d5dbe121f6f8 100644
--- a/caffe2/utils/eigen_utils.h
+++ b/caffe2/utils/eigen_utils.h
@@ -76,7 +76,6 @@ using EArrXb = EArrXt<bool>;
 using EArrXI32  = EArrXt<int32_t>;
 using EArrXU16  = EArrXt<uint16_t>;
 using EArrXU8  = EArrXt<uint8_t>;
-using EArr3U8 = Eigen::Array<uint8_t, 3, 1>;
 
 // 2-d array, column major
 template <typename T>
@@ -96,8 +95,6 @@ using ERArrXXI32t = ERArrXXt<int32_t>;
 using ERArrXXU16t = ERArrXXt<uint16_t>;
 using ERArrXXU8t = ERArrXXt<uint8_t>;
 using ERArrXXi = ERArrXXt<int>;
-using ERArrXXi64t = ERArrXXt<int64_t>;
-using ERArrXXi32t = ERArrXXt<int32_t>;
 
 // 1-d vector
 template <typename T>
@@ -106,19 +103,14 @@ using EVecXd = Eigen::VectorXd;
 using EVecXf = Eigen::VectorXf;
 
 // 1-d row vector
-template <typename T>
-using ERVecXt = Eigen::RowVector<T, Eigen::Dynamic>;
 using ERVecXd = Eigen::RowVectorXd;
 using ERVecXf = Eigen::RowVectorXf;
-using ERVecXU8 = Eigen::RowVectorX<uint8_t>;
 
 // 2-d matrix, column major
 template <typename T>
 using EMatXt = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
 using EMatXd = Eigen::MatrixXd;
 using EMatXf = Eigen::MatrixXf;
-using EMatXU8 = EMatXt<uint8_t>;
-using EMatXU16 = EMatXt<uint16_t>;
 
 // 2-d matrix, row major
 template <typename T>
@@ -126,7 +118,6 @@ using ERMatXt =
     Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 using ERMatXd = ERMatXt<double>;
 using ERMatXf = ERMatXt<float>;
-using ERMatXU8 = ERMatXt<uint8_t>;
 
 namespace utils {
 

From 4eb4db7c30cb39d3d7cd7a25b17f1d6e89a9c8f2 Mon Sep 17 00:00:00 2001
From: Tom Birch <froody@gmail.com>
Date: Fri, 4 Dec 2020 13:54:16 -0800
Subject: [PATCH 060/132] Support torch.distributed.irecv(src=None, ...)
 (#47137)

Summary:
Calling torch.distributed.irecv(src=None) fails with "The global rank None is not part of the group". This change calls recv_anysource if src is None. Tested locally with MPI backend.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47137

Reviewed By: heitorschueroff

Differential Revision: D25292656

fbshipit-source-id: beb018ba0b676924aeaabeb4a4d6acf96e4a1926
---
 torch/distributed/distributed_c10d.py         | 17 +++++++----
 .../_internal/distributed/distributed_test.py | 28 +++++++++++++------
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 48b132811839..51a83fa302f9 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -702,7 +702,7 @@ def isend(tensor,
 
 
 def irecv(tensor,
-          src,
+          src=None,
           group=group.WORLD,
           tag=0):
     """
@@ -710,7 +710,8 @@ def irecv(tensor,
 
     Arguments:
         tensor (Tensor): Tensor to fill with received data.
-        src (int): Source rank.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
         group (ProcessGroup, optional): The process group to work on
         tag (int, optional): Tag to match recv with remote send
 
@@ -724,11 +725,15 @@ def irecv(tensor,
         return
 
     if group == GroupMember.WORLD:
-        default_pg = _check_default_pg()
-        return default_pg.recv([tensor], src, tag)
+        pg = _check_default_pg()
     else:
-        group_src_rank = _get_group_rank(group, src)
-        return group.recv([tensor], group_src_rank, tag)
+        pg = group
+
+    if src is None:
+        return pg.recv_anysource([tensor], tag)
+    else:
+        group_src_rank = _get_group_rank(pg, src)
+        return pg.recv([tensor], group_src_rank, tag)
 
 
 def send(tensor,
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 86f9392f5958..5a8027a172be 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -850,6 +850,7 @@ def test_send_recv_any_source(self):
             rank = dist.get_rank()
             tensor = _build_tensor(10, value=rank)
             recv_ranks = set()
+            irecv_ranks = set()
 
             for dst in range(0, dist.get_world_size()):
                 if dst == rank:
@@ -857,19 +858,30 @@ def test_send_recv_any_source(self):
                     for dst in range(0, dist.get_world_size()):
                         if dst == rank:
                             continue
-                        output_tensor = _build_tensor(10, value=-1)
-                        sender = dist.recv(output_tensor)
 
-                        # Assert the scalar value "sender" that should be
-                        # equal to the rank of the sender is equal to all
-                        # values in the received tensor.
-                        self.assertTrue(output_tensor.eq(sender).all())
-                        recv_ranks.add(sender)
+                        for recv in ["recv", "irecv"]:
+                            output_tensor = _build_tensor(10, value=-1)
+
+                            if recv == "recv":
+                                sender = dist.recv(output_tensor)
+                                recv_ranks.add(sender)
+                            elif recv == "irecv":
+                                work = dist.irecv(output_tensor)
+                                work.wait()
+                                sender = work._source_rank()
+                                irecv_ranks.add(sender)
+
+                            # Assert the scalar value "sender" that should be
+                            # equal to the rank of the sender is equal to all
+                            # values in the received tensor.
+                            self.assertTrue(output_tensor.eq(sender).all())
                 else:
                     # Send mode
-                    dist.send(tensor, dst)
+                    dist.send(tensor, dst)  # recv
+                    dist.send(tensor, dst)  # irecv
 
             self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
+            self.assertEqual(len(irecv_ranks), dist.get_world_size() - 1)
             self._barrier()
 
         # SEND RECV WITH TAG

From 142b21fd443f14485b22c92f26c9a722ba2d7027 Mon Sep 17 00:00:00 2001
From: Rahul Manghwani <rahulmanghwani@fb.com>
Date: Fri, 4 Dec 2020 14:14:26 -0800
Subject: [PATCH 061/132] Add SparseLengthsSum4BitRowwiseSparse in
 c2_pt_converter (#48240)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48240

Adds the support for converting the SparseLengthsSum4BitRowwiseSparse operator from caffe2 to pytorch as a part of c2_pt_converter

Test Plan:
Added a unit tested

buck test //caffe2/torch/fb/model_transform/c2_convert:c2_pt_converter_test

Tests Passed :
https://our.intern.facebook.com/intern/testinfra/testrun/2251799856412296

Reviewed By: houseroad

Differential Revision: D25067833

fbshipit-source-id: 45cbc331ca35bee27e083714e65a1e87a2a2d2e0
---
 caffe2/python/brew.py            | 1 +
 caffe2/python/helpers/algebra.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
index 801a64c4f94f..25fb4892e9a0 100644
--- a/caffe2/python/brew.py
+++ b/caffe2/python/brew.py
@@ -74,6 +74,7 @@ class HelperWrapper(object):
         'loop' : loop,
         'db_input' : db_input,
         'fused_8bit_rowwise_quantized_to_float' : fused_8bit_rowwise_quantized_to_float,
+        'sparse_lengths_sum_4bit_rowwise_sparse': sparse_lengths_sum_4bit_rowwise_sparse,
     }
 
     def __init__(self, wrapped):
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
index 2d4113f46dea..4c9c3728677b 100644
--- a/caffe2/python/helpers/algebra.py
+++ b/caffe2/python/helpers/algebra.py
@@ -39,3 +39,6 @@ def batch_mat_mul(model, blob_in, blob_out,
         kwargs['engine'] = 'TENSORCORE'
 
     return model.net.BatchMatMul(blob_in, blob_out, **kwargs)
+
+def sparse_lengths_sum_4bit_rowwise_sparse(model, blob_in, blob_out, **kwargs):
+    return model.net.SparseLengthsSum4BitRowwiseSparse(blob_in, blob_out, **kwargs)

From 0f9823d88800f94b438595f124cfa0b33748da94 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 4 Dec 2020 14:34:08 -0800
Subject: [PATCH 062/132] [PyTorch] Save some space in ProcessedNode (#48861)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48861

`std::function` already has an empty state; no need to wrap
it in `c10::Optional`.
ghstack-source-id: 117891382

Reviewed By: hlu1

Differential Revision: D25296912

fbshipit-source-id: 8291bcf11735d49db17415b5de915591ee65f781
---
 torch/csrc/jit/runtime/static/impl.cpp | 4 ++--
 torch/csrc/jit/runtime/static/impl.h   | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index a0e665bc056e..bd1893290240 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -746,9 +746,9 @@ ProcessedNode::ProcessedNode(
 
 void ProcessedNode::run(std::vector<IValue>& reg) const {
   if (fn_) {
-    fn_->operator()(this, reg);
+    fn_(this, reg);
   } else if (native_fn_) {
-    native_fn_->operator()(this, reg);
+    native_fn_(this, reg);
   } else {
     std::vector<IValue> stack;
     const size_t size = node_->inputs().size();
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index e6d5ec466856..2eef530e778b 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -262,7 +262,7 @@ class ProcessedNode {
   }
 
   bool has_out_variant() const {
-    return fn_.has_value();
+    return static_cast<bool>(fn_);
   }
 
   const std::vector<size_t>& input_regs() const {
@@ -281,10 +281,8 @@ class ProcessedNode {
  private:
   Node* node_;
   c10::optional<Operation> op_;
-  c10::optional<std::function<void(const ProcessedNode*, std::vector<IValue>&)>>
-      fn_;
-  c10::optional<std::function<void(const ProcessedNode*, std::vector<IValue>&)>>
-      native_fn_;
+  std::function<void(const ProcessedNode*, std::vector<IValue>&)> fn_;
+  std::function<void(const ProcessedNode*, std::vector<IValue>&)> native_fn_;
 
   std::vector<size_t> input_regs_;
   std::vector<size_t> output_regs_;

From ca3ae7dc73e714329e646a9456e4426ef8e56763 Mon Sep 17 00:00:00 2001
From: Ruichao Xiao <xiaoruichao@fb.com>
Date: Fri, 4 Dec 2020 15:03:56 -0800
Subject: [PATCH 063/132] [DI] create a new key for threadLocalDebugInfo
 (#48762)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48762

In distributed inference, we want to use a new type info to pass some information to operators. add a new key to threadLocalDebugInfo to unblock the development.

Test Plan: Only add a new key. Should have not effect on current build.

Reviewed By: dzhulgakov

Differential Revision: D25291242

fbshipit-source-id: c71565ff7a38cc514d7cd65246c7d5f6b2ce3b8b
---
 c10/util/ThreadLocalDebugInfo.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index d5b91763ccab..a1d167d0652d 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -13,6 +13,7 @@ enum class C10_API_ENUM DebugInfoKind : uint8_t {
   PRODUCER_INFO = 0,
   MOBILE_RUNTIME_INFO,
   PROFILER_STATE,
+  INFERENCE_CONTEXT, // for inference usage
 
   TEST_INFO, // used only in tests
   TEST_INFO_2, // used only in tests

From b9cd774e29515c20278946373277cfcef17d213b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 4 Dec 2020 15:08:02 -0800
Subject: [PATCH 064/132] Get rid of printf in cuda fuser debugPrint() (#46994)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46994

Reviewed By: raghuramank100, mruberry

Differential Revision: D25342954

Pulled By: malfet

fbshipit-source-id: 549b5b072f7f70877261a155e989a21072ec49d8
---
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 57 ++++++++++----------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index d313edcbaa1f..f1a0a634727a 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -40,52 +40,49 @@ std::vector<size_t> toVector(const at::DimVector& small_vec) {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-function"
 void debugPrint(const TensorTypePtr& type) {
-  printf("\nsizes:");
+  std::stringstream sizes_s;
   if (auto sizes = type->symbolic_sizes().sizes()) {
-    // for (const auto& shape_symbol : sizes.value()) {
-    int rank = static_cast<int>(sizes->size());
-    for (int i = 0; i < rank; i++) {
-      const auto& shape_symbol = sizes.value()[i];
+    for (const auto& shape_symbol : *sizes) {
       if (shape_symbol.is_static()) {
-        printf("%ld, ", shape_symbol.static_size());
+        sizes_s << shape_symbol.static_size() << ", ";
       } else {
-        printf("s(%ld), ", *reinterpret_cast<const int64_t*>(&shape_symbol));
+        sizes_s << "s(" << *reinterpret_cast<const int64_t*>(&shape_symbol)
+                << "), ";
       }
     }
   } else {
-    printf("no size available\n");
+    sizes_s << "no size available";
   }
+  std::cout << "sizes:" << sizes_s.str() << std::endl;
   if (const auto& stride_properties = type->stride_properties().sizes()) {
-    int rank = static_cast<int>(stride_properties->size());
-    printf("\nstride: ");
-    for (int i = 0; i < rank; i++) {
-      if ((*stride_properties)[i].has_value() &&
-          (*stride_properties)[i]->stride_.has_value()) {
-        printf("%ld, ", (*stride_properties)[i]->stride_.value());
+    std::stringstream stride_s;
+    std::stringstream index_s;
+    std::stringstream contig_s;
+
+    for (const auto& stride_property : *stride_properties) {
+      if (stride_property.has_value() && stride_property->stride_.has_value()) {
+        stride_s << *stride_property->stride_ << ", ";
       } else {
-        printf("?, ");
+        stride_s << "?, ";
       }
-    }
-    printf("\nstride index: ");
-    for (int i = 0; i < rank; i++) {
-      if ((*stride_properties)[i].has_value() &&
-          (*stride_properties)[i]->stride_index_.has_value()) {
-        printf("%ld, ", (*stride_properties)[i]->stride_index_.value());
+      if (stride_property.has_value() &&
+          stride_property->stride_index_.has_value()) {
+        index_s << *stride_property->stride_index_ << ", ";
       } else {
-        printf("?, ");
+        index_s << "?, ";
       }
-    }
-    printf("\ncontiguous: ");
-    for (int i = 0; i < rank; i++) {
-      if ((*stride_properties)[i].has_value() &&
-          (*stride_properties)[i]->contiguous_.has_value()) {
-        printf("%d, ", (*stride_properties)[i]->contiguous_.value());
+      if (stride_property.has_value() &&
+          stride_property->contiguous_.has_value()) {
+        contig_s << *stride_property->contiguous_ << ", ";
       } else {
-        printf("?, ");
+        contig_s << "?, ";
       }
     }
+    std::cout << "stride: " << stride_s.str() << std::endl;
+    std::cout << "stride index: " << index_s.str() << std::endl;
+    std::cout << "contiguous: " << contig_s.str() << std::endl;
   } else {
-    printf("no stride properties available\n");
+    std::cout << "no stride properties available" << std::endl;
   }
 }
 #pragma clang diagnostic pop

From 212ec07cb7a6f8d59c1dc198b6a5456106075b9f Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Fri, 4 Dec 2020 16:19:24 -0800
Subject: [PATCH 065/132] Support torchbind as attribute in torch.fx symbolic
 tracing (#48732)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48732

add support for ScriptObject as attributes in symbolic trace.

Test Plan: OSS CI

Reviewed By: jamesr66a

Differential Revision: D25116185

fbshipit-source-id: c61993c84279fcb3c91f1d44fb952a8d80d0e552
---
 test/test_fx.py                       | 15 +++++++++++++++
 torch/csrc/jit/python/script_init.cpp |  6 ++++++
 torch/fx/symbolic_trace.py            |  5 +++--
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 5a47c729f7eb..af11f9615cb6 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1139,6 +1139,21 @@ def forward(self):
         m = M()
         self.checkGraphModule(m, ())
 
+    def test_torchbind_class_attribute_in_fx(self):
+        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
+            self.skipTest("torch.classes._TorchScriptTesting._StackString is registered, skipping")
+
+        class FooBar1234(torch.nn.Module):
+            def __init__(self):
+                super(FooBar1234, self).__init__()
+                self.f = torch.classes._TorchScriptTesting._StackString(["3", "4"])
+
+            def forward(self):
+                return self.f.top()
+
+        m = FooBar1234()
+        self.checkGraphModule(m, ())
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index feab73df6d1b..426707e303d3 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -782,6 +782,12 @@ void initJitScriptBindings(PyObject* module) {
                 });
               })
           .def("__copy__", &Object::copy)
+          .def(
+              "__hash__",
+              [](const Object& self) {
+                // Similar to Tensor's `__hash__`, which is `id()`.
+                return std::hash<c10::ivalue::Object*>{}(self._ivalue().get());
+              })
           .def(py::pickle(
               [](const Object& self)
                   -> std::tuple<py::object, std::string> { // __getstate__
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index b2e5b0961114..d48a067f5e56 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -2,6 +2,7 @@
 from types import CodeType, FunctionType
 from typing import Any, Dict, Optional, List, Callable, Union
 import torch
+from torch._C import ScriptObject  # type: ignore
 
 from .node import Argument
 from .graph import Graph
@@ -86,7 +87,7 @@ def create_arg(self, a: Any) -> Argument:
         # a get_attr to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
         # retrieve it with a get_attr.
-        if isinstance(a, torch.Tensor):
+        if isinstance(a, (torch.Tensor, ScriptObject)):
             qualname : Optional[str] = self.tensor_attrs.get(a)
 
             # Tensor was not found in the Module hierarchy, stow it away in a
@@ -221,7 +222,7 @@ def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph:
 
         def collect_tensor_attrs(m : torch.nn.Module, prefix_atoms : List[str]):
             for k, v in m.__dict__.items():
-                if isinstance(v, torch.Tensor):
+                if isinstance(v, (torch.Tensor, ScriptObject)):
                     self.tensor_attrs[v] = '.'.join(prefix_atoms + [k])
             for k, v in m.named_children():
                 collect_tensor_attrs(v, prefix_atoms + [k])

From 4b8d965f18e910cae7356d411a49c1e3d8cb3c8f Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 4 Dec 2020 16:54:05 -0800
Subject: [PATCH 066/132] Revert D25292656: [pytorch][PR] Support
 torch.distributed.irecv(src=None, ...)

Test Plan: revert-hammer

Differential Revision:
D25292656 (https://github.com/pytorch/pytorch/commit/4eb4db7c30cb39d3d7cd7a25b17f1d6e89a9c8f2)

Original commit changeset: beb018ba0b67

fbshipit-source-id: 5a13055e50ed90731fee431e81c09a1871f6cc03
---
 torch/distributed/distributed_c10d.py         | 17 ++++-------
 .../_internal/distributed/distributed_test.py | 28 ++++++-------------
 2 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 51a83fa302f9..48b132811839 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -702,7 +702,7 @@ def isend(tensor,
 
 
 def irecv(tensor,
-          src=None,
+          src,
           group=group.WORLD,
           tag=0):
     """
@@ -710,8 +710,7 @@ def irecv(tensor,
 
     Arguments:
         tensor (Tensor): Tensor to fill with received data.
-        src (int, optional): Source rank. Will receive from any
-            process if unspecified.
+        src (int): Source rank.
         group (ProcessGroup, optional): The process group to work on
         tag (int, optional): Tag to match recv with remote send
 
@@ -725,15 +724,11 @@ def irecv(tensor,
         return
 
     if group == GroupMember.WORLD:
-        pg = _check_default_pg()
-    else:
-        pg = group
-
-    if src is None:
-        return pg.recv_anysource([tensor], tag)
+        default_pg = _check_default_pg()
+        return default_pg.recv([tensor], src, tag)
     else:
-        group_src_rank = _get_group_rank(pg, src)
-        return pg.recv([tensor], group_src_rank, tag)
+        group_src_rank = _get_group_rank(group, src)
+        return group.recv([tensor], group_src_rank, tag)
 
 
 def send(tensor,
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 5a8027a172be..86f9392f5958 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -850,7 +850,6 @@ def test_send_recv_any_source(self):
             rank = dist.get_rank()
             tensor = _build_tensor(10, value=rank)
             recv_ranks = set()
-            irecv_ranks = set()
 
             for dst in range(0, dist.get_world_size()):
                 if dst == rank:
@@ -858,30 +857,19 @@ def test_send_recv_any_source(self):
                     for dst in range(0, dist.get_world_size()):
                         if dst == rank:
                             continue
+                        output_tensor = _build_tensor(10, value=-1)
+                        sender = dist.recv(output_tensor)
 
-                        for recv in ["recv", "irecv"]:
-                            output_tensor = _build_tensor(10, value=-1)
-
-                            if recv == "recv":
-                                sender = dist.recv(output_tensor)
-                                recv_ranks.add(sender)
-                            elif recv == "irecv":
-                                work = dist.irecv(output_tensor)
-                                work.wait()
-                                sender = work._source_rank()
-                                irecv_ranks.add(sender)
-
-                            # Assert the scalar value "sender" that should be
-                            # equal to the rank of the sender is equal to all
-                            # values in the received tensor.
-                            self.assertTrue(output_tensor.eq(sender).all())
+                        # Assert the scalar value "sender" that should be
+                        # equal to the rank of the sender is equal to all
+                        # values in the received tensor.
+                        self.assertTrue(output_tensor.eq(sender).all())
+                        recv_ranks.add(sender)
                 else:
                     # Send mode
-                    dist.send(tensor, dst)  # recv
-                    dist.send(tensor, dst)  # irecv
+                    dist.send(tensor, dst)
 
             self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
-            self.assertEqual(len(irecv_ranks), dist.get_world_size() - 1)
             self._barrier()
 
         # SEND RECV WITH TAG

From 5654fc8edd56c8640d88fa325417ed406d3e2f73 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 4 Dec 2020 17:06:25 -0800
Subject: [PATCH 067/132] Revert D25293474: [pytorch][PR] Server connects to
 its listen socket addr

Test Plan: revert-hammer

Differential Revision:
D25293474 (https://github.com/pytorch/pytorch/commit/7c9ba621305f939891bbb6219cf9151092d0e301)

Original commit changeset: 15f75dab48a4

fbshipit-source-id: 71ca136f2aa3204ad49f76c604f51c477cba270a
---
 torch/lib/c10d/TCPStore.cpp          |  6 ------
 torch/lib/c10d/Utils.cpp             |  8 --------
 torch/lib/c10d/Utils.hpp             |  2 --
 torch/lib/c10d/test/TCPStoreTest.cpp | 15 ---------------
 4 files changed, 31 deletions(-)

diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 6fcf817011d5..4151448e677a 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -381,15 +381,9 @@ TCPStore::TCPStore(
   if (isServer_) {
     // Opening up the listening socket
     std::tie(masterListenSocket_, tcpStorePort_) = tcputil::listen(masterPort);
-    
-    std::string socketAddr = tcputil::getLocalSocketAddr(masterListenSocket_);
-
     // Now start the daemon
     tcpStoreDaemon_ = std::unique_ptr<TCPStoreDaemon>(
         new TCPStoreDaemon(masterListenSocket_));
-
-    // Server should connect to its listen addr
-    tcpStoreAddr_ = socketAddr;
   }
   // Connect to the daemon
   storeSocket_ = tcputil::connect(
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index eaf09ec354b9..62e1e195ca45 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -56,14 +56,6 @@ PortType getSocketPort(int fd) {
 
 } // namespace
 
-std::string getLocalSocketAddr(int fd) {
-  struct ::sockaddr_storage addrStorage;
-  socklen_t addrLen = sizeof(addrStorage);
-  SYSCHECK_ERR_RETURN_NEG1(getsockname(
-      fd, reinterpret_cast<struct ::sockaddr*>(&addrStorage), &addrLen));
-  return sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addrStorage));
-}
-
 std::string sockaddrToString(struct ::sockaddr* addr) {
   char address[INET6_ADDRSTRLEN + 1];
   if (addr->sa_family == AF_INET) {
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 164d8ba749c6..e7b0f1834441 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -652,8 +652,6 @@ inline std::string recvString(int socket) {
 }
 
 // Other helpers
-std::string getLocalSocketAddr(int fd);
-
 std::string sockaddrToString(struct sockaddr* addr);
 
 std::pair<int, PortType> listen(PortType port);
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index e6f9a318d668..30a123dc163f 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -11,17 +11,6 @@
 
 constexpr int64_t kShortStoreTimeoutMillis = 100;
 
-void testNewTcpStore() {
-  const auto unKnownMasterAddr = "192.168.11.17";
-
-  // Ensure that the server can still connect to itself while the master addr is wrong
-  c10::make_intrusive<c10d::TCPStore>(unKnownMasterAddr,
-      0,
-      0, // 0 worker
-      true,
-      std::chrono::milliseconds(kShortStoreTimeoutMillis), true);
-}
-
 // Different ports for different tests.
 void testHelper(const std::string& prefix = "") {
   const auto numThreads = 16;
@@ -140,10 +129,6 @@ void testHelper(const std::string& prefix = "") {
   }
 }
 
-TEST(TCPStoreTest, testNewTcpStore) {
-  testNewTcpStore();
-}
-
 TEST(TCPStoreTest, testHelper) {
   testHelper();
 }

From 2d07d5b50a0d8f2a2166a6349120ab6f5243dc94 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 4 Dec 2020 18:00:47 -0800
Subject: [PATCH 068/132] [te] Don't fuse integer fmod or remainder (#48700)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48700

fmod and remainder on int tensors will raise ZeroDivisionError if their divisors are 0.  I don't think we should try to generate code that raises exceptions.  If at some point we really wanted to fuse these, I might lean towards calling a C++ helper function from the generated code.
ghstack-source-id: 117845642

Test Plan: `buck test //caffe2/test:jit -- test_binary_ops`

Reviewed By: eellison

Differential Revision: D25265792

fbshipit-source-id: 0be56ba3feafa1dbf3c37f6bb8c1550cb6891e6d
---
 test/test_jit_fuser_te.py                  | 22 +++++++--------
 test/test_tensorexpr.py                    | 16 -----------
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 31 ++++++++++++++++++++++
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 79a1a664e843..bd5f7ae3af6e 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1317,10 +1317,8 @@ def apply(fn):
             torch.ge,
             torch.gt,
             torch.lt,
-
-            # FIXME: fails on CPU backend with int8
-            # torch.fmod,
-            # torch.remainder,
+            torch.fmod,
+            torch.remainder,
 
             # FIXME: segfaults on CPU backend
             # operator.__rshift__,
@@ -1328,6 +1326,10 @@ def apply(fn):
 
             lambda x, y: y.type_as(x),
         ]
+        fp_only = [
+            torch.fmod,
+            torch.remainder,
+        ]
         devices = self.devices
         for dtype, op, device in product(dtypes, binary_ops, devices):
             try:
@@ -1343,7 +1345,8 @@ def apply(fn):
             try:
                 t = torch.jit.trace(fn, (x, y))
                 self.assertEqual(ref, t(x, y))
-                self.assertAllFused(t.graph_for(x, y))
+                if op not in fp_only or dtype.is_floating_point:
+                    self.assertAllFused(t.graph_for(x, y))
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
@@ -1425,10 +1428,8 @@ def apply_with_scalar(fn, scalar):
         ]
         binary_ops = [
             torch.div,
-
-            # FIXME: wrong results with int8 on cpu
-            # torch.remainder,
-            # torch.fmod,
+            torch.remainder,
+            torch.fmod,
         ]
         devices = self.devices
         # Maybe we should split this into separate tests to speed it up by
@@ -1447,10 +1448,9 @@ def apply_with_scalar(fn, scalar):
             try:
                 t = torch.jit.trace(fn, (x))
                 self.assertEqual(ref, t(x))
-                self.assertAllFused(t.graph_for(x))
             except Exception as e:
                 raise RuntimeError(
-                    " ".join(["Failed:", str(dtype), op.__name__, device])
+                    "Failed: {} {} {} {}".format(dtype, op.__name__, device, scalar)
                 )
 
     def test_binary_cuda_only_ops(self):
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index db1e9d9a12be..5c30c312534f 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -3,7 +3,6 @@
 import torch.nn.functional as F
 from torch import nn
 import unittest
-import itertools
 
 from torch.testing._internal.common_utils import suppress_warnings, num_profiled_runs
 
@@ -1012,21 +1011,6 @@ def run_remainder(x, y):
         y = run_remainder(nans, a)
         np.testing.assert_allclose(x.numpy(), y.numpy())
 
-    def test_remainder_types(self):
-        def do_mod(x, y):
-            return x % y
-
-        inputs = [torch.rand(10, dtype=torch.float),
-                  torch.randint(1, 1000, (10,), dtype=torch.int32),
-                  torch.randint(1, 1000, (10,), dtype=torch.int16)
-                  ]
-
-        scripted = torch.jit.script(do_mod)
-        for (a, b) in itertools.product(inputs, repeat=2):
-            x = warmup_and_run_forward(scripted, a, b)
-            self.assertLastGraphAllFused()
-            np.testing.assert_allclose(x, do_mod(a, b), rtol=1e-04, atol=1e-04)
-
     def test_multioutput(self):
         def easy(x):
             b = x + 1
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 31f9cce08481..8ee48f22d057 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -172,6 +172,8 @@ bool isSupported(Node* node) {
         return false;
       }
     }
+
+    // Operator is only supported on CUDA.
     if (node->isMemberOf(cuda_only_operator_set)) {
       auto device = tensorexpr::pickDeviceType(node->inputs());
       if (!device) {
@@ -726,6 +728,34 @@ class TensorExprFuser {
     return true;
   }
 
+  bool typesAreSupported(const Node* node) {
+    // clang-format off
+    // breaks up the schema strings so they are no longer discoverable with ctrl-F
+    static const OperatorSet float_only_operator_set{
+      "aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor",
+      "aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor",
+      "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor",
+      "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor",
+    };
+    // clang-format on
+
+    // Value is only supported if operands are floats.
+    if (node->isMemberOf(float_only_operator_set)) {
+      for (const Value* v : node->inputs()) {
+        if (auto const& tt = v->type()->cast<TensorType>()) {
+          auto const& st = tt->scalarType();
+          if (!st || !isFloatingType(*st)) {
+            return false;
+          }
+        } else if (!v->type()->cast<FloatType>()) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
 #define REQ(cond)                           \
   if (!(cond)) {                            \
     GRAPH_DEBUG("Failed cond " #cond "\n"); \
@@ -767,6 +797,7 @@ class TensorExprFuser {
     }
 
     REQ(tensorexpr::isSupported(node));
+    REQ(typesAreSupported(node));
     return true;
   }
 

From 9bb87fa58b8d717fd7eeb4cd2a29efb6222b2684 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 4 Dec 2020 18:03:08 -0800
Subject: [PATCH 069/132] [te] Fix spacing in graph dump (#48829)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48829

The first line was a run-on.
ghstack-source-id: 117845927

Test Plan: visual inspection

Reviewed By: ZolotukhinM

Differential Revision: D25326136

fbshipit-source-id: 3f46ad20aee5ed523b64d852d382eb06f4d60369
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 8ee48f22d057..c53a71eb02e8 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -274,9 +274,9 @@ void removeProfileNodesAndSpecializeTypes(Block* b) {
 }
 
 void RemoveProfileNodesAndSpecializeTypes(std::shared_ptr<Graph>& graph) {
-  GRAPH_DEBUG("Before removeProfileNodesAndSpecializeTypes", *graph);
+  GRAPH_DEBUG("Before removeProfileNodesAndSpecializeTypes:\n", *graph);
   removeProfileNodesAndSpecializeTypes(graph->block());
-  GRAPH_DEBUG("After removeProfileNodesAndSpecializeTypes", *graph);
+  GRAPH_DEBUG("After removeProfileNodesAndSpecializeTypes:\n", *graph);
 }
 
 void removeTensorTypeSpecialization(Value* v) {

From 03abd81b8de94207560401af98c85737f6aeba32 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>
Date: Fri, 4 Dec 2020 18:05:26 -0800
Subject: [PATCH 070/132] [ROCm] Enable skipped distributed global tests
 (#48023)

Summary:
The PR https://github.com/pytorch/pytorch/issues/47898 fixes the global tests. Hence enabling the tests.

Signed-off-by: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48023

Reviewed By: malfet, H-Huang

Differential Revision: D25347289

Pulled By: rohan-varma

fbshipit-source-id: 2b519a3046eae1cf1bfba98a125c09b4a6b01fde
---
 torch/testing/_internal/distributed/distributed_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 86f9392f5958..943eb24a0b5e 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -989,7 +989,6 @@ def test_broadcast(self):
             "Only Gloo and Nccl backend supports CUDA allReduce",
         )
         @skip_if_no_gpu
-        @skip_if_rocm
         def test_broadcast_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -1240,7 +1239,6 @@ def test_reduce_sum_cuda_twice(self):
 
         @skip_if_no_gpu
         @require_backend({"gloo", "nccl"})
-        @skip_if_rocm
         def test_all_reduce_result_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -1383,7 +1381,6 @@ def test_all_reduce_sum_cuda(self):
             "Only Gloo and NCCL backends will have CUDA allReduce tested",
         )
         @skip_if_no_gpu
-        @skip_if_rocm
         def test_all_reduce_sum_cuda_async(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -2340,7 +2337,6 @@ def _test_barrier_helper(
 
         @skip_if_no_gpu
         @unittest.skipIf(BACKEND == "mpi", "MPI doesn't supports GPU barrier")
-        @skip_if_rocm
         def test_barrier_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()

From 092e52a4da53f325d41c496a01bd433d34256ccd Mon Sep 17 00:00:00 2001
From: Horace He <horacehe2007@yahoo.com>
Date: Fri, 4 Dec 2020 18:29:55 -0800
Subject: [PATCH 071/132] [fx]added prototype of to_folder (#47544)

Summary:
What this does is that given a `FxModule foo`, you can call `foo.to_folder('foo_folder', 'Foo')` and dump the current FX module into runnable Python code.

That is
```
foo = <fxModule>
foo = foo.to_folder('bar', 'Foo')
from bar import Foo
foo2 = Foo()

forall x, foo2(x) == Foo(x)
```

This has several use cases, largely lifted from jamesr66a's doc here: https://fb.quip.com/U6KHAFaP2cWa (FB-internal).

1. As we apply more heavy-weight function transformations with FX, figuring out what's going on can be quite a difficult experience. In particular, things that can typically be used for debugging (like `print` or `import pdb; pdb.set_trace()`) no longer work. This is particularly necessary if you're using a FX transform like `grad` or `vmap. With this, you simply open up the dumped file, and add `print`/`pdb` statements wherever you'd like.

2. This also provides an immense amount of user control. Some potential use-cases:
-  Let's say an existing FX transform has some bug, or generates suboptimal code. Instead of needing to modify that FX transform, writing another FX pass that fixes the suboptimal code, or simply giving up on FX, they can workaround it by simply modifying the resulting code themselves.
- This allows users to check in their FX modules into source control.
- You could even imagine using this as part of some code-gen type workflow, where you write a function, `vmap` it to get the function you actually want, and then simply copy the output of the `vmap` function without needing FX at all in the final code.

An example:
```python
class Test(nn.Module):
    def __init__(self):
        super(Test, self).__init__()
        self.W = torch.nn.Parameter(torch.randn(2))
        self.linear = nn.Linear(2, 2)
        self.attr = torch.randn(2)
        self.attr2 = torch.randn(2)

    def forward(self, x):
        return self.linear(self.W + (self.attr + self.attr2) + x)

mod = fx.symbolic_trace(Test())
mod.to_folder('foo', 'Foo')
```
results in
```python
import torch
class Foo(torch.nn.Module):
    def __init__(self):
        super().__init__()
        state_dict = torch.load('foo/state_dict.pt')
        self.linear = torch.load('foo/linear.pt') # Linear(in_features=2, out_features=2, bias=True)
        self.__tensor_constant0 = state_dict['__tensor_constant0']
        self.W = torch.nn.Parameter(state_dict['W'])

    def forward(self, x):
        w = self.W
        tensor_constant0 = self.__tensor_constant0
        add_1 = w + tensor_constant0
        add_2 = add_1 + x
        linear_1 = self.linear(add_2)
        return linear_1
```
Some current issues:
1. How do you actually ... save things like modules or parameters? I don't think FX is in the business of tracking initializations and such. Thus, the only way I see to do it is to dump the parameters/modules as blobs, and then load them in the generated initialization. This is a somewhat subpar user experience, and perhaps prevents it from being in some use cases (ie: you would need to check in the blobs into source control to save the model).

2. Currently, the only "atomic" modules we have are those in `torch.nn`. However, if we want to allow flexibility in this, and for example, allow "atomic" modules that are user-defined, then it's not clear how to allow those to be dumped in a way that we can then load elsewhere.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47544

Reviewed By: jamesr66a

Differential Revision: D25232917

Pulled By: Chillee

fbshipit-source-id: fd2b61a5f40e614fc94256a2957ed1d57fcf5492
---
 test/test_fx_experimental.py | 32 +++++++++++++++++++
 torch/fx/graph_module.py     | 61 +++++++++++++++++++++++++++++++++++-
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 201680cec4bd..13928208316c 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -1,5 +1,6 @@
 import torch
 import unittest
+import sys
 from typing import Dict
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.graph_module import GraphModule
@@ -728,6 +729,37 @@ def foo(x):
 
         traced = symbolic_trace_with_rewrite(foo)
 
+    def test_to_folder(self):
+        class Test(torch.nn.Module):
+            def __init__(self):
+                super(Test, self).__init__()
+                self.W = torch.nn.Parameter(torch.randn(2))
+                self.seq = torch.nn.Sequential(torch.nn.BatchNorm1d(2, 2))
+                self.linear = torch.nn.Linear(2, 2)
+                self.attr = torch.randn(2)
+                self.register_buffer('attr2', torch.randn(2))
+
+            def forward(self, x):
+                return self.linear(self.seq(self.W + self.attr + self.attr2 + x))
+
+        mod = symbolic_trace(Test())
+        module_name = 'Foo'
+        import tempfile
+        from pathlib import Path
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_dir = Path(tmp_dir)
+            mod.to_folder(tmp_dir, module_name)
+            # Recipe taken from here:
+            # https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
+            import importlib.util
+            spec = importlib.util.spec_from_file_location(module_name, tmp_dir / '__init__.py')
+            module = importlib.util.module_from_spec(spec)
+            sys.modules[module_name] = module
+            spec.loader.exec_module(module)
+            t = torch.randn(2, 2)
+            self.assertEqual(module.Foo()(t), mod(t))
+
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index c593734eea4c..9becd6388f74 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -1,12 +1,17 @@
 import torch
+import torch.nn as nn
 import torch.overrides
+from torch.nn.modules.module import _addindent
 import linecache
-from typing import Type, Dict, List, Any, Union
+from typing import Type, Dict, List, Any, Union, Optional
 from .graph import Graph
 import copy
 import sys
 import traceback
 import math
+from pathlib import Path
+import os
+import warnings
 
 # normal exec loses the source code, however we can patch
 # the linecache module to still recover it.
@@ -194,6 +199,60 @@ def graph(self, g) -> None:
         self._graph = g
         self.recompile()
 
+
+    def to_folder(self, folder: Union[str, os.PathLike], module_name="FxModule"):
+        """Dumps out module to ``folder`` with ``module_name`` so that it can be
+        imported with ``from <folder> import <module_name>``
+        """
+        folder = Path(folder)
+        Path(folder).mkdir(exist_ok=True)
+        torch.save(self.state_dict(), folder / 'state_dict.pt')
+        tab = " " * 4
+        model_str = f"""
+import torch
+from torch.nn import *
+class {module_name}(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+"""
+
+        def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]:
+            safe_reprs = [nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d]
+            if type(module) in safe_reprs:
+                return f"{module.__repr__()}"
+            else:
+                return None
+
+        blobified_modules = []
+        for module_name, module in self.named_children():
+            module_str = _gen_model_repr(module_name, module)
+            if module_str is None:
+                module_file = folder / f'{module_name}.pt'
+                torch.save(module, module_file)
+                blobified_modules.append(module_name)
+                module_repr = module.__repr__().replace('\r', ' ').replace('\n', ' ')
+                module_str = f"torch.load(r'{module_file}') # {module_repr}"
+            model_str += f"{tab*2}self.{module_name} = {module_str}\n"
+
+        for buffer_name, buffer in self._buffers.items():
+            model_str += f"{tab*2}self.register_buffer('{buffer_name}', torch.empty({list(buffer.shape)}))\n"
+
+        for param_name, param in self._parameters.items():
+            model_str += f"{tab*2}self.{param_name} = torch.nn.Parameter(torch.empty({list(buffer.shape)}))\n"
+
+        model_str += f"{tab*2}self.load_state_dict(torch.load(r'{folder}/state_dict.pt'))\n"
+        model_str += f"{_addindent(self.code, 4)}\n"
+
+        module_file = folder / 'module.py'
+        module_file.write_text(model_str)
+
+        init_file = folder / '__init__.py'
+        init_file.write_text('from .module import *')
+
+        if len(blobified_modules) > 0:
+            warnings.warn("Was not able to save the following children modules as reprs -"
+                          f"saved as pickled files instead: {blobified_modules}")
+
     def recompile(self) -> None:
         """
         Recompile this GraphModule from its `graph` attribute. This should be

From 9e10e3b74f3b067872ca0a9b5630422e99bfd1e9 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 4 Dec 2020 18:33:16 -0800
Subject: [PATCH 072/132] [PyTorch] Move TensorImpl::shallow_copy_and_detach to
 .cpp file (#48680)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48680

It seems a bit long to put into the header (and is virtual anyway).
ghstack-source-id: 117894350

Test Plan: CI

Reviewed By: bhosmer

Differential Revision: D25259848

fbshipit-source-id: e3eed1f2483fc3c1ff51459159bf3bfed9d6f363
---
 c10/core/TensorImpl.cpp | 15 +++++++++++++++
 c10/core/TensorImpl.h   | 13 +------------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 9f2ca1d2ca07..cc042f76a23b 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -289,6 +289,21 @@ c10::AutogradMetaInterface* TensorImpl::autograd_meta() const {
   return autograd_meta_.get();
 }
 
+c10::intrusive_ptr<TensorImpl> TensorImpl::shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const {
+  auto impl = c10::make_intrusive<TensorImpl>(
+      Storage(storage()), key_set_, data_type_);
+  copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/version_counter,
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+  impl->refresh_numel();
+  impl->refresh_contiguous();
+  return impl;
+}
+
 void TensorImpl::copy_tensor_metadata(
     const TensorImpl* src_impl,
     TensorImpl* dest_impl,
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index da849b049b65..a42680e045cb 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -935,18 +935,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       const c10::VariableVersion& version_counter,
-      bool allow_tensor_metadata_change) const {
-    auto impl = c10::make_intrusive<TensorImpl>(
-        Storage(storage()), key_set_, data_type_);
-    copy_tensor_metadata(
-      /*src_impl=*/this,
-      /*dest_impl=*/impl.get(),
-      /*version_counter=*/version_counter,
-      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    impl->refresh_contiguous();
-    return impl;
-  }
+      bool allow_tensor_metadata_change) const;
 
   /**
    * Shallow-copies data from another TensorImpl into this TensorImpl.

From 3f10518defb40f9ef884493b71bbd5b6e43ba464 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 4 Dec 2020 18:33:16 -0800
Subject: [PATCH 073/132] [PyTorch] Add VariableVersion&& overload for
 TensorImpl::shallow_copy_and_detach (#48681)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48681

This should reduce reference counting traffic when creating views.

The code duplication here is unfortunate and I'm open to suggestions on how to reduce it. It's especially regrettable that we create a footgun for subclasses of TensorImpl: they can accidentally override only one of the two overloads and get confusing behavior.
ghstack-source-id: 117896685

Test Plan: internal benchmarks

Reviewed By: ezyang

Differential Revision: D25259741

fbshipit-source-id: 55f99b16b50f9791fdab85cbc81d7cd14e31c4cf
---
 aten/src/ATen/OpaqueTensorImpl.h      | 39 +++++++++++++++++++++++++--
 aten/src/ATen/SparseTensorImpl.h      | 19 +++++++++++++
 aten/src/ATen/quantized/QTensorImpl.h | 21 +++++++++++++++
 c10/core/TensorImpl.cpp               | 37 ++++++++++++++++++++++---
 c10/core/TensorImpl.h                 | 33 +++++++++++++++++++++++
 5 files changed, 144 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index b00f80d232db..f91b701f8f54 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -86,14 +86,34 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
     auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
         key_set(), dtype(), device(), opaque_handle_, sizes_);
     copy_tensor_metadata(
-        /*src_impl=*/this,
-        /*dest_impl=*/impl.get(),
+        /*src_opaque_impl=*/this,
+        /*dest_opaque_impl=*/impl.get(),
         /*version_counter=*/version_counter,
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     impl->refresh_numel();
     return impl;
   }
 
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
+        key_set(), dtype(), device(), opaque_handle_, sizes_);
+    copy_tensor_metadata(
+        /*src_opaque_impl=*/this,
+        /*dest_opaque_impl=*/impl.get(),
+        /*version_counter=*/std::move(version_counter),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
   /**
    * Shallow-copies data from another TensorImpl into this TensorImpl.
    *
@@ -143,6 +163,21 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
     dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
   }
 
+  static void copy_tensor_metadata(
+      const OpaqueTensorImpl<OpaqueHandle>* src_opaque_impl,
+      OpaqueTensorImpl<OpaqueHandle>* dest_opaque_impl,
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_opaque_impl,
+        dest_opaque_impl,
+        std::move(version_counter),
+        allow_tensor_metadata_change);
+
+    // OpaqueTensorImpl-specific fields.
+    dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
+  }
+
  private:
   OpaqueHandle opaque_handle_;
 };
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index b8e6bb26bf7f..4373c9b97ac8 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -200,6 +200,25 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
     return impl;
   }
 
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
+    copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/std::move(version_counter),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
   /**
    * Shallow-copies data from another TensorImpl into this TensorImpl.
    *
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
index efce432d5863..1bd859e91435 100644
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@@ -51,6 +51,27 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl {
     return impl;
   }
 
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<QTensorImpl>(
+        Storage(storage()), key_set(), data_type_, quantizer_);
+    copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/std::move(version_counter),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    impl->refresh_contiguous();
+    return impl;
+  }
+
   /**
    * Shallow-copies data from another TensorImpl into this TensorImpl.
    *
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index cc042f76a23b..a9e8f1f6853f 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -304,10 +304,24 @@ c10::intrusive_ptr<TensorImpl> TensorImpl::shallow_copy_and_detach(
   return impl;
 }
 
-void TensorImpl::copy_tensor_metadata(
+c10::intrusive_ptr<TensorImpl> TensorImpl::shallow_copy_and_detach(
+    c10::VariableVersion&& version_counter,
+    bool allow_tensor_metadata_change) const {
+  auto impl = c10::make_intrusive<TensorImpl>(
+      Storage(storage()), key_set_, data_type_);
+  copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/std::move(version_counter),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+  impl->refresh_numel();
+  impl->refresh_contiguous();
+  return impl;
+}
+
+void TensorImpl::copy_tensor_metadata_except_version_counter(
     const TensorImpl* src_impl,
     TensorImpl* dest_impl,
-    const c10::VariableVersion& version_counter,
     bool allow_tensor_metadata_change) {
   dest_impl->storage_ = src_impl->storage_;
   dest_impl->sizes_ = src_impl->sizes_;
@@ -324,13 +338,30 @@ void TensorImpl::copy_tensor_metadata(
   dest_impl->is_non_overlapping_and_dense_ = src_impl->is_non_overlapping_and_dense_;
   dest_impl->is_wrapped_number_ = src_impl->is_wrapped_number_;
   dest_impl->reserved_ = src_impl->reserved_;
-  dest_impl->set_version_counter(version_counter);
   dest_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
   if (src_impl->named_tensor_meta_ != nullptr) {
     dest_impl->named_tensor_meta_ = src_impl->named_tensor_meta_->clone();
   }
 }
 
+void TensorImpl::copy_tensor_metadata(
+    const TensorImpl* src_impl,
+    TensorImpl* dest_impl,
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) {
+  copy_tensor_metadata_except_version_counter(src_impl, dest_impl, allow_tensor_metadata_change);
+  dest_impl->set_version_counter(version_counter);
+}
+
+void TensorImpl::copy_tensor_metadata(
+    const TensorImpl* src_impl,
+    TensorImpl* dest_impl,
+    c10::VariableVersion&& version_counter,
+    bool allow_tensor_metadata_change) {
+  copy_tensor_metadata_except_version_counter(src_impl, dest_impl, allow_tensor_metadata_change);
+  dest_impl->set_version_counter(std::move(version_counter));
+}
+
 namespace impl {
 
 namespace {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index a42680e045cb..269976a7e148 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -937,6 +937,16 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       const c10::VariableVersion& version_counter,
       bool allow_tensor_metadata_change) const;
 
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
   /**
    * Shallow-copies data from another TensorImpl into this TensorImpl.
    *
@@ -958,6 +968,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     version_counter_ = version_counter;
   }
 
+  void set_version_counter(
+    c10::VariableVersion&& version_counter) noexcept {
+    version_counter_ = std::move(version_counter);
+  }
+
   const c10::VariableVersion& version_counter() const noexcept {
     return version_counter_;
   }
@@ -1570,6 +1585,24 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       const c10::VariableVersion& version_counter,
       bool allow_tensor_metadata_change);
 
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
+   * from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change);
+
+private:
+  static void copy_tensor_metadata_except_version_counter(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      bool allow_tensor_metadata_change);
+
 protected:
   // Error message to show when the user tries to change tensor metadata on
   // Tensor created from .data or .detach().

From a3298c2f64b1534c93197aaf96ed84cf8db7cd95 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Fri, 4 Dec 2020 18:39:08 -0800
Subject: [PATCH 074/132] Implement JIT serialization of ProcessGroup (#48544)

Summary:
This diff enables JIT serialization of `ProcessGroup`, including both base `ProcessGroup` class and derived classes like `ProcessGroupNCCL`.

If a `ProcessGroup` is created via high-level APIs like `dist_c10d.frontend().new_process_group_helper()`, they are automatically serializable. If a `ProcessGroup` is created via its derived class TorchBind APIs like `dist_c10d.ProcessGroupNCCL()`, then it has to be given a name and registered with `dist_c10d.frontend().register_process_group_name` to be uniquely identifiable and serializable.

* Fixed a minor bug in new dist_c10d frontend which fails to check whether a process group is used or not
* Fixed an issue where `test_jit_c10d.py` wasn't really run due to a configuration bug. Now tests are run as a slow test (need ci-all/* branch)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48544

Reviewed By: wanchaol

Differential Revision: D25298309

Pulled By: gmagogsfm

fbshipit-source-id: ed27ce37373c88277dc0c78704c48d4c19d46d46
---
 .jenkins/pytorch/multigpu-test.sh             |   1 +
 .../check_backward_compatibility.py           |   1 +
 test/distributed/test_jit_c10d.py             | 116 ++++++++++++++++--
 torch/csrc/distributed/c10d/init.cpp          |  89 ++++++++++++--
 torch/lib/c10d/frontend.cpp                   | 103 ++++++++++++++--
 torch/lib/c10d/frontend.hpp                   |  13 +-
 6 files changed, 287 insertions(+), 36 deletions(-)

diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
index fb5e6f54d013..9a2c486610c4 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -17,6 +17,7 @@ fi
 
 python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
+time python test/run_test.py --verbose -i distributed/test_jit_c10d
 time python test/run_test.py --verbose -i distributed/test_distributed_fork
 time python test/run_test.py --verbose -i distributed/test_c10d
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 24796d74ceaa..5949515fccbe 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -204,6 +204,7 @@ def allow_listed(schema, allow_list):
 dont_parse_list = [
     ("_TorchScriptTesting.*", datetime.date(2099, 9, 17)),
     ("test_backend", datetime.date(2099, 9, 17)),
+    ("c10d.frontend", datetime.date(2020, 12, 30)),
 ]
 
 
diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py
index 51579b6cd07f..85788b914059 100644
--- a/test/distributed/test_jit_c10d.py
+++ b/test/distributed/test_jit_c10d.py
@@ -3,11 +3,13 @@
 from sys import platform
 import torch
 import torch.distributed as c10d
+import time
 from typing import List
 
 import torch.testing._internal.common_utils as common
 from torch.testing._internal.common_distributed import requires_nccl, skip_if_rocm_single_process
-from torch.testing._internal.common_utils import TestCase, load_tests, TEST_WITH_TSAN
+from torch.testing._internal.common_utils import load_tests, TEST_WITH_TSAN, run_tests, IS_WINDOWS
+from torch.testing._internal.jit_utils import JitTestCase
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -17,17 +19,24 @@
     print('c10d not available, skipping tests', file=sys.stderr)
     sys.exit(0)
 
-
 if platform == 'darwin':
     LOOPBACK = 'lo0'
 else:
     LOOPBACK = 'lo'
 
+def unique_process_group_name(prefix):
+    # Append timestamp to process group name to make it unique, so
+    # that when tests run multiple times or in parallel there
+    # wouldn't be name conflicts.
+    now = int(time.time() * 1000)
+    return "%s_%d" % (prefix, now)
+
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "TSAN is not fork-safe since we're forking in a multi-threaded environment",
 )
-class ProcessGroupNCCLJitTest(TestCase):
+@unittest.skipIf(IS_WINDOWS, "TCPStore not available on Windows")
+class ProcessGroupNCCLJitTest(JitTestCase):
     MAIN_PROCESS_RANK = 0
 
     def setUp(self):
@@ -38,31 +47,33 @@ def setUp(self):
         if self.num_gpus < 2:
             raise unittest.SkipTest("NCCL test requires 2+ GPUs")
 
-    def _create_nccl_pg(self):
+    def _create_nccl_pg(self, name_prefix):
         addr = "localhost"
         port = common.find_free_port()
         tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
         opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True)
 
-        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts)  
+        name = unique_process_group_name(name_prefix)
+
+        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)  
 
-    def _create_nccl_pg_as_base_process_group(self):
+    def _create_nccl_pg_as_base_process_group(self, name):
         addr = "localhost"
         port = common.find_free_port()
         tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
 
-        return torch.classes.c10d.frontend().newProcessGroupHelper(
-            self.world_size, self.rank, [], "NCCL", tcp_store, "test_process_group", 0)
+        return torch.classes.dist_c10d.frontend().new_process_group_helper(
+            self.world_size, self.rank, [], "nccl", tcp_store, name, 0)
 
     @requires_nccl()
     @skip_if_rocm_single_process
     def test_init_process_group_nccl_torchbind(self):
-        self._create_nccl_pg()
+        self._create_nccl_pg("raw_process_group_nccl_torchbind")
 
     @requires_nccl()
     @skip_if_rocm_single_process
     def test_process_group_nccl_torchbind_alltoall(self):
-        nccl_pg = self._create_nccl_pg()
+        nccl_pg = self._create_nccl_pg("process_group_nccl_as_base_class")
 
         input = torch.rand(16).cuda()
         output = torch.rand(16).cuda()
@@ -84,12 +95,14 @@ def run_pg_nccl_alltoall(
     @requires_nccl()
     @skip_if_rocm_single_process
     def test_init_process_group_nccl_as_base_process_group_torchbind(self):
-        self._create_nccl_pg_as_base_process_group()
+        name = unique_process_group_name("creation_test_process_group")
+        self._create_nccl_pg_as_base_process_group(name)
 
     @requires_nccl()
     @skip_if_rocm_single_process
     def test_process_group_nccl_as_base_process_group_torchbind_alltoall(self):
-        nccl_pg = self._create_nccl_pg_as_base_process_group()
+        name = unique_process_group_name("alltoall_test_process_group")
+        nccl_pg = self._create_nccl_pg_as_base_process_group(name)
 
         input = torch.rand(16).cuda()
         output = torch.rand(16).cuda()
@@ -107,3 +120,82 @@ def run_pg_nccl_alltoall(
             return work.result()
 
         run_pg_nccl_alltoall(nccl_pg, output, input)
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_process_group_nccl_serialization(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, pg_nccl):
+                super(TestModule, self).__init__()
+                self.pg = pg_nccl
+
+            def forward(self, input: torch.Tensor):
+                if self.pg is None:
+                    return input + 1
+                else:
+                    return input + 2
+
+        pg_nccl = self._create_nccl_pg("nccl_process_group_as_module_member")
+        self.checkModule(TestModule(pg_nccl), (torch.rand((2, 3)),))
+
+
+@unittest.skipIf(IS_WINDOWS, "TCPStore not available on Windows")
+class C10dFrontendJitTest(JitTestCase):
+    def setUp(self):
+        self.rank = 0
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus < 2:
+            raise unittest.SkipTest("NCCL test requires 2+ GPUs")
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_frontend_singleton(self):
+        frontend1 = torch.classes.dist_c10d.frontend()
+        frontend2 = torch.classes.dist_c10d.frontend()
+
+        addr = "localhost"
+        port = common.find_free_port()
+        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+
+        pg_name = unique_process_group_name("singleton_test_process_group")
+
+        ProcessGroupNCCL1 = frontend1.new_process_group_helper(
+            self.world_size, self.rank, [], "nccl", tcp_store, pg_name, 0)
+
+        ProcessGroupNCCL2 = frontend2.get_process_group_by_name(pg_name)
+        self.assertEqual(frontend2.get_name_of_process_group(ProcessGroupNCCL2), pg_name)
+
+@unittest.skipIf(IS_WINDOWS, "TCPStore not available on Windows")
+class C10dProcessGroupSerialization(JitTestCase):
+    def setUp(self):
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus < 2:
+            raise unittest.SkipTest("NCCL test requires 2+ GPUs")
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_process_group_as_module_member(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                addr = "localhost"
+                port = common.find_free_port()
+                tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+
+                name = unique_process_group_name("module_member_process_group")
+                self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper(
+                    1, 0, [], "nccl", tcp_store, name, 0)
+
+            def forward(self, input: torch.Tensor):
+                if self.pg is None:
+                    return input + 1
+                else:
+                    return input + 2
+
+        self.checkModule(TestModule(), (torch.rand((2, 3)),))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 58556c8831ab..14d0a373e83d 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1263,6 +1263,28 @@ static const auto ProcessGroupWorkTorchBind =
 // TODO: Support argument names in Python API.
 static const auto ProcessGroupTorchBind =
     torch::class_<::c10d::ProcessGroup>("dist_c10d", "ProcessGroup")
+        .def_pickle(
+            [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self) {
+              auto name =
+                  ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self);
+              return std::vector<std::string>{name};
+            },
+            [](std::vector<std::string> state) {
+                TORCH_CHECK(
+                  state.size() == 1,
+                  "Expecting exactly 1 state when restoring ProcessGroup, got: ",
+                  state.size());
+              const auto& process_group_name = state.front();
+              auto process_group =
+                  ::c10d::DistributedC10d::get()->getProcessGroupByName(
+                      process_group_name);
+              TORCH_CHECK(
+                  process_group.defined(),
+                  "Needed process group not found, ",
+                  "please create a process group with name: ",
+                  process_group_name);
+              return process_group;
+            })
         .def(
             "rank",
             [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self) {
@@ -1527,11 +1549,49 @@ static const auto ProcessGroupNCCLOptionsTorchBind =
 
 static const auto ProcessGroupNCCLTorchBind =
     torch::class_<::c10d::ProcessGroupNCCL>("dist_c10d", "ProcessGroupNCCL")
-        .def(torch::init<
-             const c10::intrusive_ptr<::c10d::Store>&,
-             int64_t,
-             int64_t,
-             const c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options>&>())
+        .def_pickle(
+            [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+              auto base_process_group =
+                  static_intrusive_pointer_cast<::c10d::ProcessGroup>(self);
+              auto name =
+                  ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self);
+              return std::vector<std::string>{name};
+            },
+            [](std::vector<std::string> state) {
+              TORCH_CHECK(
+                  state.size() == 1,
+                  "Expecting exactly 1 state when restoring ProcessGroupNCCL, got: ",
+                  state.size());
+              const auto& process_group_name = state.front();
+              auto base_process_group =
+                  ::c10d::DistributedC10d::get()->getProcessGroupByName(
+                      process_group_name);
+              TORCH_CHECK(
+                  base_process_group.defined(),
+                  "Needed process group not found, ",
+                  "please create a process group with name: ",
+                  process_group_name);
+              c10::intrusive_ptr<::c10d::ProcessGroupNCCL> process_group_nccl =
+                  dynamic_intrusive_pointer_cast<::c10d::ProcessGroupNCCL>(
+                      base_process_group);
+              TORCH_CHECK(
+                  process_group_nccl.defined(),
+                  "Process group ",
+                  process_group_name,
+                  " isn't configured for NCCL backend");
+              return process_group_nccl;
+            })
+        .def(torch::init(
+            [](const c10::intrusive_ptr<::c10d::Store>& store,
+               int64_t rank,
+               int64_t size,
+               c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options> options,
+               const std::string& name) {
+              auto pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(store, rank, size, options);
+              ::c10d::DistributedC10d::get()->registerProcessGroupName(
+                  pg, name);
+              return pg;
+            }))
         .def(
             "alltoall_base",
             [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
@@ -1549,14 +1609,17 @@ static const auto ProcessGroupNCCLTorchBind =
 #endif
 
 static const auto DistributedC10dFrontendTorchBind =
-    torch::class_<::c10d::DistributedC10d>("c10d", "frontend")
-        .def(torch::init([]() {
-          static c10::intrusive_ptr<::c10d::DistributedC10d>
-              c10d_frontend_singleton =
-                  c10::make_intrusive<::c10d::DistributedC10d>();
-          return c10d_frontend_singleton;
-        }))
-        .def("new_process_group_helper", &::c10d::DistributedC10d::newProcessGroupHelper);
+    torch::class_<::c10d::DistributedC10d>("dist_c10d", "frontend")
+        .def(torch::init([]() { return ::c10d::DistributedC10d::get(); }))
+        .def(
+            "new_process_group_helper",
+            &::c10d::DistributedC10d::newProcessGroupHelper)
+        .def(
+            "get_process_group_by_name",
+            &::c10d::DistributedC10d::getProcessGroupByName)
+        .def(
+            "get_name_of_process_group",
+            &::c10d::DistributedC10d::getNameOfProcessGroup);
 
 } // namespace
 
diff --git a/torch/lib/c10d/frontend.cpp b/torch/lib/c10d/frontend.cpp
index bcacd830277e..bb8eb2045a28 100644
--- a/torch/lib/c10d/frontend.cpp
+++ b/torch/lib/c10d/frontend.cpp
@@ -52,8 +52,7 @@ void assertReduceOpSupportsComplexTensor(ReduceOp op) {
     case ReduceOp::MAX:
     case ReduceOp::MIN:
     case ReduceOp::PRODUCT:
-      TORCH_CHECK(
-          false,
+      AT_ERROR(
           "all_reduce does not support requested Reduce op on complex tensors");
     default:
       return;
@@ -70,6 +69,61 @@ void Backend::registerBackend() {
   TORCH_CHECK(false, "Registering third-party backend is currently not supported by TorchScript-friendly c10d");
 }
 
+c10::intrusive_ptr<DistributedC10d> DistributedC10d::get() {
+  static c10::intrusive_ptr<DistributedC10d> singleton =
+      c10::make_intrusive<DistributedC10d>();
+
+  return singleton;
+}
+
+c10::intrusive_ptr<ProcessGroup> DistributedC10d::getProcessGroupByName(const std::string& name) const {
+  auto it = std::find_if(
+      pg_names_.begin(),
+      pg_names_.end(),
+      [&](const std::pair<c10::intrusive_ptr<ProcessGroup>, std::string>&
+              pg_name) { return pg_name.second == name; });
+
+  if (it == pg_names_.end()) {
+    std::stringstream error;
+    error << "Unable to find process group with name: ";
+    error << name;
+    error << " , instead we have ";
+    error << pg_names_.size() << " process groups: {";
+    for (const auto& pg : pg_names_) {
+      error << static_cast<void*>(pg.first.get());
+      error << " with name: ";
+      error << pg.second;
+      error << ", ";
+    }
+    error << "}";
+    AT_ERROR(error.str());
+  }
+
+  TORCH_CHECK(it->first.defined(), "found a process group that's null");
+
+  return it->first;
+}
+
+std::string DistributedC10d::getNameOfProcessGroup(const c10::intrusive_ptr<ProcessGroup>& pg) const {
+  auto it = pg_names_.find(pg);
+  if (it == pg_names_.end()) {
+    std::stringstream error;
+    error << "Unable to find name of process group ";
+    error << static_cast<void*>(pg.get());
+    error << "instead we have " << pg_names_.size() << " process groups: {";
+    for (const auto& pg : pg_names_) {
+      error << static_cast<void*>(pg.first.get());
+      error << " with name: ";
+      error << pg.second;
+      error << ", ";
+    }
+    error << "}";
+    AT_ERROR(error.str());
+  }
+
+  return it->second;
+}
+
 c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
     const int64_t world_size,
     const int64_t rank,
@@ -89,13 +143,13 @@ c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
       [&](const std::pair<c10::intrusive_ptr<ProcessGroup>, std::string>&
               pg_name) { return pg_name.second == *group_name; });
 
-  if (it == pg_names_.end()) {
+  if (it != pg_names_.end()) {
     throw std::runtime_error(
         "The specified group name has already been "
         "created, please use a different group name");
   }
 
-  bool is_default_group = pg_group_ranks_.size() == 0;
+  bool is_default_group = (group_ranks.size() == 0);
 
   c10::intrusive_ptr<ProcessGroup> pg;
 
@@ -107,7 +161,7 @@ c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
     std::vector<int> group_ranks_copy(group_ranks.begin(), group_ranks.end());
     pg = ProcessGroupMPI::createProcessGroupMPI(group_ranks_copy);
 #else
-    throw std::runtime_error(
+    AT_ERROR(
         "Distributed package doesn't have MPI built in."
         " MPI is only included if you build PyTorch from"
         " source on a host that has MPI installed.");
@@ -146,7 +200,10 @@ c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
       options.threads = options.devices.size() * 2;
       pg = c10::make_intrusive<ProcessGroupGloo>(
           prefix_store, rank, world_size, options);
-#endif
+#else
+      AT_ERROR(
+          "Attempting to create GLOO-based process group while GLOO is either not enabled or built");
+#endif // USE_C10D_GLOO
     } else if (backend == "nccl") {
 #ifdef USE_C10D_NCCL
       auto options = c10::make_intrusive<ProcessGroupNCCL::Options>();
@@ -155,10 +212,13 @@ c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
       options->opTimeout = timeout;
       pg = c10::make_intrusive<ProcessGroupNCCL>(
           prefix_store, rank, world_size, options);
-#endif
+#else
+      AT_ERROR(
+          "Attempting to create NCCL-based process group while NCCL is either not enabled or built");
+#endif // USE_C10D_NCCL
     } else {
       // TODO: discuss to figure out how to extend this to third party backends?
-      return pg;
+      AT_ERROR("Unsupported backend type: ", backend);
     }
   }
 
@@ -266,7 +326,7 @@ int64_t DistributedC10d::getGlobalRank(
     }
   }
 
-  TORCH_CHECK(false, "The group rank is not part of the group");
+  AT_ERROR("The group rank is not part of the group");
 }
 
 std::string DistributedC10d::getBackend(
@@ -833,4 +893,29 @@ c10::intrusive_ptr<ProcessGroup::Work> DistributedC10d::barrier(
   return empty_work;
 }
 
+void DistributedC10d::registerProcessGroupName(const c10::intrusive_ptr<ProcessGroup>& process_group, const std::string& name) {
+  auto it = std::find_if(
+      pg_names_.begin(),
+      pg_names_.end(),
+      [&](const std::pair<c10::intrusive_ptr<ProcessGroup>, std::string>&
+              pg_name) { return pg_name.second == name; });
+
+  if (it != pg_names_.end()) {
+    TORCH_CHECK(
+        it->first == process_group,
+        "Requested name already exists: ",
+        name,
+        " and it is associated with a different process group");
+    return;
+  }
+
+  it = pg_names_.find(process_group);
+  TORCH_CHECK(
+      it == pg_names_.end(),
+      "Given process group has been registered before with a different name: ",
+      it->second);
+
+  pg_names_[process_group] = name;
+}
+
 } // namespace c10d
diff --git a/torch/lib/c10d/frontend.hpp b/torch/lib/c10d/frontend.hpp
index eb3a5b2c2037..642c59458f32 100644
--- a/torch/lib/c10d/frontend.hpp
+++ b/torch/lib/c10d/frontend.hpp
@@ -43,9 +43,11 @@ class Backend {
   std::unordered_set<std::string> registered_backends_;
 };
 
-class DistributedC10d : public torch::CustomClassHolder {
+class TORCH_PYTHON_API DistributedC10d : public torch::CustomClassHolder {
  public:
-  DistributedC10d(){};
+  static c10::intrusive_ptr<DistributedC10d> get();
+
+  DistributedC10d() = default;
 
   void initProcessGroup(
       const std::string& backend,
@@ -210,6 +212,13 @@ class DistributedC10d : public torch::CustomClassHolder {
     c10::optional<std::string> group_name,
     int64_t timeout_milisesonds);
 
+  c10::intrusive_ptr<ProcessGroup> getProcessGroupByName(
+      const std::string& name) const;
+
+  std::string getNameOfProcessGroup(
+      const c10::intrusive_ptr<ProcessGroup>& pg) const;
+
+    void registerProcessGroupName(const c10::intrusive_ptr<ProcessGroup>& process_group, const std::string& name);
 
  private:
 

From 02d89f9f1d7f32ebf7ec509d5c14b2f39690997a Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 4 Dec 2020 18:40:24 -0800
Subject: [PATCH 075/132] scatter_object_list API for c10d (#43930)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43930

Closes #23232. As part of addressing #23232, this PR adds support for scatter_object_list which is an API to scatter arbitrary picklable objects to all the other ranks.

The implementation approach follows a similar approach as https://github.com/pytorch/pytorch/pull/42189. The result of the `scatter` is stored as the first element of `scatter_object_output_list`, and the src rank is expected to provide an input list `scatter_object_input_list` which contains the objects to scatter.

Note that this API requires 1 broadcast and 2 scatters. This is because we must communicate the maximum object size to be scattered, which only the src rank knows about. After that, we also need to communicate the objects themselves as well as the true sizes of the object.

Note that the API is designed to match the tensor-based collectives other than supporting async_op. For now, it is a blocking call. If we see demand to support async_op, we will have to make more progress on merging work/future to support this.

It only works for Gloo because NCCL doesn't support scatter.
ghstack-source-id: 117904065

Reviewed By: mrshenli

Differential Revision: D23430686

fbshipit-source-id: f033b89cd82dadd194f2b036312a98423449c26b
---
 torch/distributed/distributed_c10d.py         | 84 +++++++++++++++++++
 .../_internal/distributed/distributed_test.py | 29 +++++++
 2 files changed, 113 insertions(+)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 48b132811839..13a950024af9 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1601,6 +1601,90 @@ def broadcast_object_list(object_list, src, group=group.WORLD):
             object_list[i] = _tensor_to_object(obj_view, obj_size)
 
 
+def scatter_object_list(
+    scatter_object_output_list, scatter_object_input_list, src=0, group=group.WORLD
+):
+    """
+    Scatters picklable objects in ``scatter_object_input_list`` to the whole
+    group. Similar to :func:`scatter`, but Python objects can be passed in. On
+    each rank, the scattered object will be stored as the first element of
+    ``scatter_object_output_list``. Note that all objects in
+    ``scatter_object_input_list`` must be picklable in order to be scattered.
+
+    Arguments:
+        scatter_object_output_list (List[Any]): Non-empty list whose first
+            element will store the object scattered to this rank.
+        scatter_object_input_list (List[Any]): List of input objects to scatter.
+            Each object must be picklable. Only objects on the ``src`` rank will
+            be scattered, and the argument can be ``None`` for non-src ranks.
+        src (int): Source rank from which to scatter
+            ``scatter_object_input_list``.
+        group: (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        ``None``. If rank is part of the group, ``scatter_object_output_list``
+        will have its first element set to the scattered object for this rank.
+
+    .. note:: Note that this API differs slightly from the scatter collective
+        since it does not provide an ``async_op`` handle and thus will be a
+        blocking call.
+
+    .. warning::
+        :func:`scatter_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if (
+        not isinstance(scatter_object_output_list, list)
+        or len(scatter_object_output_list) < 1
+    ):
+        raise RuntimeError(
+            "Expected argument scatter_object_output_list to be a list of size at least 1."
+        )
+
+    my_rank = get_rank(group)
+    if my_rank == src:
+        tensor_list, tensor_sizes = zip(
+            *[_object_to_tensor(obj) for obj in scatter_object_input_list]
+        )
+        tensor_list, tensor_sizes = list(tensor_list), list(tensor_sizes)
+
+    obj_tensor_size = torch.LongTensor([0])
+    # Src rank broadcasts the maximum tensor size. This is because all ranks are
+    # expected to call into scatter() with equal-sized tensors.
+    if my_rank == src:
+        max_tensor_size = max(tensor_sizes)
+        for tensor in tensor_list:
+            tensor.resize_(max_tensor_size)
+    else:
+        max_tensor_size = torch.LongTensor([0])
+    broadcast(max_tensor_size, src=src, group=group)
+
+    # Scatter actual serialized objects
+    output_tensor = torch.ByteTensor(max_tensor_size.item())
+    scatter(
+        output_tensor,
+        scatter_list=None if my_rank != src else tensor_list,
+        src=src,
+        group=group,
+    )
+
+    # Scatter per-object sizes to trim tensors when deserializing back to object
+    scatter(
+        obj_tensor_size,
+        scatter_list=None if my_rank != src else tensor_sizes,
+        src=src,
+        group=group,
+    )
+
+    # Deserialize back to object
+    scatter_object_output_list[0] = _tensor_to_object(output_tensor, obj_tensor_size)
+
+
 def all_gather(tensor_list,
                tensor,
                group=group.WORLD,
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 943eb24a0b5e..cbe8e9d630bf 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4264,3 +4264,32 @@ def forward(self, x):
                 ) if i == 1 else suppress():
                     loss = model(random_input).sum()
                     loss.backward()
+
+        @require_backend({"gloo"})
+        @unittest.skipIf(BACKEND == "nccl", "NCCL does not support scatter")
+        def test_scatter_object_list(self):
+            src_rank = 0
+            scatter_list = (
+                collectives_object_test_list
+                if self.rank == src_rank
+                else [None for _ in collectives_object_test_list]
+            )
+            world_size = dist.get_world_size()
+            scatter_list = scatter_list[: world_size]
+            i = 0
+            while len(scatter_list) < world_size:
+                scatter_list.append(scatter_list[i])
+                i += 1
+
+            output_obj_list = [None]
+            dist.scatter_object_list(output_obj_list, scatter_list, src=src_rank)
+            self.assertEqual(
+                output_obj_list[0],
+                collectives_object_test_list[self.rank % len(collectives_object_test_list)],
+            )
+            # Ensure errors are raised upon incorrect arguments.
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Expected argument scatter_object_output_list to be a list of size at least 1.",
+            ):
+                dist.scatter_object_list([], scatter_list, src=src_rank)

From f0f315c33bee6732dc2774c0c1e31e73afff06e1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 4 Dec 2020 20:45:58 -0800
Subject: [PATCH 076/132] [PyTorch] Inline RecordFunctionCallback::shouldRun
 (#48286)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48286

RecordFunction initialization is a hot path. shouldRun often does little enough work that the function prologue takes a significant proportion of its time. So, this diff forces it to be inline.
ghstack-source-id: 117892387

Test Plan: FB-internal benchmarks

Reviewed By: ezyang

Differential Revision: D25108879

fbshipit-source-id: 7121413e714c5ca22c8bf10c1d2535a878c15aec
---
 aten/src/ATen/record_function.cpp | 97 +++++++++++++++++--------------
 aten/src/ATen/record_function.h   |  4 +-
 c10/macros/Macros.h               |  8 +++
 3 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index e9c0f8455380..102931fd4aa7 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -1,5 +1,6 @@
 #include <ATen/record_function.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/macros/Macros.h>
 #include <algorithm>
 #include <cstdlib>
 #include <random>
@@ -32,21 +33,26 @@ thread_local uint64_t current_thread_id_ = 0;
 thread_local bool tls_record_function_enabled_ = true;
 
 // Low probability constant
-const double kLowProb = 0.001;
-thread_local int tries_left_ = 0;
+static const double kLowProb = 0.001;
+struct CoinflipTLS {
+  int tries_left_;
+  std::mt19937 genGeo_;
+  std::mt19937 genZeroOne_;
+  std::geometric_distribution<int> distGeo_;
+  std::uniform_real_distribution<double> distZeroOne_;
+  CoinflipTLS();
+};
+
+CoinflipTLS::CoinflipTLS()
+    : tries_left_(0), genGeo_(std::random_device()()), genZeroOne_(std::random_device()()), distGeo_(kLowProb), distZeroOne_(0.0, 1.0) {}
+thread_local CoinflipTLS coinflip_tls_;
 
 int sample_geometric() {
-  static thread_local auto gen =
-      std::make_unique<std::mt19937>(std::random_device()());
-  std::geometric_distribution<int> dist(kLowProb);
-  return dist(*gen);
+  return coinflip_tls_.distGeo_(coinflip_tls_.genGeo_);
 }
 
 double sample_zero_one() {
-  static thread_local auto gen =
-      std::make_unique<std::mt19937>(std::random_device()());
-  std::uniform_real_distribution<double> dist(0.0, 1.0);
-  return dist(*gen);
+  return coinflip_tls_.distZeroOne_(coinflip_tls_.genZeroOne_);
 }
 
 } // namespace
@@ -117,6 +123,42 @@ class CallbackManager {
     return !rf_tls_.sorted_tls_callbacks_.empty();
   }
 
+  // We need this function to be inlined: init() is a hot path and
+  // callbackShouldRun is even hotter because it's called multiple
+  // times per init(). Profiling shows that the function prologue is
+  // taking up a significant fraction of the time.
+  static bool C10_ALWAYS_INLINE callbackShouldRun(const RecordFunctionCallback& cb, RecordScope scope) {
+    // first check whether this callback is interested in
+    // the given scope type
+    if (!cb.checkScope(scope)) {
+      return false;
+    }
+    // if we have registered should_run_ function, use it
+    if (cb.should_run_) {
+      return cb.should_run_(cb);
+    }
+
+    if (cb.sampling_prob_ == 1.0) {
+      return true;
+    }
+    // model the low probability events as events happening
+    // with probability kLowProb followed by another sampling with
+    // probability (sampling_prob__ / kLowProb), then replace the coin
+    // flip for kLowProb with a thread local number of tries tries_left_
+    // sampled from the geometric distribution.
+    if (cb.sampling_prob_ < kLowProb) {
+      if (coinflip_tls_.tries_left_ == 0) {
+        coinflip_tls_.tries_left_ = sample_geometric();
+        return (sample_zero_one() < cb.sampling_prob_ / kLowProb);
+      } else {
+        --coinflip_tls_.tries_left_;
+        return false;
+      }
+    } else {
+      return (sample_zero_one() < cb.sampling_prob_);
+    }
+  }
+
   // init is called by RecordFunction in constructor to
   // determine which thread local and global callbacks are going
   // to be executed and whether any of them need inputs
@@ -125,7 +167,7 @@ class CallbackManager {
     bool found_needs_ids = false;
 
     for (const auto& cb: rf_tls_.sorted_tls_callbacks_) {
-      if (cb.first.shouldRun(scope)) {
+      if (callbackShouldRun(cb.first, scope)) {
         if (cb.first.needsInputs()) {
           found_needs_inputs = true;
         }
@@ -140,7 +182,7 @@ class CallbackManager {
     }
 
     for (const auto& cb: sorted_global_callbacks_) {
-      if (cb.first.shouldRun(scope)) {
+      if (callbackShouldRun(cb.first, scope)) {
         if (cb.first.needsInputs()) {
           found_needs_inputs = true;
         }
@@ -266,37 +308,6 @@ namespace {
   }
 } // namespace
 
-bool RecordFunctionCallback::shouldRun(RecordScope scope) const {
-  // first check whether this callback is interested in
-  // the given scope type
-  if (!checkScope(scope)) {
-    return false;
-  }
-  // if we have registered should_run_ function, use it
-  if (should_run_) {
-    return should_run_(*this);
-  }
-  // otherwise potentially do the uniform sampling
-  if (sampling_prob_ != 1.0) {
-    // model the low probability events as events happening
-    // with prob. kLowProb followed by another sampling with
-    // prob. (sampling_prob_ / kLowProb), then replace the coin
-    // flip for kLowProb with a thread local number of tries tries_left_
-    // sampled from the geometric distribution
-    if (sampling_prob_ < kLowProb) {
-      if (tries_left_ == 0) {
-        tries_left_ = sample_geometric();
-        return (sample_zero_one() < sampling_prob_ / kLowProb);
-      } else {
-        --tries_left_;
-        return false;
-      }
-    } else {
-      return (sample_zero_one() < sampling_prob_);
-    }
-  }
-  return true;
-}
 
 RecordFunctionCallbacks _getTLSCallbacks() {
   return rf_tls_.sorted_tls_callbacks_;
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 788a2a66a60c..4b07d13aa747 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -379,10 +379,8 @@ class TORCH_API RecordFunctionCallback {
     return end_;
   }
 
-  // whether the callbacks should run in the given scope
-  bool shouldRun(RecordScope scope) const;
-
  private:
+  friend class CallbackManager;
   std::function<std::unique_ptr<ObserverContext>(const RecordFunction&)> start_;
   std::function<void(const RecordFunction&, ObserverContext*)> end_;
   bool(*should_run_)(const RecordFunctionCallback&) = nullptr;
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 521d3cc69860..46ff50621417 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -186,6 +186,14 @@ namespace at { namespace cuda { using namespace c10::hip; }}
 #define C10_NOINLINE
 #endif
 
+#if __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#elif defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
 #include <sstream>
 #include <string>
 

From af30a89068593b97fd68b66b944e7ad9570e33ef Mon Sep 17 00:00:00 2001
From: James Donald <jdonald@fb.com>
Date: Fri, 4 Dec 2020 21:11:43 -0800
Subject: [PATCH 077/132] [caffe2][a10] Remove unreferenced local variable e
 (#48601)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48601

Fix this spurious warning:
```
caffe2\aten\src\aten\core\ivalue_inl.h(412): warning C4101: 'e': unreferenced local variable
```

Test Plan: Local build & continuous integration

Reviewed By: gmagogsfm

Differential Revision: D25194281

fbshipit-source-id: 3ba469d1cbff6f16394b95c4c33d95efcaea5e3e
---
 aten/src/ATen/core/ivalue_inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 92718560aab5..3068bda5f5a5 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -409,7 +409,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
         [fut](std::function<IValue(void)> cb) {
           try {
             fut->markCompleted(cb());
-          } catch (std::exception& e) {
+          } catch (std::exception&) {
             fut->setError(std::current_exception());
           }
         },

From 55b93735ac93803cfbf8ecf826a142637de2b11f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 4 Dec 2020 21:45:22 -0800
Subject: [PATCH 078/132] [PyTorch] Save refcount decrements in
 StaticRuntime::deallocate_registers (#48859)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48859

Code comment should explain what's going on. If not, please request changes.
ghstack-source-id: 117889942

Test Plan: Internal benchmarks

Reviewed By: hlu1

Differential Revision: D25288842

fbshipit-source-id: 6bddebb99c4744e2f7aceb279fdf995821404606
---
 torch/csrc/jit/runtime/static/impl.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index bd1893290240..07d41fb1f642 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -585,8 +585,12 @@ void StaticRuntime::deallocate_registers(const std::vector<size_t>& internals) {
   // they will be re-created in the next iteration regardless
   for (auto i : internals) {
     if (reg_[i].isTensor()) {
-      if (reg_[i].toTensor().storage().nbytes() > 0) {
-        reg_[i] = IValue();
+      // If the tensor has no storage, we can keep it around. We
+      // implement by moving out of the register (leaving behind an
+      // empty IValue for free!) and possibly moving back.
+      at::Tensor asTensor = std::move(reg_[i]).toTensor();
+      if (asTensor.storage().nbytes() == 0) {
+        reg_[i] = std::move(asTensor);
       }
     } else {
       reg_[i] = IValue();

From 6317e0b2f1090ea4189e88557d4ff6656fb758cc Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Fri, 4 Dec 2020 23:13:33 -0800
Subject: [PATCH 079/132] [BE] Fix signed-unsigned warnings (#48848)

Summary:
Switch to range loops when possible
Replace `ptrdiff_t`(signed type) with `size_t`(unsigned type)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48848

Reviewed By: walterddr

Differential Revision: D25338250

Pulled By: malfet

fbshipit-source-id: e840618b113b8bc0d8bb067c2fdf06e3ec9233d4
---
 aten/src/ATen/BatchingRegistrations.cpp    | 4 ++--
 aten/src/ATen/NamedTensorUtils.cpp         | 4 ++--
 aten/src/ATen/TensorIterator.cpp           | 4 ++--
 aten/src/ATen/TensorNames.cpp              | 4 ++--
 aten/src/ATen/native/Convolution.cpp       | 8 ++++----
 aten/src/ATen/native/ForeachOpsKernels.cpp | 2 +-
 aten/src/TH/generic/THStorage.cpp          | 3 +--
 aten/src/TH/generic/THStorageCopy.cpp      | 5 ++---
 caffe2/serialize/crc_alt.h                 | 8 ++++----
 9 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 16470f39ad54..0f9b31efefb9 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule(
         size.size(), ") must match dimensionality of strides (",
         stride.size(), ")");
   auto storage_size = native::storage_size_for(size, stride);
-  for (int64_t idx = 0; idx < physical_strides.size(); ++idx) {
-    physical_strides[idx] *= storage_size;
+  for (auto& physical_stride : physical_strides) {
+    physical_stride *= storage_size;
   }
 
   // physical_strides = [B1 * B2 * S, B2 * S, S] + strides
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 668838877123..5f8de486dc78 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -264,11 +264,11 @@ static std::vector<Dimname> compute_dot_product_outnames(
   }
   std::vector<Dimname> outnames(num_outnames, Dimname::wildcard());
   int64_t index = 0;
-  for (int64_t j = 0; j < tensor_names.size(); ++j) {
+  for (size_t j = 0; j < tensor_names.size(); ++j) {
     if (j == tensor_dotted_dim) continue;
     outnames[index++] = tensor_names[j];
   }
-  for (int64_t j = 0; j < other_names.size(); ++j) {
+  for (size_t j = 0; j < other_names.size(); ++j) {
     if (j == other_dotted_dim) continue;
     outnames[index++] = other_names[j];
   }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 43acc9a070d5..0f18d941feff 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
 }
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
-  for (int i = 0; i < config.tensors_.size(); i++) {
-    operands_.emplace_back(std::move(config.tensors_[i]));
+  for (auto& tensor: config.tensors_) {
+    operands_.emplace_back(std::move(tensor));
   }
   num_outputs_ = config.num_outputs_;
 }
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index 844ff4ba2bad..a7dc0bd68036 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
 }
 
 TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) {
-  int64_t size_diff = std::labs(names_.size() - other.names_.size());
+  size_t size_diff = std::labs(names_.size() - other.names_.size());
 
   if (names_.size() > other.names_.size()) {
-    for (int64_t idx = size_diff; idx < names_.size(); ++idx) {
+    for (size_t idx = size_diff; idx < names_.size(); ++idx) {
       names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name);
     }
   } else {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 6dbf1e5535ed..801925214a99 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -177,13 +177,13 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at
   int64_t outsize = 1;
   if (transposed) {
     std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
+    for (const auto& e: o) {
+      outsize *= e;
     }
   } else {
     std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
+    for (const auto& e: o) {
+      outsize *= e;
     }
   }
   return outsize > int_max;
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 5fbc1506bfaa..a4a796ca26d9 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -201,7 +201,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
                                                                                              \
   std::vector<Tensor> result;                                                                \
   result.reserve(tensors1.size());                                                           \
-  for (int i = 0; i < tensors1.size(); i++) {                                                \
+  for (size_t i = 0; i < tensors1.size(); i++) {                                             \
     result.emplace_back(at::NAME(tensors1[i], tensors2[i]));                                 \
   }                                                                                          \
                                                                                              \
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 2db795719557..a085f31c740f 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -115,10 +115,9 @@ void THStorage_(resizeBytes)(THStorage* storage, ptrdiff_t size_bytes) {
 
 void THStorage_(fill)(THStorage *storage, scalar_t value)
 {
-  ptrdiff_t i;
   auto type_meta = caffe2::TypeMeta::Make<scalar_t>();
   size_t numel = storage->nbytes() / type_meta.itemsize();
-  for (i = 0; i < numel; i++)
+  for (size_t i = 0; i < numel; i++)
     THStorage_(data)(storage)[i] = value;
 }
 
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index dc19deea7652..2d6ec8a05eb6 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -8,7 +8,7 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
   scalar_t *scalar_src = THStorage_(data)(src);
   scalar_t *data = THStorage_(data)(storage);
   uint64_t numel = storage->nbytes() / sizeof(scalar_t);
-  for (ptrdiff_t i = 0; i < numel; ++i) {
+  for (uint64_t i = 0; i < numel; ++i) {
     data[i] = scalar_src[i];
   }
 }
@@ -19,11 +19,10 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
 #define IMPLEMENT_THStorage_COPY(TYPENAMESRC)                \
   void THStorage_(copy##TYPENAMESRC)(                        \
       THStorage * storage, TH##TYPENAMESRC##Storage * src) { \
-    ptrdiff_t i;                                             \
     auto data = THStorage_(data)(storage);                   \
     auto src_data = TH##TYPENAMESRC##Storage_data(src);      \
     uint64_t numel = storage->nbytes() / sizeof(scalar_t);   \
-    for (i = 0; i < numel; i++)                              \
+    for (uint64_t i = 0; i < numel; i++)                     \
       data[i] = static_cast<scalar_t>(src_data[i]);          \
   }
 
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index be51083fec0e..e7c986ff89fb 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -680,12 +680,12 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
 
   // put operator for one zero bit in odd
   odd[0] = Polynomial;    // CRC-32 polynomial
-  for (int i = 1; i < CrcBits; i++)
+  for (uint32_t i = 1; i < CrcBits; i++)
     odd[i] = 1 << (i - 1);
 
   // put operator for two zero bits in even
   // same as gf2_matrix_square(even, odd);
-  for (int i = 0; i < CrcBits; i++)
+  for (uint32_t i = 0; i < CrcBits; i++)
   {
     uint32_t vec = odd[i];
     even[i] = 0;
@@ -695,7 +695,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
   }
   // put operator for four zero bits in odd
   // same as gf2_matrix_square(odd, even);
-  for (int i = 0; i < CrcBits; i++)
+  for (uint32_t i = 0; i < CrcBits; i++)
   {
     uint32_t vec = even[i];
     odd[i] = 0;
@@ -711,7 +711,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
   for (; lengthB > 0; lengthB >>= 1)
   {
     // same as gf2_matrix_square(a, b);
-    for (int i = 0; i < CrcBits; i++)
+    for (uint32_t i = 0; i < CrcBits; i++)
     {
       uint32_t vec = b[i];
       a[i] = 0;

From 7439bc4dd61b0d7b252bc441113fa9de53446cca Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Sat, 5 Dec 2020 01:38:35 -0800
Subject: [PATCH 080/132] [Gradient Compression] Add an index field to
 GradBucket for PowerSGD (#48757)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48757

Add an index field to GradBucekt, so error_dict is keyed by this index instead of the hashcode of input tensor. The replacement will be done in a separate diff, as the definition of this new method somehow couldn't be recognized in the OSS version.

Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202
ghstack-source-id: 117939208

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl

Reviewed By: rohan-varma

Differential Revision: D25288496

fbshipit-source-id: 6f71977809690a0367e408bd59601ee62c9c03ea
---
 torch/csrc/distributed/c10d/init.cpp | 13 ++++++++++---
 torch/lib/c10d/comm.hpp              | 11 ++++++++++-
 torch/lib/c10d/reducer.cpp           |  3 ++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 14d0a373e83d..54fc33e54424 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -163,7 +163,8 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   }
 
   auto torch_C_m = py::handle(torch_C_module).cast<py::module>();
-  auto m = torch_C_m.def_submodule("_distributed_c10d", "distributed c10d bindings");
+  auto m =
+      torch_C_m.def_submodule("_distributed_c10d", "distributed c10d bindings");
 
   auto module = py::handle(m).cast<py::module>();
 
@@ -184,14 +185,20 @@ PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   shared_ptr_class_<::c10d::GradBucket>(module, "_GradBucket")
       .def(
           py::init<
+              size_t,
               const std::vector<Tensor>&,
               const std::vector<size_t>&,
               const std::vector<size_t>&,
               const std::vector<c10::IntArrayRef>&>(),
+          py::arg("index"),
           py::arg("tensors"),
           py::arg("offsets"),
           py::arg("lengths"),
           py::arg("sizes_list"))
+      .def(
+          "get_index",
+          &::c10d::GradBucket::getIndex,
+          py::call_guard<py::gil_scoped_release>())
       .def(
           "get_tensors",
           &::c10d::GradBucket::getTensors,
@@ -1095,7 +1102,8 @@ that adds a prefix to each key inserted to the store.
           &::c10d::ProcessGroup::Work::wait,
           py::arg("timeout") = kNoTimeout,
           py::call_guard<py::gil_scoped_release>())
-      .def("get_future",
+      .def(
+          "get_future",
           [](::c10d::ProcessGroup::Work& work)
               -> std::shared_ptr<jit::PythonFutureWrapper> {
             return std::make_shared<jit::PythonFutureWrapper>(work.getFuture());
@@ -1259,7 +1267,6 @@ static const auto ProcessGroupWorkTorchBind =
             })
         .def("result", &::c10d::ProcessGroup::Work::result);
 
-
 // TODO: Support argument names in Python API.
 static const auto ProcessGroupTorchBind =
     torch::class_<::c10d::ProcessGroup>("dist_c10d", "ProcessGroup")
diff --git a/torch/lib/c10d/comm.hpp b/torch/lib/c10d/comm.hpp
index e1bde1f03ec0..3a39baccc953 100644
--- a/torch/lib/c10d/comm.hpp
+++ b/torch/lib/c10d/comm.hpp
@@ -20,15 +20,22 @@ void broadcast_coalesced(
 class GradBucket {
  public:
   explicit GradBucket(
+      size_t index,
       const std::vector<at::Tensor>& tensors,
       const std::vector<size_t>& offsets = {},
       const std::vector<size_t>& lengths = {},
       const std::vector<c10::IntArrayRef>& sizes_vec = {})
-      : tensors_(tensors),
+      : index_(index),
+        tensors_(tensors),
         offsets_(offsets),
         lengths_(lengths),
         sizes_vec_(sizes_vec) {}
 
+  // Returns the index of the bucket, which is unique across all the buckets.
+  size_t getIndex() const {
+    return index_;
+  }
+
   // Each tensor in the list that getTensors returns refers to the replica on
   // each device. There will be multiple replicas only in the case of single
   // process multiple device mode. In the single process single device mode,
@@ -37,6 +44,7 @@ class GradBucket {
     return tensors_;
   }
 
+  // Returns a mutable tensor vector compared with the above method.
   std::vector<at::Tensor>& getTensorsRef() {
     return tensors_;
   }
@@ -58,6 +66,7 @@ class GradBucket {
   }
 
  private:
+  size_t index_;
   std::vector<at::Tensor> tensors_;
 
   // Per-variable info in tensors_[0].
diff --git a/torch/lib/c10d/reducer.cpp b/torch/lib/c10d/reducer.cpp
index d0edd904ca94..56d427909155 100644
--- a/torch/lib/c10d/reducer.cpp
+++ b/torch/lib/c10d/reducer.cpp
@@ -2,11 +2,11 @@
 
 #include <functional>
 
-#include <c10d/comm.hpp>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/util/Exception.h>
 #include <c10/util/hash.h>
+#include <c10d/comm.hpp>
 #include <torch/csrc/autograd/engine.h>
 #include <torch/csrc/autograd/function_hook.h>
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
@@ -713,6 +713,7 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
       bucket.work = process_group_->allreduce(tensors);
     } else {
       GradBucket grad_bucket(
+          next_bucket_,
           tensors,
           // Since currently we do not support single-process multiple-device
           // mode, we can assume only one replica in the bucket.

From 5180caeeb49f8bdf1ec58e37a40e10691d0f4c92 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sat, 5 Dec 2020 04:11:00 -0800
Subject: [PATCH 081/132] Remove deprecated spectral ops from torch namespace
 (#48594)

Summary:
Ref https://github.com/pytorch/pytorch/issues/42175

This removes the 4 deprecated spectral functions: `torch.{fft,rfft,ifft,irfft}`. `torch.fft` is also now imported by by default.

The actual `at::native` functions are still used in `torch.stft` so can't be full removed yet. But will once https://github.com/pytorch/pytorch/issues/47601 has been merged.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48594

Reviewed By: heitorschueroff

Differential Revision: D25298929

Pulled By: mruberry

fbshipit-source-id: e36737fe8192fcd16f7e6310f8b49de478e63bf0
---
 aten/src/ATen/native/SpectralOps.cpp          |  24 +-
 aten/src/ATen/native/native_functions.yaml    |  16 -
 docs/source/fft.rst                           |   5 -
 docs/source/notes/cuda.rst                    |   2 +-
 docs/source/tensors.rst                       |   4 -
 docs/source/torch.rst                         |   4 -
 .../check_backward_compatibility.py           |   4 +
 test/cpp/api/fft.cpp                          |  10 -
 test/test_autograd.py                         |  93 +----
 test/test_spectral_ops.py                     | 118 ++----
 torch/__init__.py                             |   2 +-
 torch/_tensor_docs.py                         |  27 --
 torch/_torch_docs.py                          | 381 ------------------
 torch/csrc/api/include/torch/all.h            |   1 +
 torch/fft/__init__.py                         |  18 -
 torch/overrides.py                            |  23 +-
 16 files changed, 67 insertions(+), 665 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 52f9f14e5d34..d1fabaec6093 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -645,46 +645,30 @@ void _cufft_clear_plan_cache(int64_t device_index) {
   detail::getCUDAHooks().cuFFTClearPlanCache(device_index);
 }
 
-Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
-  TORCH_WARN_ONCE(
-    "The function torch.fft is deprecated and will be removed in PyTorch 1.8. "
-    "Use the new torch.fft module functions, instead, by importing torch.fft "
-    "and calling torch.fft.fft or torch.fft.fftn.");
+static Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
               /* onesided */ false);
 }
 
-Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
-  TORCH_WARN_ONCE(
-    "The function torch.ifft is deprecated and will be removed in a future "
-    "PyTorch release. Use the new torch.fft module functions, instead, by "
-    "importing torch.fft and calling torch.fft.ifft or torch.fft.ifftn.");
+static Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ true, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
               /* onesided */ false);
 }
 
-Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
+static Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
             const bool onesided) {
-  TORCH_WARN_ONCE(
-    "The function torch.rfft is deprecated and will be removed in a future "
-    "PyTorch release. Use the new torch.fft module functions, instead, by "
-    "importing torch.fft and calling torch.fft.fft or torch.fft.rfft.");
   return _fft(self, signal_ndim, /* complex_input */ false,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
               onesided);
 }
 
-Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
+static Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
              const bool onesided,  IntArrayRef signal_sizes) {
-  TORCH_WARN_ONCE(
-    "The function torch.irfft is deprecated and will be removed in a future "
-    "PyTorch release. Use the new torch.fft module functions, instead, by "
-    "importing torch.fft and calling torch.fft.ifft or torch.fft.irfft.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ false, /* inverse */ true, signal_sizes,
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c7c1dc33d112..4d8ea72761af 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2047,18 +2047,6 @@
   dispatch:
     CPU, CUDA: native_group_norm_backward
 
-- func: ifft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-
-- func: rfft(Tensor self, int signal_ndim, bool normalized=False, bool onesided=True) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-
-- func: irfft(Tensor self, int signal_ndim, bool normalized=False, bool onesided=True, int[] signal_sizes=[]) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-
 - func: _fft_with_size(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, bool normalized, bool onesided, int[] output_sizes) -> Tensor
   use_c10_dispatcher: full
   variants: function
@@ -9396,10 +9384,6 @@
   use_c10_dispatcher: full
   variants: function
 
-- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-
 ## Functions for linear algebra and the torch.linalg namespace
 # Note [linalg namespace binding]
 # Functions in the linalg python module should have their names start with
diff --git a/docs/source/fft.rst b/docs/source/fft.rst
index cd947ce6faa3..dfce2503f70d 100644
--- a/docs/source/fft.rst
+++ b/docs/source/fft.rst
@@ -1,16 +1,11 @@
 .. role:: hidden
     :class: hidden-section
 
-.. _torch-fft-module:
-
 torch.fft
 =========
 
 Discrete Fourier transforms and related functions.
 
-To use these functions the torch.fft module must be imported since its name
-conflicts with the :func:`torch.fft` function.
-
 .. automodule:: torch.fft
     :noindex:
 
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index b7a4004756b0..6deea675f265 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -266,7 +266,7 @@ cuFFT plan cache
 ----------------
 
 For each CUDA device, an LRU cache of cuFFT plans is used to speed up repeatedly
-running FFT methods (e.g., :func:`torch.fft`) on CUDA tensors of same geometry
+running FFT methods (e.g., :func:`torch.fft.fft`) on CUDA tensors of same geometry
 with same configuration. Because some cuFFT plans may allocate GPU memory,
 these caches have a maximum capacity.
 
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index f523d604ade7..b2e25189540b 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -315,7 +315,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: exponential_
    .. automethod:: fix
    .. automethod:: fix_
-   .. automethod:: fft
    .. automethod:: fill_
    .. automethod:: flatten
    .. automethod:: flip
@@ -359,7 +358,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: igamma_
    .. automethod:: igammac
    .. automethod:: igammac_
-   .. automethod:: ifft
    .. automethod:: index_add_
    .. automethod:: index_add
    .. automethod:: index_copy_
@@ -375,7 +373,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: int
    .. automethod:: int_repr
    .. automethod:: inverse
-   .. automethod:: irfft
    .. automethod:: isclose
    .. automethod:: isfinite
    .. automethod:: isinf
@@ -538,7 +535,6 @@ view of a storage and defines numeric operations on it.
    .. automethod:: resize_as_
    .. automethod:: retain_grad
       :noindex:
-   .. automethod:: rfft
    .. automethod:: roll
    .. automethod:: rot90
    .. automethod:: round
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 98934e5e9849..2bb6c0204395 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -423,10 +423,6 @@ Spectral Ops
     :toctree: generated
     :nosignatures:
 
-    fft
-    ifft
-    rfft
-    irfft
     stft
     istft
     bartlett_window
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 5949515fccbe..ccb4a6457537 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -183,6 +183,10 @@
     ("__caffe2::BBoxTransform", datetime.date(2020, 11, 30)),
     ("__caffe2::GenerateProposals", datetime.date(2020, 11, 30)),
     ("__caffe2::RoIAlignRotated", datetime.date(2020, 11, 30)),
+    ("aten::fft", datetime.date(2021, 1, 31)),
+    ("aten::ifft", datetime.date(2021, 1, 31)),
+    ("aten::irfft", datetime.date(2021, 1, 31)),
+    ("aten::rfft", datetime.date(2021, 1, 31)),
 ]
 
 def allow_listed(schema, allow_list):
diff --git a/test/cpp/api/fft.cpp b/test/cpp/api/fft.cpp
index f8f9d5f1d906..e78e358862e6 100644
--- a/test/cpp/api/fft.cpp
+++ b/test/cpp/api/fft.cpp
@@ -4,16 +4,6 @@
 #include <test/cpp/api/support.h>
 
 
-// Tests that the fft function can be called as usual
-TEST(FFTTest, unclobbered_fft) {
-    auto t = torch::randn({64, 2}, torch::dtype(torch::kDouble));
-    torch::fft(t, 1);
-}
-
-// Clobbers torch::fft the function with torch::fft the namespace
-#include <torch/fft.h>
-
-
 // Naive DFT of a 1 dimensional tensor
 torch::Tensor naive_dft(torch::Tensor x, bool forward=true) {
   TORCH_INTERNAL_ASSERT(x.dim() == 1);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 125fa7a41ba9..dfcc7221e528 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -29,7 +29,7 @@
                                      record_function, emit_nvtx)
 import torch.autograd.functional as autogradF
 from torch.utils.checkpoint import checkpoint
-from torch.testing._internal.common_utils import (TEST_MKL, TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack,
+from torch.testing._internal.common_utils import (TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
                                                   load_tests, random_symmetric_matrix,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
@@ -57,9 +57,6 @@ def getattr_qualified(obj, qname, default=None):
     e.g. getattr(torch, 'fft.rfft')
     """
     path = qname.split('.')
-    if len(path) > 1 and path[0] == 'fft':
-        import torch.fft  # noqa: F401
-
     for name in path:
         obj = getattr(obj, name, _END_SENTINEL)
         if obj is _END_SENTINEL:
@@ -2772,94 +2769,6 @@ def func(A, upper):
         for upper, dims in product([True, False], [(3, 3), (5, 5)]):
             _test_with_size(upper, dims)
 
-    @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
-    def test_fft_ifft_rfft_irfft(self):
-        def _test_complex(sizes, signal_ndim):
-            x = torch.randn(sizes, requires_grad=True, dtype=torch.double)
-
-            for normalized in (True, False):
-                def fft(x):
-                    return x.fft(signal_ndim, normalized=normalized)
-
-                gradcheck(fft, [x])
-                gradgradcheck(fft, [x], gen_non_contig_grad_outputs=True)
-
-                def ifft(fx):
-                    return fx.ifft(signal_ndim, normalized=normalized)
-
-                # Use output of fft(x) for inverse fft, due to symmetry requirements
-                fx = fft(x).detach()
-                fx.requires_grad = True
-                gradcheck(ifft, [fx])
-                gradgradcheck(ifft, [fx], gen_non_contig_grad_outputs=True)
-
-        def _test_real(sizes, signal_ndim):
-            x = torch.randn(sizes, requires_grad=True, dtype=torch.double)
-            if x.dim() == signal_ndim:
-                start_dim = 0
-            else:
-                start_dim = 1
-            signal_sizes = x.size()[start_dim:start_dim + signal_ndim]
-
-            for normalized, onesided in product((True, False), repeat=2):
-                def rfft(x):
-                    return x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
-
-                gradcheck(rfft, [x])
-                gradgradcheck(rfft, [x], gen_non_contig_grad_outputs=True)
-
-                # Generally speaking, irfft itself won't and can't pass the
-                # current gradcheck as it assumes the input follows conjugate
-                # symmetry, an requirement that is never true with our point
-                # numerical Jacobian estimate. Without input symmtry, irfft's
-                # behavior is undefined.
-                #
-                # Even onesided results can't remove all redundancy. For
-                # example, consider the .select(last_signal_dim, 0) slice.
-                # It is entirely represented in the onesided results (except
-                # for 1D), and will be reflected onto itself!
-                #
-                # So only 1D onesided irfft should pass grad check as it is
-                # guaranteed that the input has no symmetrical values.
-                #
-                # In other cases, we test a function that first uses rfft to
-                # generate a tensor that follows the conjugate symmetry irfft
-                # expects, and then feeds it into irfft. Since rfft is already
-                # tested above, we thereby verify the correctness of irfft.
-                if signal_ndim == 1 and onesided:
-                    def irfft(fx):
-                        return fx.irfft(signal_ndim, normalized=normalized,
-                                        onesided=onesided, signal_sizes=signal_sizes)
-
-                    # Use output of rfft(x) for inverse rfft, due to symmetry requirements
-                    fx = rfft(x).detach()
-                    fx.requires_grad = True
-                    gradcheck(irfft, [fx])
-                    gradgradcheck(irfft, [fx], gen_non_contig_grad_outputs=True)
-                else:
-                    # Test this function: f(x) = ifft(rfft(x) + rfft(z)), where
-                    # z is some fixed tensor of same size as x. rfft(z) term is
-                    # needed because otherwise f becomes identity.
-                    z = torch.randn(sizes, dtype=torch.double)
-                    fz = z.rfft(signal_ndim, normalized=normalized, onesided=onesided)
-
-                    def rfft_irfft(x):
-                        fx = x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
-                        y = fx + fz
-                        return y.irfft(signal_ndim, normalized=normalized,
-                                       onesided=onesided, signal_sizes=signal_sizes)
-
-                    gradcheck(rfft_irfft, [x])
-                    gradgradcheck(rfft_irfft, [x], gen_non_contig_grad_outputs=True)
-
-        _test_real((2, 10), 1)
-        _test_real((2, 3, 4), 2)
-        _test_real((2, 3, 4, 3), 3)
-
-        _test_complex((2, 2, 10, 2), 1)
-        _test_complex((1, 2, 3, 4, 2), 2)
-        _test_complex((2, 1, 3, 4, 3, 2), 3)
-
     def test_gradcheck_fail_when_no_differentiable_outputs_and_num_grad_not_zero(self):
         def autograd_fn(input):
             output = torch.detach(input)
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 919feb74d171..9310a6448ef5 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -23,10 +23,6 @@
 if TEST_LIBROSA:
     import librosa
 
-# saves the torch.fft function that's clobbered by importing the torch.fft module
-fft_fn = torch.fft
-import torch.fft
-
 
 def _complex_stft(x, *args, **kwargs):
     # Transform real and imaginary components separably
@@ -100,21 +96,6 @@ def _stft_reference(x, hop_length, window):
 class TestFFT(TestCase):
     exact_dtype = True
 
-    @skipCPUIfNoMkl
-    @skipCUDAIfRocm
-    def test_fft_function_clobbered(self, device):
-        t = torch.randn((100, 2), device=device)
-        eager_result = fft_fn(t, 1)
-
-        def method_fn(t):
-            return t.fft(1)
-        scripted_method_fn = torch.jit.script(method_fn)
-
-        self.assertEqual(scripted_method_fn(t), eager_result)
-
-        with self.assertRaisesRegex(TypeError, "'module' object is not callable"):
-            torch.fft(t, 1)
-
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
     @onlyOnCPUAndCUDA
@@ -690,53 +671,36 @@ def test_fftshift_frequencies(self, device, dtype):
 
     # Legacy fft tests
     def _test_fft_ifft_rfft_irfft(self, device, dtype):
+        complex_dtype = {
+            torch.float16: torch.complex32,
+            torch.float32: torch.complex64,
+            torch.float64: torch.complex128
+        }[dtype]
+
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
-            x = prepro_fn(torch.randn(*sizes, dtype=dtype, device=device))
-            for normalized in (True, False):
-                res = x.fft(signal_ndim, normalized=normalized)
-                rec = res.ifft(signal_ndim, normalized=normalized)
+            x = prepro_fn(torch.randn(*sizes, dtype=complex_dtype, device=device))
+            dim = tuple(range(-signal_ndim, 0))
+            for norm in ('ortho', None):
+                res = torch.fft.fftn(x, dim=dim, norm=norm)
+                rec = torch.fft.ifftn(res, dim=dim, norm=norm)
                 self.assertEqual(x, rec, atol=1e-8, rtol=0, msg='fft and ifft')
-                res = x.ifft(signal_ndim, normalized=normalized)
-                rec = res.fft(signal_ndim, normalized=normalized)
+                res = torch.fft.ifftn(x, dim=dim, norm=norm)
+                rec = torch.fft.fftn(res, dim=dim, norm=norm)
                 self.assertEqual(x, rec, atol=1e-8, rtol=0, msg='ifft and fft')
 
         def _test_real(sizes, signal_ndim, prepro_fn=lambda x: x):
             x = prepro_fn(torch.randn(*sizes, dtype=dtype, device=device))
             signal_numel = 1
             signal_sizes = x.size()[-signal_ndim:]
-            for normalized, onesided in product((True, False), repeat=2):
-                res = x.rfft(signal_ndim, normalized=normalized, onesided=onesided)
-                if not onesided:  # check Hermitian symmetry
-                    def test_one_sample(res, test_num=10):
-                        idxs_per_dim = [torch.LongTensor(test_num).random_(s).tolist() for s in signal_sizes]
-                        for idx in zip(*idxs_per_dim):
-                            reflected_idx = tuple((s - i) % s for i, s in zip(idx, res.size()))
-                            idx_val = res.__getitem__(idx)
-                            reflected_val = res.__getitem__(reflected_idx)
-                            self.assertEqual(idx_val[0], reflected_val[0], msg='rfft hermitian symmetry on real part')
-                            self.assertEqual(idx_val[1], -reflected_val[1], msg='rfft hermitian symmetry on imaginary part')
-                    if len(sizes) == signal_ndim:
-                        test_one_sample(res)
-                    else:
-                        output_non_batch_shape = res.size()[-(signal_ndim + 1):]
-                        flatten_batch_res = res.view(-1, *output_non_batch_shape)
-                        nb = flatten_batch_res.size(0)
-                        test_idxs = torch.LongTensor(min(nb, 4)).random_(nb)
-                        for test_idx in test_idxs.tolist():
-                            test_one_sample(flatten_batch_res[test_idx])
-                    # compare with C2C
-                    xc = torch.stack([x, torch.zeros_like(x)], -1)
-                    xc_res = xc.fft(signal_ndim, normalized=normalized)
-                    self.assertEqual(res, xc_res)
-                test_input_signal_sizes = [signal_sizes]
-                rec = res.irfft(signal_ndim, normalized=normalized,
-                                onesided=onesided, signal_sizes=signal_sizes)
+            dim = tuple(range(-signal_ndim, 0))
+            for norm in (None, 'ortho'):
+                res = torch.fft.rfftn(x, dim=dim, norm=norm)
+                rec = torch.fft.irfftn(res, s=signal_sizes, dim=dim, norm=norm)
                 self.assertEqual(x, rec, atol=1e-8, rtol=0, msg='rfft and irfft')
-                if not onesided:  # check that we can use C2C ifft
-                    rec = res.ifft(signal_ndim, normalized=normalized)
-                    self.assertEqual(x, rec.select(-1, 0), atol=1e-8, rtol=0, msg='twosided rfft and ifft real')
-                    self.assertEqual(rec.select(-1, 1).abs().mean(), 0, atol=1e-8,
-                                     rtol=0, msg='twosided rfft and ifft imaginary')
+                res = torch.fft.fftn(x, dim=dim, norm=norm)
+                rec = torch.fft.ifftn(res, dim=dim, norm=norm)
+                x_complex = torch.complex(x, torch.zeros_like(x))
+                self.assertEqual(x_complex, rec, atol=1e-8, rtol=0, msg='fft and ifft (from real)')
 
         # contiguous case
         _test_real((100,), 1)
@@ -746,12 +710,12 @@ def test_one_sample(res, test_num=10):
         _test_real((50, 40, 70), 3)
         _test_real((30, 1, 50, 25, 20), 3)
 
-        _test_complex((100, 2), 1)
-        _test_complex((100, 100, 2), 1)
-        _test_complex((100, 100, 2), 2)
-        _test_complex((1, 20, 80, 60, 2), 2)
-        _test_complex((50, 40, 70, 2), 3)
-        _test_complex((6, 5, 50, 25, 20, 2), 3)
+        _test_complex((100,), 1)
+        _test_complex((100, 100), 1)
+        _test_complex((100, 100), 2)
+        _test_complex((1, 20, 80, 60), 2)
+        _test_complex((50, 40, 70), 3)
+        _test_complex((6, 5, 50, 25, 20), 3)
 
         # non-contiguous case
         _test_real((165,), 1, lambda x: x.narrow(0, 25, 100))  # input is not aligned to complex type
@@ -761,20 +725,10 @@ def test_one_sample(res, test_num=10):
         _test_real((65, 80, 115), 3, lambda x: x[10:60, 13:53, 10:80])
         _test_real((30, 20, 50, 25), 3, lambda x: x.transpose(1, 2).transpose(2, 3))
 
-        _test_complex((2, 100), 1, lambda x: x.t())
-        _test_complex((100, 2), 1, lambda x: x.expand(100, 100, 2))
-        _test_complex((300, 200, 3), 2, lambda x: x[:100, :100, 1:])  # input is not aligned to complex type
-        _test_complex((20, 90, 110, 2), 2, lambda x: x[:, 5:85].narrow(2, 5, 100))
-        _test_complex((40, 60, 3, 80, 2), 3, lambda x: x.transpose(2, 0).select(0, 2)[5:55, :, 10:])
-        _test_complex((30, 55, 50, 22, 2), 3, lambda x: x[:, 3:53, 15:40, 1:21])
-
-        # non-contiguous with strides not representable as aligned with complex type
-        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [3, 2, 1]))
-        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [4, 2, 2]))
-        _test_complex((50,), 1, lambda x: x.as_strided([5, 5, 2], [4, 3, 1]))
-        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [3, 3, 1]))
-        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [4, 2, 2]))
-        _test_complex((50,), 2, lambda x: x.as_strided([5, 5, 2], [4, 3, 1]))
+        _test_complex((100,), 1, lambda x: x.expand(100, 100))
+        _test_complex((20, 90, 110), 2, lambda x: x[:, 5:85].narrow(2, 5, 100))
+        _test_complex((40, 60, 3, 80), 3, lambda x: x.transpose(2, 0).select(0, 2)[5:55, :, 10:])
+        _test_complex((30, 55, 50, 22), 3, lambda x: x[:, 3:53, 15:40, 1:21])
 
     @skipCUDAIfRocm
     @skipCPUIfNoMkl
@@ -825,7 +779,7 @@ def plan_cache_max_size(device, n):
             # Test that different GPU has different cache
             x0 = torch.randn(2, 3, 3, device=devices[0])
             x1 = x0.to(devices[1])
-            self.assertEqual(x0.rfft(2), x1.rfft(2))
+            self.assertEqual(torch.fft.rfftn(x0, dim=(-2, -1)), torch.fft.rfftn(x1, dim=(-2, -1)))
             # If a plan is used across different devices, the following line (or
             # the assert above) would trigger illegal memory access. Other ways
             # to trigger the error include
@@ -1117,18 +1071,18 @@ def test_fft_input_modification(self, device):
 
         signal = torch.ones((2, 2, 2), device=device)
         signal_copy = signal.clone()
-        spectrum = signal.fft(2)
+        spectrum = torch.fft.fftn(signal, dim=(-2, -1))
         self.assertEqual(signal, signal_copy)
 
         spectrum_copy = spectrum.clone()
-        _ = torch.ifft(spectrum, 2)
+        _ = torch.fft.ifftn(spectrum, dim=(-2, -1))
         self.assertEqual(spectrum, spectrum_copy)
 
-        half_spectrum = torch.rfft(signal, 2)
+        half_spectrum = torch.fft.rfftn(signal, dim=(-2, -1))
         self.assertEqual(signal, signal_copy)
 
         half_spectrum_copy = half_spectrum.clone()
-        _ = torch.irfft(half_spectrum_copy, 2, signal_sizes=(2, 2))
+        _ = torch.fft.irfftn(half_spectrum_copy, s=(2, 2), dim=(-2, -1))
         self.assertEqual(half_spectrum, half_spectrum_copy)
 
     @onlyOnCPUAndCUDA
diff --git a/torch/__init__.py b/torch/__init__.py
index 49049583f5ba..403c192b47e9 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -568,7 +568,7 @@ def _assert(condition, message):
 import torch.cuda
 import torch.autograd
 from torch.autograd import no_grad, enable_grad, set_grad_enabled
-# import torch.fft  # TODO: enable once torch.fft() is removed
+import torch.fft
 import torch.futures
 import torch.nn
 import torch.nn.intrinsic
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index ef7d71586d32..79ce982da5e9 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -4356,33 +4356,6 @@ def callable(a, b) -> number
 See :func:`torch.istft`
 """)
 
-add_docstr_all('fft', r"""
-fft(signal_ndim, normalized=False) -> Tensor
-
-See :func:`torch.fft`
-""")
-
-add_docstr_all('ifft',
-               r"""
-ifft(signal_ndim, normalized=False) -> Tensor
-
-See :func:`torch.ifft`
-""")
-
-add_docstr_all('rfft',
-               r"""
-rfft(signal_ndim, normalized=False, onesided=True) -> Tensor
-
-See :func:`torch.rfft`
-""")
-
-add_docstr_all('irfft',
-               r"""
-irfft(signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
-
-See :func:`torch.irfft`
-""")
-
 add_docstr_all('det',
                r"""
 det() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 5cc796dfaf7d..e57d44e41525 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -9375,387 +9375,6 @@ def merge_dicts(*dicts):
 .. _[2]: https://www.jstor.org/stable/2156365
 """)
 
-add_docstr(torch.fft, r"""
-fft(input, signal_ndim, normalized=False) -> Tensor
-
-Complex-to-complex Discrete Fourier Transform.
-
-.. warning::
-    The function :func:`torch.fft` is deprecated and will be removed in
-    PyTorch 1.8. Use the new :ref:`torch.fft <torch-fft-module>` module
-    functions, instead, by importing :ref:`torch.fft <torch-fft-module>` and
-    calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`.
-
-This method computes the complex-to-complex discrete Fourier transform.
-Ignoring the batch dimensions, it computes the following expression:
-
-.. math::
-    X[\omega_1, \dots, \omega_d] =
-        \sum_{n_1=0}^{N_1-1} \dots \sum_{n_d=0}^{N_d-1} x[n_1, \dots, n_d]
-         e^{-j\ 2 \pi \sum_{i=0}^d \frac{\omega_i n_i}{N_i}},
-
-where :math:`d` = :attr:`signal_ndim` is number of dimensions for the
-signal, and :math:`N_i` is the size of signal dimension :math:`i`.
-
-This method supports 1D, 2D and 3D complex-to-complex transforms, indicated
-by :attr:`signal_ndim`. :attr:`input` must be a tensor with last dimension
-of size 2, representing the real and imaginary components of complex
-numbers, and should have at least ``signal_ndim + 1`` dimensions with optionally
-arbitrary number of leading batch dimensions. If :attr:`normalized` is set to
-``True``, this normalizes the result by dividing it with
-:math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is unitary.
-
-Returns the real and the imaginary parts together as one tensor of the same
-shape of :attr:`input`.
-
-The inverse of this function is :func:`~torch.ifft`.
-
-.. note::
-    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
-    repeatedly running FFT methods on tensors of same geometry with same
-    configuration. See :ref:`cufft-plan-cache` for more details on how to
-    monitor and control the cache.
-
-.. warning::
-    If the torch.fft module is imported then "torch.fft" will refer to the
-    module and not this function. Use :meth:`torch.Tensor.fft` instead.
-
-.. warning::
-    Due to limited dynamic range of half datatype, performing this operation in half
-    precision may cause the first element of result to overflow for certain inputs.
-
-.. warning::
-    For CPU tensors, this method is currently only available with MKL. Use
-    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
-
-Arguments:
-    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
-        dimensions
-    signal_ndim (int): the number of dimensions in each signal.
-        :attr:`signal_ndim` can only be 1, 2 or 3
-    normalized (bool, optional): controls whether to return normalized results.
-        Default: ``False``
-
-Returns:
-    Tensor: A tensor containing the complex-to-complex Fourier transform result
-
-Example::
-
-    >>> # unbatched 2D FFT
-    >>> x = torch.randn(4, 3, 2)
-    >>> torch.fft(x, 2)
-    tensor([[[-0.0876,  1.7835],
-             [-2.0399, -2.9754],
-             [ 4.4773, -5.0119]],
-
-            [[-1.5716,  2.7631],
-             [-3.8846,  5.2652],
-             [ 0.2046, -0.7088]],
-
-            [[ 1.9938, -0.5901],
-             [ 6.5637,  6.4556],
-             [ 2.9865,  4.9318]],
-
-            [[ 7.0193,  1.1742],
-             [-1.3717, -2.1084],
-             [ 2.0289,  2.9357]]])
-    >>> # batched 1D FFT
-    >>> torch.fft(x, 1)
-    tensor([[[ 1.8385,  1.2827],
-             [-0.1831,  1.6593],
-             [ 2.4243,  0.5367]],
-
-            [[-0.9176, -1.5543],
-             [-3.9943, -2.9860],
-             [ 1.2838, -2.9420]],
-
-            [[-0.8854, -0.6860],
-             [ 2.4450,  0.0808],
-             [ 1.3076, -0.5768]],
-
-            [[-0.1231,  2.7411],
-             [-0.3075, -1.7295],
-             [-0.5384, -2.0299]]])
-    >>> # arbitrary number of batch dimensions, 2D FFT
-    >>> x = torch.randn(3, 3, 5, 5, 2)
-    >>> y = torch.fft(x, 2)
-    >>> y.shape
-    torch.Size([3, 3, 5, 5, 2])
-
-""")
-
-add_docstr(torch.ifft, r"""
-ifft(input, signal_ndim, normalized=False) -> Tensor
-
-Complex-to-complex Inverse Discrete Fourier Transform.
-
-.. warning::
-    The function :func:`torch.ifft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`.
-
-This method computes the complex-to-complex inverse discrete Fourier
-transform. Ignoring the batch dimensions, it computes the following
-expression:
-
-.. math::
-    X[\omega_1, \dots, \omega_d] =
-        \frac{1}{\prod_{i=1}^d N_i} \sum_{n_1=0}^{N_1-1} \dots \sum_{n_d=0}^{N_d-1} x[n_1, \dots, n_d]
-         e^{\ j\ 2 \pi \sum_{i=0}^d \frac{\omega_i n_i}{N_i}},
-
-where :math:`d` = :attr:`signal_ndim` is number of dimensions for the
-signal, and :math:`N_i` is the size of signal dimension :math:`i`.
-
-The argument specifications are almost identical with :func:`~torch.fft`.
-However, if :attr:`normalized` is set to ``True``, this instead returns the
-results multiplied by :math:`\sqrt{\prod_{i=1}^d N_i}`, to become a unitary
-operator. Therefore, to invert a :func:`~torch.fft`, the :attr:`normalized`
-argument should be set identically for :func:`~torch.fft`.
-
-Returns the real and the imaginary parts together as one tensor of the same
-shape of :attr:`input`.
-
-The inverse of this function is :func:`~torch.fft`.
-
-.. note::
-    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
-    repeatedly running FFT methods on tensors of same geometry with same
-    configuration. See :ref:`cufft-plan-cache` for more details on how to
-    monitor and control the cache.
-
-.. warning::
-    Due to limited dynamic range of half datatype, performing this operation in half
-    precision may cause the first element of result to overflow for certain inputs.
-
-.. warning::
-    For CPU tensors, this method is currently only available with MKL. Use
-    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
-
-Arguments:
-    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
-        dimensions
-    signal_ndim (int): the number of dimensions in each signal.
-        :attr:`signal_ndim` can only be 1, 2 or 3
-    normalized (bool, optional): controls whether to return normalized results.
-        Default: ``False``
-
-Returns:
-    Tensor: A tensor containing the complex-to-complex inverse Fourier transform result
-
-Example::
-
-    >>> x = torch.randn(3, 3, 2)
-    >>> x
-    tensor([[[ 1.2766,  1.3680],
-             [-0.8337,  2.0251],
-             [ 0.9465, -1.4390]],
-
-            [[-0.1890,  1.6010],
-             [ 1.1034, -1.9230],
-             [-0.9482,  1.0775]],
-
-            [[-0.7708, -0.8176],
-             [-0.1843, -0.2287],
-             [-1.9034, -0.2196]]])
-    >>> y = torch.fft(x, 2)
-    >>> torch.ifft(y, 2)  # recover x
-    tensor([[[ 1.2766,  1.3680],
-             [-0.8337,  2.0251],
-             [ 0.9465, -1.4390]],
-
-            [[-0.1890,  1.6010],
-             [ 1.1034, -1.9230],
-             [-0.9482,  1.0775]],
-
-            [[-0.7708, -0.8176],
-             [-0.1843, -0.2287],
-             [-1.9034, -0.2196]]])
-
-""")
-
-add_docstr(torch.rfft, r"""
-rfft(input, signal_ndim, normalized=False, onesided=True) -> Tensor
-
-Real-to-complex Discrete Fourier Transform.
-
-.. warning::
-    The function :func:`torch.rfft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.rfft` for one-sided output, or
-    :func:`torch.fft.fft` for two-sided output.
-
-This method computes the real-to-complex discrete Fourier transform. It is
-mathematically equivalent with :func:`~torch.fft` with differences only in
-formats of the input and output.
-
-This method supports 1D, 2D and 3D real-to-complex transforms, indicated
-by :attr:`signal_ndim`. :attr:`input` must be a tensor with at least
-``signal_ndim`` dimensions with optionally arbitrary number of leading batch
-dimensions. If :attr:`normalized` is set to ``True``, this normalizes the result
-by dividing it with :math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is
-unitary, where :math:`N_i` is the size of signal dimension :math:`i`.
-
-The real-to-complex Fourier transform results follow conjugate symmetry:
-
-.. math::
-    X[\omega_1, \dots, \omega_d] = X^*[N_1 - \omega_1, \dots, N_d - \omega_d],
-
-where the index arithmetic is computed modulus the size of the corresponding
-dimension, :math:`\ ^*` is the conjugate operator, and
-:math:`d` = :attr:`signal_ndim`. :attr:`onesided` flag controls whether to avoid
-redundancy in the output results. If set to ``True`` (default), the output will
-not be full complex result of shape :math:`(*, 2)`, where :math:`*` is the shape
-of :attr:`input`, but instead the last dimension will be halfed as of size
-:math:`\lfloor \frac{N_d}{2} \rfloor + 1`.
-
-The inverse of this function is :func:`~torch.irfft`.
-
-.. note::
-    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
-    repeatedly running FFT methods on tensors of same geometry with same
-    configuration. See :ref:`cufft-plan-cache` for more details on how to
-    monitor and control the cache.
-
-.. warning::
-    Due to limited dynamic range of half datatype, performing this operation in half
-    precision may cause the first element of result to overflow for certain inputs.
-
-.. warning::
-    For CPU tensors, this method is currently only available with MKL. Use
-    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
-
-Arguments:
-    input (Tensor): the input tensor of at least :attr:`signal_ndim` dimensions
-    signal_ndim (int): the number of dimensions in each signal.
-        :attr:`signal_ndim` can only be 1, 2 or 3
-    normalized (bool, optional): controls whether to return normalized results.
-        Default: ``False``
-    onesided (bool, optional): controls whether to return half of results to
-        avoid redundancy. Default: ``True``
-
-Returns:
-    Tensor: A tensor containing the real-to-complex Fourier transform result
-
-Example::
-
-    >>> x = torch.randn(5, 5)
-    >>> torch.rfft(x, 2).shape
-    torch.Size([5, 3, 2])
-    >>> torch.rfft(x, 2, onesided=False).shape
-    torch.Size([5, 5, 2])
-
-""")
-
-
-add_docstr(torch.irfft, r"""
-irfft(input, signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
-
-Complex-to-real Inverse Discrete Fourier Transform.
-
-.. warning::
-    The function :func:`torch.irfft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.irfft` for one-sided input, or
-    :func:`torch.fft.ifft` for two-sided input.
-
-This method computes the complex-to-real inverse discrete Fourier transform.
-It is mathematically equivalent with :func:`ifft` with differences only in
-formats of the input and output.
-
-The argument specifications are almost identical with :func:`~torch.ifft`.
-Similar to :func:`~torch.ifft`, if :attr:`normalized` is set to ``True``,
-this normalizes the result by multiplying it with
-:math:`\sqrt{\prod_{i=1}^K N_i}` so that the operator is unitary, where
-:math:`N_i` is the size of signal dimension :math:`i`.
-
-.. note::
-    Due to the conjugate symmetry, :attr:`input` do not need to contain the full
-    complex frequency values. Roughly half of the values will be sufficient, as
-    is the case when :attr:`input` is given by :func:`~torch.rfft` with
-    ``rfft(signal, onesided=True)``. In such case, set the :attr:`onesided`
-    argument of this method to ``True``. Moreover, the original signal shape
-    information can sometimes be lost, optionally set :attr:`signal_sizes` to be
-    the size of the original signal (without the batch dimensions if in batched
-    mode) to recover it with correct shape.
-
-    Therefore, to invert an :func:`~torch.rfft`, the :attr:`normalized` and
-    :attr:`onesided` arguments should be set identically for :func:`~torch.irfft`,
-    and preferably a :attr:`signal_sizes` is given to avoid size mismatch. See the
-    example below for a case of size mismatch.
-
-    See :func:`~torch.rfft` for details on conjugate symmetry.
-
-The inverse of this function is :func:`~torch.rfft`.
-
-.. warning::
-    Generally speaking, input to this function should contain values
-    following conjugate symmetry. Note that even if :attr:`onesided` is
-    ``True``, often symmetry on some part is still needed. When this
-    requirement is not satisfied, the behavior of :func:`~torch.irfft` is
-    undefined. Since :func:`torch.autograd.gradcheck` estimates numerical
-    Jacobian with point perturbations, :func:`~torch.irfft` will almost
-    certainly fail the check.
-
-.. note::
-    For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
-    repeatedly running FFT methods on tensors of same geometry with same
-    configuration. See :ref:`cufft-plan-cache` for more details on how to
-    monitor and control the cache.
-
-.. warning::
-    Due to limited dynamic range of half datatype, performing this operation in half
-    precision may cause the first element of result to overflow for certain inputs.
-
-.. warning::
-    For CPU tensors, this method is currently only available with MKL. Use
-    :func:`torch.backends.mkl.is_available` to check if MKL is installed.
-
-Arguments:
-    input (Tensor): the input tensor of at least :attr:`signal_ndim` ``+ 1``
-        dimensions
-    signal_ndim (int): the number of dimensions in each signal.
-        :attr:`signal_ndim` can only be 1, 2 or 3
-    normalized (bool, optional): controls whether to return normalized results.
-        Default: ``False``
-    onesided (bool, optional): controls whether :attr:`input` was halfed to avoid
-        redundancy, e.g., by :func:`rfft`. Default: ``True``
-    signal_sizes (list or :class:`torch.Size`, optional): the size of the original
-        signal (without batch dimension). Default: ``None``
-
-Returns:
-    Tensor: A tensor containing the complex-to-real inverse Fourier transform result
-
-Example::
-
-    >>> x = torch.randn(4, 4)
-    >>> torch.rfft(x, 2, onesided=True).shape
-    torch.Size([4, 3, 2])
-    >>>
-    >>> # notice that with onesided=True, output size does not determine the original signal size
-    >>> x = torch.randn(4, 5)
-
-    >>> torch.rfft(x, 2, onesided=True).shape
-    torch.Size([4, 3, 2])
-    >>>
-    >>> # now we use the original shape to recover x
-    >>> x
-    tensor([[-0.8992,  0.6117, -1.6091, -0.4155, -0.8346],
-            [-2.1596, -0.0853,  0.7232,  0.1941, -0.0789],
-            [-2.0329,  1.1031,  0.6869, -0.5042,  0.9895],
-            [-0.1884,  0.2858, -1.5831,  0.9917, -0.8356]])
-    >>> y = torch.rfft(x, 2, onesided=True)
-    >>> torch.irfft(y, 2, onesided=True, signal_sizes=x.shape)  # recover x
-    tensor([[-0.8992,  0.6117, -1.6091, -0.4155, -0.8346],
-            [-2.1596, -0.0853,  0.7232,  0.1941, -0.0789],
-            [-2.0329,  1.1031,  0.6869, -0.5042,  0.9895],
-            [-0.1884,  0.2858, -1.5831,  0.9917, -0.8356]])
-
-""")
-
-
 add_docstr(torch.hann_window,
            """
 hann_window(window_length, periodic=True, *, dtype=None, \
diff --git a/torch/csrc/api/include/torch/all.h b/torch/csrc/api/include/torch/all.h
index 5bcc8eec93ab..5717bccf6017 100644
--- a/torch/csrc/api/include/torch/all.h
+++ b/torch/csrc/api/include/torch/all.h
@@ -7,6 +7,7 @@
 #include <torch/cuda.h>
 #include <torch/data.h>
 #include <torch/enum.h>
+#include <torch/fft.h>
 #include <torch/jit.h>
 #include <torch/linalg.h>
 #include <torch/nn.h>
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 437c1b11eef2..7efdb04a52d3 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -49,7 +49,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.arange(4)
     >>> t
     tensor([0, 1, 2, 3])
@@ -87,7 +86,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
     >>> torch.fft.ifft(t)
     tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j])
@@ -133,7 +131,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> x = torch.rand(10, 10, dtype=torch.complex64)
     >>> fft2 = torch.fft.fft2(t)
 
@@ -177,7 +174,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> x = torch.rand(10, 10, dtype=torch.complex64)
     >>> ifft2 = torch.fft.ifft2(t)
 
@@ -229,7 +225,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> x = torch.rand(10, 10, dtype=torch.complex64)
     >>> fftn = torch.fft.fftn(t)
 
@@ -272,7 +267,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> x = torch.rand(10, 10, dtype=torch.complex64)
     >>> ifftn = torch.fft.ifftn(t)
 
@@ -314,7 +308,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.arange(4)
     >>> t
     tensor([0, 1, 2, 3])
@@ -376,7 +369,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.arange(5)
     >>> t
     tensor([0, 1, 2, 3, 4])
@@ -433,7 +425,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.rand(10, 10)
     >>> rfft2 = torch.fft.rfft2(t)
     >>> rfft2.size()
@@ -506,7 +497,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.rand(10, 9)
     >>> T = torch.fft.rfft2(t)
 
@@ -564,7 +554,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.rand(10, 10)
     >>> rfftn = torch.fft.rfftn(t)
     >>> rfftn.size()
@@ -636,7 +625,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.rand(10, 9)
     >>> T = torch.fft.rfftn(t)
 
@@ -714,7 +702,6 @@
     Taking a real-valued frequency signal and bringing it into the time domain
     gives Hermitian symmetric output:
 
-    >>> import torch.fft
     >>> t = torch.arange(5)
     >>> t
     tensor([0, 1, 2, 3, 4])
@@ -769,7 +756,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> t = torch.arange(5)
     >>> t
     tensor([0, 1, 2, 3, 4])
@@ -817,7 +803,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> torch.fft.fftfreq(5)
     tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
 
@@ -861,7 +846,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> torch.fft.rfftfreq(5)
     tensor([ 0.0000,  0.2000,  0.4000])
 
@@ -903,7 +887,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> f = torch.fft.fftfreq(4)
     >>> f
     tensor([ 0.0000,  0.2500,  -0.5000, -0.2500])
@@ -947,7 +930,6 @@
 
 Example:
 
-    >>> import torch.fft
     >>> f = torch.fft.fftfreq(5)
     >>> f
     tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
diff --git a/torch/overrides.py b/torch/overrides.py
index f96549612812..e8a3933a1954 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -132,6 +132,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.empty_strided,
         torch.empty_quantized,
         torch.eye,
+        torch.fft.fftfreq,
+        torch.fft.rfftfreq,
         torch.from_file,
         torch.full,
         torch.hamming_window,
@@ -389,8 +391,24 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.fbgemm_pack_quantized_matrix: lambda input, a, b: -1,
         torch.feature_alpha_dropout: lambda input, p, train: -1,
         torch.feature_dropout: lambda input, p, train: -1,
+        torch.fft.fft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.ifft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.rfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.irfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.hfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.ihfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.fftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.ifftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.rfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.irfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.fft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.ifft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.rfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.irfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.fftshift: lambda input, dim=None: -1,
+        torch.fft.ifftshift: lambda input, dim=None: -1,
+        torch.fft.fft: lambda input, n=None, dim=-1, norm=None: -1,
         torch.fix: lambda input, out=None: -1,
-        torch.fft: lambda input, signal_ndim, normalized=False: -1,
         torch.flatten: lambda input, start_dim=0, end_dim=-1: -1,
         torch.flip: lambda input, dims: -1,
         torch.fliplr: lambda input: -1,
@@ -427,7 +445,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.hspmm: lambda mat1, mat2, out=None: -1,
         torch.hstack: lambda tensors, out=None: -1,
         torch.hypot: lambda input, other, out=None: -1,
-        torch.ifft: lambda input, signal_ndim, normalized=False: -1,
         torch.igamma: lambda input, other, out=None: -1,
         torch.igammac: lambda input, other, out=None: -1,
         torch.imag: lambda input, out=None: -1,
@@ -445,7 +462,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
                               cudnn_enabled: -1),
         torch.int_repr: lambda input: -1,
         torch.inverse: lambda input, out=None: -1,
-        torch.irfft: lambda input, signal_ndim, normalized=False, onesided=True, signal_sizes=None: -1,
         torch.is_complex: lambda input: -1,
         torch.is_distributed: lambda input: -1,
         torch.is_floating_point: lambda input: -1,
@@ -732,7 +748,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.renorm: lambda input, p, dim, maxnorm, out=None: -1,
         torch.repeat_interleave: lambda input, dim=None: -1,
         torch.reshape: lambda input, shape: -1,
-        torch.rfft: lambda input, signal_ndim, normalized=False, onesided=True: -1,
         torch.rnn_relu: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
         torch.rnn_relu_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
         torch.rnn_tanh: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,

From 799b700ada6cb6815d716779bdebf56c5f7516ea Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Sat, 5 Dec 2020 06:07:21 -0800
Subject: [PATCH 082/132] add a unit test for lack of devices (#48858)

Summary:
add a unit test for the situation where devices have no enough memory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48858

Reviewed By: malfet, gcatron

Differential Revision: D25341254

Pulled By: scottxu0730

fbshipit-source-id: c0524c22717b6c8afd67f5b0ad0f1851b973e4b7
---
 test/test_fx_experimental.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 13928208316c..57201ded332e 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -141,6 +141,26 @@ def forward(self, a, b):
         self.assertEqual(traced(a, b), module_with_submodules(a, b))
         assert dag.nodes[0].logical_device_ids == [0]
 
+    def test_lack_of_devices(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+
+        m = TestModule()
+        traced = symbolic_trace(m)
+        a = torch.rand(4)
+        b = torch.rand(4)
+        graph_manipulation.get_size_of_all_nodes(traced, [a, b])
+        partitioner = Partitioner()
+        devices = [Device("dev_0", 4, 0), Device("dev_1", 4, 1)]
+        partitioner_config = PartitionerConfig(devices, PartitionMode.size_based)
+        catch_runtime_error = False
+        try:
+            ret = partitioner.partition_graph(traced, m, partitioner_config)
+        except RuntimeError:
+            catch_runtime_error = True
+        assert catch_runtime_error
+
     def test_partition_node_manipulation(self):
         class TestModule(torch.nn.Module):
             def forward(self, a, b):

From 63a71a82cf69dc01c2d169d242f2e6eefbe3d39c Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Sat, 5 Dec 2020 06:54:40 -0800
Subject: [PATCH 083/132] [ROCm] add 3.10 to nightly builds (#48866)

Summary:
Depends on https://github.com/pytorch/builder/pull/603.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48866

Reviewed By: malfet, janeyx99

Differential Revision: D25345895

Pulled By: walterddr

fbshipit-source-id: 5d1c754b36fa7ebd60832af58cbcbed2bc0da3bd
---
 .circleci/cimodel/data/dimensions.py |   2 +-
 .circleci/config.yml                 | 208 +++++++++++++--------------
 2 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index 57489ebe7915..c9aab39ddd2a 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -8,8 +8,8 @@
 ]
 
 ROCM_VERSIONS = [
-    "3.8",
     "3.9",
+    "3.10",
 ]
 
 ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b8e8aed96ae7..8bdfb3c9c7bd 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2162,8 +2162,8 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda110"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
-          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -2171,10 +2171,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
-          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -2182,10 +2182,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
-          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -2193,10 +2193,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build
-          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -2204,10 +2204,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2215,10 +2215,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2226,10 +2226,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2237,10 +2237,10 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -2248,7 +2248,7 @@ workflows:
             tags:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3732,8 +3732,8 @@ workflows:
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
-          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -3742,13 +3742,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.8"
+            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
-          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -3757,13 +3757,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.8"
+            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
-          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -3772,13 +3772,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.8"
+            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test
-          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
           filters:
             branches:
               only:
@@ -3787,13 +3787,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.8"
+            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3802,13 +3802,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3817,13 +3817,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3832,13 +3832,13 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           filters:
             branches:
               only:
@@ -3847,8 +3847,8 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
-          docker_image: "pytorch/manylinux-rocm:3.9"
+            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - binary_linux_test:
@@ -5558,10 +5558,10 @@ workflows:
           package_type: manywheel
           upload_subfolder: cu110
       - binary_upload:
-          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5570,12 +5570,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.8
+          upload_subfolder: rocm3.9
       - binary_upload:
-          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5584,12 +5584,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.8
+          upload_subfolder: rocm3.9
       - binary_upload:
-          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5598,12 +5598,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.8
+          upload_subfolder: rocm3.9
       - binary_upload:
-          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5612,12 +5612,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.8
+          upload_subfolder: rocm3.9
       - binary_upload:
-          name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5626,12 +5626,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5640,12 +5640,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5654,12 +5654,12 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
-          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_upload
+          name: binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_upload
           context: org-member
           requires:
-            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
+            - binary_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly_test
           filters:
             branches:
               only:
@@ -5668,7 +5668,7 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
-          upload_subfolder: rocm3.9
+          upload_subfolder: rocm3.10
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -8460,99 +8460,99 @@ workflows:
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
-          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          name: smoke_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly
-          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          name: smoke_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly
-          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          name: smoke_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly
-          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          name: smoke_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.8"
+          docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_6m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_7m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.7m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_7m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_8m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.8m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_8m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:
-          name: smoke_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly
-          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          name: smoke_linux_manywheel_3_9m_rocm3_10_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm3.10 devtoolset7"
           requires:
             - update_s3_htmls
           filters:
             branches:
               only:
                 - postnightly
-          docker_image: "pytorch/manylinux-rocm:3.9"
+          docker_image: "pytorch/manylinux-rocm:3.10"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
       - smoke_linux_test:

From fa5f7d87bfb6c869563c1b8fed16bf45064faf46 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Sat, 5 Dec 2020 08:42:13 -0800
Subject: [PATCH 084/132] fx quant: add typing for fuser (#48844)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48844

Add types to function I/O for `Fuser` to improve readability

Test Plan:
```
mypy torch/quantization/
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25337314

fbshipit-source-id: e5074d71c7834f24975169d36bf49357e53650ff
---
 torch/quantization/fx/fuse.py               | 23 ++++++++++++++++-----
 torch/quantization/fx/pattern_utils.py      |  5 +++--
 torch/quantization/fx/quantization_types.py |  3 +++
 3 files changed, 24 insertions(+), 7 deletions(-)
 create mode 100644 torch/quantization/fx/quantization_types.py

diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 490b34ae518e..b5cf78b05f33 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -2,6 +2,7 @@
 
 from torch.fx import (  # type: ignore
     GraphModule,
+    Node,
     map_arg
 )
 
@@ -18,8 +19,14 @@
 
 from .fusion_patterns import *  # noqa: F401
 
+from .quantization_types import Pattern
+
+from typing import Callable, Tuple, Optional
+
+
 class Fuser:
-    def fuse(self, model, fuse_custom_config_dict=None):
+    def fuse(self, model: GraphModule,
+             fuse_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
         if fuse_custom_config_dict is None:
             fuse_custom_config_dict = {}
 
@@ -27,10 +34,13 @@ def fuse(self, model, fuse_custom_config_dict=None):
         input_graph = model.graph
         self.modules = dict(input_root.named_modules())
 
-        additional_fusion_patterns = fuse_custom_config_dict.get("additional_quant_pattern", {})
-        fusion_patterns = get_combined_dict(get_default_fusion_patterns(), additional_fusion_patterns)
+        additional_fusion_patterns = \
+            fuse_custom_config_dict.get("additional_quant_pattern", {})
+        fusion_patterns = get_combined_dict(
+            get_default_fusion_patterns(), additional_fusion_patterns)
         # find fusion
-        fusion_pairs = self._find_matches(input_root, input_graph, fusion_patterns)
+        fusion_pairs = self._find_matches(
+            input_root, input_graph, fusion_patterns)
         self.fused_graph = Graph()
         env: Dict[Any, Any] = {}
 
@@ -40,6 +50,7 @@ def load_arg(a):
         for node in input_graph.nodes:
             root_node, obj = fusion_pairs.get(node.name, (None, None))
             if root_node is node:
+                assert obj is not None
                 env[node.name] = obj.fuse(self, load_arg)
             elif root_node is None:
                 env[node.name] = self.fused_graph.node_copy(node, load_arg)
@@ -48,7 +59,9 @@ def load_arg(a):
         model = GraphModule(input_root, self.fused_graph)
         return model
 
-    def _find_matches(self, root, graph, patterns):
+    def _find_matches(self, root: GraphModule, graph: Graph,
+                      patterns: Dict[Pattern, Callable]
+                      ) -> Dict[str, Tuple[Node, Optional[Any]]]:
         modules = dict(root.named_modules())
         match_map = {}  # node name -> (root_node, match_value?)
 
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index 8d218df7bd86..146dad1eab2e 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -1,11 +1,12 @@
 import torch
 import sys
 from collections import OrderedDict
-from typing import Dict, Any, Union, Tuple, Callable
+from typing import Dict, Any
+
+from .quantization_types import Pattern
 
 # TODO(future PR): fix the typing on QuantizeHandler (currently a circular dependency)
 QuantizeHandler = Any
-Pattern = Union[Callable, Tuple[Callable, Callable], Tuple[Callable, Callable, Callable]]
 
 # pattern for conv bn fusion
 DEFAULT_FUSION_PATTERNS = OrderedDict()
diff --git a/torch/quantization/fx/quantization_types.py b/torch/quantization/fx/quantization_types.py
new file mode 100644
index 000000000000..9a87119dc0db
--- /dev/null
+++ b/torch/quantization/fx/quantization_types.py
@@ -0,0 +1,3 @@
+from typing import Union, Callable, Tuple
+
+Pattern = Union[Callable, Tuple[Callable, Callable], Tuple[Callable, Callable, Callable]]

From 0923d19601e9c4853a90d0b6b2f73a29b0a28af6 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Sat, 5 Dec 2020 08:42:13 -0800
Subject: [PATCH 085/132] fx quant: add types to quantization_patterns (#48851)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48851

Adding typing to improve readability.

Note: this uncovered a few missing return statements, we should
fix that before landing.

Test Plan:
```
mypy torch/quantization/
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D25338644

fbshipit-source-id: 0ac4405db05fdd2737bc3415217bc1937c2db684
---
 .../quantization/fx/quantization_patterns.py  | 95 +++++++++++++------
 1 file changed, 67 insertions(+), 28 deletions(-)

diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 72e165f8351e..176cd7603286 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -37,6 +37,13 @@
 import operator
 import warnings
 
+from typing import Any, Callable, Dict
+
+# This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+# Define separately to prevent circular imports.
+# TODO(future PR): improve this.
+QuantizerCls = Any
+
 # -------------------------
 # Pattern Registrations
 # -------------------------
@@ -47,7 +54,7 @@
 class QuantizeHandler(ABC):
     """ Base handler class for the quantizer patterns
     """
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         """ Records pattern information in __init__, which will be used
         in convert
         """
@@ -58,7 +65,9 @@ def __init__(self, quantizer, node):
         self.all_node_args = True
 
     @abstractmethod
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         """ Convert the given node to a quantized node and insert
         it to the quantized graph
         """
@@ -71,18 +80,20 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern((torch.nn.functional.relu, operator.add))
 @register_quant_pattern((torch.nn.functional.relu, torch.add))
 class Add(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
         self.relu_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
            (node.op == 'call_module' and isinstance(quantizer.modules[node.target], torch.nn.ReLU)):
             self.relu_node = node
-            node = node.args[0]
+            node = node.args[0]  # type: ignore
         assert node.op == 'call_function' and node.target in [operator.add, torch.add]
         self.add_node = node
         self.num_node_args = len([a for a in self.add_node.args[:2] if isinstance(a, Node)])
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         if self.num_node_args == 1:
             # add scalar
             if self.relu_node is not None:
@@ -119,18 +130,20 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern((torch.nn.functional.relu, operator.mul))
 @register_quant_pattern((torch.nn.functional.relu, torch.mul))
 class Mul(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
         self.relu_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
            (node.op == 'call_module' and isinstance(quantizer.modules[node.target], torch.nn.ReLU)):
             self.relu_node = node
-            node = node.args[0]
+            node = node.args[0]  # type: ignore
         assert node.op == 'call_function' and node.target in [operator.mul, torch.mul]
         self.mul_node = node
         self.num_node_args = len([a for a in self.mul_node.args[:2] if isinstance(a, Node)])
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         if self.num_node_args == 1:
             # mul scalar
             if self.relu_node is not None:
@@ -159,7 +172,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 
 @register_quant_pattern(torch.cat)
 class Cat(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         if not self.all_node_args:
             return NotImplemented
         activation_post_process = quantizer.activation_post_process_map[node.name]
@@ -191,18 +206,20 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d))
 class ConvRelu(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
         self.relu_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
            (node.op == 'call_module' and isinstance(quantizer.modules[node.target], torch.nn.ReLU)):
             self.relu_node = node
-            node = node.args[0]
+            node = node.args[0]  # type: ignore
         self.conv_node = node
         if node.op == 'call_module':
             self.conv = quantizer.modules[self.conv_node.target]
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         # TODO: debug option for conv module
         qconfig = quantizer.qconfig_map[node.name]
         activation_statically_quantized = activation_is_statically_quantized(qconfig)
@@ -230,7 +247,8 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
                 self.conv_node.target,
                 (load_arg(quantized=True)(self.conv_node.args[0]),),
                 {})
-        elif self.conv_node.op == 'call_function':
+        else:  # call_function
+            assert self.conv_node.op == 'call_function'
             if self.relu_node is not None:
                 raise Exception("functional conv + relu is not supported yet")
             if debug:
@@ -273,18 +291,20 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
 class LinearReLUQuantizeHandler(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
         self.relu_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
            (node.op == 'call_module' and isinstance(quantizer.modules[node.target], torch.nn.ReLU)):
             self.relu_node = node
-            node = node.args[0]
+            node = node.args[0]  # type: ignore
         self.linear_node = node
         if node.op == 'call_module':
             self.linear = quantizer.modules[self.linear_node.target]
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         # Supported combinations are:
         # quant_type | activation (compute_type) | weight
         #  static       quint8                      qint8
@@ -338,7 +358,8 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
                 'call_module',
                 self.linear_node.target,
                 (load_arg(quantized=activation_statically_quantized)(self.linear_node.args[0]),), {})
-        elif self.linear_node.op == 'call_function':
+        else:  # call_function
+            assert self.linear_node.op == 'call_function'
             if debug:
                 quantized_input_idxs = []
                 if activation_statically_quantized:
@@ -405,13 +426,15 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.nn.intrinsic.BNReLU2d)
 @register_quant_pattern(torch.nn.intrinsic.BNReLU3d)
 class BatchNorm(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
         assert node.op == 'call_module'
         self.bn_node = node
         self.bn = quantizer.modules[self.bn_node.target]
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
         additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
@@ -431,10 +454,12 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern(torch.nn.EmbeddingBag)
 @mark_input_output_not_observed()
 class Embedding(QuantizeHandler):
-    def __init__(self, quantizer, node):
+    def __init__(self, quantizer: QuantizerCls, node: Node):
         super().__init__(quantizer, node)
 
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         # Supported combinations are:
         # quant_type  | activation | weight | activation_compute_type
         # weight_only |  float32   | quint8 | None
@@ -486,7 +511,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 class DefaultNode(QuantizeHandler):
     ''' Common quantized op, first input and first output will be quantized
     '''
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         if not self.all_node_args:
             return NotImplemented
         assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \
@@ -528,7 +555,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 # TODO: elu is using scale/zero_point instead of output_scale, output_zero_point
 @register_quant_pattern(torch.nn.functional.elu)
 class ELU(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         activation_post_process = quantizer.activation_post_process_map[node.name]
         scale, zero_point = activation_post_process.calculate_qparams()
         scale = float(scale)
@@ -553,7 +582,9 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('tanh', default_symmetric_fixed_qparams_fake_quant)
 @register_quant_pattern('tanh_', default_symmetric_fixed_qparams_fake_quant)
 class FixedQParamsOpQuantizeHandler(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
 
 # these ops have quantized equivalents that do not need any extra information
@@ -622,13 +653,17 @@ def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_
 @register_quant_pattern('unsqueeze_')
 @register_quant_pattern('view')
 class CopyNode(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
 
 # Default quantization handler, used for quantization of input and output
 # of quantizable objects (e.g. modules and functionals)
 class DefaultQuantizeHandler(QuantizeHandler):
-    def convert(self, quantizer, node):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         assert self.all_node_args
         root_module = quantizer.modules['']
         return quantize_node(
@@ -637,7 +672,9 @@ def convert(self, quantizer, node):
             node, quantizer.activation_post_process_map[node.name])
 
 class CustomModuleQuantizeHandler(QuantizeHandler):
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         """ Convert a float custom module to quantized custom module
         """
         assert node.op == 'call_module'
@@ -666,7 +703,9 @@ class StandaloneModuleQuantizeHandler(QuantizeHandler):
     """ Converts an observed standalone module to quantized standalone module
     by calling convert_fx on the observed standalone module.
     """
-    def convert(self, quantizer, node, load_arg, debug=False, convert_custom_config_dict=None):
+    def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
+                debug: bool = False,
+                convert_custom_config_dict: Dict[str, Any] = None) -> Node:
         assert node.op == 'call_module'
         qconfig = quantizer.qconfig_map[node.name]
         convert = torch.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore

From 251398acca732fcdf78c194328ff9ede8adb56d1 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Sat, 5 Dec 2020 10:45:45 -0800
Subject: [PATCH 086/132] Force a sync on non-CPU tensors for the benchmark to
 reflect the timing accurately. (#48856)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48856

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D25339803

Pulled By: AshkanAliabadi

fbshipit-source-id: fdfd9a0e0cc37245d7671419f492e445396fbdb8
---
 binaries/speed_benchmark_torch.cc | 58 +++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index 09f1cabb8e15..88cc0b5dd956 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -70,6 +70,8 @@ C10_DEFINE_bool(
 C10_DEFINE_int(pytext_len, 0, "Length of input sequence.");
 C10_DEFINE_bool(vulkan, false, "Whether to use Vulkan backend (GPU).");
 
+namespace {
+
 std::vector<std::string>
 split(char separator, const std::string& string, bool ignore_empty = true) {
   std::vector<std::string> pieces;
@@ -143,14 +145,11 @@ std::vector<c10::IValue> create_inputs() {
           "Unsupported input memory format: ", input_memory_format_list[i]);
     }
 
-    const auto input_tensor = torch::ones(
-        input_dims,
-        at::TensorOptions(input_type).memory_format(input_memory_format));
-    if (FLAGS_vulkan) {
-      inputs.push_back(input_tensor.vulkan());
-    } else {
-      inputs.push_back(input_tensor);
-    }
+    inputs.push_back(
+        torch::ones(
+            input_dims,
+            at::TensorOptions(input_type).
+            memory_format(input_memory_format)));
   }
 
   if (FLAGS_pytext_len > 0) {
@@ -161,6 +160,39 @@ std::vector<c10::IValue> create_inputs() {
   return inputs;
 }
 
+class Runner {
+ public:
+  virtual ~Runner() = default;
+  virtual c10::IValue run(
+      torch::jit::Module& module,
+      const std::vector<c10::IValue>& inputs) {
+    return module.forward(inputs);
+  }
+};
+
+class vkRunner final : public Runner {
+ public:
+  virtual ~vkRunner() = default;
+  virtual c10::IValue run(
+      torch::jit::Module& module,
+      const std::vector<c10::IValue>& inputs) override {
+    // Upload the input tensor(s) to GPU memory.
+    inputs_.clear();
+    inputs_.reserve(inputs.size());
+    for (const auto& input : inputs) {
+      inputs_.emplace_back(input.toTensor().vulkan());
+    }
+
+    // Run, and download the output tensor to system memory.
+    return module.forward(inputs_).toTensor().cpu();
+  }
+
+ private:
+  std::vector<c10::IValue> inputs_;
+};
+
+} // namespace
+
 int main(int argc, char** argv) {
   c10::SetUsageMessage(
     "Run speed benchmark for pytorch model.\n"
@@ -199,9 +231,13 @@ int main(int argc, char** argv) {
     inputs = all_inputs.get(FLAGS_use_bundled_input).toTuple()->elements();
   }
 
+  const std::unique_ptr<Runner> runner =
+      FLAGS_vulkan ? std::make_unique<vkRunner>() :
+                     std::make_unique<Runner>();
+
   module.eval();
   if (FLAGS_print_output) {
-    std::cout << module.forward(inputs) << std::endl;
+    std::cout << runner->run(module, inputs) << std::endl;
   }
 
   c10::CPUCachingAllocator caching_allocator;
@@ -217,7 +253,7 @@ int main(int argc, char** argv) {
       FLAGS_warmup,
       ".");
   for (int i = 0; i < FLAGS_warmup; ++i) {
-    module.forward(inputs);
+    runner->run(module, inputs);
   }
 
   std::cout << "Main runs." << std::endl;
@@ -231,7 +267,7 @@ int main(int argc, char** argv) {
   auto micros = timer.MicroSeconds();
   for (int i = 0; i < FLAGS_iter; ++i) {
     auto start = high_resolution_clock::now();
-    module.forward(inputs);
+    runner->run(module, inputs);
     auto stop = high_resolution_clock::now();
     auto duration = duration_cast<microseconds>(stop - start);
     times.push_back(duration.count());

From 0fb58d76a1dd7b0ff6cdd374aff83e9a294b3047 Mon Sep 17 00:00:00 2001
From: Newsha Ardalani <new@fb.com>
Date: Sat, 5 Dec 2020 16:33:54 -0800
Subject: [PATCH 087/132] Support ArgMin in c2_pt_converter

Summary:
+ Add ArgMin support to Caffe2 to PyTorch converter
+ Using hypothesis to parameterize different conditions for test

Test Plan: buck test //caffe2/torch/fb/model_transform/c2_convert:c2_pt_converter_test

Reviewed By: houseroad

Differential Revision: D25016203

fbshipit-source-id: 94489fcf1ed3183ec96f9796a5b4fb348fbde5bc
---
 caffe2/python/brew.py            | 1 +
 caffe2/python/helpers/algebra.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
index 25fb4892e9a0..69a4561aae10 100644
--- a/caffe2/python/brew.py
+++ b/caffe2/python/brew.py
@@ -55,6 +55,7 @@ class HelperWrapper(object):
         'sum': sum,
         'reduce_sum': reduce_sum,
         'sub': sub,
+        'arg_min': arg_min,
         'transpose': transpose,
         'iter': iter,
         'accuracy': accuracy,
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
index 4c9c3728677b..2b626677b029 100644
--- a/caffe2/python/helpers/algebra.py
+++ b/caffe2/python/helpers/algebra.py
@@ -33,6 +33,10 @@ def mat_mul(model, blob_in, blob_out, **kwargs):
     return model.net.MatMul(blob_in, blob_out, **kwargs)
 
 
+def arg_min(model, blob_in, blob_out, **kwargs):
+    """ArgMin"""
+    return model.net.ArgMin(blob_in, blob_out, **kwargs)
+
 def batch_mat_mul(model, blob_in, blob_out,
                   enable_tensor_core=False, **kwargs):
     if enable_tensor_core:

From ae9f39eb580c4d92157236d64548b055f71cf14b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Sat, 5 Dec 2020 17:22:00 -0800
Subject: [PATCH 088/132] [FX][1/2] Make docstrings pretty when rendered
 (#48738)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48738

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D25280867

Pulled By: jamesr66a

fbshipit-source-id: d08641c19a6c69b4042389c800a48e699f0be628
---
 torch/fx/__init__.py       | 140 ++++++-------
 torch/fx/graph.py          | 408 ++++++++++++++++++++++++-------------
 torch/fx/graph_module.py   |  99 +++++----
 torch/fx/node.py           |  20 +-
 torch/fx/proxy.py          |   4 +-
 torch/fx/symbolic_trace.py |  69 ++++---
 6 files changed, 454 insertions(+), 286 deletions(-)

diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index f3804c515612..7a3eb03de1ef 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -4,84 +4,86 @@
 
 FX is a toolkit for capturing and transforming functional PyTorch programs. It
 consists of GraphModule and a corresponding intermediate representation (IR). When GraphModule is constructed
-with an `nn.Module` instance as its argument, GraphModule will trace through the computation of that Module's
-`forward` method symbolically and record those operations in the FX intermediate representation.
+with an ``nn.Module`` instance as its argument, GraphModule will trace through the computation of that Module's
+``forward`` method symbolically and record those operations in the FX intermediate representation.
 
-```
-import torch
-from torch.fx import symbolic_trace
+.. code-block:: python
 
-class MyModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.param = torch.nn.Parameter(torch.rand(3, 4))
-        self.linear = torch.nn.Linear(4, 5)
+    import torch
+    import torch.fx
 
-    def forward(self, x):
-        return torch.topk(torch.sum(self.linear(x + self.linear.weight).relu(), dim=-1), 3)
-
-m = MyModule()
-gm = symbolic_trace(m)
-```
-
-The Intermediate Representation centers around a 5-opcode format:
-
-```
-print(gm.graph)
-```
-
-```
-graph(x):
-    %linear_weight : [uses=1] = self.linear.weight
-    %add_1 : [uses=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
-    %linear_1 : [uses=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
-    %relu_1 : [uses=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
-    %sum_1 : [uses=1] = call_function[target=<built-in method sum of type object at 0x7fad0a3c16a0>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
-    %topk_1 : [uses=1] = call_function[target=<built-in method topk of type object at 0x7fad0a3c16a0>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
-    return topk_1
-```
-
-The semantics are as follows:
-
-- `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
-  `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument
-  denoting the default parameter of the function input. `kwargs` is don't-care. Placeholders correspond to
-  the function parameters (e.g. `x`) in the graph printout.
-- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
-   fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
-   `args` and `kwargs` are don't-care
-- `call_function` applies a free function to some values. `name` is similarly the name of the value to assign
-  to. `target` is the function to be applied. `args` and `kwargs` represent the arguments to the function,
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.param = torch.nn.Parameter(torch.rand(3, 4))
+            self.linear = torch.nn.Linear(4, 5)
+
+        def forward(self, x):
+            return torch.topk(torch.sum(self.linear(x + self.linear.weight).relu(), dim=-1), 3)
+
+    m = MyModule()
+    gm = torch.fx.symbolic_trace(m)
+
+The Intermediate Representation centers around a 5-opcode format::
+
+    print(gm.graph)
+
+.. code-block:: text
+
+    graph(x):
+        %linear_weight : [#users=1] = self.linear.weight
+        %add_1 : [#users=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
+        %linear_1 : [#users=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
+        %relu_1 : [#users=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
+        %sum_1 : [#users=1] = call_function[target=<built-in method sum of type object at 0x7ff2da9dc300>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
+        %topk_1 : [#users=1] = call_function[target=<built-in method topk of type object at 0x7ff2da9dc300>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
+        return topk_1
+
+The Node semantics are as follows:
+
+- ``placeholder`` represents a function input. The ``name`` attribute specifies the name this value will take on.
+  ``target`` is similarly the name of the argument. ``args`` holds either: 1) nothing, or 2) a single argument
+  denoting the default parameter of the function input. ``kwargs`` is don't-care. Placeholders correspond to
+  the function parameters (e.g. ``x``) in the graph printout.
+- ``get_attr`` retrieves a parameter from the module hierarchy. ``name`` is similarly the name the result of the
+  fetch is assigned to. ``target`` is the fully-qualified name of the parameter's position in the module hierarchy.
+  ``args`` and ``kwargs`` are don't-care
+- ``call_function`` applies a free function to some values. ``name`` is similarly the name of the value to assign
+  to. ``target`` is the function to be applied. ``args`` and ``kwargs`` represent the arguments to the function,
   following the Python calling convention
-- `call_module` applies a module in the module hierarchy's `forward()` method to given arguments. `name` is
-  as previous. `target` is the fully-qualified name of the module in the module hierarchy to call.
-  `args` and `kwargs` represent the arguments to invoke the module on, _including the self argument_.
-- `call_method` calls a method on a value. `name` is as similar. `target` is the string name of the method
-  to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on,
-  _including the self argument_.
-- `output` contains the output of the traced function in its `args[0]` attribute. This corresponds to the "return" statement
+- ``call_module`` applies a module in the module hierarchy's ``forward()`` method to given arguments. ``name`` is
+  as previous. ``target`` is the fully-qualified name of the module in the module hierarchy to call.
+  ``args`` and ``kwargs`` represent the arguments to invoke the module on, *including the self argument*.
+- ``call_method`` calls a method on a value. ``name`` is as similar. ``target`` is the string name of the method
+  to apply to the ``self`` argument. ``args`` and ``kwargs`` represent the arguments to invoke the module on,
+  *including the self argument*
+- ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement
   in the Graph printout.
 
-GraphModule automatically generates Python code for the operations it symbolically observed:
+GraphModule automatically generates Python code for the operations it symbolically observed::
 
-```
-print(gm.code)
-```
+    print(gm.code)
 
-```
-def forward(self, x):
-    linear_weight = self.linear.weight
-    add_1 = x + linear_weight
-    linear_1 = self.linear(add_1)
-    relu_1 = linear_1.relu()
-    sum_1 = torch.sum(relu_1, dim = -1)
-    topk_1 = torch.topk(sum_1, 3)
-    return topk_1
+.. code-block:: python
 
-```
-
-Because this code is valid PyTorch code, the resulting `GraphModule` can be used in any context another
-`nn.Module` can be used, including in TorchScript tracing/compilation.
+    import torch
+    def forward(self, x):
+        linear_weight = self.linear.weight
+        add_1 = x + linear_weight
+        x = linear_weight = None
+        linear_1 = self.linear(add_1)
+        add_1 = None
+        relu_1 = linear_1.relu()
+        linear_1 = None
+        sum_1 = torch.sum(relu_1, dim = -1)
+        relu_1 = None
+        topk_1 = torch.topk(sum_1, 3)
+        sum_1 = None
+        return topk_1
+        topk_1 = None
+
+Because this code is valid PyTorch code, the resulting ``GraphModule`` can be used in any context another
+``nn.Module`` can be used, including in TorchScript tracing/compilation.
 '''
 
 from .graph_module import GraphModule
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 65db80f8d919..072aef6e3b93 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -62,9 +62,9 @@ def _type_repr(obj):
     typically enough to uniquely identify a type.  For everything
     else, we fall back on repr(obj).
     """
-    # HACK: In Python 3.6, type aliases from `typing` are instances of `type`, but in
-    # later Python versions, type aliases are not instances of `type`!! We want
-    # all type aliases to fall through to `repr`, so if we have a type that is
+    # HACK: In Python 3.6, type aliases from ``typing`` are instances of ``type``, but in
+    # later Python versions, type aliases are not instances of ``type``!! We want
+    # all type aliases to fall through to ``repr``, so if we have a type that is
     # in the module typing, don't go down this path.
     if isinstance(obj, type) and obj.__module__ != 'typing':
         if obj.__module__ == 'builtins':
@@ -109,67 +109,65 @@ def __reversed__(self):
 
 class Graph:
     """
-    `Graph` is the main data structure used in the FX Intermediate Representation.
-    It consists of a series of `Node`s, each representing callsites (or other
-    syntactic constructs). The list of `Node`s, taken together, constitute a
+    ``Graph`` is the main data structure used in the FX Intermediate Representation.
+    It consists of a series of ``Node`` s, each representing callsites (or other
+    syntactic constructs). The list of ``Node`` s, taken together, constitute a
     valid Python function.
 
     For example, the following code
 
-    ```
-    import torch
-    from torch.fx import symbolic_trace
+    .. code-block:: python
 
-    class MyModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.param = torch.nn.Parameter(torch.rand(3, 4))
-            self.linear = torch.nn.Linear(4, 5)
+        import torch
+        import torch.fx
 
-        def forward(self, x):
-            return torch.topk(torch.sum(self.linear(x + self.linear.weight).relu(), dim=-1), 3)
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 4))
+                self.linear = torch.nn.Linear(4, 5)
 
-    m = MyModule()
-    gm = symbolic_trace(m)
-    ```
+            def forward(self, x):
+                return torch.topk(torch.sum(self.linear(x + self.linear.weight).relu(), dim=-1), 3)
 
-    Will produce the following Graph:
+        m = MyModule()
+        gm = torch.fx.symbolic_trace(m)
 
-    ```
-    print(gm.graph)
-    ```
+    Will produce the following Graph::
 
-    ```
-    graph(x):
-        %linear_weight : [uses=1] = self.linear.weight
-        %add_1 : [uses=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
-        %linear_1 : [uses=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
-        %relu_1 : [uses=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
-        %sum_1 : [uses=1] = call_function[target=<built-in method sum of type object at 0x7fad0a3c16a0>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
-        %topk_1 : [uses=1] = call_function[target=<built-in method topk of type object at 0x7fad0a3c16a0>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
-        return topk_1
-    ```
+        print(gm.graph)
+
+    .. code-block:: text
+
+        graph(x):
+            %linear_weight : [#users=1] = self.linear.weight
+            %add_1 : [#users=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
+            %linear_1 : [#users=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
+            %relu_1 : [#users=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
+            %sum_1 : [#users=1] = call_function[target=<built-in method sum of type object at 0x7ff2da9dc300>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
+            %topk_1 : [#users=1] = call_function[target=<built-in method topk of type object at 0x7ff2da9dc300>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
+            return topk_1
 
     The Node semantics are as follows:
 
-    - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
-    `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument
-    denoting the default parameter of the function input. `kwargs` is don't-care. Placeholders correspond to
-    the function parameters (e.g. `x`) in the graph printout.
-    - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
-    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
-    `args` and `kwargs` are don't-care
-    - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign
-    to. `target` is the function to be applied. `args` and `kwargs` represent the arguments to the function,
-    following the Python calling convention
-    - `call_module` applies a module in the module hierarchy's `forward()` method to given arguments. `name` is
-    as previous. `target` is the fully-qualified name of the module in the module hierarchy to call.
-    `args` and `kwargs` represent the arguments to invoke the module on, _including the self argument_.
-    - `call_method` calls a method on a value. `name` is as similar. `target` is the string name of the method
-    to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on,
-    _including the self argument_.
-    - `output` contains the output of the traced function in its `args[0]` attribute. This corresponds to the "return" statement
-    in the Graph printout.
+    - ``placeholder`` represents a function input. The ``name`` attribute specifies the name this value will take on.
+      ``target`` is similarly the name of the argument. ``args`` holds either: 1) nothing, or 2) a single argument
+      denoting the default parameter of the function input. ``kwargs`` is don't-care. Placeholders correspond to
+      the function parameters (e.g. ``x``) in the graph printout.
+    - ``get_attr`` retrieves a parameter from the module hierarchy. ``name`` is similarly the name the result of the
+      fetch is assigned to. ``target`` is the fully-qualified name of the parameter's position in the module hierarchy.
+      ``args`` and ``kwargs`` are don't-care
+    - ``call_function`` applies a free function to some values. ``name`` is similarly the name of the value to assign
+      to. ``target`` is the function to be applied. ``args`` and ``kwargs`` represent the arguments to the function,
+      following the Python calling convention
+    - ``call_module`` applies a module in the module hierarchy's ``forward()`` method to given arguments. ``name`` is
+      as previous. ``target`` is the fully-qualified name of the module in the module hierarchy to call.
+      ``args`` and ``kwargs`` represent the arguments to invoke the module on, *including the self argument*.
+    - ``call_method`` calls a method on a value. ``name`` is as similar. ``target`` is the string name of the method
+      to apply to the ``self`` argument. ``args`` and ``kwargs`` represent the arguments to invoke the module on,
+      *including the self argument*
+    - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement
+      in the Graph printout.
     """
     def __init__(self):
         """
@@ -183,19 +181,34 @@ def __init__(self):
     @property
     def nodes(self) -> _node_list:
         """
-        Get the list of `Node`s that constitute this Graph.
+        Get the list of Nodes that constitute this Graph.
 
-        Note that this `Node` list representation is a doubly-linked list. Mutations
+        Note that this ``Node`` list representation is a doubly-linked list. Mutations
         during iteration (e.g. delete a Node, add a Node) are safe.
+
+        Returns:
+
+            A doubly-linked list of Nodes. Note that ``reversed`` can be called on
+            this list to switch iteration order.
         """
         return _node_list(self)
 
-    def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node]) -> Optional[Argument]:
+    def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node]) -> 'Optional[Argument]':
         """
-        Append all nodes from graph `g` to this graph. `val_map` should be a dictionary
-        that maps nodes in `g` to nodes in `self. `val_map` will be populated with more
-        items by this function. Returns the equivalent output value of `g` with
-        Nodes switched to refer to nodes in `self`.
+        Copy all nodes from a given graph into ``self``.
+
+        Args:
+
+            g (Graph): The source graph from which to copy Nodes.
+
+            val_map (Dict[Node, Node]): a dictionary that will be populated with a mapping
+                from nodes in ``g`` to nodes in ``self``. Note that ``val_map`` can be passed
+                in with values in it already to override copying of certain values.
+
+        Returns:
+
+            The value in ``self`` that is now equivalent to the output value in ``g``,
+            if ``g`` had an ``output`` node. ``None`` otherwise.
         """
         for node in g.nodes:
             if node in val_map:
@@ -220,25 +233,35 @@ def __deepcopy__(self, memo=None) -> 'Graph':
         g.output(output_val)
         return g
 
-    def create_node(self, op: str, target: Target,
-                    args: Optional[Tuple[Argument, ...]] = None,
-                    kwargs: Optional[Dict[str, Argument]] = None,
+    def create_node(self, op: str, target: 'Target',
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
                     name: Optional[str] = None,
                     type_expr: Optional[Any] = None) -> Node:
         """
-        Create a `Node` and add it to the `Graph` at the current insert-point.
-        Note that the current insert-point can be set via `Graph.inserting_before`
-        and `Graph.inserting_after`.
+        Create a ``Node`` and add it to the ``Graph`` at the current insert-point.
+        Note that the current insert-point can be set via :meth:`Graph.inserting_before`
+        and :meth:`Graph.inserting_after`.
+
+        Args:
+            op (str): the opcode for this Node. One of 'call_function', 'call_method', 'get_attr',
+                'call_module', 'placeholder', or 'output'. The semantics of these opcodes are
+                described in the ``Graph`` docstring.
+
+            args (Optional[Tuple[Argument, ...]]): is a tuple of arguments to this node.
+
+            kwargs (Optional[Dict[str, Argument]]): the kwargs of this Node
+
+            name (Optional[str]): an optional string name for the ``Node``.
+                This will influence the name of the value assigned to in the
+                Python generated code.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
 
-        - op is the opcode for this Node. One of 'call_function', 'call_method', 'get_attr',
-          'call_module', 'placeholder', or 'output'. The semantics of these opcodes are
-          described in the `Graph` docstring.
-        - args is a tuple of arguments to this node.
-        - kwargs is a dict from string to argument, representing the kwargs of this Node
-        - name is an optional string name for the `Node`. This will influence the name
-          of the value assigned to in the Python generated code.
-        - type_expr is an optional type annotation representing the Python type
-          the output of this node will have.
+            The newly-created and inserted node.
         """
         assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder', 'output')
         args = () if args is None else args
@@ -249,10 +272,14 @@ def create_node(self, op: str, target: Target,
         self._len += 1
         return n
 
-    def erase_node(self, to_erase : Node):
+    def erase_node(self, to_erase : Node) -> None:
         """
-        Erases the node `to_erase` from the `Graph`. Throws an exception if
-        there are still users of that node in the `Graph`.
+        Erases a ``Node`` from the ``Graph``. Throws an exception if
+        there are still users of that node in the ``Graph``.
+
+        Args:
+
+            to_erase (Node): The ``Node`` to erase from the ``Graph``.
         """
         if len(to_erase.users) > 0:
             raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {len(to_erase.users)} '
@@ -263,7 +290,7 @@ def erase_node(self, to_erase : Node):
         self._len -= 1
 
         # Null out this Node's argument nodes so that the Nodes referred to
-        # can update their `users` accordingly
+        # can update their ``users`` accordingly
         new_args = map_arg(to_erase.args, lambda n: None)
         assert isinstance(new_args, tuple)
         to_erase.args = new_args
@@ -274,7 +301,7 @@ def erase_node(self, to_erase : Node):
     def inserting_before(self, n: Optional[Node] = None):
         """Set the point at which create_node and companion methods will insert into the graph.
         When used within a 'with' statement, this will temporary set the insert point and
-        then restore it when the with statement exits:
+        then restore it when the with statement exits::
 
             with g.inserting_before(n):
                 ... # inserting before node n
@@ -286,7 +313,7 @@ def inserting_before(self, n: Optional[Node] = None):
               the beginning of the entire graph.
 
         Returns:
-            A resource manager that will restore the insert point on `__exit__`.
+            A resource manager that will restore the insert point on ``__exit__``.
         """
         if n is None:
             return self.inserting_after(self._root)
@@ -296,7 +323,7 @@ def inserting_before(self, n: Optional[Node] = None):
     def inserting_after(self, n: Optional[Node] = None):
         """Set the point at which create_node and companion methods will insert into the graph.
         When used within a 'with' statement, this will temporary set the insert point and
-        then restore it when the with statement exits:
+        then restore it when the with statement exits::
 
             with g.inserting_after(n):
                 ... # inserting after node n
@@ -308,7 +335,7 @@ def inserting_after(self, n: Optional[Node] = None):
               the beginning of the entire graph.
 
         Returns:
-            A resource manager that will restore the insert point on `__exit__`.
+            A resource manager that will restore the insert point on ``__exit__``.
         """
         if n is None:
             return self.inserting_before(self._root)
@@ -318,97 +345,178 @@ def inserting_after(self, n: Optional[Node] = None):
     # sugar for create_node when you know the op
     def placeholder(self, name: str, type_expr: Optional[Any] = None) -> Node:
         """
-        Insert a `placeholder` node into the Graph. A `placeholder` represents
-        a function input. This function takes a string `name` for the input
-        value as well as an optional `type_expr`, which is a type expression
-        describing the type of value this input will take. The type expression
-        is needed in some cases for proper code generation.
+        Insert a ``placeholder`` node into the Graph. A ``placeholder`` represents
+        a function input.
+
+        Args:
 
-        The same insertion point rules apply for this method as `Graph.create_node`.
+            name (str): A name for the input value. This corresponds to the name
+                of the positional argument to the function this ``Graph`` represents.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have. This is needed in some
+                cases for proper code generation (e.g. when the function is used
+                subsequently in TorchScript compilation).
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
         """
         return self.create_node('placeholder', name, type_expr=type_expr)
 
     def get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> Node:
         """
-        Insert a `get_attr` node into the Graph. A `get_attr` `Node` represents the
-        fetch of an attribute from the `Module` hierarchy. `qualified_name` is the
-        fully-qualified name of the attribute to be retrieved. For example, if
-        the traced Module has a submodule named `foo`, which has a submodule named
-        `bar`, which has an attribute named `baz`, the qualified name `foo.bar.baz`
-        should be passed as `qualified_name`.
+        Insert a ``get_attr`` node into the Graph. A ``get_attr`` ``Node`` represents the
+        fetch of an attribute from the ``Module`` hierarchy.
+
+        Args:
+
+            qualified_name (str): the fully-qualified name of the attribute to be retrieved.
+                For example, if the traced Module has a submodule named ``foo``, which has a
+                submodule named ``bar``, which has an attribute named ``baz``, the qualified
+                name ``foo.bar.baz`` should be passed as ``qualified_name``.
 
-        The same insertion point and type expression rules apply for this method
-        as `Graph.create_node`.
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+
+        Returns:
+
+            The newly-created and inserted ``get_attr`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
         """
         return self.create_node('get_attr', qualified_name, type_expr=type_expr)
 
     def call_module(self,
                     module_name: str,
-                    args: Optional[Tuple[Argument, ...]] = None,
-                    kwargs: Optional[Dict[str, Argument]] = None,
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
                     type_expr: Optional[Any] = None) -> Node:
         """
-        Insert a `call_module` `Node` into the `Graph`. A `call_module` node
-        represents a call to the forward() function of a `Module` in the `Module`
-        hierarchy. For example, if the traced `Module` has a submodule named `foo`,
-        which has a submodule named `bar`, the qualified name `foo.bar` should
-        be passed as `module_name` to call that module.
+        Insert a ``call_module`` ``Node`` into the ``Graph``. A ``call_module`` node
+        represents a call to the forward() function of a ``Module`` in the ``Module``
+        hierarchy.
+
+        Args:
+
+            module_name (str): The qualified name of the ``Module`` in the ``Module``
+                hierarchy to be called. For example, if the traced ``Module`` has a
+                submodule named ``foo``, which has a submodule named ``bar``, the
+                qualified name ``foo.bar`` should be passed as ``module_name`` to
+                call that module.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called method. Note that this should *not* include a ``self`` argument.
 
-        `args` and `kwargs` represent the args and kwargs passed to the called
-        `Module`, respectively.
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called method
 
-        The same insertion point and type expression rules apply for this method
-        as `Graph.create_node`.
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly-created and inserted ``call_module`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
         """
         return self.create_node('call_module', module_name, args, kwargs, type_expr=type_expr)
 
     def call_method(self,
                     method_name: str,
-                    args: Optional[Tuple[Argument, ...]] = None,
-                    kwargs: Optional[Dict[str, Argument]] = None,
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
                     type_expr: Optional[Any] = None) -> Node:
         """
-        Insert a `call_method` `Node` into the `Graph`. A `call_method` node
+        Insert a ``call_method`` ``Node`` into the ``Graph``. A ``call_method`` node
         represents a call to a given method on the 0th element of `args.
-        For example, if args[0] is a `Node` representing a `Tensor`, then to call
-        `relu()` on that `Tensor`, pass `relu` to `method_name`.
 
-        `args` and `kwargs` represent the args and kwargs passed to the called
-        method, respectively.
+        Args:
+
+            method_name (str): The name of the method to apply to the self argument.
+                For example, if args[0] is a ``Node`` representing a ``Tensor``,
+                then to call ``relu()`` on that ``Tensor``, pass ``relu`` to ``method_name``.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called method. Note that this *should* include a ``self`` argument.
+
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called method
 
-        The same insertion point and type expression rules apply for this method
-        as `Graph.create_node`.
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly created and inserted ``call_method`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
         """
         return self.create_node('call_method', method_name, args, kwargs, type_expr=type_expr)
 
     def call_function(self,
                       the_function: Callable[..., Any],
-                      args: Optional[Tuple[Argument, ...]] = None,
-                      kwargs: Optional[Dict[str, Argument]] = None,
+                      args: Optional[Tuple['Argument', ...]] = None,
+                      kwargs: Optional[Dict[str, 'Argument']] = None,
                       type_expr: Optional[Any] = None) -> Node:
         """
-        Insert a `call_function` `Node` into the `Graph`. A `call_function` node
-        represents a call to a Python callable, specified by `the_function`. `the_function`
-        can be any PyTorch operator, Python function, or member of the `builtins`
-        or `operator` namespaces.
+        Insert a ``call_function`` ``Node`` into the ``Graph``. A ``call_function`` node
+        represents a call to a Python callable, specified by ``the_function``. ``the_function``
+        can be
+
+        Args:
+
+            the_function (Callable[..., Any]): The function to be called. Can be any PyTorch
+                operator, Python function, or member of the ``builtins`` or ``operator``
+                namespaces.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called function.
+
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called function
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns
 
-        `args` and `kwargs` represent the args and kwargs passed to the called
-        method, respectively.
+            The newly created and inserted ``call_function`` node.
 
-        The same insertion point and type expression rules apply for this method
-        as `Graph.create_node`.
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
         """
         return self.create_node('call_function', the_function, args, kwargs, type_expr=type_expr)
 
-    def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lambda x: x) -> Node:
-        """ Copy a node from one graph into another. arg_transform needs to transform arguments from the graph of node
-            to the graph of self. Example:
+    def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = lambda x: x) -> Node:
+        """
+        Copy a node from one graph into another. ``arg_transform`` needs to transform arguments from
+        the graph of node to the graph of self. Example::
 
+            # Copying all the nodes in `g` into `new_graph`
             g : torch.fx.Graph = ...
             new_graph = torch.fx.graph()
             value_remap = {}
             for node in g.nodes:
                 value_remap[node] = new_graph.node_copy(node, lambda n : value_remap[n])
+
+        Args:
+
+            node (Node): The node to copy into ``self``.
+
+            arg_transform (Callable[[Node], Argument]): A function that transforms
+                ``Node`` arguments in node's ``args`` and ``kwargs`` into the
+                equivalent argument in ``self``. In the simplest case, this should
+                retrieve a value out of a table mapping Nodes in the original
+                graph to ``self``.
         """
         args = map_arg(node.args, arg_transform)
         kwargs = map_arg(node.kwargs, arg_transform)
@@ -416,14 +524,23 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lamb
         assert isinstance(kwargs, dict)
         return self.create_node(node.op, node.target, args, kwargs, node.name, node.type)
 
-    def output(self, result: Argument, type_expr: Optional[Any] = None):
+    def output(self, result: 'Argument', type_expr: Optional[Any] = None):
         """
-        Insert an `output` `Node` into the `Graph`. An `output` node represents
-        a `return` statement in the Python code. `result` is the value that should
+        Insert an ``output`` ``Node`` into the ``Graph``. An ``output`` node represents
+        a ``return`` statement in Python code. ``result`` is the value that should
         be returned.
 
-        The same insertion point and type expression rules apply for this method
-        as `Graph.create_node`.
+        Args:
+
+            result (Argument): The value to be returned.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        .. note::
+
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
         """
         return self.create_node(op='output', target='output', args=(result,), type_expr=type_expr)
 
@@ -463,7 +580,16 @@ def illegal_shadowing_name(name : str) -> bool:
 
     def python_code(self, root_module: str) -> str:
         """
-        Turn this `Graph` into valid Python code.
+        Turn this ``Graph`` into valid Python code.
+
+        Args:
+
+            root_module (str): The name of the root module on which to look-up
+                qualified name targets. This is usually 'self'.
+
+        Returns:
+
+            The string source code generated from this ``Graph``.
         """
         free_vars: List[str] = []
         modules_used : Set[str] = set()
@@ -569,7 +695,7 @@ def emit_node(node : Node):
             delete_unused_values(node)
 
         # repr() for inf and nan floating point values aren't parseable by
-        # python as literals. Explicitly import the names from the `math` module.
+        # python as literals. Explicitly import the names from the ``math`` module.
         import_strs = [f'import {name}' for name in sorted(modules_used)]
         import_block = '\n'.join(import_strs)
 
@@ -589,7 +715,7 @@ def __str__(self) -> str:
         of this Graph
         """
         placeholder_names : List[str] = []
-        # This is a one-element array just so `format_node` can modify the closed
+        # This is a one-element array just so ``format_node`` can modify the closed
         # over value
         maybe_return_typename : List[str] = ['']
 
@@ -644,7 +770,13 @@ def lint(self, root : Optional[torch.nn.Module] = None):
         particular:
             - Checks Nodes have correct ownership (owned by this graph)
             - Checks Nodes appear in topological order
-            - If `root` is provided, checks that `target`s exist in `root`
+            - If ``root`` is provided, checks that targets exist in ``root``
+
+        Args:
+
+            root (Optional[torch.nn.Module]): The root module with which to check
+                for targets. This is equivalent to the ``root`` argument that is
+                passed when constructing a ``GraphModule``.
         """
 
         # Check topo order
@@ -653,7 +785,7 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
             if arg.graph is not self:
                 raise RuntimeError(f'Argument \'{arg}\'{context_str}does not belong to this Graph, '
                                    f'but was used as an argument! If you are copying nodes from another graph, make '
-                                   f'sure to use `arg_transform` on node_copy() to remap values\n{self}')
+                                   f'sure to use ``arg_transform`` on node_copy() to remap values\n{self}')
             if arg not in seen_values:
                 raise RuntimeError(f'Argument \'{arg}\'{context_str}was used before it has been '
                                    f'defined! Please check that Nodes in the graph are topologically ordered\n{self}')
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 9becd6388f74..fc68cdab5677 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -55,7 +55,12 @@ def __init__(self, body):
             super().__init__()
             self.__dict__ = body
 
-    CodeOnlyModule.forward = _forward_from_src(body['code'])
+    try:
+        CodeOnlyModule.forward = _forward_from_src(body['_code'])
+    except KeyError:
+        # BC: attribute name was changed from `code` to `_code` to facilitate
+        # making `code` into a property and adding a docstring to it
+        CodeOnlyModule.forward = _forward_from_src(body['code'])
 
     from .symbolic_trace import Tracer
 
@@ -112,17 +117,17 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
 
 class GraphModule(torch.nn.Module):
     """
-    GraphModule is an nn.Module generated from an fx.Graph. GraphModule has
-    important attributes:
+    GraphModule is an nn.Module generated from an fx.Graph. Graphmodule has a
+    ``graph`` attribute, as well as ``code`` and ``forward`` attributes generated
+    from that ``graph``.
 
-        graph : The graph from which this GraphModule was generated
-        code : The Python source code for the function generated from `graph`
-        forward : The Python method generated from `graph`
+    .. warning::
+
+        When ``graph`` is reassigned, ``code`` and ``forward`` will be automatically
+        regenerated. However, if you edit the contents of the ``graph`` without reassigning
+        the ``graph`` attribute itself, you must call ``recompile()`` to update the generated
+        code.
 
-    Note that when `graph` is reassigned, `code` and `forward` will be automatically
-    regenerated. However, if you edit the contents of the `graph` without reassigning
-    the `graph` attribute itself, you must call `recompile()` to update the generated
-    code.
     """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
@@ -137,14 +142,20 @@ class GraphModuleImpl(cls):  # type: ignore
     def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
         """
         Construct a GraphModule.
-        root - `root` can either be an nn.Module instance or a Dict mapping strings to any attribute type.
-               - In the case that `root` is a Module, any references to Module-based objects (via qualified
-                 name) in the Graph's Nodes' `target` field will be copied over from the respective place
-                 within `root`'s Module hierarchy into the GraphModule's module hierarchy.
-               - In the case that `root` is a dict, the qualified name found in a Node's `target` will be
-                 looked up directly in the dict's keys. The object mapped to by the Dict will be copied
-                 over into the appropriate place within the GraphModule's module hierarchy.
-        graph - `graph` contains the nodes this GraphModule should use for code generation
+
+        Args:
+
+            root (Union[torch.nn.Module, Dict[str, Any]):
+                ``root`` can either be an nn.Module instance or a Dict mapping strings to any attribute type.
+                In the case that ``root`` is a Module, any references to Module-based objects (via qualified
+                name) in the Graph's Nodes' ``target`` field will be copied over from the respective place
+                within ``root``'s Module hierarchy into the GraphModule's module hierarchy.
+                In the case that ``root`` is a dict, the qualified name found in a Node's ``target`` will be
+                looked up directly in the dict's keys. The object mapped to by the Dict will be copied
+                over into the appropriate place within the GraphModule's module hierarchy.
+
+            graph (Graph): ``graph`` contains the nodes this GraphModule should use for code generation
+
         """
         super().__init__()
         if isinstance(root, torch.nn.Module):
@@ -161,14 +172,14 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
                     assert isinstance(node.target, str)
                     if node.target not in root:
                         raise RuntimeError('Node ' + str(node) + ' referenced target ' + node.target +
-                                           ' but that target was not provided in `root`!')
+                                           ' but that target was not provided in ``root``!')
                     targets_to_copy.append(node.target)
             # Sort targets in ascending order of the # of atoms.
             # This will ensure that less deeply nested attributes are assigned
             # before more deeply nested attributes. For example, foo.bar
             # will be assigned before foo.bar.baz. Otherwise, we might assign
-            # the user-provided `foo.bar` and wipe out the previously-assigned
-            # `foo.bar.baz`
+            # the user-provided ``foo.bar`` and wipe out the previously-assigned
+            # ``foo.bar.baz``
             targets_to_copy.sort(key=lambda t: t.count('.'))
             for target_to_copy in targets_to_copy:
                 _assign_attr(root[target_to_copy], self, target_to_copy)
@@ -183,26 +194,32 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
     __jit_unused_properties__ = ['graph']
 
     @property
-    def graph(self):
+    def graph(self) -> Graph:
         """
-        Return the `Graph` underlying this `GraphModule`
+        Return the ``Graph`` underlying this ``GraphModule``
         """
         return self._graph
 
     @graph.setter
     def graph(self, g) -> None:
         """
-        Set the underlying `Graph` for this `GraphModule`. This will internally
-        recompile the `GraphModule` so that the generated `forward()` function
-        corresponds to `g`
+        Set the underlying ``Graph`` for this ``GraphModule``. This will internally
+        recompile the ``GraphModule`` so that the generated ``forward()`` function
+        corresponds to ``g``
         """
         self._graph = g
         self.recompile()
 
-
-    def to_folder(self, folder: Union[str, os.PathLike], module_name="FxModule"):
+    def to_folder(self, folder: Union[str, os.PathLike], module_name : str = "FxModule"):
         """Dumps out module to ``folder`` with ``module_name`` so that it can be
         imported with ``from <folder> import <module_name>``
+
+        Args:
+
+            folder (Union[str, os.PathLike]): The folder to write the code out to
+
+            module_name (str): Top-level name to use for the ``Module`` while
+                writing out the code
         """
         folder = Path(folder)
         Path(folder).mkdir(exist_ok=True)
@@ -253,15 +270,25 @@ def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]:
             warnings.warn("Was not able to save the following children modules as reprs -"
                           f"saved as pickled files instead: {blobified_modules}")
 
+    @property
+    def code(self) -> str:
+        """
+        Return the Python code generated from the ``Graph`` underlying this
+        ``GraphModule``.
+        """
+        if not hasattr(self, '_code'):
+            raise RuntimeError('Code has not been generated! Please report a bug to PyTorch')
+        return self._code
+
     def recompile(self) -> None:
         """
-        Recompile this GraphModule from its `graph` attribute. This should be
-        called after editing the contained `graph`, otherwise the generated
-        code of this `GraphModule` will be out of date.
+        Recompile this GraphModule from its ``graph`` attribute. This should be
+        called after editing the contained ``graph``, otherwise the generated
+        code of this ``GraphModule`` will be out of date.
         """
-        self.code = self._graph.python_code(root_module='self')
+        self._code = self._graph.python_code(root_module='self')
         cls = type(self)
-        cls.forward = _forward_from_src(self.code)
+        cls.forward = _forward_from_src(self._code)
 
         cls_call = cls.__call__
 
@@ -280,10 +307,10 @@ def wrapped_call(self, *args, **kwargs):
     def __reduce__(self):
         """
         Serialization of GraphModule. We serialize only the generated code, not
-        the underlying `Graph`. This is because `Graph` does not have on-disk
+        the underlying ``Graph``. This is because ``Graph`` does not have on-disk
         backward-compatibility guarantees, whereas Python source code does.
         On the deserialization side, we symbolically trace through the generated
-        code to regenerate the underlying `Graph`
+        code to regenerate the underlying ``Graph``
         """
         dict_without_graph = self.__dict__.copy()
         del dict_without_graph['_graph']
@@ -302,7 +329,7 @@ def __copy__(self):
 
     def __str__(self) -> str:
         orig_str = super().__str__()
-        return '\n'.join([orig_str, self.code])
+        return '\n'.join([orig_str, self._code])
 
 # workarounds for issues in __torch_function__
 
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 8c484e0ab421..1cc94be83e7e 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -37,7 +37,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
         self._update_args_kwargs(map_arg(args, lambda x: x), map_arg(kwargs, lambda x: x))  # type: ignore
 
         # All of the nodes that use the value produced by this Node
-        # Note one user may correspond to several uses, e.g. the node fo `x + x`
+        # Note one user may correspond to several uses, e.g. the node fo ``x + x``
         # would appear once here, but represents two uses.
         #
         # Is a dict to act as an "ordered set". Keys are significant, value dont-care
@@ -49,9 +49,9 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
         # For placeholder nodes, this value will be used to type-annotate the
         # generated function parameters.
         # For the return ndoe, this value will be used to type-annotate the
-        # generated function return type. (Note this is a special case. `return`
+        # generated function return type. (Note this is a special case. ``return``
         # does not produce a value, it's more of a notation. Thus, this value
-        # describes the type of args[0] in the `return` node.
+        # describes the type of args[0] in the ``return`` node.
         self.type : Optional[Any] = type
         self._prev = self
         self._next = self
@@ -89,7 +89,7 @@ def prepend(self, x: 'Node'):
 
     def append(self, x: 'Node'):
         """Insert x after this node in the list of nodes in the graph.
-        Equvalent to `self.next.prepend(x)`
+        Equvalent to ``self.next.prepend(x)``
 
         Args:
             x (Node): The node to put after this node. Must be a member of the same graph.
@@ -104,7 +104,7 @@ def _remove_from_list(self):
     def args(self) -> Tuple[Argument, ...]:
         """
         Return the tuple of arguments to this Node. The interpretation of arguments
-        depends on the node's opcode. See the `fx.Graph` docstring for more
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
         return self._args
@@ -113,7 +113,7 @@ def args(self) -> Tuple[Argument, ...]:
     def args(self, a : Tuple[Argument, ...]):
         """
         Set the tuple of arguments to this Node. The interpretation of arguments
-        depends on the node's opcode. See the `fx.Graph` docstring for more
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
         self._update_args_kwargs(map_arg(a, lambda x: x), self._kwargs)  # type: ignore
@@ -122,7 +122,7 @@ def args(self, a : Tuple[Argument, ...]):
     def kwargs(self) -> Dict[str, Argument]:
         """
         Return the dict of kwargs to this Node. The interpretation of arguments
-        depends on the node's opcode. See the `fx.Graph` docstring for more
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
         return self._kwargs
@@ -131,7 +131,7 @@ def kwargs(self) -> Dict[str, Argument]:
     def kwargs(self, k : Dict[str, Argument]):
         """
         Set the dict of kwargs to this Node. The interpretation of arguments
-        depends on the node's opcode. See the `fx.Graph` docstring for more
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
         information.
         """
         self._update_args_kwargs(self._args, map_arg(k, lambda x: x))  # type: ignore
@@ -140,7 +140,7 @@ def kwargs(self, k : Dict[str, Argument]):
     def all_input_nodes(self) -> List['Node']:
         """
         Return all Nodes that are inputs to this Node. This is equivalent to
-        iterating over `args` and `kwargs` and only collecting the values that
+        iterating over ``args`` and ``kwargs`` and only collecting the values that
         are Nodes
         """
         all_nodes : List['Node'] = []
@@ -167,7 +167,7 @@ def __repr__(self) -> str:
 
     def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
         """
-        Replace all uses of `self` in the Graph with the Node `replace_with`.
+        Replace all uses of ``self`` in the Graph with the Node ``replace_with``.
         Returns the list of nodes on which this change was made.
         """
         to_process = list(self.users)
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index da005f0640b0..f8c4aa8d8366 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -33,8 +33,8 @@ def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs:
 
         If kind = 'placeholder', then we're creating a Node that
         represents the parameter of a function. If we need to encode
-        a default parameter, we use the `args` tuple. `args` is
-        otherwise empty for `placeholder` Nodes.
+        a default parameter, we use the ``args`` tuple. ``args`` is
+        otherwise empty for ``placeholder`` Nodes.
         '''
         args_ = self.create_arg(args)
         kwargs_ = self.create_arg(kwargs)
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index d48a067f5e56..6bdc8dd1070b 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -40,9 +40,9 @@ def _patch_function(fn: FunctionType, nargs: int) -> FunctionType:
 
 class Tracer(TracerBase):
     """
-    `Tracer` is the class that implements the symbolic tracing functionality
-    of `torch.fx.symbolic_trace`. A call to `symbolic_trace(m)` is equivalent
-    to `Tracer().trace(m)`.
+    ``Tracer`` is the class that implements the symbolic tracing functionality
+    of ``torch.fx.symbolic_trace``. A call to ``symbolic_trace(m)`` is equivalent
+    to ``Tracer().trace(m)``.
 
     Tracer can be subclassed to override various behaviors of the tracing
     process. The different behaviors that can be overridden are described
@@ -54,14 +54,14 @@ def __init__(self):
     def create_arg(self, a: Any) -> Argument:
         """
         A method to specify the behavior of tracing when preparing values to
-        be used as arguments to nodes in the `Graph`.
+        be used as arguments to nodes in the ``Graph``.
 
         By default, the behavior includes:
         - Iterate through collection types (e.g. tuple, list, dict) and recursively
-          call `create_args` on the elements.
-        - Given a Proxy object, return a reference to the underlying IR `Node`
+          call ``create_args`` on the elements.
+        - Given a Proxy object, return a reference to the underlying IR ``Node``
         - Given a non-Proxy Tensor object, emit IR for various cases:
-            - For a Parameter, emit a `get_attr` node referring to that Parameter
+            - For a Parameter, emit a ``get_attr`` node referring to that Parameter
             - For a non-Parameter Tensor, store the Tensor away in a special
               attribute referring to that attribute.
 
@@ -106,10 +106,10 @@ def create_arg(self, a: Any) -> Argument:
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
         """
-        A method to specify whether a given `nn.Module` is a "leaf" module.
+        A method to specify whether a given ``nn.Module`` is a "leaf" module.
 
         Leaf modules are the atomic units that appear in
-        the IR, referenced by `call_module` calls. By default,
+        the IR, referenced by ``call_module`` calls. By default,
         Modules in the PyTorch standard library namespace (torch.nn)
         are leaf modules. All other modules are traced through and
         their constituent ops are recorded, unless specified otherwise
@@ -118,17 +118,17 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         Args
         m - The module itself
         module_qualified_name - The path to root of this module. For example,
-            if you have a module hierarchy where submodule `foo` contains
-            submodule `bar`, which contains submodule `baz`, that module will
-            appear with the qualified name `foo.bar.baz` here.
+            if you have a module hierarchy where submodule ``foo`` contains
+            submodule ``bar``, which contains submodule ``baz``, that module will
+            appear with the qualified name ``foo.bar.baz`` here.
         """
         return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)
 
     def path_of_module(self, mod) -> str:
         """
-        Helper method to find the qualified name of `mod` in the Module hierarchy
-        of `root`. For example, if `root` has a submodule named `foo`, which has
-        a submodule named `bar`, passing `bar` into this function will return
+        Helper method to find the qualified name of ``mod`` in the Module hierarchy
+        of ``root``. For example, if ``root`` has a submodule named ``foo``, which has
+        a submodule named ``bar``, passing ``bar`` into this function will return
         the string "foo.bar".
         """
         for n, p in self.root.named_modules():
@@ -138,17 +138,17 @@ def path_of_module(self, mod) -> str:
 
     def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args, kwargs):
         """
-        Method that specifies the behavior of this `Tracer` when it encounters
-        a call to an `nn.Module` instance.
+        Method that specifies the behavior of this ``Tracer`` when it encounters
+        a call to an ``nn.Module`` instance.
 
         By default, the behavior is to check if the called module is a leaf module
-        via `is_leaf_module`. If it is, emit a `call_module` node referring to
-        `m` in the `Graph`. Otherwise, call the `Module` normally, tracing through
-        the operations in its `forward` function.
+        via ``is_leaf_module``. If it is, emit a ``call_module`` node referring to
+        ``m`` in the ``Graph``. Otherwise, call the ``Module`` normally, tracing through
+        the operations in its ``forward`` function.
 
         This method can be overridden to--for example--create nested traced
         GraphModules, or any other behavior you would want while tracing across
-        `Module` boundaries.
+        ``Module`` boundaries.
         """
         module_qualified_name = self.path_of_module(m)
         if not self.is_leaf_module(m, module_qualified_name):
@@ -157,12 +157,12 @@ def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args, kwa
 
     def create_args_for_root(self, root_fn, is_module):
         """
-        Create `placeholder` nodes corresponding to the signature of the `root`
-        Module. This method introspects `root`'s signature and emits those
+        Create ``placeholder`` nodes corresponding to the signature of the ``root``
+        Module. This method introspects ``root``'s signature and emits those
         nodes accordingly, also supporting *args and **kwargs.
         """
         # In some cases, a function or method has been decorated with a wrapper
-        # defined via `functools.wraps`. In this case, the outer code object
+        # defined via ``functools.wraps``. In this case, the outer code object
         # will likely not contain the actual parameters we care about, so unwrap
         # the function to get to the innermost callable.
         fn_for_analysis = inspect.unwrap(root_fn)
@@ -173,7 +173,7 @@ def create_args_for_root(self, root_fn, is_module):
         skip_arg_idx = 0
         if is_module:
             if total_args == 0:
-                raise RuntimeError('`self` argument cannot be part of *args expansion!')
+                raise RuntimeError('``self`` argument cannot be part of *args expansion!')
             skip_arg_idx = 1
             next(names_iter)  # skip self
             args.append(self.root)
@@ -203,8 +203,8 @@ def proxy_placeholder(name: str):
 
     def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph:
         """
-        Trace `root` and return the corresponding FX `Graph` representation. `root`
-        can either be an `nn.Module` instance or a Python callable.
+        Trace ``root`` and return the corresponding FX ``Graph`` representation. ``root``
+        can either be an ``nn.Module`` instance or a Python callable.
         """
         if isinstance(root, torch.nn.Module):
             self.root = root
@@ -269,10 +269,17 @@ def forward(*args, **kwargs):
 
 
 def symbolic_trace(root : Union[torch.nn.Module, Callable]) -> GraphModule:
-    """
-    Symbolic tracing API
+    """Symbolic tracing API
+
+    Given an ``nn.Module`` or function instance ``root``, this function will return a ``GraphModule``
+    constructed by recording operations seen while tracing through ``root``.
+
+    Args:
+        root (Union[torch.nn.Module, Callable]): Module or function to be traced and converted
+            into a Graph representation.
+
+    Returns:
+        GraphModule: a Module created from the recorded operations from ``root``.
 
-     Given an `nn.Module` or function instance `root`, this function will return a `GraphModule`
-     constructed by recording operations seen while tracing through `root`.
     """
     return GraphModule(root if isinstance(root, torch.nn.Module) else torch.nn.Module(), Tracer().trace(root))

From 0185a05ceb257854fc71981cb84a16911f270cc3 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Sat, 5 Dec 2020 18:06:56 -0800
Subject: [PATCH 089/132] Revert D25338250: [pytorch][PR] [BE] Fix
 signed-unsigned warnings

Test Plan: revert-hammer

Differential Revision:
D25338250 (https://github.com/pytorch/pytorch/commit/6317e0b2f1090ea4189e88557d4ff6656fb758cc)

Original commit changeset: e840618b113b

fbshipit-source-id: dbecb068892dc118f257fe5c50692ede2b2462ca
---
 aten/src/ATen/BatchingRegistrations.cpp    | 4 ++--
 aten/src/ATen/NamedTensorUtils.cpp         | 4 ++--
 aten/src/ATen/TensorIterator.cpp           | 4 ++--
 aten/src/ATen/TensorNames.cpp              | 4 ++--
 aten/src/ATen/native/Convolution.cpp       | 8 ++++----
 aten/src/ATen/native/ForeachOpsKernels.cpp | 2 +-
 aten/src/TH/generic/THStorage.cpp          | 3 ++-
 aten/src/TH/generic/THStorageCopy.cpp      | 5 +++--
 caffe2/serialize/crc_alt.h                 | 8 ++++----
 9 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 0f9b31efefb9..16470f39ad54 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule(
         size.size(), ") must match dimensionality of strides (",
         stride.size(), ")");
   auto storage_size = native::storage_size_for(size, stride);
-  for (auto& physical_stride : physical_strides) {
-    physical_stride *= storage_size;
+  for (int64_t idx = 0; idx < physical_strides.size(); ++idx) {
+    physical_strides[idx] *= storage_size;
   }
 
   // physical_strides = [B1 * B2 * S, B2 * S, S] + strides
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 5f8de486dc78..668838877123 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -264,11 +264,11 @@ static std::vector<Dimname> compute_dot_product_outnames(
   }
   std::vector<Dimname> outnames(num_outnames, Dimname::wildcard());
   int64_t index = 0;
-  for (size_t j = 0; j < tensor_names.size(); ++j) {
+  for (int64_t j = 0; j < tensor_names.size(); ++j) {
     if (j == tensor_dotted_dim) continue;
     outnames[index++] = tensor_names[j];
   }
-  for (size_t j = 0; j < other_names.size(); ++j) {
+  for (int64_t j = 0; j < other_names.size(); ++j) {
     if (j == other_dotted_dim) continue;
     outnames[index++] = other_names[j];
   }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 0f18d941feff..43acc9a070d5 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
 }
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
-  for (auto& tensor: config.tensors_) {
-    operands_.emplace_back(std::move(tensor));
+  for (int i = 0; i < config.tensors_.size(); i++) {
+    operands_.emplace_back(std::move(config.tensors_[i]));
   }
   num_outputs_ = config.num_outputs_;
 }
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index a7dc0bd68036..844ff4ba2bad 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
 }
 
 TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) {
-  size_t size_diff = std::labs(names_.size() - other.names_.size());
+  int64_t size_diff = std::labs(names_.size() - other.names_.size());
 
   if (names_.size() > other.names_.size()) {
-    for (size_t idx = size_diff; idx < names_.size(); ++idx) {
+    for (int64_t idx = size_diff; idx < names_.size(); ++idx) {
       names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name);
     }
   } else {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 801925214a99..6dbf1e5535ed 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -177,13 +177,13 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at
   int64_t outsize = 1;
   if (transposed) {
     std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
-    for (const auto& e: o) {
-      outsize *= e;
+    for (int64_t i = 1; i < o.size(); i++) {
+      outsize *= o[i];
     }
   } else {
     std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-    for (const auto& e: o) {
-      outsize *= e;
+    for (int64_t i = 1; i < o.size(); i++) {
+      outsize *= o[i];
     }
   }
   return outsize > int_max;
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index a4a796ca26d9..5fbc1506bfaa 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -201,7 +201,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
                                                                                              \
   std::vector<Tensor> result;                                                                \
   result.reserve(tensors1.size());                                                           \
-  for (size_t i = 0; i < tensors1.size(); i++) {                                             \
+  for (int i = 0; i < tensors1.size(); i++) {                                                \
     result.emplace_back(at::NAME(tensors1[i], tensors2[i]));                                 \
   }                                                                                          \
                                                                                              \
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index a085f31c740f..2db795719557 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -115,9 +115,10 @@ void THStorage_(resizeBytes)(THStorage* storage, ptrdiff_t size_bytes) {
 
 void THStorage_(fill)(THStorage *storage, scalar_t value)
 {
+  ptrdiff_t i;
   auto type_meta = caffe2::TypeMeta::Make<scalar_t>();
   size_t numel = storage->nbytes() / type_meta.itemsize();
-  for (size_t i = 0; i < numel; i++)
+  for (i = 0; i < numel; i++)
     THStorage_(data)(storage)[i] = value;
 }
 
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index 2d6ec8a05eb6..dc19deea7652 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -8,7 +8,7 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
   scalar_t *scalar_src = THStorage_(data)(src);
   scalar_t *data = THStorage_(data)(storage);
   uint64_t numel = storage->nbytes() / sizeof(scalar_t);
-  for (uint64_t i = 0; i < numel; ++i) {
+  for (ptrdiff_t i = 0; i < numel; ++i) {
     data[i] = scalar_src[i];
   }
 }
@@ -19,10 +19,11 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
 #define IMPLEMENT_THStorage_COPY(TYPENAMESRC)                \
   void THStorage_(copy##TYPENAMESRC)(                        \
       THStorage * storage, TH##TYPENAMESRC##Storage * src) { \
+    ptrdiff_t i;                                             \
     auto data = THStorage_(data)(storage);                   \
     auto src_data = TH##TYPENAMESRC##Storage_data(src);      \
     uint64_t numel = storage->nbytes() / sizeof(scalar_t);   \
-    for (uint64_t i = 0; i < numel; i++)                     \
+    for (i = 0; i < numel; i++)                              \
       data[i] = static_cast<scalar_t>(src_data[i]);          \
   }
 
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index e7c986ff89fb..be51083fec0e 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -680,12 +680,12 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
 
   // put operator for one zero bit in odd
   odd[0] = Polynomial;    // CRC-32 polynomial
-  for (uint32_t i = 1; i < CrcBits; i++)
+  for (int i = 1; i < CrcBits; i++)
     odd[i] = 1 << (i - 1);
 
   // put operator for two zero bits in even
   // same as gf2_matrix_square(even, odd);
-  for (uint32_t i = 0; i < CrcBits; i++)
+  for (int i = 0; i < CrcBits; i++)
   {
     uint32_t vec = odd[i];
     even[i] = 0;
@@ -695,7 +695,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
   }
   // put operator for four zero bits in odd
   // same as gf2_matrix_square(odd, even);
-  for (uint32_t i = 0; i < CrcBits; i++)
+  for (int i = 0; i < CrcBits; i++)
   {
     uint32_t vec = even[i];
     odd[i] = 0;
@@ -711,7 +711,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
   for (; lengthB > 0; lengthB >>= 1)
   {
     // same as gf2_matrix_square(a, b);
-    for (uint32_t i = 0; i < CrcBits; i++)
+    for (int i = 0; i < CrcBits; i++)
     {
       uint32_t vec = b[i];
       a[i] = 0;

From 5de22d3f6914fd39a63aa910994276cc50b81982 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Sat, 5 Dec 2020 19:23:50 -0800
Subject: [PATCH 090/132] Removes redundant method_test entries (#48828)

Summary:
Now that Lilyjjo's [stack of OpInfo updates](https://github.com/pytorch/pytorch/pull/48627) is landed, we can port method_test entries to OpInfos. This PR doesn't port any method_test entries, but it removes redundant entries. These entries previously tested both multi-dim and zero-dim tensors, so a new zero-dim tensor input is added to UnaryUfuncInfo's sample inputs.

To recap, this PR:

- removes method_test() entries that are redundant with OpInfo entries
- adds a new sample input to unary ufunc OpInfos that tests them on 0d tensors

cc kshitij12345 as an fyi. Going forward we should have a goal of not only porting all the MathTestMeta objects to use the OpInfo pattern but also all the current method_test entries. For each entry the function needs to be added as an OpInfo and the inputs need to be added as sample inputs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48828

Reviewed By: malfet

Differential Revision: D25336071

Pulled By: mruberry

fbshipit-source-id: 6b3f6c347195233d6b8ad57e2be68fd772663d9b
---
 .../_internal/common_methods_invocations.py   | 100 +++++-------------
 1 file changed, 27 insertions(+), 73 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 226dbdd6ad5d..2e35c5582ce0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -69,16 +69,16 @@ def __init__(self,
                                             # with the dtypes support on the tested device
                  test_inplace_grad=True,  # whether to gradcheck and gradgradcheck the inplace variant
                  test_complex_grad=True,  # whether to gradcheck and gradgradcheck for complex dtypes
-                 skip_bfloat16_grad=False,  # whether to skip grad and gradgradcheck for bfloat16 dtype 
+                 skip_bfloat16_grad=False,  # whether to skip grad and gradgradcheck for bfloat16 dtype
                  assert_autodiffed=False,  # if a op's aten::node is expected to be symbolically autodiffed
-                 autodiff_nonfusible_nodes=None,  # a list of strings with node names that are expected to be in a 
-                                                  # DifferentiableGraph when autodiffed. Ex: ['aten::add', 'aten::mm'], 
+                 autodiff_nonfusible_nodes=None,  # a list of strings with node names that are expected to be in a
+                                                  # DifferentiableGraph when autodiffed. Ex: ['aten::add', 'aten::mm'],
                                                   # default is populated to be ['aten::(name of Python operator)']
                  autodiff_fusible_nodes=None,  # a list of strings with node names that are expected to be in FusionGroups
-                                               # inside of DifferentiableGraphs when this operation is autodiffed. 
-                                               # Ex: ['aten::add', 'aten::mm'], defaults to an empty list  
-                                               # Note: currently no ops use fusible nodes 
-                 output_func=lambda x: x,  # fn mapping output to part that should be gradcheck'ed 
+                                               # inside of DifferentiableGraphs when this operation is autodiffed.
+                                               # Ex: ['aten::add', 'aten::mm'], defaults to an empty list
+                                               # Note: currently no ops use fusible nodes
+                 output_func=lambda x: x,  # fn mapping output to part that should be gradcheck'ed
                  supports_tensor_out=True,  # whether the op supports the out kwarg, returning a Tensor
                  skips=tuple(),  # information about which tests to skip
                  decorators=None,  # decorators to apply to generated tests
@@ -147,10 +147,7 @@ def get_inplace(self):
 
     def sample_inputs(self, device, dtype, requires_grad=False):
         """Returns an iterable of SampleInputs."""
-        if self.sample_inputs:
-            return self.sample_inputs_func(self, device, dtype, requires_grad)
-        else:
-            return tuple()
+        return self.sample_inputs_func(self, device, dtype, requires_grad)
 
     # Returns True if the test should be skipped and False otherwise
     def should_skip(self, cls_name, test_name, device_type, dtype):
@@ -195,14 +192,17 @@ def default_test_dtypes(self, device_type):
 S = 5
 
 
-def sample_inputs_unary(self, device, dtype, requires_grad):
-    low, high = self.domain
-    low = low if low is None else low + self._domain_eps
-    high = high if high is None else high - self._domain_eps
+def sample_inputs_unary(op_info, device, dtype, requires_grad):
+    low, high = op_info.domain
+    low = low if low is None else low + op_info._domain_eps
+    high = high if high is None else high - op_info._domain_eps
 
     return (SampleInput(make_tensor((L,), device, dtype,
                                     low=low, high=high,
-                                    requires_grad=requires_grad)),)
+                                    requires_grad=requires_grad)),
+            SampleInput(make_tensor((), device, dtype,
+                                    low=low, high=high,
+                                    requires_grad=requires_grad)))
 
 # Metadata class for unary "universal functions (ufuncs)" that accept a single
 # tensor and have common properties like:
@@ -251,16 +251,16 @@ def __init__(self,
         self._domain_eps = 1e-5
 
 
-def sample_inputs_addmm(self, device, dtype, requires_grad):
+def sample_inputs_addmm(op_info, device, dtype, requires_grad):
     return (SampleInput((make_tensor((S, S), device, dtype,
-                                     low=None, high=None, 
-                                     requires_grad=requires_grad), 
-                        make_tensor((S, S), device, dtype, 
-                                    low=None, high=None, 
-                                    requires_grad=requires_grad), 
-                        make_tensor((S, S), device, dtype, 
-                                    low=None, high=None, 
-                                    requires_grad=False))),) 
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=False))),)
 
 
 # Operator database (sorted alphabetically)
@@ -599,9 +599,9 @@ def sample_inputs_addmm(self, device, dtype, requires_grad):
                        SkipInfo('TestGradients', 'test_inplace_gradgrad',
                                 dtypes=[torch.cdouble]),
                        SkipInfo('TestCommon', 'test_variant_consistency_eager',
-                                dtypes=[torch.cfloat, torch.cdouble]), 
+                                dtypes=[torch.cfloat, torch.cdouble]),
                        SkipInfo('TestCommon', 'test_variant_consistency_jit',
-                                dtypes=[torch.cfloat, torch.cdouble])), 
+                                dtypes=[torch.cfloat, torch.cdouble])),
                    promotes_integers_to_float=True,
                    handles_complex_extremals=False),
 ]
@@ -767,8 +767,6 @@ def ident(x):
 def method_tests():
     set_rng_seed(0)
     return [
-        ('acosh', torch.rand(S, S, S).add(1), NO_ARGS, ''),
-        ('acosh', torch.rand(tuple()).add(1), NO_ARGS, 'scalar'),
         ('add', (S, S, S), ((S, S, S),), '', (True,)),
         ('add', (S, S, S), ((S, S),), 'broadcast_rhs', (True,)),
         ('add', (S, S), ((S, S, S),), 'broadcast_lhs', (True,)),
@@ -779,10 +777,6 @@ def method_tests():
         ('add', (S, S, S), (3.14,), 'constant', (True,)),
         ('add', (), (3.14,), 'scalar_constant', (True,)),
         ('add', (S, S, S), (3.14j,), 'complex_scalar_constant', (True,)),
-        ('asinh', (S, S, S), NO_ARGS, ''),
-        ('asinh', (), NO_ARGS, 'scalar'),
-        ('atanh', torch.rand(S, S, S), NO_ARGS, ''),
-        ('atanh', torch.rand(tuple()), NO_ARGS, 'scalar'),
         ('__radd__', (S, S, S), (3.14,), 'constant', (True, 'aten::add')),
         ('__radd__', (), (3.14,), 'scalar_constant', (True, 'aten::add')),
         ('sub', (S, S, S), ((S, S, S),), '', (True,)),
@@ -929,42 +923,14 @@ def method_tests():
         ('expand_as', (S, 1, 1), (torch.rand(S, S, S),), '', (False,)),
         ('exp', (S, S, S), NO_ARGS, '', (True,)),
         ('exp', (), NO_ARGS, 'scalar', (True,)),
-        ('exp2', (S, S, S), NO_ARGS, '', (False,)),
-        ('exp2', (), NO_ARGS, 'scalar', (False,)),
         ('expm1', (S, S, S), NO_ARGS, '', (True,)),
         ('expm1', (), NO_ARGS, 'scalar', (True,)),
-        ('erf', torch.rand(S, S, S), NO_ARGS, '', (True,)),
-        ('erf', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('erfc', torch.rand(S, S, S), NO_ARGS, '', (True,)),
-        ('erfc', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('erfinv', torch.rand(S, S, S).clamp(-0.9, 0.9), NO_ARGS),
         ('erfinv', normal_scalar_clamp(-0.9, 0.9, requires_grad=True), NO_ARGS, 'scalar'),
-        ('log', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
-        ('log', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('log10', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
-        ('log10', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('log1p', torch.rand(S, S, S), NO_ARGS, '', (True,)),
-        ('log1p', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('log2', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
-        ('log2', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('log', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        ('log10', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        ('log2', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        ('tanh', (S, S, S), NO_ARGS, '', (True,)),
-        ('tanh', (), NO_ARGS, 'scalar', (True,)),
-        ('sigmoid', (S, S, S), NO_ARGS, '', (True,)),
-        ('sigmoid', (), NO_ARGS, 'scalar', (True,)),
         ('logit', torch.randn(S, S, S).clamp(0.1, 0.9).requires_grad_(True), NO_ARGS, ''),
         ('logit', torch.randn(S, S, S).clamp(0.1, 0.9).requires_grad_(True), (0.2,), 'eps'),
         ('logit', uniform_scalar().clamp(0.1, 0.9).requires_grad_(True), NO_ARGS, 'scalar'),
         ('logit', uniform_scalar().clamp(0.1, 0.9).requires_grad_(True), (0.2,), 'scalar_eps'),
-        ('sinh', (S, S, S), NO_ARGS, '', (True,)),
-        ('sinh', (), NO_ARGS, 'scalar', (True,)),
-        ('cosh', (S, S, S), NO_ARGS, '', (True,)),
-        ('cosh', (), NO_ARGS, 'scalar', (True,)),
         ('conj', (S, S, S), NO_ARGS),
         ('copysign', (S, S, S), ((S, S, S),), '', (False,)),
         ('copysign', (S, S, S), ((S, S),), 'broadcast_rhs', (False,)),
@@ -991,18 +957,6 @@ def method_tests():
         ('clamp', (), (None, 0.5), 'min_scalar', (True,)),
         ('clamp', (), (0.5, None), 'max_scalar', (True,)),
         ('clamp', (S, S), (), 'max_scalar_kwarg', (True,), (), (), ident, {'max': 1}),
-        ('sqrt', torch.rand(S, S, S) + 5e-4, NO_ARGS, '', (True,)),
-        ('sqrt', uniform_scalar(5e-4, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('sin', (S, S, S), NO_ARGS, '', (True,)),
-        ('sin', (), NO_ARGS, 'scalar', (True,)),
-        ('cos', (S, S, S), NO_ARGS, '', (True,)),
-        ('cos', (), NO_ARGS, 'scalar', (True,)),
-        ('tan', torch.randn(S, S, S).clamp(-1, 1), NO_ARGS, '', (True,)),
-        ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
-        ('asin', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
-        ('acos', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
-        ('atan', (S, S, S), NO_ARGS, '', (True,)),
-        ('atan', (), NO_ARGS, 'scalar', (True,)),
         ('atan2', (S, S, S), ((S, S, S),)),
         ('atan2', (), ((),), 'scalar'),
         ('atan2', (S, S, S), ((S,),), 'broadcast_rhs'),

From 85121a7a0f24faccdb36093e92262c186fc65ee0 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Sat, 5 Dec 2020 20:16:50 -0800
Subject: [PATCH 091/132] Added CUDA support for complex input for
 torch.cholesky_solve (#47047)

Summary:
`torch.cholesky_solve` now works for complex inputs on GPU.
I moved the existing tests to `test_linalg.py` and modified them to test complex and float32 dtypes.
Differentiation also works correctly with complex inputs now.

Ref. https://github.com/pytorch/pytorch/issues/33152

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47047

Reviewed By: ngimel

Differential Revision: D24730020

Pulled By: mruberry

fbshipit-source-id: 95402da5789c56e5a682019790985207fa28fa1f
---
 .../ATen/native/cuda/BatchLinearAlgebra.cu    |  47 +++-
 test/test_autograd.py                         |  21 --
 test/test_linalg.py                           | 220 ++++++++++--------
 tools/autograd/gen_variable_type.py           |   2 +-
 torch/_torch_docs.py                          |   3 +
 torch/csrc/autograd/FunctionsManual.cpp       |   4 +-
 torch/testing/_internal/common_utils.py       |   4 +-
 7 files changed, 178 insertions(+), 123 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index eaee3c87b1f8..5b16adaa2e5f 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -535,6 +535,28 @@ void magmaCholeskySolve<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholeskySolve<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, c10::complex<double>* dA, magma_int_t ldda,
+    c10::complex<double>* dB, magma_int_t lddb, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zpotrs_gpu(uplo, n, nrhs,
+    reinterpret_cast<magmaDoubleComplex*>(dA), ldda,
+    reinterpret_cast<magmaDoubleComplex*>(dB), lddb, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholeskySolve<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, c10::complex<float>* dA, magma_int_t ldda,
+    c10::complex<float>* dB, magma_int_t lddb, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cpotrs_gpu(uplo, n, nrhs,
+    reinterpret_cast<magmaFloatComplex*>(dA), ldda,
+    reinterpret_cast<magmaFloatComplex*>(dB), lddb, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaCholeskySolveBatched<double>(
     magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, double** dA_array, magma_int_t ldda,
@@ -551,6 +573,26 @@ void magmaCholeskySolveBatched<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholeskySolveBatched<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, c10::complex<double>** dA_array, magma_int_t ldda,
+    c10::complex<double>** dB_array, magma_int_t lddb, magma_int_t& info, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  info = magma_zpotrs_batched(uplo, n, nrhs,
+    reinterpret_cast<magmaDoubleComplex**>(dA_array), ldda,
+    reinterpret_cast<magmaDoubleComplex**>(dB_array), lddb, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholeskySolveBatched<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, c10::complex<float>** dA_array, magma_int_t ldda,
+    c10::complex<float>** dB_array, magma_int_t lddb, magma_int_t& info, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  info = magma_cpotrs_batched(uplo, n, nrhs,
+    reinterpret_cast<magmaFloatComplex**>(dA_array), ldda,
+    reinterpret_cast<magmaFloatComplex**>(dB_array), lddb, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaCholesky<double>(
     magma_uplo_t uplo, magma_int_t n, double* dA,
@@ -1376,7 +1418,7 @@ Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upp
   int64_t info = 0;
   auto self_working_copy = cloneBatchedColumnMajor(self);
   auto A_working_copy = cloneBatchedColumnMajor(A);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_solve_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_solve_cuda", [&]{
     apply_cholesky_solve<scalar_t>(self_working_copy, A_working_copy, upper, info);
   });
   TORCH_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info);
@@ -1418,10 +1460,11 @@ AT_ERROR("cholesky: MAGMA library not found in "
 
     MAGMAQueue magma_queue(self.get_device());
 
-    constexpr int64_t batch_limit = 262140;
+    int64_t batch_limit = self.is_complex() ? 65535 : 262140;
     // Compute as many batches of 262140 possible
     // 262140 is the size of the largest batch of matrices that can be run with
     // violating maximum kernel configuration
+    // For complex input the batch limit is 65535 (determined experimentally, see https://github.com/pytorch/pytorch/pull/47047#discussion_r516086923 for more information)
     // The number of "mini"-batches are floor(batch_size / batch_limit)
     // and these cover floor(batch_size / batch_limit) * batch_limit cholesky calls
     int64_t mini_batches = batch_size / batch_limit, mini_idx;
diff --git a/test/test_autograd.py b/test/test_autograd.py
index dfcc7221e528..7c2082b1ed1d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2611,27 +2611,6 @@ def test_var_mean_differentiable(self):
         torch.autograd.backward(r2, grad)
         self.assertTrue(torch.allclose(input1.grad, input2.grad, rtol=0.01, atol=0.0))
 
-    @skipIfNoLapack
-    def test_cholesky_solve(self):
-        def _test_with_size(A_dims, B_dims, upper):
-            root = torch.rand(*A_dims).requires_grad_()
-            b = torch.rand(*B_dims).requires_grad_()
-
-            def func(root, b, upper):
-                if upper:
-                    A = root.triu()
-                else:
-                    A = root.tril()
-                return torch.cholesky_solve(b, A, upper)
-
-            gradcheck(func, [root, b, upper])
-            gradgradcheck(func, [root, b, upper])
-
-        for (a_size, b_size), upper in product([((3, 3), (3, 4)), ((3, 3), (3, 2)),
-                                                ((2, 3, 3), (2, 3, 4)), ((2, 3, 3), (2, 3, 2))],
-                                               [True, False]):
-            _test_with_size(a_size, b_size, upper)
-
     @skipIfNoLapack
     def test_eig(self):
         def func(B):
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 114ee1842c42..b6ff817a59fa 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1837,6 +1837,131 @@ def test_nuclear_norm_exceptions_old(self, device):
         self.assertRaisesRegex(RuntimeError, "duplicate or invalid", torch.norm, x, "nuc", (0, 0))
         self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2))
 
+    def cholesky_solve_test_helper(self, A_dims, b_dims, upper, device, dtype):
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        b = torch.randn(*b_dims, dtype=dtype, device=device)
+        A = random_hermitian_pd_matrix(*A_dims, dtype=dtype, device=device)
+        L = torch.cholesky(A, upper=upper)
+        return b, A, L
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
+    def test_cholesky_solve(self, device, dtype):
+        for (k, n), upper in itertools.product(zip([2, 3, 5], [3, 5, 7]), [True, False]):
+            b, A, L = self.cholesky_solve_test_helper((n,), (n, k), upper, device, dtype)
+            x = torch.cholesky_solve(b, L, upper=upper)
+            self.assertEqual(b, A.mm(x))
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
+    def test_cholesky_solve_batched(self, device, dtype):
+        def cholesky_solve_batch_helper(A_dims, b_dims, upper):
+            b, A, L = self.cholesky_solve_test_helper(A_dims, b_dims, upper, device, dtype)
+            x_exp_list = []
+            for i in range(b_dims[0]):
+                x_exp_list.append(torch.cholesky_solve(b[i], L[i], upper=upper))
+            x_exp = torch.stack(x_exp_list)  # Stacked output
+            x_act = torch.cholesky_solve(b, L, upper=upper)  # Actual output
+            self.assertEqual(x_act, x_exp)  # Equality check
+            Ax = torch.matmul(A, x_act)
+            self.assertEqual(b, Ax)  # Correctness check
+
+        for upper, batchsize in itertools.product([True, False], [1, 3, 4]):
+            cholesky_solve_batch_helper((5, batchsize), (batchsize, 5, 10), upper)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_cholesky_solve_batched_non_contiguous(self, device, dtype):
+        from numpy.linalg import solve
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        for upper in [True, False]:
+            A = random_hermitian_pd_matrix(2, 2, dtype=dtype, device='cpu')
+            b = torch.randn(2, 2, 2, dtype=dtype, device='cpu')
+            x_exp = solve(A.permute(0, 2, 1).numpy(), b.permute(2, 1, 0).numpy())
+            A = A.to(device).permute(0, 2, 1)
+            b = b.to(device).permute(2, 1, 0)
+            assert not A.is_contiguous() and not b.is_contiguous(), "contiguous inputs"
+            L = torch.cholesky(A, upper)
+            x = torch.cholesky_solve(b, L, upper=upper)
+            self.assertEqual(x, x_exp)
+
+    @slowTest
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
+    def test_cholesky_solve_batched_many_batches(self, device, dtype):
+        for A_dims, b_dims in zip([(5, 256, 256), (5,)], [(5, 10), (512, 512, 5, 10)]):
+            for upper in [True, False]:
+                b, A, L = self.cholesky_solve_test_helper(A_dims, b_dims, upper, device, dtype)
+                x = torch.cholesky_solve(b, L, upper)
+                Ax = torch.matmul(A, x)
+                self.assertEqual(Ax, b.expand_as(Ax))
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
+    def test_cholesky_solve_batched_broadcasting(self, device, dtype):
+        from numpy.linalg import solve
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        def run_test(A_dims, b_dims, upper):
+            A_matrix_size = A_dims[-1]
+            A_batch_dims = A_dims[:-2]
+            A = random_hermitian_pd_matrix(A_matrix_size, *A_batch_dims,
+                                           dtype=dtype, device='cpu')
+            b = torch.randn(*b_dims, dtype=dtype, device='cpu')
+            x_exp = torch.tensor(solve(A.numpy(), b.numpy()), dtype=dtype, device=device)
+            A, b = A.to(dtype=dtype, device=device), b.to(dtype=dtype, device=device)
+            L = torch.cholesky(A, upper)
+            x = torch.cholesky_solve(b, L, upper=upper)
+            self.assertEqual(x, x_exp)
+            # https://github.com/pytorch/pytorch/issues/42695
+            x = torch.cholesky_solve(b, L, upper=upper, out=x)
+            self.assertEqual(x, x_exp)
+
+        # test against numpy.linalg.solve
+        for upper in [True, False]:
+            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), upper)  # no broadcasting
+            run_test((2, 1, 3, 4, 4), (4, 6), upper)  # broadcasting b
+            run_test((4, 4), (2, 1, 3, 4, 2), upper)  # broadcasting A
+            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), upper)  # broadcasting A & b
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float64, torch.complex128)
+    def test_cholesky_solve_autograd(self, device, dtype):
+        def run_test(A_dims, B_dims, upper):
+            root = torch.randn(*A_dims, device=device, dtype=dtype).requires_grad_()
+            b = torch.randn(*B_dims, device=device, dtype=dtype).requires_grad_()
+
+            def func(root, b, upper):
+                if upper:
+                    A = root.triu()
+                else:
+                    A = root.tril()
+                return torch.cholesky_solve(b, A, upper)
+
+            gradcheck(func, [root, b, upper])
+            gradgradcheck(func, [root, b, upper], atol=1e-3)
+
+        for (a_size, b_size), upper in itertools.product([((3, 3), (3, 4)), ((3, 3), (3, 2)),
+                                                          ((2, 3, 3), (2, 3, 4)), ((2, 3, 3), (2, 3, 2))],
+                                                         [True, False]):
+            run_test(a_size, b_size, upper)
+
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
@@ -4739,101 +4864,6 @@ def run_test(matsize, batchdims, mat_chars):
             run_test(matsize, batchdims, mat_chars=['sym', 'sym_pd', 'sym_psd'])
             run_test(matsize, batchdims, mat_chars=['sing', 'non_sing'])
 
-    def cholesky_solve_test_helper(self, A_dims, b_dims, upper, device, dtype):
-        from torch.testing._internal.common_utils import random_symmetric_pd_matrix
-
-        b = torch.randn(*b_dims, dtype=dtype, device=device)
-        A = random_symmetric_pd_matrix(*A_dims, dtype=dtype, device=device)
-        L = torch.cholesky(A, upper=upper)
-        return b, A, L
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double)
-    def test_cholesky_solve(self, device, dtype):
-        for (k, n), upper in itertools.product(zip([2, 3, 5], [3, 5, 7]), [True, False]):
-            b, A, L = self.cholesky_solve_test_helper((n,), (n, k), upper, device, dtype)
-            x = torch.cholesky_solve(b, L, upper=upper)
-            self.assertLessEqual(b.dist(A.mm(x)), 1e-11)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double)
-    def test_cholesky_solve_batched(self, device, dtype):
-        def cholesky_solve_batch_helper(A_dims, b_dims, upper):
-            b, A, L = self.cholesky_solve_test_helper(A_dims, b_dims, upper, device, dtype)
-            x_exp_list = []
-            for i in range(b_dims[0]):
-                x_exp_list.append(torch.cholesky_solve(b[i], L[i], upper=upper))
-            x_exp = torch.stack(x_exp_list)  # Stacked output
-            x_act = torch.cholesky_solve(b, L, upper=upper)  # Actual output
-            self.assertEqual(x_act, x_exp)  # Equality check
-            self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 2e-12)  # Correctness check
-
-        for upper, batchsize in itertools.product([True, False], [1, 3, 4]):
-            cholesky_solve_batch_helper((5, batchsize), (batchsize, 5, 10), upper)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double)
-    def test_cholesky_solve_batched_non_contiguous(self, device, dtype):
-        from numpy.linalg import solve
-        from torch.testing._internal.common_utils import random_symmetric_pd_matrix
-
-        for upper in [True, False]:
-            A = random_symmetric_pd_matrix(2, 2, dtype=dtype, device='cpu')
-            b = torch.randn(2, 2, 2, dtype=dtype, device='cpu')
-            x_exp = torch.Tensor(solve(A.permute(0, 2, 1).numpy(), b.permute(2, 1, 0).numpy())).to(dtype=dtype, device=device)
-            A = A.to(device).permute(0, 2, 1)
-            b = b.to(device).permute(2, 1, 0)
-            assert not A.is_contiguous() and not b.is_contiguous(), "contiguous inputs"
-            L = torch.cholesky(A, upper)
-            x = torch.cholesky_solve(b, L, upper=upper)
-            self.assertEqual(x, x_exp)
-
-    @slowTest
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double)
-    def test_cholesky_solve_batched_many_batches(self, device, dtype):
-        for upper in [True, False]:
-            b, A, L = self.cholesky_solve_test_helper((5, 256, 256), (5, 10), upper, device, dtype)
-            x = torch.cholesky_solve(b, L, upper)
-            self.assertEqual(torch.matmul(A, x), b.expand(A.shape[:-2] + (5, 10)))
-
-            b, A, L = self.cholesky_solve_test_helper((5,), (512, 512, 5, 10), upper, device, dtype)
-            x = torch.cholesky_solve(b, L, upper)
-            self.assertEqual(torch.matmul(A, x), b)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double)
-    def test_cholesky_solve_batched_broadcasting(self, device, dtype):
-        from numpy.linalg import solve
-        from torch.testing._internal.common_utils import random_symmetric_pd_matrix
-
-        def run_test(A_dims, b_dims, upper):
-            A_matrix_size = A_dims[-1]
-            A_batch_dims = A_dims[:-2]
-            A = random_symmetric_pd_matrix(A_matrix_size, *A_batch_dims,
-                                           dtype=dtype, device='cpu')
-            b = torch.randn(*b_dims, dtype=dtype, device='cpu')
-            x_exp = torch.tensor(solve(A.numpy(), b.numpy()), dtype=dtype, device=device)
-            A, b = A.to(dtype=dtype, device=device), b.to(dtype=dtype, device=device)
-            L = torch.cholesky(A, upper)
-            x = torch.cholesky_solve(b, L, upper=upper)
-            self.assertEqual(x, x_exp)
-            # issue gh-42695
-            x = torch.cholesky_solve(b, L, upper=upper, out=x)
-            self.assertEqual(x, x_exp)
-
-        # test against numpy.linalg.solve
-        for upper in [True, False]:
-            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), upper)  # no broadcasting
-            run_test((2, 1, 3, 4, 4), (4, 6), upper)  # broadcasting b
-            run_test((4, 4), (2, 1, 3, 4, 2), upper)  # broadcasting A
-            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), upper)  # broadcasting A & b
-
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index e60d90e4ecf8..f0ecd55a9b66 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,7 +78,7 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh',
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve',
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index e57d44e41525..56aec4668b0d 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1638,6 +1638,9 @@ def merge_dicts(*dicts):
 batches of 2D matrices. If the inputs are batches, then returns
 batched outputs `c`
 
+Supports real-valued and complex-valued inputs.
+For the complex-valued inputs the transpose operator above is the conjugate transpose.
+
 Args:
     input (Tensor): input matrix :math:`b` of size :math:`(*, m, k)`,
                 where :math:`*` is zero or more batch dimensions
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 5e9a22f9ebcb..91bad195f47e 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2312,8 +2312,8 @@ std::tuple<Tensor, Tensor> cholesky_solve_backward(
   if (grad_x.defined()) {
     grad_self = grad_x.cholesky_solve(input2, /*upper=*/upper);
 
-    Tensor common_term = at::matmul(grad_self, result.transpose(-2, -1));
-    common_term = common_term + common_term.transpose(-2, -1);
+    Tensor common_term = at::matmul(grad_self, result.conj().transpose(-2, -1));
+    common_term = common_term + common_term.conj().transpose(-2, -1);
 
     if (upper) {
       grad_input2 = -at::matmul(input2, common_term);
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 298727df152a..cf997ddb894b 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1540,13 +1540,13 @@ def random_hermitian_pd_matrix(matrix_size, *batch_dims, dtype, device):
     """
     Returns a batch of random Hermitian positive-definite matrices.
     The shape of the result is batch_dims + (matrix_size, matrix_size)
-
     The following example creates a tensor of size 2 x 4 x 3 x 3
     >>> matrices = random_hermitian_pd_matrix(3, 2, 4, dtype=dtype, device=device)
     """
     A = torch.randn(*(batch_dims + (matrix_size, matrix_size)),
                     dtype=dtype, device=device)
-    return torch.matmul(A, A.transpose(-2, -1).conj())
+    return torch.matmul(A, A.transpose(-2, -1).conj()) \
+        + torch.eye(matrix_size, dtype=dtype, device=device)
 
 
 def make_nonzero_det(A, sign=None, min_singular_value=0.1):

From 195ab5e864ebc5e380b05a30bb5081afbf5666d0 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Sat, 5 Dec 2020 20:57:22 -0800
Subject: [PATCH 092/132] remove non-default settings in fuser.py (#48862)

Summary:
I've noticed we are setting `_jit_set_num_profiled_runs` to 2 (which isn't our default) and sometimes we don't. We are also setting `_jit_set_bailout_depth` to 20 which **is** our default. I suggest we remove this logic altogether.
I did a quick run to see if there's any impact and thankfully, the numbers seem to be consistent, but we should try avoding testing configurations that aren't default or aren't  considered to become default.

 numactl -C 3 python -m fastrnns.bench --fuser=te --executor=profiling

non-defaults:

```
Namespace(cnns=None, cuda_pointwise_block_count=None, cuda_pointwise_block_size=None, cuda_pointwise_loop_level=None, device='cuda', executor='profiling', fuser='te', group=['cnns', 'rnns'], hiddenSize=512, inputSize=512, miniBatch=64, nloops=100, numLayers=1, print_json=None, rnns=None, sep=' ', seqLength=100, variable_lstms=False, warmup=10)
Benchmarking LSTMs...
            name          avg_fwd          std_fwd         info_fwd          avg_bwd          std_bwd         info_bwd
           cudnn            5.057          0.06287             None            7.322          0.07404             None
            aten            5.602          0.06303             None            13.64           0.4078             None
             jit            7.019          0.07995             None            13.77            0.554             None
      jit_premul            5.324          0.06203             None            12.01           0.2996             None
 jit_premul_bias            5.148          0.08061             None            11.62           0.4104             None
      jit_simple             6.69           0.2317             None            13.37           0.3791             None
  jit_multilayer            7.006            0.251             None            13.67           0.2239             None
              py            19.05           0.1119             None            28.28           0.6346             None

Benchmarking ResNets...
            name          avg_fwd          std_fwd         info_fwd          avg_bwd          std_bwd         info_bwd
        resnet18            8.712          0.01628             None            19.93          0.03512             None
    resnet18_jit            8.688          0.01374             None            19.79          0.07518             None
        resnet50            31.04          0.08049             None            66.44          0.08187             None
    resnet50_jit            31.11          0.07171             None            66.45          0.09157             None
```

defaults:
```
Namespace(cnns=None, cuda_pointwise_block_count=None, cuda_pointwise_block_size=None, cuda_pointwise_loop_level=None, device='cuda', executor='profiling', fuser='te', group=['cnns', 'rnns'], hiddenSize=512, inputSize=512, miniBatch=64, nloops=100, numLayers=1, print_json=None, rnns=None, sep=' ', seqLength=100, variable_lstms=False, warmup=10)
Benchmarking LSTMs...
            name          avg_fwd          std_fwd         info_fwd          avg_bwd          std_bwd         info_bwd
           cudnn            5.086            0.115             None            7.394           0.1743             None
            aten            5.611           0.2559             None            13.54            0.387             None
             jit            7.062           0.3358             None            13.24           0.3688             None
      jit_premul            5.379           0.2086             None            11.57           0.3987             None
 jit_premul_bias            5.202           0.2127             None            11.13          0.06748             None
      jit_simple            6.648          0.05794             None            12.84           0.3047             None
  jit_multilayer            6.964           0.1104             None            13.24           0.3283             None
              py            19.14          0.09959             None            28.17           0.4946             None

Benchmarking ResNets...
            name          avg_fwd          std_fwd         info_fwd          avg_bwd          std_bwd         info_bwd
        resnet18            8.713          0.01563             None            19.93          0.02759             None
    resnet18_jit            8.697          0.01792             None            19.78          0.06916             None
        resnet50            31.14          0.07431             None            66.57          0.07418             None
    resnet50_jit            31.21           0.0677             None            66.56          0.08655             None

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48862

Reviewed By: bertmaher

Differential Revision: D25342097

Pulled By: Krovatkin

fbshipit-source-id: 8d2f72c2770793ec8cecee9dfab9aaaf2e1ad2b1
---
 benchmarks/fastrnns/fuser.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
index 5b85f87291dc..e1daab594c50 100644
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@@ -5,8 +5,6 @@ def set_fuser(fuser_name, executor_name):
     if fuser_name == 'te':
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_set_bailout_depth(20)
-        torch._C._jit_set_num_profiled_runs(1)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
@@ -28,8 +26,6 @@ def set_fuser(fuser_name, executor_name):
     if executor_name == 'profiling':
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_set_bailout_depth(20)
-        torch._C._jit_set_num_profiled_runs(2)
     elif executor_name == 'simple':
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(False)

From 2e600feda9bb9359700ea21dac4c23c5aa552887 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Sat, 5 Dec 2020 22:03:09 -0800
Subject: [PATCH 093/132] [numpy] `torch.sinh`: promote integer inputs to float
 (#48644)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/42515

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48644

Reviewed By: heitorschueroff

Differential Revision: D25298436

Pulled By: mruberry

fbshipit-source-id: 675ad8e3c34e61fbbab77eca15048df09b09c1ed
---
 aten/src/ATen/native/UnaryOps.cpp             |  4 +-
 .../ATen/native/cuda/UnaryGeometricKernels.cu |  2 +-
 torch/csrc/jit/tensorexpr/kernel.cpp          |  5 ++-
 .../_internal/common_methods_invocations.py   | 39 +++++++++++++++++--
 4 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index daea7e7f68bb..900f5ee72f7a 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -343,8 +343,8 @@ Tensor& cos_out(Tensor& result, const Tensor& self) { return unary_op_impl_float
 Tensor cos(const Tensor& self) { return unary_op_impl_float(self, cos_stub); }
 Tensor& cos_(Tensor& self) { return unary_op_impl_(self, at::cos_out); }
 
-Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sinh_stub); }
-Tensor sinh(const Tensor& self) { return unary_op_impl(self, at::sinh_out); }
+Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sinh_stub); }
+Tensor sinh(const Tensor& self) { return unary_op_impl_float(self, sinh_stub); }
 Tensor& sinh_(Tensor& self) { return unary_op_impl_(self, at::sinh_out); }
 
 Tensor& cosh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, cosh_stub); }
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
index 46281b6573aa..2488528f5e2c 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu
@@ -51,7 +51,7 @@ void cos_kernel_cuda(TensorIterator& iter) {
 }
 
 void sinh_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "sinh_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "sinh_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       return ::sinh(a);
     });
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index d36f4f428c53..50f285104d95 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1148,8 +1148,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::sinh: {
-      return computeOneOperand(
-          "aten_sinh", v, [](const ExprHandle& a) { return sinh(a); });
+      return computeOneOperand("aten_sinh", v, [](const ExprHandle& a) {
+        return sinh(promoteIntegerToFloat(a));
+      });
     } break;
 
     case aten::atan: {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2e35c5582ce0..96d1cd03557e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1,4 +1,4 @@
-from functools import reduce
+from functools import reduce, wraps
 from operator import mul, itemgetter
 import collections
 
@@ -21,7 +21,8 @@
      random_symmetric_matrix, random_symmetric_psd_matrix,
      random_symmetric_pd_matrix, make_nonzero_det,
      random_fullrank_matrix_distinct_singular_value, set_rng_seed,
-     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY)
+     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY,
+     torch_to_numpy_dtype_dict)
 
 if TEST_SCIPY:
     import scipy.special
@@ -262,6 +263,31 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
                                     low=None, high=None,
                                     requires_grad=False))),)
 
+def np_unary_ufunc_integer_promotion_wrapper(fn):
+    # Wrapper that passes PyTorch's default scalar
+    #   type as an argument to the wrapped NumPy
+    #   unary ufunc when given an integer input.
+    #   This mimicks PyTorch's integer->floating point
+    #   type promotion.
+    #
+    # This is necessary when NumPy promotes
+    #   integer types to double, since PyTorch promotes
+    #   integer types to the default scalar type.
+
+    # Helper to determine if promotion is needed
+    def is_integral(dtype):
+        return dtype in [np.bool, np.uint8, np.int8, np.int16, np.int32, np.int64]
+
+    # NOTE: Promotion in PyTorch is from integer types to the default dtype
+    np_dtype = torch_to_numpy_dtype_dict[torch.get_default_dtype()]
+
+    @wraps(fn)
+    def wrapped_fn(x):
+        if is_integral(x.dtype):
+            return fn(x, dtype=np_dtype)
+        return fn(x)
+
+    return wrapped_fn
 
 # Operator database (sorted alphabetically)
 op_db: List[Any] = [
@@ -508,8 +534,10 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
                                 dtypes=[torch.float], active_if=TEST_WITH_ROCM),
                    )),
     UnaryUfuncInfo('sinh',
-                   ref=np.sinh,
-                   dtypesIfCPU=floating_and_complex_types(),
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
+                   promotes_integers_to_float=True,
                    assert_autodiffed=True,
                    decorators=(precisionOverride({torch.float16: 1e-2}),),
                    skips=(
@@ -519,6 +547,9 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
                                 active_if=IS_WINDOWS),
+                       # Reference: https://github.com/pytorch/pytorch/issues/48641
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                device_type='cpu', dtypes=[torch.int8]),
                        SkipInfo('TestCommon', 'test_variant_consistency_jit',
                                 device_type='cuda', dtypes=[torch.float16]),
                    )),

From 17f53bffefa7bd46e134a99bcd4c4e471b81c86b Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Sat, 5 Dec 2020 23:51:34 -0800
Subject: [PATCH 094/132] [Gradient Compression] Replace the key of error_dict
 in PowerSGD state with bucket index (#48867)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48867

Previously the key of error_dict is the hashcode of tensor. Now replaced with bucket index.

Bucket index can have a few advantages over the hashcode of tensor.
1) Error dict in the state never removes any key. If the bucket rebuild process occurs frequently, the size of error dict can increase. For now, such rebuild process is infrequent, so it is probably fine.

2) Integer index has a better readability than hashcode, and it can facilitate debugging.
If the user wants to debug the tensor values, usually only a specific bucket needs to be targeted. It's easy to specify such condition (e..g, bucket_index = 0), but it's hard to specify a hashcode in advance, as it can only be determined at runtime.

Note that sometimes the buckets can be rebuilt in the forward pass. In this case, the shape of the bucket with the same index will not be consistent with the one in the previous iteration, and hence the error tensor will be re--initialized as a zero tensor of the new shape. Therefore, `and state.error_dict[bucket_index].shape[0] == padded_total_length` is added to the condition of applying the local error from the previous iteration.

Deleted the arg type of `dist._GradBucket` in powerSGD_hook.py, because somehow test_run_mypy - TestTypeHints failed:
AssertionError: mypy failed: torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py:128: error: "_GradBucket" has no attribute "get_index"  [attr-defined]

Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202
ghstack-source-id: 117951402

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl

Reviewed By: rohan-varma

Differential Revision: D25346347

fbshipit-source-id: 8348aa103002ec1c69e3ae759504b431140b3b0d
---
 .../ddp_comm_hooks/powerSGD_hook.py           | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 17414df3024d..e1d475a34425 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -1,3 +1,4 @@
+import logging
 import math
 
 import numpy as np
@@ -63,16 +64,13 @@ def __init__(
         # there will be differences between the gradients that are never synchronized.
         self.rng = np.random.RandomState(random_seed)
         # Since there is only a single state instance for all the input buckets,
-        # need to maintain a dictionary that maps each bucket to the local error.
-        # TODO(wayi): Currently the key is the (hashcode of) input tensor, which may change across steps,
-        # since the bucket can be rebuilt in the forward pass (to save peak memory usage).
-        # Need to add an index field to the input bucket of comm hook.
+        # need to maintain a dictionary that maps each bucket index to the local error.
         self.error_dict = {}
 
 
 def powerSGD_hook(
     state: PowerSGDState,
-    bucket: dist._GradBucket,
+    bucket,
 ) -> torch.futures.Future:
     """
     This DDP communication hook implements a simplified PowerSGD gradient compression
@@ -127,11 +125,26 @@ def powerSGD_hook(
     input_tensor[total_length:padded_total_length].fill_(0)
 
     # Incorporate the error from the previous state into the gradients.
+    bucket_index = bucket.get_index()
     if state.use_error_feedback:
-        if input_tensor in state.error_dict:
-            input_tensor.add_(state.error_dict[input_tensor])
+        # The buckets can be rebuilt during training.
+        # In this case, the error tensor shape will not be aligned with the input tensor,
+        # and the error will be re-initialized as zeros.
+        if (
+            bucket_index in state.error_dict
+            and state.error_dict[bucket_index].shape[0] == padded_total_length
+        ):
+            input_tensor.add_(state.error_dict[bucket_index])
         else:
-            state.error_dict[input_tensor] = torch.zeros(padded_total_length, device=device)
+            logging.info(
+                "A zero tensor of length {} that represents local error is created.".format(
+                    padded_total_length
+                )
+            )
+            state.error_dict[bucket_index] = torch.zeros(
+                padded_total_length, device=device
+            )
+
         # Keep a copy of the input tensor,
         # so that we can compute the local error caused by compression later,
         # by comparing this copy and the input tensor updated after decompression.
@@ -181,7 +194,7 @@ def decompress(fut):
 
         if state.use_error_feedback:
             # Memorize the local errors.
-            state.error_dict[input_tensor] = input_tensor_cp - input_tensor
+            state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         ret = input_tensor.resize_(total_length)
         return [ret]
 

From ea2a568cca71aaf690051782c225ca9dd2e5e1f9 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Sun, 6 Dec 2020 08:01:00 -0800
Subject: [PATCH 095/132] Fixed einsum compatibility/performance issues
 (#46398) (#47860)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47860

This PR makes torch.einsum compatible with numpy.einsum except for the sublist input option as requested here https://github.com/pytorch/pytorch/issues/21412. It also fixed 2 performance issues linked below and adds a check for reducing to torch.dot instead of torch.bmm which is faster in some cases.

fixes #45854, #37628, #30194, #15671

fixes #41467 with benchmark below
```python
import torch
from torch.utils.benchmark import Timer

a = torch.randn(10000, 100, 101, device='cuda')
b = torch.randn(10000, 101, 3, device='cuda')

c = torch.randn(10000, 100, 1, device='cuda')
d = torch.randn(10000, 100, 1, 3, device='cuda')

print(Timer(
    stmt='torch.einsum("bij,bjf->bif", a, b)',
    globals={'a': a, 'b': b}
).blocked_autorange())

print()

print(Timer(
    stmt='torch.einsum("bic,bicf->bif", c, d)',
    globals={'c': c, 'd': d}
).blocked_autorange())
```
```
<torch.utils.benchmark.utils.common.Measurement object at 0x7fa37c413850>
torch.einsum("bij,bjf->bif", a, b)
  Median: 4.53 ms
  IQR:    0.00 ms (4.53 to 4.53)
  45 measurements, 1 runs per measurement, 1 thread

<torch.utils.benchmark.utils.common.Measurement object at 0x7fa37c413700>
torch.einsum("bic,bicf->bif", c, d)
  Median: 63.86 us
  IQR:    1.52 us (63.22 to 64.73)
  4 measurements, 1000 runs per measurement, 1 thread
```

fixes #32591 with benchmark below
```python
import torch
from torch.utils.benchmark import Timer

a = torch.rand(1, 1, 16, 2, 16, 2, 16, 2, 2, 2, 2, device="cuda")
b = torch.rand(729, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, device="cuda")

print(Timer(
    stmt='(a * b).sum(dim = (-3, -2, -1))',
    globals={'a': a, 'b': b}
).blocked_autorange())

print()

print(Timer(
    stmt='torch.einsum("...ijk, ...ijk -> ...", a, b)',
    globals={'a': a, 'b': b}
).blocked_autorange())
```
```
<torch.utils.benchmark.utils.common.Measurement object at 0x7efe0de28850>
(a * b).sum(dim = (-3, -2, -1))
  Median: 17.86 ms
  2 measurements, 10 runs per measurement, 1 thread

<torch.utils.benchmark.utils.common.Measurement object at 0x7efe0de286a0>
torch.einsum("...ijk, ...ijk -> ...", a, b)
  Median: 296.11 us
  IQR:    1.38 us (295.42 to 296.81)
  662 measurements, 1 runs per measurement, 1 thread
```

TODO

- [x] add support for ellipsis broadcasting
- [x] fix corner case issues with sumproduct_pair
- [x] update docs and add more comments
- [x] add tests for error cases

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D24923679

Pulled By: heitorschueroff

fbshipit-source-id: 47e48822cd67bbcdadbdfc5ffa25ee8ba4c9620a
---
 aten/src/ATen/native/Linear.cpp | 501 +++++++++++++++++++-------------
 test/test_linalg.py             | 219 +++++++++-----
 torch/functional.py             | 171 ++++++-----
 3 files changed, 543 insertions(+), 348 deletions(-)

diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index c9e03aaa3b6b..bac2f80e8a7c 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -136,241 +136,334 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   return result;
 }
 
-Tensor einsum(std::string eqn, TensorList tensors) {
-  constexpr size_t number_of_letters = 26;
-  std::string in_eqn;
-  size_t pos;
-  // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis.
-  // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter
-  // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices.
-  // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that
-  // the letter has not been assigned an index yet (because it has not been seen).
-  // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices).
-  // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet.
-  // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below.
-
-  std::array<std::int64_t, number_of_letters> letter_mapping; // map letter to internal (numerical) label
-  letter_mapping.fill(-1);
-  int64_t num_ell_idxes = -1;
-  int64_t first_ell_idx = 0;
-
-  // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes.
-  // For each operand, we have a vector mapping each dimension to an internal index.
-  // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and
-  // of the last occurrence of each index.
-  std::vector<std::vector<int64_t>> input_op_idxes;                   // the parsed operand indices
-  std::array<std::int64_t, number_of_letters> num_letter_occurrences; // number of occurrence in the equation of this letter
-  num_letter_occurrences.fill(0);
-  std::vector<std::int64_t> last_idx_occurrence;                      // the last operator (left to right) using this index
-
-  if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side
-    in_eqn = eqn.substr(0, pos);
-  } else {
-    in_eqn = eqn;
-  }
-  // remove spaces for einsum compatibility (#9929)
-  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
-
-  // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
-  int64_t operand = 0;
-  std::stringstream eqn_stream(in_eqn);
-  std::string term;
-  int64_t num_total_idxes = 0;
-  while (! eqn_stream.eof()) {
-    std::getline(eqn_stream, term, ',');  // term = string with indices of current term
-    TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
-
-    int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
-    // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
-    int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3;
-    int64_t dims_in_term = 0;              // dimensions we have seen
-    std::vector<int64_t> current_op_idxes; // mapping of operand dimensions to indices for current term
-    for (auto &c : term) {                 // c = character with a single letter or '.'
-      if (c == '.') {
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
-        if (ell_char_count == 3) {        // this completes the ellipsis
-          if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
-            first_ell_idx = num_total_idxes;
-            num_ell_idxes = candidate_num_ell_idxes;
-            num_total_idxes += num_ell_idxes;
-          }
-          else {                          // we have seen an ellipsis before, so we check compatibility
-            TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes,
-                     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
-          }
-          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
-            current_op_idxes.push_back(first_ell_idx + i);
-            last_idx_occurrence.push_back(operand);
-          }
-          dims_in_term += num_ell_idxes;                // keep track of dimensions
-        }
-      } else {                                          // a letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
-        if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
-          letter_mapping[letter_num] = num_total_idxes;
-          num_total_idxes++;
-          last_idx_occurrence.push_back(operand);
-        } else {                                        // letter we have already seen
-          last_idx_occurrence[letter_mapping[letter_num]] = operand;
-        }
-        num_letter_occurrences[letter_num]++;
-        current_op_idxes.push_back(letter_mapping[letter_num]);
-        dims_in_term++;
-      }
+// There are roughly three parts to compute einsum:
+// 1. Parse equation to extract the labels for each input operand and output
+// 2. Unsqueeze missing dimensions from input operands and permute to align them
+// 3. Compute result by multiplying input operands and summing contraction
+//    dimensions We do the last part by reducing to bmm.
+Tensor einsum(std::string equation, TensorList operands) {
+  TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand");
+  checkDeviceType("einsum()", operands, operands[0].device().type());
+
+  // Code for encoding ellipsis ("...") with labels
+  constexpr int ELLIPSIS = '.';
+
+  // Find arrow (->) to split equation into lhs and rhs
+  const auto arrow_pos = equation.find("->");
+  const auto lhs = equation.substr(0, arrow_pos);
+
+  // Convert labels for input operands into an index in [0, 25] and store
+  // them in op_labels for each operand along with ELLIPSIS.
+  std::vector<std::vector<int>> op_labels(operands.size());
+  bool found_ell = false;
+  std::string::size_type curr_op = 0;
+  for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
+    switch (lhs[i]) {
+      case ' ':
+        // Ignore spaces
+        break;
+
+      case '.':
+        TORCH_CHECK(
+            // Only one ellipsis per operand can be given
+            !found_ell,
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " for which an ellipsis was already found");
+        TORCH_CHECK(
+            // Ensure it's a valid ellipsis
+            i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.',
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " that is not part of any ellipsis");
+        op_labels[curr_op].push_back(ELLIPSIS);
+        found_ell = true;
+        break;
+
+      case ',':
+        // Move onto next operand
+        ++curr_op;
+        TORCH_CHECK(
+            curr_op < operands.size(),
+            "einsum() fewer operands were provided than specified in the equation");
+        found_ell = false;
+        break;
+
+      default:
+        // Parse label
+        TORCH_CHECK(
+            lhs[i] >= 'a' && lhs[i] <= 'z',
+            "einsum() operand subscript must be in range [a, z] but found ",
+            lhs[i],
+            " for operand ",
+            curr_op);
+        // Convert label to index in [0, 25] and store
+        op_labels[curr_op].push_back(lhs[i] - 'a');
     }
-    TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
-    input_op_idxes.push_back(std::move(current_op_idxes));
-    operand++;
   }
-  // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
-  TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
-
-  // the following parses or infers output (right hand side)
-  // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the output indices. -1 means that the index has not been assigned a dimension yet
-  std::vector<int64_t> idxes_to_preprocessed_dims(num_total_idxes, -1);     // the position of the index in the tensor dimensions
-  int64_t num_output_dims = 0;
-  if (pos != std::string::npos) {            // parse the user provided right hand side
-    int64_t ell_char_count = 0;
-    for (auto &c : eqn.substr(pos+2)) {
-      if (c == '.') {                        // '.' as part of ellipsis
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
-        if (ell_char_count == 3) {           // ellipsis complete
-          TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
-          for (int64_t i = 0; i < num_ell_idxes; ++i) {
-            idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-            num_output_dims++;
-          }
-        }
-      } else if (! isspace(c)) {                              // letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';
-        TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output");
-        idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
-        num_output_dims++;
+
+  TORCH_CHECK(
+      curr_op == operands.size() - 1,
+      "einsum() more operands were provided than specified in the equation");
+
+  // Labels must be within [a, z].
+  constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
+  std::vector<int> label_count(TOTAL_LABELS, 0);
+
+  // The maximum number of dimensions covered by any ellipsis, needed when
+  // unsqueezing missing dimensions from operands to permute and broadcast
+  int64_t ell_num_dim = 0;
+
+  // Compute label frequency and number of dimensions covered by ellipsis
+  // We do this after parsing labels to make it more readable and simpler
+  // to compute the number of dimensions covered by ellipsis.
+  for (std::size_t i = 0; i < operands.size(); ++i) {
+    const Tensor operand = operands[i];
+    std::vector<int> labels = op_labels[i];
+    int64_t nlabels = labels.size();
+    int64_t ndims = operand.dim();
+    bool has_ellipsis = false;
+
+    for (int label : labels) {
+      if (label == ELLIPSIS) {
+        --nlabels;
+        has_ellipsis = true;
+        ell_num_dim = std::max(ell_num_dim, ndims - nlabels);
+      } else {
+        ++label_count[label];
       }
     }
-  } else { // create an inferred right hand side
-    // the ellipsis (if in the lhs) comes first
-    if (num_ell_idxes >= 0) {
-      for (int64_t i = 0; i < num_ell_idxes; ++i) {
-        idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-        num_output_dims++;
+
+    TORCH_CHECK(
+        has_ellipsis ? nlabels <= ndims : nlabels == ndims,
+        "einsum() the number of subscripts in the equation (",
+        nlabels,
+        has_ellipsis ? ") is more than the number of dimensions ("
+                     : ") does not match the number of dimensions (",
+        ndims,
+        ") for operand ",
+        i,
+        has_ellipsis ? "" : " and no ellipsis was given");
+  }
+
+  // Mapping of label to index in the permuted tensors (out_dims + sum_dims)
+  // This will be used for aligning the dimensions of all input operands
+  std::vector<int> label_perm_index(TOTAL_LABELS, -1);
+  
+  // Current index in the permuted shape
+  int perm_index = 0;
+
+  // Start index of ellipsis dimensions in the permuted shape
+  int64_t ell_index = 0;
+  found_ell = false;
+
+  if (arrow_pos == std::string::npos) {
+    // Implicit output is ellipsis (...) + labels seen only once
+    perm_index = ell_num_dim;
+    found_ell = true;
+    for (int label = 0; label < TOTAL_LABELS; ++label) {
+      if (label_count[label] == 1) {
+        label_perm_index[label] = perm_index++;
       }
     }
-    // then the indices that occur exactly once in alphabetic order
-    for (size_t idx = 0; idx < number_of_letters; idx++) {
-      if (num_letter_occurrences[idx] == 1) {
-        idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims;
-        num_output_dims++;
+  } else {
+    // Parse explicit output
+    const std::string rhs = equation.substr(arrow_pos + 2);
+    for (std::size_t i = 0; i < rhs.length(); ++i) {
+      switch (rhs[i]) {
+        case ' ':
+          // Ignore spaces
+          break;
+
+        case '.':
+          TORCH_CHECK(
+              // There can only be one ellipsis in the output
+              !found_ell,
+              "einsum() found \'.\' for output but an ellipsis (...) was already found");
+          TORCH_CHECK(
+              // Ensure ellipsis is correct
+              i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.',
+              "einsum() found \'.\' for output that is not part of any ellipsis (...)");
+          ell_index = perm_index;
+          perm_index += ell_num_dim;
+          found_ell = true;
+          break;
+
+        default:
+          TORCH_CHECK(
+              rhs[i] >= 'a' && rhs[i] <= 'z',
+              "einsum() subscripts must be in range [a, z] but found ",
+              rhs[i],
+              " for the output");
+          TORCH_CHECK(
+              // Ensure label appeared at least once for some input operand and at
+              // most once for the output
+              label_count[rhs[i] - 'a'] > 0,
+              "einsum() output subscript ",
+              rhs[i],
+              label_count[rhs[i] - 'a'] == -1
+                  ? " appears more than once in the output"
+                  : " does not appear in the equation for any input operand");
+          label_perm_index[rhs[i] - 'a'] = perm_index++;
+          
+          // Set to -1 to mark that this label already appeared in the output
+          label_count[rhs[i] - 'a'] = -1;
       }
     }
   }
-  // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the non-output indices - those that are eventually summed over
-  int64_t position = num_output_dims;
-  for (int64_t i = 0; i < num_total_idxes; i++) {
-    if (idxes_to_preprocessed_dims[i]==-1) {
-      idxes_to_preprocessed_dims[i] = position;
-      position++;
+
+  // Save output size before adding sum dims
+  const int out_size = perm_index;
+
+  // If ellipsis is not part of the output, add to contraction dimensions
+  if (ell_num_dim > 0 && !found_ell) {
+    ell_index = perm_index;
+    perm_index += ell_num_dim;
+  }
+
+  // Add contraction labels (labels not present in output)
+  for (int label = 0; label < TOTAL_LABELS; ++label) {
+    if (label_count[label] > 0 && label_perm_index[label] == -1) {
+      label_perm_index[label] = perm_index++;
     }
   }
 
-  // we now "homogenize the dimensions", i.e.
-  // - take diagonals for duplicated indices
-  // - permute the dimensions to match the order given by idxes_to_preprocessed_dims
-  // - unsqueeze to create all dimensions for each index in each tensor where they are missing
-  // we also check that sizes match
-  // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable)
-  std::vector<Tensor> preprocessed_operands;
-  std::vector<std::int64_t> size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet
-  for (int64_t op = 0; op < (int64_t) tensors.size(); op++) {
-    auto preprocessed_op = tensors[op];
-    std::vector<int64_t> idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear
-    std::vector<int64_t>& current_op_input_idxes = input_op_idxes[op];
-    int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input
-    for (size_t i = 0; i < current_op_input_idxes.size(); i++) {
-      auto idx = current_op_input_idxes[i];
-      auto dim_out = idxes_to_preprocessed_dims[idx];
-      if (idx_to_dim[dim_out] == -1) { // first appearance
-        idx_to_dim[dim_out] = dim;
-        if (size_of_dims[idx] == -1) { // keep track of sizes
-          size_of_dims[idx] = preprocessed_op.size(dim);
-        }
-        else {
-          TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+  // Here we unsqueeze missing dimensions to make all operands have the same
+  // number of dimensions. We take diagonals for repeated labels within the
+  // same operand. Finally we permute the operands to align dimensions as 
+  // per the perm_out_index we computed above.
+  std::vector<Tensor> permuted_operands;
+  for (std::size_t i = 0; i < operands.size(); ++i) {
+    std::vector<int64_t> perm_shape(perm_index, -1);
+    std::vector<int64_t> label_dim(TOTAL_LABELS, -1);
+    const std::vector<int> labels = op_labels[i];
+    Tensor operand = operands[i];
+    const auto sizes = operand.sizes();
+    std::size_t j = 0;
+
+    for (int label : labels) {
+      if (label == ELLIPSIS) {
+        // Add missing dimensions under ellipsis
+        int64_t num_dim_diff =
+            ell_num_dim - (operand.dim() - labels.size() + 1);
+        for (int64_t k = 0; k < num_dim_diff; ++k) {
+          operand = operand.unsqueeze(j);
         }
-        dim++;
-      } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
-        TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
-        preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
-        // diagonal moves the diagonal dimension to the back
-        // now we permute the last dim back to idx_to_dim[dim_out]
-        std::vector<int64_t> perm(preprocessed_op.dim(), 0);
-        for (int64_t d = 0; d < preprocessed_op.dim(); d++) {
-          if (d == idx_to_dim[dim_out]) {
-            perm[d] = preprocessed_op.dim() - 1;
-          } else {
-            perm[d] = d - (d > idx_to_dim[dim_out]);
-          }
+        for (int64_t k = 0; k < ell_num_dim; ++k) {
+          perm_shape[ell_index + k] = j++;
         }
-        preprocessed_op = preprocessed_op.permute(perm);
+      } else if (label_dim[label] != -1) {
+        // Repeated label, take diagonal
+        int64_t dim = label_dim[label];
+        TORCH_CHECK(
+            sizes[j] == sizes[dim],
+            "einsum() subscript ",
+            char(label + 'a'),
+            " is repeated for operand ",
+            i,
+            " but the sizes don't match, ",
+            sizes[j],
+            " != ",
+            sizes[dim]);
+        operand = operand.diagonal(0, j, dim).movedim(-1, dim);
+      } else {
+        // Lookup output index for label
+        label_dim[label] = j;
+        perm_shape[label_perm_index[label]] = j++;
       }
     }
-    // now we permute the dimensions in the right order
-    std::vector<int64_t> permutation; // permutation for this tensor
-    for (auto &d : idx_to_dim) {
-      if (d > -1) {
-        permutation.push_back(d);
+
+    // Add dimensions for missing labels
+    for (int64_t& index : perm_shape) {
+      if (index == -1) {
+        operand = operand.unsqueeze(-1);
+        index = j++;
       }
     }
-    preprocessed_op = preprocessed_op.permute(permutation);
-    // finally, we insert dimensions for idxes not in the operand
-    for (size_t dim = 0; dim < idx_to_dim.size(); dim++) {
-      if (idx_to_dim[dim] == -1) {
-        preprocessed_op = preprocessed_op.unsqueeze(dim);
+
+    permuted_operands.push_back(operand.permute(perm_shape));
+  }
+
+  // Check if operands broadcast and keep track of last operand with
+  // dimension size != 1 for optimizing reductions
+  std::vector<std::size_t> dim_last_op(perm_index, 0);
+  bool has_zero_size_dim = false;
+  for (int dim = 0; dim < perm_index; ++dim) {
+    int64_t broadcast_size = permuted_operands[0].size(dim);
+    for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
+      int64_t dim_size = permuted_operands[i].size(dim);
+      if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) {
+        std::ostringstream msg;
+        msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:";
+        for (std::size_t j = 0; j < operands.size(); ++j) {
+          msg << " " << operands[j].sizes() << "->"
+              << permuted_operands[j].sizes();
+        }
+        TORCH_CHECK(false, msg.str());
+      }
+      if (dim_size != 1) {
+        broadcast_size = dim_size;
+        dim_last_op[dim] = i;
       }
     }
+    has_zero_size_dim |= broadcast_size == 0;
+  }
+
+  // Compute result
+  Tensor result = permuted_operands[0];
 
-    preprocessed_operands.push_back(std::move(preprocessed_op));
+  // Fast path for when an operand has zero sized dim
+  if (has_zero_size_dim) {
+    std::vector<int64_t> out_shape(out_size);
+    for (int i = 0; i < out_size; ++i) {
+      out_shape[i] = permuted_operands[dim_last_op[i]].size(i);
+    }
+    return at::zeros(out_shape, result.options());
   }
 
-  // now we reduce the indices from left to right
-  // numpy allows to optimize the path using various
-  // algorithms (see eigen_path in numpy docs)
-  // we start with the leftmost operator and reduce indices that
-  // appear only there
-  Tensor result = std::move(preprocessed_operands[0]);
-  for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-    if ((last_idx_occurrence[idx] == 0)
-        && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-      result = result.sum(idxes_to_preprocessed_dims[idx], true);
+  // Sum out or squeeze dimensions that are size 1 for all later operands
+  int dim = out_size;
+  for (int i = dim; i < perm_index; ++i, ++dim) {
+    if (dim_last_op[i] == 0) {
+      if (result.size(dim) == 1) {
+        result = result.squeeze(dim--);
+      } else {
+        result = result.sum(dim--);
+      }
     }
   }
 
-  // now we process each tensor using sumproduct_pair
-  for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) {
+  for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
+    Tensor operand = permuted_operands[i];
     std::vector<int64_t> sum_dims;
-    for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-      if ((last_idx_occurrence[idx] == i)
-          && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-        sum_dims.push_back(idxes_to_preprocessed_dims[idx]);
+
+    // Sum out or squeeze dimensions that are size 1 for all later operands
+    dim = out_size;
+    for (int j = dim; j < perm_index; ++j, ++dim) {
+      if (dim_last_op[j] < i) {
+        operand = operand.squeeze(dim);
+        --dim;
+      } else if (dim_last_op[j] == i) {
+        if (result.size(dim) == 1) {
+          operand = operand.sum(dim);
+          result = result.squeeze(dim);
+          --dim;
+        } else {
+          sum_dims.push_back(dim);
+        }
       }
     }
-    result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true);
-  }
-  // finally, we squeeze out all non-result dimensions
-  auto sizes = result.sizes().vec();
-  for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) {
-    sizes.erase(sizes.begin() + dim);
+
+    // Multiply tensors and sum out dimensions in sum_dims
+    if (sum_dims.empty()) {
+      result = result.mul(operand);
+    } else if (sum_dims.size() == result.sizes().size()) {
+      result = result.flatten().dot(operand.flatten());
+    } else {
+      result = sumproduct_pair(result, operand, sum_dims, false);
+    }
   }
 
-  result = result.view(sizes);
   return result;
 }
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index b6ff817a59fa..3fa677d2b1de 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2588,6 +2588,151 @@ def test_old_matrix_rank(self, device, dtype):
             self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True))
             self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True))
 
+    @dtypes(torch.double)
+    def test_einsum(self, device, dtype):
+        def check(equation, *operands):
+            ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands])
+            res = torch.einsum(equation, operands)
+            self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
+
+            # Check autograd
+            ops = [op.detach().requires_grad_() for op in operands]
+            self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops))
+            for op in ops:
+                self.assertTrue(op._version == 0)
+
+        # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+        x = torch.rand(5, device=device, dtype=dtype)
+        y = torch.rand(7, device=device, dtype=dtype)
+        A = torch.randn(3, 5, device=device, dtype=dtype)
+        B = torch.randn(2, 5, device=device, dtype=dtype)
+        C = torch.randn(2, 3, 5, device=device, dtype=dtype)
+        D = torch.randn(2, 5, 7, device=device, dtype=dtype)
+        E = torch.randn(7, 9, device=device, dtype=dtype)
+        F = torch.randn(2, 3, 3, 5, device=device, dtype=dtype)
+        G = torch.randn(5, 4, 6, device=device, dtype=dtype)
+        H = torch.randn(4, 4, device=device, dtype=dtype)
+        I = torch.rand(2, 3, 2, device=device, dtype=dtype)
+
+        # Note: gradcheck fails if the same input is given multiple times which is why the
+        # calls to clone below. (see https://github.com/pytorch/pytorch/issues/9282)
+
+        # Vector operations
+        check('i->', x)                     # sum
+        check('i,i->', x, x.clone())        # dot
+        check('i,i->i', x, x.clone())       # vector element-wisem mul
+        check('i,j->ij', x, y)              # outer
+
+        # Matrix operations
+        check("ij->ji", A)                  # transpose
+        check("ij->j", A)                   # row sum
+        check("ij->i", A)                   # col sum
+        check("ij,ij->ij", A, A.clone())    # matrix element-wise mul
+        check("ij,j->i", A, x)              # matrix vector multiplication
+        check("ij,kj->ik", A, B)            # matmul
+        check("ij,ab->ijab", A, E)          # matrix outer product
+
+        # Tensor operations
+        check("aij,ajk->aik", C, D)         # batch matmul
+        check("ijk,jk->i", C, A)            # tensor matrix contraction
+        check("aij,jk->aik", D, E)          # tensor matrix contraction
+        check("abcd,dfg->abcfg", F, G)      # tensor tensor contraction
+        check("ijk,jk->ik", C, A)           # tensor matrix contraction with double indices
+        check("ijk,jk->ij", C, A)           # tensor matrix contraction with double indices
+        check("ijk,ik->j", C, B)            # non contiguous
+        check("ijk,ik->jk", C, B)           # non contiguous with double indices
+
+        # Test diagonals
+        check("ii", H)                      # trace
+        check("ii->i", H)                   # diagonal
+        check('iji->j', I)                  # non-contiguous trace
+
+        # Test ellipsis
+        check("i...->...", H)
+        check("ki,...k->i...", A.t(), B)
+        check("k...,jk->...", A.t(), B)
+        check('...ik, ...j -> ...ij', C, x)
+        check('bik,k...j->i...j', C, torch.rand(5, 3, device=device, dtype=dtype))
+        check('i...j, ij... -> ...ij', C, torch.rand(2, 5, 2, 3, device=device, dtype=dtype))
+
+        # torch.bilinear with discontiguous tensors
+        l = torch.randn(10, 5, device=device, dtype=dtype).transpose(0, 1)
+        r = torch.randn(20, 5, device=device, dtype=dtype).transpose(0, 1)
+        w = torch.randn(15, 10, 20, device=device, dtype=dtype)
+        check("bn,anm,bm->ba", l, w, r)
+
+        # with strided tensors
+        check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2])
+
+    def test_einsum_corner_cases(self, device):
+        def check(equation, *operands, expected_output):
+            tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple)
+                       else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands]
+            output = torch.einsum(equation, tensors)
+            self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
+
+        # Test equation variantions
+        check(' ', 1, expected_output=1)
+        check(' -> ', 1, expected_output=1)
+        check(' , ', 2, 2, expected_output=4)
+        check(' , , ', 2, 2, 2, expected_output=8)
+        check(' , -> ', 2, 2, expected_output=4)
+        check(' i ', [1], expected_output=[1])
+        check(' i -> ', [1], expected_output=1)
+        check(' i -> i ', [1], expected_output=[1])
+        check(' i , i ', [2], [2], expected_output=4)
+        check(' i , i -> i ', [2], [2], expected_output=[4])
+
+        # Test tensors with 0 size dimensions
+        check('i', [], expected_output=[])
+        check(' i j -> j', [[], []], expected_output=[])
+        check('ij->i', [[], []], expected_output=[0., 0.])
+        check(' i j k  ,  k  -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []])
+
+        # Test broadcasting
+        check('i,j', [2], [1, 2], expected_output=[[2, 4]])
+        check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]])
+
+        # Test ellipsis broadcasting
+        check('...', 1, expected_output=1)
+        check('...->', 1, expected_output=1)
+        check('...->...', 1, expected_output=1)
+        check('...', [1], expected_output=[1])
+        check('...->', [1], expected_output=1)
+        check('i...->i', [1], expected_output=[1])
+        check('i...->...i', [1], expected_output=[1])
+        check('...a->', [[2], [4]], expected_output=6)
+        check('a...b->ab', [[[1], [2]], [[3], [4]]], expected_output=[[3], [7]])
+
+    def test_einsum_error_cases(self, device):
+        def check(equation, operands, regex, exception=RuntimeError):
+            with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex):
+                torch.einsum(equation, operands)
+
+        x = torch.rand(2)
+        y = torch.rand(2, 3)
+
+        check('', [], r'must provide at least one operand')
+        check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis')
+        check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found')
+        check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0')
+        check(',', [x], r'fewer operands were provided than specified in the equation')
+        check('', [x, x], r'more operands were provided than specified in the equation')
+        check('', [x], r'the number of subscripts in the equation \(0\) does not match the number '
+                       r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number '
+                         r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number '
+                            r'of dimensions \(1\) for operand 0')
+        check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found')
+        check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)')
+        check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output')
+        check('a->aa', [x], r'output subscript a appears more than once in the output')
+        check('a->i', [x], r'output subscript i does not appear in the equation for any input operand')
+        check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2')
+        check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: '
+                               r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]')
+
     def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular,
                                      device, dtype):
         triangle_function = torch.triu if upper else torch.tril
@@ -3240,80 +3385,6 @@ def run_test(pivot):
         if self.device_type == 'cuda':
             run_test(False)
 
-    @onlyCPU
-    @slowTest
-    @dtypes(torch.double)
-    def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None:
-        # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
-        x = torch.randn(5, dtype=dtype, device=device)
-        y = torch.randn(7, dtype=dtype, device=device)
-        A = torch.randn(3, 5, dtype=dtype, device=device)
-        B = torch.randn(2, 5, dtype=dtype, device=device)
-        C = torch.randn(2, 3, 5, dtype=dtype, device=device)
-        D = torch.randn(2, 5, 7, dtype=dtype, device=device)
-        E = torch.randn(7, 9, dtype=dtype, device=device)
-        F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device)
-        G = torch.randn(7, 11, 13, dtype=dtype, device=device)
-        H = torch.randn(4, 4, dtype=dtype, device=device)
-        I = torch.randn(3, 4, 4, dtype=dtype, device=device)
-        l = torch.randn(5, 10, dtype=dtype, device=device)
-        r = torch.randn(5, 20, dtype=dtype, device=device)
-        w = torch.randn(30, 10, 20, dtype=dtype, device=device)
-        test_list: List[Union[Tuple[str, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [
-            # -- Vector
-            ("i->", x),                 # sum
-            ("i,i->", x, x),            # dot
-            ("i,i->i", x, x),           # vector element-wise mul
-            ("i,j->ij", x, y),          # outer
-            # -- Matrix
-            ("ij->ji", A),              # transpose
-            ("ij->j", A),               # row sum
-            ("ij->i", A),               # col sum
-            ("ij,ij->ij", A, A),        # matrix element-wise mul
-            ("ij,j->i", A, x),          # matrix vector multiplication
-            ("ij,kj->ik", A, B),        # matmul
-            ("ij,ab->ijab", A, E),      # matrix outer product
-            # -- Tensor
-            ("aij,ajk->aik", C, D),     # batch matmul
-            ("ijk,jk->i", C, A),        # tensor matrix contraction
-            ("aij,jk->aik", D, E),      # tensor matrix contraction
-            ("abcd,dfg->abcfg", F, G),  # tensor tensor contraction
-            ("ijk,jk->ik", C, A),       # tensor matrix contraction with double indices
-            ("ijk,jk->ij", C, A),       # tensor matrix contraction with double indices
-            ("ijk,ik->j", C, B),        # non contiguous
-            ("ijk,ik->jk", C, B),       # non contiguous with double indices
-            # -- Diagonal
-            ("ii", H),                 # trace
-            ("ii->i", H),              # diagonal
-            # -- Ellipsis
-            ("i...->...", H),
-            ("ki,...k->i...", A.t(), B),
-            ("k...,jk", A.t(), B),
-            ("...ii->...i", I),       # batch diagonal
-            # -- Other
-            ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
-            ("... ii->...i  ", I),       # batch diagonal with spaces
-        ]
-        for test in test_list:
-            actual = torch.einsum(test[0], test[1:])
-            expected = np.einsum(test[0], *[t.numpy() for t in test[1:]])
-            self.assertEqual(expected.shape, actual.shape, msg=test[0])
-            self.assertEqual(expected, actual, msg=test[0])
-            # test vararg
-            actual2 = torch.einsum(test[0], *test[1:])
-            self.assertEqual(expected.shape, actual2.shape, msg=test[0])
-            self.assertEqual(expected, actual2, msg=test[0])
-
-            def do_einsum(*args):
-                return torch.einsum(test[0], args)
-            # FIXME: following test cases fail gradcheck
-            if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}:
-                gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:])
-                self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
-            self.assertTrue(A._version == 0)  # check that we do not use inplace ops
-
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
diff --git a/torch/functional.py b/torch/functional.py
index 62076a9dc29a..72739018889c 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -296,76 +296,107 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
 def einsum(equation, *operands):
     r"""einsum(equation, *operands) -> Tensor
 
-This function provides a way of computing multilinear expressions (i.e. sums of products) using the
-Einstein summation convention.
-
-Args:
-    equation (string): The equation is given in terms of lower case letters (indices) to be associated
-           with each dimension of the operands and result. The left hand side lists the operands
-           dimensions, separated by commas. There should be one index letter per tensor dimension.
-           The right hand side follows after `->` and gives the indices for the output.
-           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
-           sorted list of all indices appearing exactly once in the left hand side.
-           The indices not apprearing in the output are summed over after multiplying the operands
-           entries.
-           If an index appears several times for the same operand, a diagonal is taken.
-           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
-           the ellipsis dimensions are at the beginning of the output.
-    operands (Tensor): The operands to compute the Einstein sum of.
-
-.. note::
-
-    This function does not optimize the given expression, so a different formula for the same computation may
-    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
-    can optimize the formula for you.
-
-Examples::
-
-    >>> x = torch.randn(5)
-    >>> y = torch.randn(4)
-    >>> torch.einsum('i,j->ij', x, y)  # outer product
-    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
-            [ 1.2616,  0.6335,  0.5113, -0.4351],
-            [ 1.4452,  0.7257,  0.5857, -0.4984],
-            [-0.4647, -0.2333, -0.1883,  0.1603],
-            [-1.1130, -0.5588, -0.4510,  0.3838]])
-
-
-    >>> A = torch.randn(3,5,4)
-    >>> l = torch.randn(2,5)
-    >>> r = torch.randn(2,4)
-    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
-    tensor([[-0.3430, -5.2405,  0.4494],
-            [ 0.3311,  5.5201, -3.0356]])
-
-
-    >>> As = torch.randn(3,2,5)
-    >>> Bs = torch.randn(3,5,4)
-    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
-    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
-             [-1.6706, -0.8097, -0.8025, -2.1183]],
-
-            [[ 4.2239,  0.3107, -0.5756, -0.2354],
-             [-1.4558, -0.3460,  1.5087, -0.8530]],
-
-            [[ 2.8153,  1.8787, -4.3839, -1.2112],
-             [ 0.3728, -2.1131,  0.0921,  0.8305]]])
-
-    >>> A = torch.randn(3, 3)
-    >>> torch.einsum('ii->i', A) # diagonal
-    tensor([-0.7825,  0.8291, -0.1936])
-
-    >>> A = torch.randn(4, 3, 3)
-    >>> torch.einsum('...ii->...i', A) # batch diagonal
-    tensor([[-1.0864,  0.7292,  0.0569],
-            [-0.9725, -1.0270,  0.6493],
-            [ 0.5832, -1.1716, -1.5084],
-            [ 0.4041, -1.1690,  0.8570]])
-
-    >>> A = torch.randn(2, 3, 4, 5)
-    >>> torch.einsum('...ij->...ji', A).shape # batch permute
-    torch.Size([2, 3, 5, 4])
-"""
+    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
+    based on the Einstein summation convention.
+
+    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
+    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of 
+    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
+    with some subscript and define which subscripts are part of the output. The output is then computed by summing
+    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
+    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
+    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
+
+    Equation:
+
+        The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of
+        the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a
+        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
+        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
+        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
+        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
+        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
+        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
+        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
+
+        Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation
+        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
+        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
+        at most once for the output.
+
+        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
+        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
+        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
+        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
+        'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not
+        explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions),
+        before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements
+        batch matrix multiplication `'...ij,...jk'`.
+
+        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
+        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
+
+    .. note::
+
+        ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions
+        covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output.
+
+    .. note::
+
+        This function does not optimize the given expression, so a different formula for the same computation may
+        run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
+        can optimize the formula for you.
+
+    Args:
+        equation (string): The subscripts for the Einstein summation.
+        operands (Tensor): The operands to compute the Einstein sum of.
+
+    Examples::
+
+        # trace
+        >>> torch.einsum('ii', torch.randn(4, 4))
+        tensor(-1.2104)
+
+        # diagonal
+        >>> torch.einsum('ii->i', torch.randn(4, 4))
+        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
+
+        # outer product
+        >>> x = torch.randn(5)
+        >>> y = torch.randn(4)
+        >>> torch.einsum('i,j->ij', x, y)
+        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
+                [-0.3744,  0.9381,  1.2685, -1.6070],
+                [ 0.7208, -1.8058, -2.4419,  3.0936],
+                [ 0.1713, -0.4291, -0.5802,  0.7350],
+                [ 0.5704, -1.4290, -1.9323,  2.4480]])
+
+        # batch matrix multiplication
+        >>> As = torch.randn(3,2,5)
+        >>> Bs = torch.randn(3,5,4)
+        >>> torch.einsum('bij,bjk->bik', As, Bs)
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        # batch permute
+        >>> A = torch.randn(2, 3, 4, 5)
+        >>> torch.einsum('...ij->...ji', A).shape 
+        torch.Size([2, 3, 5, 4])
+
+        # equivalent to torch.nn.functional.bilinear
+        >>> A = torch.randn(3,5,4)
+        >>> l = torch.randn(2,5)
+        >>> r = torch.randn(2,4)
+        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
+        tensor([[-0.3430, -5.2405,  0.4494],
+                [ 0.3311,  5.5201, -3.0356]])
+    """
     if not torch.jit.is_scripting():
         if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
             return handle_torch_function(einsum, operands, equation, *operands)

From e429d05015bfa878f2ae660a1e0dd96b51d743d5 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Sun, 6 Dec 2020 10:21:08 -0800
Subject: [PATCH 096/132] Fixing error: "member may not be initialized" due to
 constexpr at Windows (#48836)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48835
Fixes https://github.com/pytorch/pytorch/issues/48716

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48836

Reviewed By: malfet

Differential Revision: D25335829

Pulled By: datumbox

fbshipit-source-id: 807182e9afa3bb314dbb85bfcd9589a2c319a7db
---
 torch/csrc/jit/ir/ir.cpp               | 4 ++--
 torch/csrc/jit/ir/ir.h                 | 4 ++--
 torch/csrc/jit/serialization/pickler.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index fe79091c946f..ceb0fd1dbfcf 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -2061,8 +2061,8 @@ TypePtr NamedValue::type() const {
   }
 }
 
-constexpr Symbol ProfileOp::Kind;
-constexpr Symbol ProfileOptionalOp::Kind;
+const Symbol ProfileOp::Kind = ::c10::prim::profile;
+const Symbol ProfileOptionalOp::Kind = ::c10::prim::profile_optional;
 
 OperatorSet::OperatorSet(std::initializer_list<const char*> sig_literals) {
   for (const char* sig : sig_literals) {
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 9db2dbdf2516..64c8031bd601 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -1326,7 +1326,7 @@ inline const Graph* Value::owningGraph() const {
 
 /************* All nodes not required to be defined before Graph **************/
 struct ProfileOp : public Node {
-  static constexpr Symbol Kind = ::c10::prim::profile;
+  static const Symbol Kind;
   ProfileOp(Graph* graph, std::function<void(std::vector<IValue>&)> callback)
       : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {}
 
@@ -1346,7 +1346,7 @@ struct ProfileOp : public Node {
 };
 
 struct TORCH_API ProfileOptionalOp : public Node {
-  static constexpr Symbol Kind = ::c10::prim::profile_optional;
+  static const Symbol Kind;
   ProfileOptionalOp(
       Graph* graph,
       std::function<void(std::vector<IValue>&)> callback)
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 4473b0cb50dd..6a557e6e53f3 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -209,7 +209,7 @@ class TORCH_API Pickler {
   // the left of a '::', its type cannot be deduced by the compiler so one must
   // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
   // does not)
-  static constexpr size_t kBufferSize = 256;
+  static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256;
   template <typename T>
   void push(typename std::common_type<T>::type value) {
     const char* begin = reinterpret_cast<const char*>(&value);

From 19f4c5110e8bcad5e7e75375194262fca0a6293a Mon Sep 17 00:00:00 2001
From: Liang Liu <lliu315@fb.com>
Date: Sun, 6 Dec 2020 18:08:07 -0800
Subject: [PATCH 097/132] Add another torch::jit::load API to load PyTorch
 model with shared_ptr PyTorchStreamReader input (#48802)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48802

Current torch::jit::load API only supports unique_ptr ReadAdaptInterface input, but for some cases, torch::jit::load may not be the only consumer of the reader adapter. This diff enables an overload of torch::jit::load to load shared_ptr PyTorchStreamReader.

Reviewed By: malfet, houseroad

Differential Revision: D25241904

fbshipit-source-id: aa403bac9ed820cc0e94342aebfe524a1d5bf913
---
 caffe2/serialize/inline_container.cc           |  2 +-
 caffe2/serialize/inline_container.h            |  4 ++--
 torch/csrc/jit/serialization/import.cpp        | 13 +++++--------
 torch/csrc/jit/serialization/import.h          |  4 ++--
 torch/csrc/jit/serialization/import_legacy.cpp |  6 +++---
 torch/csrc/jit/serialization/import_legacy.h   |  2 +-
 6 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 7928d5e3de86..3d9701274ba3 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -65,7 +65,7 @@ PyTorchStreamReader::PyTorchStreamReader(std::istream* in)
 }
 
 PyTorchStreamReader::PyTorchStreamReader(
-    std::unique_ptr<ReadAdapterInterface> in)
+    std::shared_ptr<ReadAdapterInterface> in)
     : ar_(std::make_unique<mz_zip_archive>()), in_(std::move(in)) {
   init();
 }
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index 2e841d0ad824..ee7e971344ea 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -156,7 +156,7 @@ class CAFFE2_API PyTorchStreamReader final {
  public:
   explicit PyTorchStreamReader(const std::string& file_name);
   explicit PyTorchStreamReader(std::istream* in);
-  explicit PyTorchStreamReader(std::unique_ptr<ReadAdapterInterface> in);
+  explicit PyTorchStreamReader(std::shared_ptr<ReadAdapterInterface> in);
 
   // return dataptr, size
   std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
@@ -180,7 +180,7 @@ class CAFFE2_API PyTorchStreamReader final {
   std::unique_ptr<mz_zip_archive> ar_;
   std::string archive_name_;
   std::string archive_name_plus_slash_;
-  std::unique_ptr<ReadAdapterInterface> in_;
+  std::shared_ptr<ReadAdapterInterface> in_;
   int64_t version_;
 };
 
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index 3956d0283487..ed5dde0b08b0 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -108,7 +108,7 @@ class ScriptModuleDeserializer final {
  public:
   ScriptModuleDeserializer(
       std::shared_ptr<CompilationUnit> cu,
-      std::unique_ptr<PyTorchStreamReader> reader)
+      std::shared_ptr<PyTorchStreamReader> reader)
       : compilation_unit_(std::move(cu)),
         reader_(std::move(reader)),
         source_importer_(
@@ -128,7 +128,7 @@ class ScriptModuleDeserializer final {
   IValue readArchive(const std::string& archive_name);
 
   std::shared_ptr<CompilationUnit> compilation_unit_;
-  std::unique_ptr<PyTorchStreamReader> reader_;
+  std::shared_ptr<PyTorchStreamReader> reader_;
   c10::optional<at::Device> device_;
   std::vector<at::IValue> constants_table_;
   SourceImporter source_importer_;
@@ -175,7 +175,6 @@ IValue ScriptModuleDeserializer::readArchive(const std::string& archive_name) {
       return obj;
     }
   };
-
   return readArchiveAndTensors(
       archive_name, type_resolver, obj_loader, device_, *reader_.get());
 }
@@ -257,8 +256,7 @@ Module ScriptModuleDeserializer::deserialize(
   }
   if (reader_->hasRecord("model.json")) {
 #if !defined(C10_MOBILE) && !defined(C10_DISABLE_LEGACY_IMPORT)
-    return torch::jit::LEGACY_deserialize(
-        compilation_unit_, std::move(reader_), device_);
+    return torch::jit::LEGACY_deserialize(compilation_unit_, reader_, device_);
 #else
     AT_ERROR("Legacy model format is not supported on mobile.");
 #endif
@@ -271,7 +269,6 @@ Module ScriptModuleDeserializer::deserialize(
   rewriteQuantizedConvForBC(m);
   return m;
 }
-
 } // namespace
 
 Module import_ir_module(
@@ -323,7 +320,7 @@ Module load(
 }
 
 Module load(
-    std::unique_ptr<ReadAdapterInterface> rai,
+    std::shared_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files) {
   // Verify that we're loading a zip archive and not a torch.save pickle archive
@@ -347,7 +344,7 @@ Module load(
         " produced by `torch.jit.save()`");
   }
 
-  auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
+  auto reader = std::make_shared<PyTorchStreamReader>(std::move(rai));
   auto cu = std::make_shared<CompilationUnit>();
 
   ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index 543a1ca32aaf..cbfb765a6350 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -55,13 +55,13 @@ TORCH_API Module load(
     c10::optional<c10::Device> device = c10::nullopt,
     ExtraFilesMap& extra_files = default_extra_files);
 
-/// Loads a serialized `Module` from the given `rai`.
+/// Loads a serialized `Module` from the given shared_ptr `rai`.
 ///
 /// The reader adapter, which is for customized input stream, must contain a
 /// serialized `Module`, exported either via `ScriptModule.save()` in
 /// Python or `torch::jit::ExportModule` in C++.
 TORCH_API Module load(
-    std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
     c10::optional<c10::Device> device = c10::nullopt,
     ExtraFilesMap& extra_files = default_extra_files);
 
diff --git a/torch/csrc/jit/serialization/import_legacy.cpp b/torch/csrc/jit/serialization/import_legacy.cpp
index 7a8279e0199c..40e035b82090 100644
--- a/torch/csrc/jit/serialization/import_legacy.cpp
+++ b/torch/csrc/jit/serialization/import_legacy.cpp
@@ -40,7 +40,7 @@ class ScriptModuleDeserializer final {
  public:
   ScriptModuleDeserializer(
       std::shared_ptr<CompilationUnit> cu,
-      std::unique_ptr<PyTorchStreamReader> reader,
+      std::shared_ptr<PyTorchStreamReader> reader,
       const c10::optional<at::Device>& device)
       : compilation_unit_(std::move(cu)),
         reader_(std::move(reader)),
@@ -76,7 +76,7 @@ class ScriptModuleDeserializer final {
   std::shared_ptr<Source> sourceLoader(const std::string& qualifier);
 
   std::shared_ptr<CompilationUnit> compilation_unit_;
-  std::unique_ptr<PyTorchStreamReader> reader_;
+  std::shared_ptr<PyTorchStreamReader> reader_;
   c10::optional<at::Device> device_;
   // Legacy only tensor can be a constant.
   std::vector<at::IValue> constant_table_;
@@ -383,7 +383,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule(
 
 Module LEGACY_deserialize(
     std::shared_ptr<CompilationUnit> cu,
-    std::unique_ptr<caffe2::serialize::PyTorchStreamReader> reader,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
     const c10::optional<c10::Device>& device) {
   ScriptModuleDeserializer deserializer(
       std::move(cu), std::move(reader), device);
diff --git a/torch/csrc/jit/serialization/import_legacy.h b/torch/csrc/jit/serialization/import_legacy.h
index 64f8a7da1968..a26182810959 100644
--- a/torch/csrc/jit/serialization/import_legacy.h
+++ b/torch/csrc/jit/serialization/import_legacy.h
@@ -16,7 +16,7 @@ struct CompilationUnit;
 // Deserializes a model in legacy format.
 Module LEGACY_deserialize(
     std::shared_ptr<CompilationUnit> cu,
-    std::unique_ptr<caffe2::serialize::PyTorchStreamReader> reader,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
     const c10::optional<c10::Device>& device);
 
 } // namespace jit

From a39398b9e5d528e4a6ca293f1703833932f0d9b2 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Sun, 6 Dec 2020 23:38:15 -0800
Subject: [PATCH 098/132] CUDA BF16 norm (#48806)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48806

Reviewed By: mruberry

Differential Revision: D25358465

Pulled By: ngimel

fbshipit-source-id: 1a2afd86f39e96db0754d04bf81de045b1e1235c
---
 aten/src/ATen/native/cuda/ReduceNormKernel.cu |  2 --
 test/test_torch.py                            | 14 ++++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
index 39a355a96756..a857dbc52b8a 100644
--- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
@@ -40,14 +40,12 @@ static void norm_kernel_cuda(TensorIterator& iter, Scalar p) {
     // type promotion that does cast and reduction in a single kernel
     return norm_kernel_cuda_impl<at::Half, float, float>(iter, p);
   }
-  #ifdef __HIP_PLATFORM_HCC__
   else if(iter.dtype() == kBFloat16) {
     return norm_kernel_cuda_impl<at::BFloat16, float>(iter, p);
   } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
     // type promotion that does cast and reduction in a single kernel
     return norm_kernel_cuda_impl<at::BFloat16, float, float>(iter, p);
   }
-  #endif
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cuda", [&]() {
     norm_kernel_cuda_impl<scalar_t>(iter, p);
   });
diff --git a/test/test_torch.py b/test/test_torch.py
index fde60ca4174f..2d181c3b9400 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6328,10 +6328,6 @@ def test_copy_broadcast(self, device) -> None:
 
 _float_types_no_half = [torch.float, torch.double]
 
-# _float_types2 adds bfloat16 type to _float_types only on ROCm. Should eventually be unified
-# with _float_types when bfloat16 bringup is complete on all platforms
-_float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types
-
 _signed_types = [
     torch.half, torch.bfloat16, torch.float, torch.double,
     torch.int8, torch.short, torch.int, torch.long
@@ -6689,10 +6685,12 @@ def inner(self, device, dtype):
     ('narrow', '', _small_3d, lambda t, d: [1, 3, 2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('narrow', 'neg_dim', _small_3d, lambda t, d: [-1, 3, 2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('nonzero', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('norm', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False),
-    ('norm', '3_norm', _small_3d, lambda t, d: [3], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False),
-    ('norm', '3_norm_dim', _small_3d, lambda t, d: [3, 0], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False),
-    ('norm', '3_norm_neg_dim', _small_3d, lambda t, d: [3, -2], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False),
+    ('norm', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
+    ('norm', '3_norm', _small_3d, lambda t, d: [3], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
+    ('norm', '3_norm_dim', _small_3d, lambda t, d: [3, 0], 1e-1, 1e-1, 1e-5,
+        torch.testing.get_all_fp_dtypes(), _cpu_types, False),
+    ('norm', '3_norm_neg_dim', _small_3d, lambda t, d: [3, -2], 1e-1, 1e-1, 1e-5,
+        torch.testing.get_all_fp_dtypes(), _cpu_types, False),
     ('new_ones', '', _small_3d, lambda t, d: [1, 2, 3, 4, 5], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('permute', '', _new_t((1, 2, 3, 4)), lambda t, d: [2, 1, 3, 0], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('put_', '', _new_t((2, 5, 3)),

From 00f01791a37bb88e4d2140ffb3eb3eef1754786f Mon Sep 17 00:00:00 2001
From: Meng Zhang <zhangmeng1010@fb.com>
Date: Mon, 7 Dec 2020 07:41:17 -0800
Subject: [PATCH 099/132] [Caffe2]Add more error message in
 ComputeBinaryBroadcastForwardDims

Summary: Add more error message in ComputeBinaryBroadcastForwardDims

Test Plan:
buck test mode/opt caffe2/caffe2/python/operator_test:gather_ranges_op_test

buck test mode/opt caffe2/caffe2/python/operator_test:reduce_ops_test

buck test mode/opt caffe2/caffe2/python/operator_test:elementwise_ops_test

Reviewed By: BIT-silence

Differential Revision: D24949525

fbshipit-source-id: 762d913a6615a6394072f5bebbcb5cc36f0b8603
---
 caffe2/operators/elementwise_ops_utils.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc
index 5bb6c768ea3e..0f76a1b35aa4 100644
--- a/caffe2/operators/elementwise_ops_utils.cc
+++ b/caffe2/operators/elementwise_ops_utils.cc
@@ -53,7 +53,10 @@ std::vector<int> ComputeBinaryBroadcastForwardDims(
   for (; i >= 0 && j >= 0; --k) {
     const int A_dim = A_dims[i];
     const int B_dim = B_dims[j];
-    CAFFE_ENFORCE(A_dim == B_dim || A_dim == 1 || B_dim == 1);
+    CAFFE_ENFORCE(
+      A_dim == B_dim || A_dim == 1 || B_dim == 1,
+      "A_dim: ", A_dim , ",B_dim: ", B_dim
+    );
     if (A_dim == 0 || B_dim == 0) {
       C_dims[k] = 0;
     } else {

From 1febd2225b03516340a4c799c4920e0d3dc82417 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 7 Dec 2020 08:05:25 -0800
Subject: [PATCH 100/132] Add explicit cast to cuda_atomic_ops_test.cu (#48886)

Summary:
Should fix linking error reported in https://github.com/pytorch/pytorch/issues/48870

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48886

Reviewed By: walterddr

Differential Revision: D25356601

Pulled By: malfet

fbshipit-source-id: 25282d4606251b27d047917f096868ddb662a723
---
 aten/src/ATen/test/cuda_atomic_ops_test.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/test/cuda_atomic_ops_test.cu b/aten/src/ATen/test/cuda_atomic_ops_test.cu
index 285623349e52..920a72452916 100644
--- a/aten/src/ATen/test/cuda_atomic_ops_test.cu
+++ b/aten/src/ATen/test/cuda_atomic_ops_test.cu
@@ -11,7 +11,7 @@ template <typename T>
 __global__ void addition_test_kernel(T * a, T * sum) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int idx = (tid) % arraysize;
-  
+
   gpuAtomicAdd(&sum[idx], a[idx]);
 }
 
@@ -19,7 +19,7 @@ template <typename T>
 __global__ void mul_test_kernel(T * a, T * sum) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int idx = (tid) % arraysize;
-  
+
   gpuAtomicMul(&sum[idx], a[idx]);
 }
 
@@ -29,7 +29,7 @@ void test_atomic_add() {
   dim3 dimGrid(1, 1);
 
   T *a, *sum, *answer, *ad, *sumd;
-  
+
   a = (T*)malloc(arraysize * sizeof(T));
   sum = (T*)malloc(arraysize * sizeof(T));
   answer = (T*)malloc(arraysize * sizeof(T));
@@ -42,7 +42,7 @@ void test_atomic_add() {
 
   cudaMalloc((void**)&ad, arraysize * sizeof(T));
   cudaMalloc((void**)&sumd, arraysize * sizeof(T));
-  
+
   cudaMemcpy(ad, a, arraysize * sizeof(T), cudaMemcpyHostToDevice);
   cudaMemcpy(sumd, sum, arraysize * sizeof(T), cudaMemcpyHostToDevice);
 
@@ -67,7 +67,7 @@ void test_atomic_mul() {
   dim3 dimGrid(1, 1);
 
   T *a, *sum, *answer, *ad, *sumd;
-  
+
   a = (T*)malloc(arraysize * sizeof(T));
   sum = (T*)malloc(arraysize * sizeof(T));
   answer = (T*)malloc(arraysize * sizeof(T));
@@ -75,12 +75,12 @@ void test_atomic_mul() {
   for (int i = 0; i < arraysize; ++i) {
     a[i] = 2;
     sum[i] = 2;
-    answer[i] = pow(sum[i], factor);
+    answer[i] = pow(sum[i], static_cast<T>(factor));
   }
 
   cudaMalloc((void**)&ad, arraysize * sizeof(T));
   cudaMalloc((void**)&sumd, arraysize * sizeof(T));
-  
+
   cudaMemcpy(ad, a, arraysize * sizeof(T), cudaMemcpyHostToDevice);
   cudaMemcpy(sumd, sum, arraysize * sizeof(T), cudaMemcpyHostToDevice);
 
@@ -105,7 +105,7 @@ TEST(TestAtomicOps, TestAtomicAdd) {
   test_atomic_add<int16_t>();
   test_atomic_add<int32_t>();
   test_atomic_add<int64_t>();
-  
+
   test_atomic_add<at::BFloat16>();
   test_atomic_add<at::Half>();
   test_atomic_add<float>();

From 8bc6023d7a822ea6936b7460027f29558149008d Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <gleobas@quansight.com>
Date: Mon, 7 Dec 2020 08:21:42 -0800
Subject: [PATCH 101/132] Add type annotations to torch.onnx.* modules (#48782)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45215

This is a follow up PR of https://github.com/pytorch/pytorch/issues/45258

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48782

Reviewed By: heitorschueroff

Differential Revision: D25304229

Pulled By: ezyang

fbshipit-source-id: b01b21ddbf86f908ca08173e68b81fb25851bc81
---
 mypy.ini                        | 24 ------------
 torch/_C/__init__.pyi.in        | 68 ++++++++++++++++++++++++++++++++-
 torch/_C/_onnx.pyi              |  1 +
 torch/onnx/symbolic_helper.py   | 23 ++++++-----
 torch/onnx/symbolic_opset8.py   |  2 +-
 torch/onnx/symbolic_opset9.py   |  9 +++--
 torch/onnx/symbolic_registry.py |  5 ++-
 torch/onnx/utils.py             | 24 +++++++-----
 8 files changed, 105 insertions(+), 51 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index f4b37f15a820..0b9f5497162c 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -143,30 +143,6 @@ ignore_errors = True
 [mypy-torch.nn.intrinsic.qat.modules.conv_fused]
 ignore_errors = True
 
-[mypy-torch.onnx.operators]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_opset8]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_opset9]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_opset11]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_caffe2]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_helper]
-ignore_errors = True
-
-[mypy-torch.onnx.symbolic_registry]
-ignore_errors = True
-
-[mypy-torch.onnx.utils]
-ignore_errors = True
-
 [mypy-torch.multiprocessing.pool]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index cbb5b2452e21..1452718ed793 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -165,7 +165,7 @@ def wait(fut: Future) -> Any: ...
 def _collect_all(futures: List[Future]) -> Future: ...
 
 def unify_type_list(types: List[JitType]) -> JitType: ...
-def _freeze_module(module: ScriptModule, preserved_attrs: List[str], freeze_interfaces: _bool = True) -> ScriptModule: ...
+def _freeze_module(module: ScriptModule, preserved_attrs: List[str] = [], freeze_interfaces: _bool = True) -> ScriptModule: ...
 def _is_tracing() -> _bool: ...
 def _jit_init() -> _bool: ...
 def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
@@ -217,6 +217,8 @@ def _jit_get_trigger_value(trigger_name: str) -> _int: ...
 # Defined in torch/csrc/jit/python/script_init.cpp
 ResolutionCallback = Callable[[str], Callable[..., Any]]
 
+# Defined in torch/csrc/jit/python/script_init.cpp
+#        and torch/csrc/jit/python/init.cpp
 def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ...
 def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
 def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
@@ -246,6 +248,54 @@ def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallb
 def _create_module_with_type(ty: JitType) -> ScriptModule: ...
 def _run_emit_module_hook(m: ScriptModule): ...
 def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def, new_name: str) -> Def: ...
+
+def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
+def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ...
+def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ...
+def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], onnx_shape_inference: _bool = False) -> None: ...
+def _jit_pass_fixup_onnx_loop_node_inputs(n: Node) -> None: ...
+def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ...
+def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
+def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
+def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ...
+def _jit_pass_fuse_addmm(graph: Graph) -> None: ...
+def _jit_pass_onnx_preprocess(graph: Graph) -> None: ...
+def _jit_pass_onnx_prepare_inplace_ops_for_onnx(graph: Graph) -> None: ...
+def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ...
+def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
+def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
+def _jit_pass_onnx_unpack_quantized_weights(
+    graph: Graph,
+    paramsDict: Dict[str, IValue]
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_quantization_insert_permutes(
+    graph: Graph,
+    paramsDict: Dict[str, IValue]
+) -> Dict[str, IValue]: ...
+def _jit_pass_custom_pattern_based_rewrite_graph(pattern: str, fused_node_name: str, graph: Graph) -> None: ...
+def _jit_pass_erase_number_types(graph: Graph) -> None: ...
+def _jit_pass_onnx(graph: Graph, _jit_pass_onnx: _onnx.OperatorExportTypes) -> Graph: ...
+def _jit_pass_onnx_scalar_type_analysis(graph: Graph) -> None: ...
+def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: _bool) -> None: ...
+def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ...
+def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
+def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
+def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
+def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
+def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ...
+def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
+def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ...
+def _jit_pass_onnx_block(
+    old_block: Block,
+    new_block: Block,
+    operator_export_type: _onnx.OperatorExportTypes,
+    env: Dict[Value, Value]
+) -> None: ...
+def _jit_pass_fixup_onnx_controlflow_node(n: Node, opset_version: _int) -> Node: ...
+
 def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ...
 def _jit_script_compile_overload(
     qualname: str,
@@ -281,8 +331,18 @@ def import_ir_module_from_buffer(
     extra_files: Dict[str, Any]
 ) -> ScriptModule: ...
 
+def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
+def _check_onnx_proto(proto: str) -> None: ...
+def _propagate_and_assign_input_shapes(
+    graph: Graph,
+    inputs: Tuple[Tensor, ...],
+    with_grad: _bool,
+    propagate: _bool
+) -> Graph: ...
+
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
+    def eraseInput(self, i: _int) -> None: ...
     ...
 
 # Defined in torch/csrc/jit/ir/ir.h
@@ -366,8 +426,8 @@ class ScriptFunction:
     def qualified_name(self) -> str: ...
 
 class ScriptMethod:
+    graph: Graph
     ...
-
 class ModuleDict:
     def __init__(self, mod: ScriptModule) -> None: ...
     def items(self) -> List[Tuple[str, Any]]: ...
@@ -378,6 +438,10 @@ class ParameterDict:
 class BufferDict:
     def __init__(self, mod: ScriptModule) -> None: ...
 
+# Defined in torch/csrc/jit/api/module.h
+class Module:
+    ...
+
 # Defined in torch/csrc/Module.cpp
 def _initExtension(shm_manager_path: str) -> None: ...  # THPModule_initExtension
 def _autograd_init() -> _bool: ...  # THPAutograd_initExtension
diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi
index 51f16566ce6c..7ab3cd9c567d 100644
--- a/torch/_C/_onnx.pyi
+++ b/torch/_C/_onnx.pyi
@@ -29,6 +29,7 @@ class OperatorExportTypes(Enum):
     ONNX_ATEN = ...
     ONNX_ATEN_FALLBACK = ...
     RAW = ...
+    ONNX_FALLTHROUGH = ...
 
 class TrainingMode(Enum):
     EVAL = ...
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 5e9430f995f8..8fd8ce3ea760 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -2,6 +2,7 @@
 import torch
 import warnings
 from sys import maxsize as maxsize
+from typing import Set
 
 import torch.onnx
 # This import monkey-patches graph manipulation methods on Graph, used for the
@@ -125,7 +126,7 @@ def decorator(fn):
         def wrapper(g, *args, **kwargs):
             # some args may be optional, so the length may be smaller
             assert len(arg_descriptors) >= len(args)
-            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]
+            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]  # type: ignore
             # only support _outputs in kwargs
             assert len(kwargs) <= 1
             if len(kwargs) == 1:
@@ -232,11 +233,11 @@ def _select_helper(g, self, dim, index, apply_reshape=True):
 
 def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
     if _export_onnx_opset_version <= 9:
-        from torch.onnx.symbolic_opset9 import _slice
-        return _slice(g, input, axes, starts, ends)
+        from torch.onnx.symbolic_opset9 import _slice as _slice9
+        return _slice9(g, input, axes, starts, ends)
     else:
-        from torch.onnx.symbolic_opset10 import _slice
-        return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
+        from torch.onnx.symbolic_opset10 import _slice as _slice10
+        return _slice10(g, input, axes, starts, ends, steps, dynamic_slice)
 
 def _hardtanh_helper(g, input, min_val, max_val):
     if _export_onnx_opset_version <= 10:
@@ -380,7 +381,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_
                 size = g.op("Concat", *size, axis_i=0)
         scale_factor = _interpolate_size_to_scales(g, input, size, dim)
     else:
-        return _unimplemented("Both size and scales are None in __interpolate")
+        return _unimplemented("interpolate", "Both size and scales are None in __interpolate")
     return scale_factor, mode
 
 
@@ -388,7 +389,7 @@ def _unbind_helper(g, self, dim, _outputs):
     if _export_onnx_opset_version <= 9:
         from torch.onnx.symbolic_opset9 import unbind
     else:
-        from torch.onnx.symbolic_opset11 import unbind
+        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
     return unbind(g, self, dim, _outputs)
 
 
@@ -396,7 +397,8 @@ def _scatter_helper(g, self, dim, index, src):
     if _export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
-        from torch.onnx.symbolic_opset11 import scatter
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore
     return scatter(g, self, dim, index, src)
 
 
@@ -444,7 +446,8 @@ def _index_fill_reshape_helper(g, self, dim, index):
     if _export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
-        from torch.onnx.symbolic_opset11 import scatter
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore
 
     if self.type().dim() is None:
         return _unimplemented("index_fill", "input rank not accesible")
@@ -632,4 +635,4 @@ def _cast_func_template(to_i, g, input, non_blocking):
 
 # Global set to store the list of quantized operators in the network.
 # This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
-_quantized_ops = set()
+_quantized_ops: Set[int] = set()
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index c0c1d48ebec0..e4023dab2320 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -4,7 +4,7 @@
 import torch.onnx.symbolic_opset9 as sym_opset9
 
 from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type
-from torch.onnx.symbolic_opset9 import _cast_Float
+from torch.onnx.symbolic_opset9 import _cast_Float  # type: ignore
 
 import warnings
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index e395ce5c703f..8630f48a62ad 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -13,6 +13,8 @@
 import torch.onnx.symbolic_helper as sym_help
 from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented
 
+from typing import Optional
+
 import numpy
 import math
 import warnings
@@ -311,7 +313,7 @@ def _maybe_cast_reduce_op_input(g, self):
     if dtype is not None:
         # pytorch reduce-ops cast all other integral types to int64
         if not sym_help._is_fp(self) and not (dtype == 'Long'):
-            self = _cast_Long(g, self, False)
+            self = _cast_Long(g, self, False)  # type: ignore
     return self
 
 
@@ -2092,7 +2094,7 @@ def _pack_padded_sequence(g, input, lengths, batch_first):
     # It's really only necessary because those operators expand to something that
     # only works with int32 types in Caffe2...
     if lengths.type().scalarType() != 'Int':
-        lengths = _cast_Int(g, lengths, False)
+        lengths = _cast_Int(g, lengths, False)  # type: ignore
     return g.op("prim::PackPadded", input, lengths, outputs=2)
 
 
@@ -2436,7 +2438,7 @@ def _get_arange_dtype(dtype):
 
 
 def masked_fill(g, self, mask, value):
-    mask = _cast_Bool(g, mask, False)
+    mask = _cast_Bool(g, mask, False)  # type: ignore
     value = sym_help._maybe_get_scalar(value)
     return g.op('Where', mask, sym_help._if_scalar_type_as(g, value, self), self)
 
@@ -2734,6 +2736,7 @@ def as_strided(g, self, sizes, strides, offset=None):
     sizes = sym_help._maybe_get_const(sizes, 'is')
     rank = len(strides)
     self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+    ind: Optional[torch.Tensor]
     if not sym_help._is_value(sizes):
         ind = torch.tensor([0], dtype=torch.long)
         for i, (size, stride) in enumerate(zip(sizes, strides)):
diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py
index 48114d6c472b..c059e8f2eb31 100644
--- a/torch/onnx/symbolic_registry.py
+++ b/torch/onnx/symbolic_registry.py
@@ -1,6 +1,7 @@
 import warnings
 import importlib
 from inspect import getmembers, isfunction
+from typing import Dict, Tuple, Any, Union
 
 # The symbolic registry "_registry" is a dictionary that maps operators
 # (for a specific domain and opset version) to their symbolic functions.
@@ -8,9 +9,9 @@
 # The keys are tuples (domain, version), (where domain is a string, and version is an int),
 # and the operator's name (string).
 # The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
-_registry = {}
+_registry: Dict[Tuple[str, int], Dict] = {}
 
-_symbolic_versions = {}
+_symbolic_versions: Dict[Union[int, str], Any] = {}
 from torch.onnx.symbolic_helper import _onnx_stable_opsets
 for opset_version in _onnx_stable_opsets:
     module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version))
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 5c41306b9ee2..3fe19a56c124 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -18,6 +18,7 @@
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode
 from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto
+from typing import Union, Tuple, List
 
 
 # the flag to tell the user whether it's in the middle of ONNX export or not
@@ -76,7 +77,7 @@ def export(model, args, f, export_params=True, verbose=False, training=None,
     if aten or export_raw_ir:
         assert operator_export_type is None
         assert aten ^ export_raw_ir
-        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
+        operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW
     elif operator_export_type is None:
         if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE:
             operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK
@@ -351,6 +352,7 @@ def _trace_and_get_graph_from_model(model, args):
 
 def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes):
     torch_out = None
+    params: Union[List, Tuple]
     if isinstance(model, torch.jit.ScriptModule):
         try:
             graph = model.forward.graph
@@ -442,7 +444,7 @@ def _model_to_graph(model, args, verbose=False,
     param_names = input_and_param_names[len(input_and_param_names) - len(params):]
     params_dict = dict(zip(param_names, params))
 
-    if training is None or training == TrainingMode.EVAL or (training == TrainingMode.PRESERVE and not is_originally_training):
+    if training is None or training == TrainingMode.EVAL:
         params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
 
     if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
@@ -476,7 +478,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t
     if aten or export_raw_ir:
         assert operator_export_type is None
         assert aten ^ export_raw_ir
-        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
+        operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW
     elif operator_export_type is None:
         operator_export_type = OperatorExportTypes.ONNX
     return _export_to_pretty_string(model, args, f, export_params, verbose, training,
@@ -1051,6 +1053,10 @@ def _graph_constant(g, value, dims, type, *args, **kwargs):
         dims = [1]
         isscalar = True
     type = type.lower()
+    tensor: Union[torch.CharTensor, torch.ShortTensor,
+                  torch.IntTensor, torch.LongTensor,
+                  torch.HalfTensor, torch.FloatTensor,
+                  torch.DoubleTensor]
     if type == "char":
         tensor = torch.CharTensor(*dims)
     elif type == "short":
@@ -1068,7 +1074,7 @@ def _graph_constant(g, value, dims, type, *args, **kwargs):
     else:
         raise ValueError("Unknown type, type should be one of the following strings: "
                          "char, short, int, long, half, float, double")
-    tensor.fill_(value)
+    tensor.fill_(value)  # type: ignore
     if isscalar:
         return g.op("Constant", *args, value_z=tensor, **kwargs)
     return g.op("Constant", *args, value_t=tensor, **kwargs)
@@ -1141,8 +1147,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
             dynamic_axes[key] = value_dict
 
 
-torch._C.Graph.op = _graph_op
-torch._C.Graph.at = _graph_at
-torch._C.Block.op = _block_op
-torch._C.Graph.constant = _graph_constant
-torch._C.Node.__getitem__ = _node_getitem
+torch._C.Graph.op = _graph_op  # type: ignore
+torch._C.Graph.at = _graph_at  # type: ignore
+torch._C.Block.op = _block_op  # type: ignore
+torch._C.Graph.constant = _graph_constant  # type: ignore
+torch._C.Node.__getitem__ = _node_getitem  # type: ignore

From 36df25334f89aca54232a3947cdaaaa066f289ac Mon Sep 17 00:00:00 2001
From: X Wang <24860335+xwang233@users.noreply.github.com>
Date: Mon, 7 Dec 2020 08:24:16 -0800
Subject: [PATCH 102/132] Fix incorrect usage of CUDACachingAllocator [v2]
 (#48817)

Summary:
This is similar to https://github.com/pytorch/pytorch/issues/46605, where the c10::complex part of the code was not merged yet at that moment.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48817

Reviewed By: malfet

Differential Revision: D25333179

Pulled By: ezyang

fbshipit-source-id: a92bdad5ad4b36bef7f050b21a59676c38e7b1fc
---
 aten/src/ATen/cuda/CUDASolver.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp
index 00329acda4a9..bcd630a06b9e 100644
--- a/aten/src/ATen/cuda/CUDASolver.cpp
+++ b/aten/src/ATen/cuda/CUDASolver.cpp
@@ -46,14 +46,14 @@ void getrf<c10::complex<double>>(
   TORCH_CUSOLVER_CHECK(cusolverDnZgetrf_bufferSize(
       handle, m, n, reinterpret_cast<cuDoubleComplex*>(dA), ldda, &lwork));
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  void* buffer = allocator.allocate(sizeof(cuDoubleComplex) * lwork).get();
+  auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex) * lwork);
   TORCH_CUSOLVER_CHECK(cusolverDnZgetrf(
       handle,
       m,
       n,
       reinterpret_cast<cuDoubleComplex*>(dA),
       ldda,
-      static_cast<cuDoubleComplex*>(buffer),
+      static_cast<cuDoubleComplex*>(dataPtr.get()),
       ipiv,
       info));
 }
@@ -71,14 +71,14 @@ void getrf<c10::complex<float>>(
   TORCH_CUSOLVER_CHECK(cusolverDnCgetrf_bufferSize(
       handle, m, n, reinterpret_cast<cuComplex*>(dA), ldda, &lwork));
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  void* buffer = allocator.allocate(sizeof(cuComplex) * lwork).get();
+  auto dataPtr = allocator.allocate(sizeof(cuComplex) * lwork);
   TORCH_CUSOLVER_CHECK(cusolverDnCgetrf(
       handle,
       m,
       n,
       reinterpret_cast<cuComplex*>(dA),
       ldda,
-      static_cast<cuComplex*>(buffer),
+      static_cast<cuComplex*>(dataPtr.get()),
       ipiv,
       info));
 }

From 21ba48fe4955f7fe144a1c9dd239726d24ed67cd Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Mon, 7 Dec 2020 08:43:01 -0800
Subject: [PATCH 103/132] [vulkan] test_app for mobilenetV2 on vulkan api
 (#48924)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48924

Test Plan: Imported from OSS

Reviewed By: SS-JIA

Differential Revision: D25365000

Pulled By: IvanKobzarev

fbshipit-source-id: 79295b5781d2494681dbb4e4a741de49ff9c058c
---
 android/test_app/app/build.gradle             | 20 +++++-----
 .../ATen/native/vulkan/ops/Convolution.cpp    | 37 +++++++++++++++----
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
index 37bdb35e2f19..df7b758e3b31 100644
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@@ -60,20 +60,20 @@ android {
     //}
     flavorDimensions "model", "build", "activity"
     productFlavors {
-        mbq {
+        mnet {
             dimension "model"
-            applicationIdSuffix ".mbq"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"")
-            addManifestPlaceholders([APP_NAME: "MBQ"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
+            applicationIdSuffix ".mnet"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"")
+            addManifestPlaceholders([APP_NAME: "MNET"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
         }
-        mbvulkan {
+        mnetVulkan {
             dimension "model"
-            applicationIdSuffix ".mbvulkan"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
+            applicationIdSuffix ".mnet_vulkan"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"")
             buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
-            addManifestPlaceholders([APP_NAME: "MBQ"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
+            addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
         }
         resnet18 {
             dimension "model"
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index a77b1935eda6..5af2c14b80cb 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -119,17 +119,40 @@ vTensor pack_weights(
       }
 
       // shader KO4C4HW_to_image
-      float image[4 * C_4][OC_4][KH * KW][4];
-      memset(image, 0.f, 16 * C_4 * OC_4 * KH * KW * sizeof(float));
+      struct Image3D {
+        float* data_;
+        uint32_t dim0_, dim1_, dim2_;
+
+        Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) {
+          dim0_ = dim0;
+          dim1_ = dim1;
+          dim2_ = dim2;
+          data_ = new float[dim0 * dim1 * dim2 * 4];
+          memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
+        }
+
+        inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+          return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_;
+        }
+
+        void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) {
+          data_[idx(i0, i1, i2, i3)] = value;
+        }
+
+        float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+          return data_[idx(i0, i1, i2, i3)];
+        }
+      } image{4 * C_4, OC_4, KH * KW};
+
       for (uint32_t sx = 0; sx < C_4; ++sx) {
         for (uint32_t sy = 0; sy < OC_4; ++sy) {
           for (uint32_t sz = 0; sz < (KH * KW); ++sz) {
             for (uint32_t vi = 0; vi < 4; ++vi) {
               int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz;
-              image[4 * sx + 0][sy][sz][vi] = dst[4 * (bufferVIdx + 0) + vi];
-              image[4 * sx + 1][sy][sz][vi] = dst[4 * (bufferVIdx + 1) + vi];
-              image[4 * sx + 2][sy][sz][vi] = dst[4 * (bufferVIdx + 2) + vi];
-              image[4 * sx + 3][sy][sz][vi] = dst[4 * (bufferVIdx + 3) + vi];
+              image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]);
+              image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]);
+              image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]);
+              image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]);
             }
           }
         }
@@ -143,7 +166,7 @@ vTensor pack_weights(
         for (uint32_t sy = 0; sy < H; ++sy) {
           for (uint32_t sz = 0; sz < D; ++sz) {
             for (uint32_t szvi = 0; szvi < 4; ++szvi) {
-              dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image[sx][sy][sz][szvi];
+              dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi);
             }
           }
         }

From f2c3efd51fa7040b6390ee2b483176d97a530102 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Mon, 7 Dec 2020 09:36:20 -0800
Subject: [PATCH 104/132] Fix generator exhaustion in SparseAdam (#47724)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/47594

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47724

Reviewed By: heitorschueroff

Differential Revision: D25304131

Pulled By: albanD

fbshipit-source-id: 67c058b0836b9b4fba4f7b966396e4f3fa61f939
---
 torch/optim/sparse_adam.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index e1315e370269..909aa0c6cc62 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -32,6 +32,8 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
         if not 0.0 <= betas[1] < 1.0:
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 
+        params = list(params)
+
         sparse_params = []
         for index, param in enumerate(params):
             if isinstance(param, dict):

From ba6511b304a50ef8261692922a7038ca3eb48dd3 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Mon, 7 Dec 2020 10:37:38 -0800
Subject: [PATCH 105/132] pyi codegen update - remove Declarations.yaml
 (#48754)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48754

The goal of this PR is to kill Declarations.yaml in the pyi codegen, in favor of native_functions + the existing python object model.

**High-level design**

Since the python signatures used by the `python_arg_parser` are “supposed” to resemble the corresponding pyi type hint signatures, I re-used the existing python object model that Jiakai defined in `tools/codegen/api/python.py`. This means that the pyi codegen now reads `native_functions.yaml`, parses it into a bunch of `PythonSignatureGroup` objects, and emits corresponding method + function variants of type-hint signatures for each one, respectively into `__init__.pyi` and `_VariableFunctions.pyi`.

What makes this uglier is that pyi and the python arg parser have a number of differences in how they’re emitted. I expressed that through a `pyi` flag on the `PythonSignature` dataclass, that tells it whether or not to print itself as a pyi vs. arg_parser signature.

One thing worth noting is how pyi generates signatures differently for native / deprecated op signatures.

For native ops:
- The pyi codegen fuses functional and out variants of each op into a single signature with an optional `out` argument. Ops without an `out` variant just get an ordinary functional signature.
- Some ops that fit certain criteria also get a second “varargs” signature - basically ops with a single positional argument of type List[int].

For deprecated signatures:
- Functional and out variants are not fused - they each get their own signature entry
- There are no varargs signatures

This is currently implemented through the `signature_str()` and `signature_str_vararg()` methods on the `PythonSignature`/`PythonSignatureDeprecated` classes.  `signature_str()` knows how to print itself with/without out arguments, differently for native/deprecated ops. `signature_str_vararg()` optionally returns a vararg variant of the signature if one exists.

**Calling out the gap between python_arg_parser vs. pyi**

The two formats are notably different, so I don’t think we can expect to unify them completely. That said, I encountered a number of differences in the pyi codegen that looked wrong- I tried to call them out in the PR, to be removed later. Just as an example, looking at the `svd` signature in the python_arg_parser vs. the pyi type hint:

python_arg_parser
```
Static PythonArgParser parser({
  “svd(Tensor input, bool some=True, bool compute_uv=True, *, TensorList[3] out=None”,
}, /*traceable=*/true);
```

Pyi
```
def svd(input: Tensor, some: _bool=True, compute_uv: _bool=True, *, out: Optional[Tensor]=None) -> namedtuple_U_S_V: …
```

The two have obvious syntactic differences that we probably don’t plan on changing: the python_arg_parser doesn’t include `def` or return types, and it includes the type hint before the variable name. But the type of `out` in pyi is probably wrong, since `svd` has multiple output params. I tried to clearly call out any instances of the pyi codegen diverging in a way that looks buggy, so we can clean it up in a later PR (see the comments for details).

Another particularly ugly “bug” that I kept in to maintain byte-for-byte compatibility is the fact that the pyi codegen groups operator overloads together. It turns out that the only reason it does this (as far as I can tell) is because is tacks on an out argument to signatures that don’t have one, if ANY overloads of that op have an out variant.

E.g. consider the pyi type hints generated for `nanmedian` in `_VF.pyi`:
```
overload
def nanmedian(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ...
overload
def nanmedian(input: Tensor, dim: _int, keepdim: _bool=False, *, out: Optional[Tensor]=None) -> namedtuple_values_indices: ...
overload
def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool=False, *, out: Optional[Tensor]=None) -> namedtuple_values_indices: ...
```

And the corresponding native_functions.yaml entries:
```
- func: nanmedian(Tensor self) -> Tensor
- func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
- func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
- func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
- func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!)
```

Signature 2 corresponds to entries 2 and 3 in native_functions, and Signature 3 corresponds to entries 4 and 5. But signature 1 has an optional out argument, even though entry 1 in native_functions.yaml has no out variant.

I’d like to delete that logic in a later PR- that will also have the added benefit no longer requiring to group overloads together in the pyi codegen. We can just operate independently on each PythonSignatureGroup.

**More detailed accounting of the changes**

Per file:

gen_python_functions.py
- `load_signatures()` can now skip deprecated signatures. Needed because pyi only includes deprecated functions, and skips their method variants (maybe we should add them in…?)
- Moved `namedtuple_fieldnames` into python.cpp
- `group_overloads()` can now opt to not sort the overloads (needed for byte-for-byte compact, pyi doesn’t sort for some reason)

Python.py:
- Gave `PythonSignature`and `PythonSignatureDeprecated` a `pyi` flag that tells it whether or not to print itself in pyi vs. python_arg_parser format
- Added a `PythonReturns` dataclass , which is now a member of PythonSignature. It is only used by pyi. I found this useful because python returns need to know how to deal with named tuple returns properly. I also moved `namedtuple_fieldnames` into this file from gen_python_functions

gen_pyi.py
- Merged `get_py_torch_functions` and `get_py_variable_methods` into a single function, since they’re very similar
- Lifted out all of the pyi type hint type-mapping mess and dropped it into python.py. This required updating the mapping to deal with NativeFunction objects instead of the outputs of Declarations.yaml (this was most of the logic in `type_to_python`, `arg_to_type_hint`, and `generate_type_hints`).  `generate_type_hints` is now a small orchestration function that gathers the different signatures for each PythonSignatureGroup.
- NamedTuples are now generated by calling `PythonReturn.named_tuple()` (in `generate_named_tuples()`), rather than appending to a global list

A lot of hardcoded pyi signatures still live in `gen_pyi.py`. I didn’t look to closely into whether or not any of that can be removed as part of this PR.

Test Plan: Imported from OSS

Reviewed By: ljk53

Differential Revision: D25343802

Pulled By: bdhirsh

fbshipit-source-id: f73e99e1afef934ff41e4aca3dabf34273459a52
---
 .jenkins/pytorch/codegen-test.sh       |   2 +
 mypy-strict.ini                        |   1 +
 tools/autograd/gen_python_functions.py |  47 ++-
 tools/autograd/utils.py                |  16 +-
 tools/codegen/api/python.py            | 277 +++++++++++++++--
 tools/pyi/gen_pyi.py                   | 397 +++++++------------------
 torch/CMakeLists.txt                   |   4 +-
 7 files changed, 381 insertions(+), 363 deletions(-)

diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
index 0f015df045c2..44f1e9449bf0 100755
--- a/.jenkins/pytorch/codegen-test.sh
+++ b/.jenkins/pytorch/codegen-test.sh
@@ -38,6 +38,8 @@ mkdir -p "$OUT"/pyi/torch/_C
 mkdir -p "$OUT"/pyi/torch/nn
 python -m tools.pyi.gen_pyi \
   --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
+  --native-functions-path aten/src/ATen/native/native_functions.yaml \
+  --deprecated-functions-path tools/autograd/deprecated.yaml \
   --out "$OUT"/pyi
 
 # autograd codegen (called by torch codegen but can run independently)
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 42fc73abf1cc..ddd369ebe621 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -35,6 +35,7 @@ files = tools/codegen/gen.py,
     tools/autograd/gen_trace_type.py,
     tools/autograd/gen_variable_factories.py,
     tools/autograd/load_derivatives.py,
+    tools/pyi/gen_pyi.py,
     torch/utils/benchmark/utils/common.py,
     torch/utils/benchmark/utils/timer.py,
     torch/utils/benchmark/utils/valgrind_wrapper/*.py,
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 123a47f2aac2..63438a527b4c 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -193,25 +193,28 @@ def load_signatures(
     deprecated_yaml_path: str,
     *,
     method: bool,
+    skip_deprecated: bool = False,
+    pyi: bool = False,
 ) -> Sequence[PythonSignatureNativeFunctionPair]:
     native_functions = list(filter(should_generate_py_binding, parse_native_yaml(native_yaml_path)))
 
     @with_native_function
     def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair:
         return PythonSignatureNativeFunctionPair(
-            signature=signature(f, method=method),
+            signature=signature(f, method=method, pyi=pyi),
             function=f,
         )
 
     pairs = list(map(gen_signature_pairs, native_functions))
-    deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method)
-    return pairs + deprecated
+    deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method, pyi=pyi)
+    return pairs if skip_deprecated else pairs + deprecated
 
 def load_deprecated_signatures(
     pairs: Sequence[PythonSignatureNativeFunctionPair],
     deprecated_yaml_path: str,
     *,
     method: bool,
+    pyi: bool,
 ) -> List[PythonSignatureNativeFunctionPair]:
     # The deprecated.yaml doesn't have complete type information, we need
     # find and leverage the original ATen signature (to which it delegates
@@ -225,6 +228,10 @@ def signature_original(f: NativeFunction) -> str:
         opname = str(f.func.name.name.base)
         if f.func.is_out_fn():
             opname += '_out'
+        # TODO: remove HACK
+        # I think we want to differentiate inplace functions here.. but we currently don't for the arg parser
+        if f.func.name.name.inplace and pyi:
+            opname += '_'
         args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
         # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml.
         types = ', '.join(argument_type_str(a.argument.type)
@@ -308,6 +315,7 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
                     method=python_sig.method,
                     deprecated_args_names=tuple(args),
                     deprecated_args_exprs=tuple(call_args),
+                    returns=python_sig.returns,
                 ),
                 function=pair.function,
             ))
@@ -320,31 +328,10 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
-# TODO: remove the copy of this method in 'tools/pyi/gen_pyi.py'.
-@with_native_function
-def namedtuple_fieldnames(f: NativeFunction) -> List[str]:
-    returns = f.func.returns
-    if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)):
-        return []
-    else:
-        if any(map(lambda r: r.name is None, returns)):
-            # When building on Windows, `PyStructSequence_UnnamedField` could not be
-            # resolved by the linker for some reason, which cause error in building:
-            #
-            # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
-            # PyStructSequence_UnnamedField
-            #
-            # Thus, at this point in time, we do not support unnamed
-            # fields in namedtuple; you must either name all fields,
-            # or none of them.
-            raise ValueError("Unnamed field is not supported by codegen")
-
-        return list(map(lambda r: str(r.name), returns))
-
 @with_native_function
 def gen_namedtuple_typename_key(f: NativeFunction) -> str:
     name = cpp.name(f.func)
-    fieldnames = namedtuple_fieldnames(f)
+    fieldnames = namedtuple_fieldnames(f.func.returns)
     return '_'.join([name] + fieldnames)
 
 def emit_namedtuple_typedefs(
@@ -360,7 +347,7 @@ def emit_namedtuple_typedefs(
     typedefs: List[str] = []          # typedef declarations and init code
 
     for overload in overloads:
-        fieldnames = namedtuple_fieldnames(overload.function)
+        fieldnames = namedtuple_fieldnames(overload.function.func.returns)
         if not fieldnames:
             continue
 
@@ -651,7 +638,9 @@ def method_def(
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 def group_overloads(
-    overloads: Sequence[PythonSignatureNativeFunctionPair]
+    overloads: Sequence[PythonSignatureNativeFunctionPair],
+    *,
+    sort: bool = True,
 ) -> Sequence[PythonSignatureGroup]:
     bases: Dict[str, PythonSignatureNativeFunctionPair] = {}
     outplaces: Dict[str, PythonSignatureNativeFunctionPair] = {}
@@ -700,7 +689,9 @@ def group_overloads(
             outplace=outplace.function if outplace is not None else None,
         ))
 
-    return sort_overloads(grouped)
+    # TODO: unconditionally sort
+    # maintaining byte-for-byte compatibility for pyi codegen for now
+    return grouped if not sort else sort_overloads(grouped)
 
 # This function declares a partial order on declarations, and sorts them according
 # to its linear extension. This is necessary, because there's some ambiguity in the
diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py
index 5c0fcccc4c78..b4889d219e9c 100644
--- a/tools/autograd/utils.py
+++ b/tools/autograd/utils.py
@@ -1,8 +1,8 @@
 import re
 import os
 import yaml
-from collections import defaultdict
 from .nested_dict import nested_dict
+from typing import Dict, List
 
 
 __all__ = [
@@ -52,7 +52,7 @@ def uninplace_api_name(api_name):
     return api_name
 
 
-def write(dirname, name, template, env):
+def write(dirname: str, name: str, template: CodeTemplate, env: Dict[str, List[str]]) -> None:
     env['generated_comment'] = GENERATED_COMMENT.substitute(filename=template.filename)
     path = os.path.join(dirname, name)
     # See Note [Unchanging results for ninja]
@@ -69,12 +69,6 @@ def write(dirname, name, template, env):
     else:
         print("Skipped writing {}".format(path))
 
-def is_tensor_method(declaration):
-    return 'Tensor' in declaration['method_of']
-
-def is_torch_function(declaration):
-    return 'namespace' in declaration['method_of']
-
 def is_out_variant(decl):
     return decl['name'].endswith('_out')
 
@@ -92,12 +86,6 @@ def load_op_list_and_strip_overload(op_list, op_list_path):
     # strip out the overload part
     return {opname.split('.', 1)[0] for opname in op_list}
 
-def group_declarations_by_op_name(declarations):
-    groups = defaultdict(list)
-    for d in declarations:
-        groups[op_name(d)].append(d)
-    return groups
-
 def is_output(arg):
     return arg.get('output', False)
 
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index 4b407d45553a..26b0f8eb8076 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -173,6 +173,53 @@
 #      }
 #
 
+# TODO: stick this more firmly in the data model somewhere?
+def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
+    if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)):
+        return []
+    else:
+        if any(map(lambda r: r.name is None, returns)):
+            # When building on Windows, `PyStructSequence_UnnamedField` could not be
+            # resolved by the linker for some reason, which cause error in building:
+            #
+            # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
+            # PyStructSequence_UnnamedField
+            #
+            # Thus, at this point in time, we do not support unnamed
+            # fields in namedtuple; you must either name all fields,
+            # or none of them.
+            raise ValueError("Unnamed field is not supported by codegen")
+
+        return list(map(lambda r: str(r.name), returns))
+
+@dataclass(frozen=True)
+class PythonReturns:
+    returns: Tuple[Return, ...]
+
+    def named_tuple_pyi(self) -> Optional[Tuple[str, str]]:
+        python_returns = [argument_type_str_pyi(r.type) for r in self.returns]
+        field_names = namedtuple_fieldnames(self.returns)
+        if field_names:
+            namedtuple_name = '_'.join(['namedtuple'] + field_names)
+            tuple_args = [f'("{name}", {typ})' for name, typ in zip(field_names, python_returns)]
+            namedtuple_def = f'NamedTuple("{namedtuple_name}", [{", ".join(tuple_args)}])'
+            return namedtuple_name, namedtuple_def
+        return None
+
+    def returns_str_pyi(self) -> str:
+        named_tuple = self.named_tuple_pyi()
+        if named_tuple is not None:
+            namedtuple_name, _ = named_tuple
+            return namedtuple_name
+
+        python_returns = [argument_type_str_pyi(r.type) for r in self.returns]
+        if len(python_returns) > 1:
+            return 'Tuple[' + ', '.join(python_returns) + ']'
+        if len(python_returns) == 1:
+            return python_returns[0]
+        return 'None'
+
+
 @dataclass(frozen=True)
 class PythonArgument:
     name: str
@@ -189,26 +236,56 @@ class PythonArgument:
 
     # Compute argument formal for python argument parsing.
     # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
-    def argument_str(self, *, method: bool = False) -> str:
-        type_str = argument_type_str(self.type)
+    def argument_str(self, *, method: bool = False, pyi: bool = False, deprecated: bool = False) -> str:
+        type_str = argument_type_str_pyi(self.type, pyi_out_arg=pyi and isinstance(self, PythonOutArgument)) \
+            if pyi else argument_type_str(self.type)
 
+        name = self.name
         # s/self/input/ outside method bindings
         # [old codegen] TODO: remove this? doesn't rename in codegen, it's just
         # for the parse string
-        name = self.name
-        if name == 'self' and type_str == 'Tensor' and not method:
+        if name == 'self' and type_str == 'Tensor' and not method and not deprecated:
             name = 'input'
 
+        if pyi:
+            if name == 'from':  # from is a Python keyword...
+                name += '_'
+            # pyi merges the _out and functional variants into the same signature, with an optional out arg
+            if name == 'out' and type_str == 'Tensor' and not deprecated:
+                type_str = 'Optional[' + type_str + ']'
+
+        # TODO: remove diff. pyi deprecated signatures don't get defaults for their out arg
+        treat_as_no_default = pyi and deprecated and isinstance(self, PythonOutArgument) and self.default == 'None'
+
         # add default
-        if self.default is not None:
-            default = {
-                'nullptr': 'None',
-                'c10::nullopt': 'None',
-                '{}': 'None',
-            }.get(self.default, self.default)
-            return f'{type_str} {name}={default}'
+        if self.default is not None and not treat_as_no_default:
+            if pyi:
+                if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \
+                   self.default.startswith('{') and self.default.endswith('}'):
+                    default = '(' + self.default[1:-1] + ')'
+                else:
+                    default = {
+                        'nullptr': 'None',
+                        'c10::nullopt': 'None',
+                        '{}': 'None',
+                        'MemoryFormat::Contiguous': 'contiguous_format',
+                        'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine',
+                    }.get(self.default, self.default)
+                # TODO: remove requires_grad special case (byte-for-byte compat)
+                return f'{name}:{type_str}={default}' if name == 'requires_grad' else f'{name}: {type_str}={default}'
+            else:
+                default = {
+                    'nullptr': 'None',
+                    'c10::nullopt': 'None',
+                    '{}': 'None',
+                }.get(self.default, self.default)
+                return f'{type_str} {name}={default}'
         else:
-            return f'{type_str} {name}'
+            if pyi:
+                # TODO: remove requires_grad special case (byte-for-byte compat)
+                return f'{name}:{type_str}' if name == 'requires_grad' else f'{name}: {type_str}'
+            else:
+                return f'{type_str} {name}'
 
 @dataclass(frozen=True)
 class PythonOutArgument(PythonArgument):
@@ -238,6 +315,7 @@ def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgu
                 raise RuntimeError(f'Unsupported output type: {outputs}')
             return PythonOutArgument(
                 name='out',
+                # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None?
                 type=ListType(BaseType(BaseTy.Tensor), size),
                 default='None',
                 default_init=None,
@@ -260,6 +338,9 @@ class PythonSignature:
 
     output_args: Optional[PythonOutArgument]
 
+    # Return types, which are only used by pyi
+    returns: PythonReturns
+
     # These are scattered kwargs arguments belonging to TensorOptions.
     # When binding to C++, they are packed into a TensorOptions object 'options'.
     # It's possible that the C++ signature doesn't take TensorOptions object (e.g.
@@ -276,13 +357,23 @@ def deprecated(self) -> bool:
         return False
 
     def arguments(
-        self, *, skip_outputs: bool = False, skip_tensor_options: bool = False
+        self, *, skip_outputs: bool = False, skip_tensor_options: bool = False, hacky_add_output: bool = False
     ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]:
         result: List[Union[PythonArgument, PythonOutArgument]] = []
         result.extend(self.input_args)
         result.extend(self.input_kwargs)
         if self.output_args is not None and not skip_outputs:
             result.append(self.output_args)
+        # TODO: remove HACK
+        # in the existing pyi codegen, we tack on an optional out argument to every operator overload
+        # if there exists at least one overload with an out variant. This seems wrong.
+        elif hacky_add_output:
+            result.extend([PythonOutArgument(
+                name='out',
+                type=OptionalType(BaseType(BaseTy.Tensor)),
+                default='None',
+                default_init=None,
+                outputs=())])
         if not skip_tensor_options:
             result.extend(self.tensor_options_args)
         return tuple(result)
@@ -301,18 +392,57 @@ def output_idx(self) -> int:
     # for error parsing.
     #
     # For a translation to mypy-valid type signatures, see
-    # tools/gen_pyi.py.  If you change any logic here, please
+    # signature_str_pyi.  If you change any logic here, please
     # check that file too.
     def signature_str(self, *, skip_outputs: bool = False) -> str:
-        schema_formals: List[str] = \
-            list(map(lambda a: a.argument_str(method=self.method),
-                     self.arguments(skip_outputs=skip_outputs)))
+        args = self.arguments(skip_outputs=skip_outputs)
+        schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method), args))
         positional_argc = len(self.input_args)
         if len(schema_formals) > positional_argc:
             schema_formals.insert(positional_argc, '*')
 
         return f'{self.name}({", ".join(schema_formals)})'
 
+    def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output)
+        schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args))
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, '*')
+
+        # only pyi signatures include returns
+        returns_str = self.returns.returns_str_pyi()
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]:
+        # only pyi uses vararg signatures
+        args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output)
+        schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args))
+        # vararg only applies to pyi signatures. vararg variants are not generated for all signatures
+        num_args = self.arguments_count()
+        num_positionalargs = len(self.input_args)
+
+        have_vararg_version = False
+        if num_args > 0:
+            vararg_type = args[0].type
+            if isinstance(vararg_type, ListType) and str(vararg_type.elem) == 'int' and num_positionalargs == 1:
+                have_vararg_version = True
+
+        if not have_vararg_version:
+            return None
+        # Below are the major changes in vararg vs. regular pyi signatures
+        # vararg signatures also omit the asterix
+        schema_formals[0] = '*' + args[0].name + ': _int'
+
+        returns_str = self.returns.returns_str_pyi()
+        # pyi also includes self (with no typing/defaults) for methods
+        if self.method:
+            schema_formals.insert(0, "self")
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
 # The deprecated python signature involves some special logic, so create a
 # dedicated data model to store these extra properties.
 @dataclass(frozen=True)
@@ -340,6 +470,20 @@ def deprecated(self) -> bool:
     def signature_str(self, *, skip_outputs: bool = False) -> str:
         return PythonSignature.signature_str(self, skip_outputs=skip_outputs) + '|deprecated'
 
+    def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str:
+        args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output)
+        schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True, deprecated=True), args))
+        positional_argc = len(self.input_args)
+        if len(schema_formals) > positional_argc:
+            schema_formals.insert(positional_argc, '*')
+
+        returns_str = self.returns.returns_str_pyi()
+        return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...'
+
+    def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]:
+        # the codegen doesn't include vararg variants for deprecated signatures
+        return None
+
 # This struct is used to hold the PythonSignature and its corresponding
 # NativeFunction BEFORE grouping base and out-variant functions.
 # Why not store NativeFunction in PythonSignature or construct PythonSignature
@@ -520,12 +664,75 @@ def argument(a: Argument) -> PythonArgument:
         default_init=None,
     )
 
-def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
+def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str:
+    add_optional = False
+    if isinstance(t, OptionalType):
+        t = t.elem
+        add_optional = True
+
+    if isinstance(t, BaseType):
+        if t.name == BaseTy.int:
+            ret = '_int'
+        elif t.name == BaseTy.float:
+            ret = '_float'
+        elif t.name == BaseTy.str:
+            ret = 'str'
+        elif t.name == BaseTy.Scalar:
+            ret = 'Number'
+        elif t.name == BaseTy.ScalarType:
+            ret = '_dtype'
+        elif t.name == BaseTy.bool:
+            ret = '_bool'
+        elif t.name == BaseTy.QScheme:
+            ret = '_qscheme'
+        elif t.name == BaseTy.Layout:
+            ret = '_layout'
+        elif t.name == BaseTy.Device:
+            ret = 'Union[_device, str, None]'
+        elif t.name == BaseTy.MemoryFormat:
+            ret = 'memory_format'
+        elif t.name == BaseTy.Dimname:
+            ret = 'Union[str, ellipsis, None]'
+        elif t.name in [BaseTy.Tensor, BaseTy.Generator,
+                        BaseTy.Storage, BaseTy.Stream, BaseTy.str]:
+            # These python schema type names line up with their function schema names
+            ret = t.name.name
+
+    elif isinstance(t, ListType):
+        if pyi_out_arg and t.is_tensor_like():
+            # TODO remove HACK
+            # pyi blindly treats all tensor-like out args as having type Tensor
+            return 'Tensor'
+        if str(t.elem) == 'int':
+            ret = 'Union[_int, _size]' if t.size is not None else '_size'
+        elif t.is_tensor_like():
+            # TODO: this doesn't seem right...
+            # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]]
+            # It should probably translate to   Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]]
+            if isinstance(t.elem, OptionalType):
+                add_optional = True
+            ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \
+                  'Union[Tuple[Tensor, ...], List[Tensor]]'
+        elif str(t.elem) == 'float':
+            ret = 'Sequence[float]'
+        else:
+            elem = argument_type_str_pyi(t.elem)
+            ret = f'Sequence[{elem}]'
+
+    if add_optional:
+        ret = 'Optional[' + ret + ']'
+    return ret
+
+    raise RuntimeError(f'unrecognized type {repr(t)}')
+
+# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen
+def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature:
     # Use cpp api to gather TensorOptions fields from kwargs.
-    # Skip ThisArgument if this is method signature.
+    # Skip SelfArgument if this is method.
     # Skip TensorOptionsArguments in C++ signature. Python side TensorOptions
     # arguments are created based on different rules - see below.
-    args = tuple(a for a in cpp.group_arguments(f.func, method=method) if isinstance(a, Argument))
+    cpp_args = cpp.group_arguments(f.func, method=method)
+    args = tuple(a for a in cpp_args if isinstance(a, Argument))
 
     input_arg_set = set(a.name for a in f.func.arguments.positional)
     kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only)
@@ -561,13 +768,15 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
         tensor_options_args.append(PythonArgument(
             name='dtype',
             type=BaseType(BaseTy.ScalarType),
-            default=_dtype_default_type_hack(name),
+            default=_dtype_default_type_hack(name, pyi=pyi),
             default_init='self.scalar_type()' if is_like_or_new_function else None,
         ))
+        # TODO: probably a bug, kill this diff?
+        # pyi signatures have a slightly different type/default for layout
         tensor_options_args.append(PythonArgument(
             name='layout',
-            type=OptionalType(BaseType(BaseTy.Layout)),
-            default='torch.strided',
+            type=BaseType(BaseTy.Layout) if pyi else OptionalType(BaseType(BaseTy.Layout)),
+            default='strided' if pyi else 'torch.strided',
             default_init='layout_from_backend(self.options().backend())' if is_like_or_new_function else None,
         ))
         tensor_options_args.append(PythonArgument(
@@ -576,12 +785,15 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
             default='None',
             default_init='self.device()' if is_like_or_new_function else None,
         ))
-        tensor_options_args.append(PythonArgument(
-            name='pin_memory',
-            type=BaseType(BaseTy.bool),
-            default='False',
-            default_init=None,
-        ))
+        # TODO: probably a bug, kill this diff?
+        # pyi signatures don't include pin memory
+        if not pyi:
+            tensor_options_args.append(PythonArgument(
+                name='pin_memory',
+                type=BaseType(BaseTy.bool),
+                default='False',
+                default_init=None,
+            ))
         tensor_options_args.append(PythonArgument(
             name='requires_grad',
             type=BaseType(BaseTy.bool),
@@ -589,18 +801,21 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
             default_init=None,
         ))
 
+    returns = PythonReturns(returns=f.func.returns)
+
     return PythonSignature(
         name=str(f.func.name.name),
         input_args=input_args,
         input_kwargs=input_kwargs,
         output_args=PythonOutArgument.from_outputs(outputs),
         tensor_options_args=tuple(tensor_options_args),
+        returns=returns,
         method=method,
     )
 
 # TODO blowtorch
-def _dtype_default_type_hack(name: str) -> str:
-    if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices':
+def _dtype_default_type_hack(name: str, *, pyi: bool) -> str:
+    if not pyi and (name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices'):
         return 'torch.int64'
     else:
         return 'None'
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 617f997a8d76..ee5c38a4cf1c 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -3,13 +3,14 @@
 import collections
 from pprint import pformat
 
-import yaml
-import re
 import argparse
 
-from ..autograd.utils import YamlLoader, CodeTemplate, write, group_declarations_by_op_name, is_tensor_method, is_torch_function
-from ..autograd.gen_python_functions import SKIP_PYTHON_BINDINGS, SKIP_PYTHON_BINDINGS_SIGNATURES
-from ..autograd.gen_autograd import load_aten_declarations
+from tools.codegen.model import *
+from tools.codegen.api.python import *
+from typing import Sequence, List, Mapping, Dict
+
+from ..autograd.utils import CodeTemplate, write
+from ..autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads
 
 """
 This module implements generation of type stubs for PyTorch,
@@ -28,60 +29,48 @@
   (the latter case should be pretty rare).
 
 - We go through automatically bound functions based on the
-  type information recorded in Declarations.yaml and
+  type information recorded in native_functions.yaml and
   generate type hints for them (generate_type_hints)
 
 There are a number of type hints which we've special-cased;
 read gen_pyi for the gory details.
 """
 
-# TODO: remove after migrating entire codegen to the new data model.
-def should_generate_python_binding(declaration):
-    name = declaration['name']
-    for pattern in SKIP_PYTHON_BINDINGS:
-        if re.match('^' + pattern + '$', name):
-            return False
-
-    simple_types = [arg['simple_type'] for arg in declaration['arguments']]
-    signature = '{}({})'.format(name, ', '.join(simple_types))
-    for pattern in SKIP_PYTHON_BINDINGS_SIGNATURES:
-        if pattern == signature:
-            return False
-
-    return True
-
-
-def get_py_variable_methods(declarations):
+# TODO: consider waiting to group by base name until we actually need to
+# (after computing type hint signatures, when adding @overload directives)
+def group_by_base_name(python_funcs: Sequence[PythonSignatureNativeFunctionPair]) -> Mapping[str, List[PythonSignatureGroup]]:
+    groups = group_overloads(python_funcs, sort=False)
+    d = collections.defaultdict(list)
+    for g in groups:
+        name = g.signature.name
+        d[name].append(g)
+    return d
+
+def get_py_torch_functions(
+        python_funcs: Sequence[PythonSignatureNativeFunctionPair],
+        method: bool = False,
+) -> Mapping[str, Sequence[PythonSignatureGroup]]:
     """
     Get declarations (grouped by name) which should be generated
-    as methods on Tensor.
+    as either functions in the "torch" module or methods on Tensor.
     """
-    def should_bind(declaration):
-        return (should_generate_python_binding(declaration) and
-                not declaration.get('python_module') and
-                is_tensor_method(declaration))
+    def should_bind_function(python_func: PythonSignatureNativeFunctionPair) -> bool:
+        return (should_generate_py_binding(python_func.function) and
+                not python_func.function.python_module and
+                Variant.function in python_func.function.variants)
 
-    return group_declarations_by_op_name([d for d in declarations if should_bind(d)])
+    def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
+        return (should_generate_py_binding(python_func.function) and
+                not python_func.function.python_module and
+                Variant.method in python_func.function.variants)
 
-
-def get_py_torch_functions(declarations):
-    """
-    Get declarations (grouped by name) which should be generated
-    as functions in the "torch" module.
-    """
-    def should_bind(declaration):
-        return (should_generate_python_binding(declaration) and
-                not declaration.get('python_module') and
-                is_torch_function(declaration))
-
-    return group_declarations_by_op_name([d for d in declarations if should_bind(d)])
+    should_bind = should_bind_method if method else should_bind_function
+    return group_by_base_name([f for f in python_funcs if should_bind(f)])
 
 
 # TODO: Consider defining some aliases for our Union[...] types, to make
 # the stubs to read on the human eye.
 
-needed_modules = set()
-
 DEVICE_PARAM = "device: Union[_device, str, None]=None"
 FACTORY_PARAMS = f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False"
 
@@ -144,90 +133,6 @@ def should_bind(declaration):
 ]
 
 
-def type_to_python(typename, size=None):
-    """type_to_python(typename: str, size: str) -> str
-
-    Transforms a Declarations.yaml type name into a Python type specification
-    as used for type hints.
-    """
-    typename = typename.replace(' ', '')  # normalize spaces, e.g., 'Generator *'
-
-    # Disambiguate explicitly sized int/tensor lists from implicitly
-    # sized ones.  These permit non-list inputs too.  (IntArrayRef[] and
-    # TensorList[] are not real types; this is just for convenience.)
-    if typename in {'IntArrayRef', 'TensorList'} and size is not None:
-        typename += '[]'
-
-    typename = {
-        'Device': 'Device',
-        'Generator': 'Generator',
-        'IntegerTensor': 'Tensor',
-        'Scalar': 'Number',
-        'ScalarType': '_dtype',
-        'Storage': 'Storage',
-        'BoolTensor': 'Tensor',
-        'IndexTensor': 'Tensor',
-        'Tensor': 'Tensor',
-        'MemoryFormat': 'memory_format',
-        'IntArrayRef': '_size',
-        'IntArrayRef[]': 'Union[_int, _size]',
-        'TensorList': 'Union[Tuple[Tensor, ...], List[Tensor]]',
-        'TensorList[]': 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]',
-        'bool': '_bool',
-        'double': '_float',
-        'int64_t': '_int',
-        'accreal': 'Number',
-        'real': 'Number',
-        'void*': '_int',    # data_ptr
-        'void': 'None',
-        'std::string': 'str',
-        'Dimname': 'Union[str, ellipsis, None]',
-        'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
-        'QScheme': '_qscheme',
-        'ArrayRef<double>' : 'Sequence[float]',
-        'Stream': 'Stream',
-    }[typename]
-
-    return typename
-
-
-def arg_to_type_hint(arg):
-    """arg_to_type_hint(arg) -> str
-
-    This takes one argument in a Declarations and returns a string
-    representing this argument in a type hint signature.
-    """
-    name = arg['name']
-    if name == 'from':  # from is a Python keyword...
-        name += '_'
-    typename = type_to_python(arg['dynamic_type'], arg.get('size'))
-    if arg.get('is_nullable'):
-        typename = 'Optional[' + typename + ']'
-    if 'default' in arg:
-        default = arg['default']
-        if default == 'nullptr':
-            default = None
-        elif default == 'c10::nullopt':
-            default = None
-        elif isinstance(default, str) and default.startswith('{') and default.endswith('}'):
-            if arg['dynamic_type'] == 'Tensor' and default == '{}':
-                default = None
-            elif arg['dynamic_type'] == 'Generator' and default == '{}':
-                default = None
-            elif arg['dynamic_type'] == 'IntArrayRef':
-                default = '(' + default[1:-1] + ')'
-            else:
-                raise Exception("Unexpected default constructor argument of type {}".format(arg['dynamic_type']))
-        elif default == 'MemoryFormat::Contiguous':
-            default = 'contiguous_format'
-        elif default == 'QScheme::PER_TENSOR_AFFINE':
-            default = 'per_tensor_affine'
-        default = '={}'.format(default)
-    else:
-        default = ''
-    return name + ': ' + typename + default
-
-
 binary_ops = ('add', 'sub', 'mul', 'div', 'pow', 'lshift', 'rshift', 'mod', 'truediv',
               'matmul', 'floordiv',
               'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rpow',          # reverse arithmetic
@@ -241,7 +146,7 @@ def arg_to_type_hint(arg):
 all_ops = binary_ops + comparison_ops + unary_ops + to_py_type_ops
 
 
-def sig_for_ops(opname):
+def sig_for_ops(opname: str) -> List[str]:
     """sig_for_ops(opname : str) -> List[str]
 
     Returns signatures for operator special functions (__add__ etc.)"""
@@ -271,146 +176,66 @@ def sig_for_ops(opname):
     else:
         raise Exception("unknown op", opname)
 
-
-# Copied from 'gen_python_functions.py'
-# TODO: consolidate after migrating to the new codegen model in 'tools/codegen'.
-def namedtuple_fieldnames(declaration):
-    returns = declaration['returns']
-    if len(returns) <= 1 or all(['field_name' not in x for x in returns]):
-        return []
-    else:
-        def get_field_name(x):
-            # See Note [field_name versus name]
-            if 'field_name' not in x:
-                # When building on Windows, `PyStructSequence_UnnamedField` could not be
-                # resolved by the linker for some reason, which cause error in building:
-                #
-                # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol
-                # PyStructSequence_UnnamedField
-                #
-                # Thus, at this point in time, we do not support unnamed
-                # fields in namedtuple; you must either name all fields,
-                # or none of them.
-                raise ValueError("Unnamed field is not supported by codegen")
+def generate_named_tuples(funcs: Sequence[PythonSignatureGroup]) -> Dict[str, str]:
+    namedtuples: Dict[str, str] = {}
+    for sig_group in funcs:
+        named_tuple = sig_group.signature.returns.named_tuple_pyi()
+        if named_tuple is not None:
+            tuple_name, tuple_def = named_tuple
+            if tuple_name in namedtuples:
+                assert namedtuples[tuple_name] == tuple_def
             else:
-                return x['field_name']
-        return [get_field_name(x) for x in returns]
-
+                namedtuples[tuple_name] = tuple_def
+    return namedtuples
 
-def generate_type_hints(fname, decls, namedtuples, is_tensor=False):
-    """generate_type_hints(fname, decls, is_tensor=False)
+def generate_type_hints(funcs: Sequence[PythonSignatureGroup], is_tensor: bool = False) -> List[str]:
+    """generate_type_hints(funcs, is_tensor=False)
 
     Generates type hints for the declarations pertaining to the function
-    :attr:`fname`. attr:`decls` are the declarations from the parsed
-    Declarations.yaml.
-    :attr:`namedtuples` is a dictionary for accumulating NamedTuple definitions.
+    :attr:`funcs` are the func from the parsed native_functions.yaml.
     The :attr:`is_tensor` flag indicates whether we are parsing
     members of the Tensor class (true) or functions in the
     `torch` namespace (default, false).
-
-    This function currently encodes quite a bit about the semantics of
-    the translation C++ -> Python.
     """
-    if fname in blocklist:
-        return []
 
     type_hints = []
-    dnames = ([d['name'] for d in decls])
-    has_out = fname + '_out' in dnames
-
-    if has_out:
-        decls = [d for d in decls if d['name'] != fname + '_out']
-
-    for decl in decls:
-        render_kw_only_separator = True  # whether we add a '*' if we see a keyword only argument
-        python_args = []
-
-        has_tensor_options = 'TensorOptions' in (a['dynamic_type'] for a in decl['arguments'])
-
-        for a in decl['arguments']:
-            if a['dynamic_type'] != 'TensorOptions':
-                if a.get('kwarg_only', False) and render_kw_only_separator:
-                    python_args.append('*')
-                    render_kw_only_separator = False
-                try:
-                    python_args.append(arg_to_type_hint(a))
-                except Exception:
-                    print("Error while processing function {}".format(fname))
-                    raise
-
-        if 'self: Tensor' in python_args:
-            self_index = python_args.index('self: Tensor')
-            python_args.remove('self: Tensor')
-            if is_tensor:
-                python_args = ['self'] + python_args
-            else:
-                python_args.insert(self_index, 'input: Tensor')
-        else:
-            if is_tensor:
-                raise Exception("method without self is unexpected")
-
-        if has_out:
-            if render_kw_only_separator:
-                python_args.append('*')
-                render_kw_only_separator = False
-            python_args.append('out: Optional[Tensor]=None')
-
-        if has_tensor_options:
-            if render_kw_only_separator:
-                python_args.append('*')
-                render_kw_only_separator = False
-            python_args += ["dtype: _dtype=None",
-                            "layout: _layout=strided",
-                            "device: Union[_device, str, None]=None",
-                            "requires_grad:_bool=False"]
-
-        python_args_s = ', '.join(python_args)
-        python_returns = [type_to_python(r['dynamic_type']) for r in decl['returns']]
-        field_names = namedtuple_fieldnames(decl)
-
-        if field_names:
-            namedtuple_name = '_'.join(['namedtuple'] + field_names)
-            tuple_args = ['("{}", {})'.format(name, typ) for name, typ in zip(field_names, python_returns)]
-            namedtuple_def = 'NamedTuple("{}", [{}])'.format(namedtuple_name, ', '.join(tuple_args))
-            if namedtuple_name in namedtuples:
-                assert namedtuples[namedtuple_name] == namedtuple_def
-            else:
-                namedtuples[namedtuple_name] = namedtuple_def
-            python_returns_s = namedtuple_name
-        elif len(python_returns) > 1:
-            python_returns_s = 'Tuple[' + ', '.join(python_returns) + ']'
-        elif len(python_returns) == 1:
-            python_returns_s = python_returns[0]
-        else:
-            python_returns_s = 'None'
-
-        type_hint = "def {}({}) -> {}: ...".format(fname, python_args_s, python_returns_s)
-        numargs = len(decl['arguments'])
-        vararg_pos = int(is_tensor)
-        have_vararg_version = (numargs > vararg_pos and
-                               decl['arguments'][vararg_pos]['dynamic_type'] in {'IntArrayRef'} and
-                               (numargs == vararg_pos + 1 or python_args[vararg_pos + 1] == '*') and
-                               (not is_tensor or decl['arguments'][0]['name'] == 'self'))
+    any_out = any([g for g in funcs if g.outplace is not None])
+
+    for sig_group in funcs:
+        # Some deprecated ops that are on the blocklist are still included in pyi
+        if sig_group.signature.name in blocklist and not sig_group.signature.deprecated:
+            continue
+
+        # deprecated signatures have separate entries for their functional and out variants
+        # (as opposed to the native ops, which fuse the two into a single signature).
+        # generate the functional variant here, if an out variant exists.
+        if sig_group.signature.deprecated and sig_group.outplace is not None:
+            type_hint = sig_group.signature.signature_str_pyi(skip_outputs=True)
+            type_hints.append(type_hint)
 
+        # TODO: remove HACK
+        # the pyi codegen currently adds an optional out param in cases where the current op does NOT have an out variant,
+        # but an overload of the op DOES have an out variant.
+        # TODO: After that, we should consider killing this method entirely and operating per PythonSignatureGroup
+        # rather than grouping their overloads together
+        # (since there isn't much else semantically meaningful about grouping overloads)
+        # this hack also doesn't apply to deprecated ops
+        hacky_add_output = any_out and sig_group.outplace is None and not sig_group.signature.deprecated
+        # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument
+        # Generates the out variant if one exists. Otherwise, generate the functional variant
+        type_hint = sig_group.signature.signature_str_pyi(
+            skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output)
         type_hints.append(type_hint)
 
-        if have_vararg_version:
-            # Two things come into play here: PyTorch has the "magic" that if the first and only positional argument
-            # is an IntArrayRef, it will be used as a vararg variant.
-            # The following outputs the vararg variant, the "pass a list variant" is output above.
-            # The other thing is that in Python, the varargs are annotated with the element type, not the list type.
-            typelist = decl['arguments'][vararg_pos]['dynamic_type']
-            vararg_type = '_int'
-            # replace first argument and eliminate '*' if present
-            python_args = ((['self'] if is_tensor else []) + ['*' + decl['arguments'][vararg_pos]['name'] +
-                                                              ': ' + vararg_type] + python_args[vararg_pos + 2:])
-            python_args_s = ', '.join(python_args)
-            type_hint = "def {}({}) -> {}: ...".format(fname, python_args_s, python_returns_s)
-            type_hints.append(type_hint)
+        # Some operators also additionally have a vararg variant of their signature
+        type_hint_vararg = sig_group.signature.signature_str_pyi_vararg(
+            skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output)
+        if type_hint_vararg:
+            type_hints.append(type_hint_vararg)
 
     return type_hints
 
-def gen_nn_functional(out):
+def gen_nn_functional(out: str) -> None:
     # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered
     # through an `_add_docstr` call
     imports = [
@@ -475,10 +300,10 @@ def gen_nn_functional(out):
     stubs = CodeTemplate.from_file(os.path.join('torch', '_C', '_nn.pyi.in'))
     write(out, 'torch/_C/_nn.pyi', stubs, env)
 
-def gen_nn_pyi(out):
+def gen_nn_pyi(out: str) -> None:
     gen_nn_functional(out)
 
-def gen_pyi(declarations_path, out):
+def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, out: str) -> None:
     """gen_pyi()
 
     This function generates a pyi file for torch.
@@ -491,16 +316,13 @@ def gen_pyi(declarations_path, out):
     # checking.  If you are update this, consider if your change
     # also needs to update the other file.
 
-    # Load information from YAML
-    declarations = load_aten_declarations(declarations_path)
-
     # Dictionary for NamedTuple definitions
-    namedtuples = {}
+    namedtuples: Dict[str, str] = {}
 
     # Generate type signatures for top-level functions
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    unsorted_function_hints = collections.defaultdict(list)
+    unsorted_function_hints: Dict[str, List[str]] = collections.defaultdict(list)
     unsorted_function_hints.update({
         'set_flush_denormal': ['def set_flush_denormal(mode: _bool) -> _bool: ...'],
         'get_default_dtype': ['def get_default_dtype() -> _dtype: ...'],
@@ -560,21 +382,13 @@ def gen_pyi(declarations_path, out):
             ' other: Union[Tensor, Number],'
             ' *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop))
 
-    function_declarations = get_py_torch_functions(declarations)
-    for name in sorted(function_declarations.keys()):
-        unsorted_function_hints[name] += generate_type_hints(name, function_declarations[name], namedtuples)
-
-    # Generate type signatures for deprecated functions
-
-    # TODO: Maybe we shouldn't generate type hints for deprecated
-    # functions :)  However, examples like those addcdiv rely on these.
-    with open('tools/autograd/deprecated.yaml', 'r') as f:
-        deprecated = yaml.load(f, Loader=YamlLoader)
-    for d in deprecated:
-        name, sig = re.match(r"^([^\(]+)\(([^\)]*)", d['name']).groups()
-        sig = ['*' if p.strip() == '*' else p.split() for p in sig.split(',')]
-        sig = ['*' if p == '*' else (p[1] + ': ' + type_to_python(p[0])) for p in sig]
-        unsorted_function_hints[name].append("def {}({}) -> Tensor: ...".format(name, ', '.join(sig)))
+    function_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=False, pyi=True)
+    sig_groups = get_py_torch_functions(function_signatures)
+    for name in sorted(sig_groups.keys()):
+        unsorted_function_hints[name] += generate_type_hints(sig_groups[name])
+        # deprecated signatures are not used when computing named tuples
+        native_groups = [g for g in sig_groups[name] if not g.signature.deprecated]
+        namedtuples.update(generate_named_tuples(native_groups))
 
     function_hints = []
     for name, hints in sorted(unsorted_function_hints.items()):
@@ -585,26 +399,26 @@ def gen_pyi(declarations_path, out):
     # Generate type signatures for Tensor methods
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    unsorted_tensor_method_hints = collections.defaultdict(list)
+    unsorted_tensor_method_hints: Dict[str, List[str]] = collections.defaultdict(list)
     unsorted_tensor_method_hints.update({
         'size': ['def size(self) -> Size: ...',
                  'def size(self, _int) -> _int: ...'],
         'stride': ['def stride(self) -> Tuple[_int]: ...',
                    'def stride(self, _int) -> _int: ...'],
-        'new_ones': ['def new_ones(self, size: {}, {}) -> Tensor: ...'.
-                     format(type_to_python('IntArrayRef'), FACTORY_PARAMS)],
+        'new_ones': ['def new_ones(self, size: _size, {}) -> Tensor: ...'.
+                     format(FACTORY_PARAMS)],
         'new_tensor': ["def new_tensor(self, data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)],
         # new and __init__ have the same signatures differ only in return type
         # Adapted from legacy_tensor_ctor and legacy_tensor_new
         'new': ['def new(self, *args: Any, {}) ->Tensor: ...'.format(DEVICE_PARAM),
                 'def new(self, storage: Storage) -> Tensor: ...',
                 'def new(self, other: Tensor) -> Tensor: ...',
-                'def new(self, size: {}, *, {}) -> Tensor: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM),
+                'def new(self, size: _size, *, {}) -> Tensor: ...'.format(DEVICE_PARAM),
                 ],
         '__init__': ['def __init__(self, *args: Any, {}) -> None: ...'.format(DEVICE_PARAM),
                      'def __init__(self, storage: Storage) -> None: ...',
                      'def __init__(self, other: Tensor) -> None: ...',
-                     'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM),
+                     'def __init__(self, size: _size, *, {}) -> None: ...'.format(DEVICE_PARAM),
                      ],
         'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."],
         # clamp has no default values in the Declarations
@@ -679,10 +493,14 @@ def gen_pyi(declarations_path, out):
     for name in simple_conversions:
         unsorted_tensor_method_hints[name].append('def {}(self) -> Tensor: ...'.format(name))
 
-    tensor_method_declarations = get_py_variable_methods(declarations)
-    for name in sorted(tensor_method_declarations.keys()):
-        unsorted_tensor_method_hints[name] += \
-            generate_type_hints(name, tensor_method_declarations[name], namedtuples, is_tensor=True)
+    # pyi tensor methods don't currently include deprecated signatures for some reason
+    # TODO: we should probably add them in
+    tensor_method_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=True, skip_deprecated=True, pyi=True)
+    tensor_method_sig_groups = get_py_torch_functions(tensor_method_signatures, method=True)
+
+    for name in sorted(tensor_method_sig_groups.keys()):
+        unsorted_tensor_method_hints[name] += generate_type_hints(tensor_method_sig_groups[name], is_tensor=True)
+        namedtuples.update(generate_named_tuples(tensor_method_sig_groups[name]))
 
     for op in all_ops:
         name = '__{}__'.format(op)
@@ -764,17 +582,20 @@ def gen_pyi(declarations_path, out):
     gen_nn_pyi(out)
 
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(
         description='Generate type stubs for PyTorch')
-    parser.add_argument('--declarations-path', metavar='DECL',
-                        default='torch/share/ATen/Declarations.yaml',
-                        help='path to Declarations.yaml')
+    parser.add_argument('--native-functions-path', metavar='NATIVE',
+                        default='aten/src/ATen/native/native_functions.yaml',
+                        help='path to native_functions.yaml')
+    parser.add_argument('--deprecated-functions-path', metavar='DEPRECATED',
+                        default='tools/autograd/deprecated.yaml',
+                        help='path to deprecated.yaml')
     parser.add_argument('--out', metavar='OUT',
                         default='.',
                         help='path to output directory')
     args = parser.parse_args()
-    gen_pyi(args.declarations_path, args.out)
+    gen_pyi(args.native_functions_path, args.deprecated_functions_path, args.out)
 
 
 if __name__ == '__main__':
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index bcc847e825ad..9b1d6fd4a55f 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -234,9 +234,9 @@ add_custom_command(
     "${TORCH_SRC_DIR}/nn/functional.pyi"
     COMMAND
     "${PYTHON_EXECUTABLE}" -mtools.pyi.gen_pyi
-      --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
+      --native-functions-path "aten/src/ATen/native/native_functions.yaml"
+      --deprecated-functions-path "tools/autograd/deprecated.yaml"
     DEPENDS
-    "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
     "${TORCH_SRC_DIR}/_C/__init__.pyi.in"
     "${TORCH_SRC_DIR}/_C/_VariableFunctions.pyi.in"
     "${TORCH_SRC_DIR}/nn/functional.pyi.in"

From dad74e58fcbe35a4409ecec5d816ce54c6986358 Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Mon, 7 Dec 2020 10:43:56 -0800
Subject: [PATCH 106/132] [WIP] Added foreach_trunc, foreahc_reciprocal,
 foreach_sigmoid APIs (#47385)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47385

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D24737051

Pulled By: izdeby

fbshipit-source-id: ed259d9184b2b784d8cc1983a8b85cc6cbf930ba
---
 aten/src/ATen/native/ForeachOpsKernels.cpp  |   3 +
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu | 180 +++++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml  |  48 ++++++
 test/test_foreach.py                        |  18 +-
 tools/codegen/model.py                      |   3 +
 5 files changed, 239 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 5fbc1506bfaa..24ab10b25f84 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -188,6 +188,9 @@ FOREACH_UNARY_OP(sinh);
 FOREACH_UNARY_OP(round);
 FOREACH_UNARY_OP(lgamma);
 FOREACH_UNARY_OP(frac);
+FOREACH_UNARY_OP(trunc);
+FOREACH_UNARY_OP(reciprocal);
+FOREACH_UNARY_OP(sigmoid);
 
 FOREACH_POINTWISE_OP_SCALAR(addcdiv);
 FOREACH_POINTWISE_OP_SCALAR(addcmul);
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 2cd01d80bdca..88b952fe1d95 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -417,9 +417,9 @@ std::vector<Tensor> foreach_tensor_frac_cuda(TensorList tensors) {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2>(tensor_lists,
                               UnaryOpFunctor<scalar_t,
-                                          /* depth */ 2,
-                                          /* r_args_depth */ 1, 
-                                          /* res_arg_index */ 1>(),
+                                             /* depth */ 2,
+                                             /* r_args_depth */ 1,
+                                             /* res_arg_index */ 1>(),
                               Trunc<opmath_t>());
     });
     return tensor_lists[1];
@@ -439,10 +439,178 @@ void foreach_tensor_frac_cuda_(TensorList tensors) {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<1>(tensor_lists,
                               UnaryOpFunctor<scalar_t,
-                                          /* depth */ 1,
-                                          /* r_args_depth */ 1, 
-                                          /* res_arg_index */ 0>(),
+                                             /* depth */ 1,
+                                             /* r_args_depth */ 1,
+                                             /* res_arg_index */ 0>(),
                               Trunc<opmath_t>());
     });
 }
+
+template<typename T>
+struct Sigmoid {
+    T one = T(1);
+    __device__ T operator()(T t) const { return (one / (one + std::exp(-t))); }
+};
+
+std::vector<Tensor> foreach_tensor_sigmoid_cuda(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_sigmoid_slow(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(std::move(vec_res));
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 2,
+                                             /* r_args_depth */ 1, 
+                                             /* res_arg_index */ 1>(),
+                              Sigmoid<opmath_t>());
+    });
+    return tensor_lists[1];
+}
+
+void foreach_tensor_sigmoid_cuda_(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_sigmoid_slow_(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 1,
+                                             /* r_args_depth */ 1,
+                                             /* res_arg_index */ 0>(),
+                              Sigmoid<opmath_t>());
+    });
+}
+
+template<typename T>
+struct Reciprocal {
+    T one = T(1);
+    __device__ T operator()(T t) const { return (one / t); }
+};
+
+std::vector<Tensor> foreach_tensor_reciprocal_cuda(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_reciprocal_slow(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(std::move(vec_res));
+
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 2,
+                                             /* r_args_depth */ 1, 
+                                             /* res_arg_index */ 1>(),
+                              Reciprocal<opmath_t>());
+    });
+    return tensor_lists[1];
+}
+
+void foreach_tensor_reciprocal_cuda_(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_reciprocal_slow_(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 1,
+                                             /* r_args_depth */ 1, 
+                                             /* res_arg_index */ 0>(),
+                              Reciprocal<opmath_t>());
+    });
+}
+
+template<typename T>
+struct Truncf {
+    __device__ T operator()(T t) const { return std::trunc(t); }
+};
+
+std::vector<Tensor> foreach_tensor_trunc_cuda(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_trunc_slow(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(std::move(vec_res));
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 2,
+                                             /* r_args_depth */ 1, 
+                                             /* res_arg_index */ 1>(),
+                              Truncf<opmath_t>());
+    });
+    return tensor_lists[1];
+}
+
+void foreach_tensor_trunc_cuda_(TensorList tensors) {
+    check_foreach_api_restrictions(tensors);
+
+    if (!can_use_fast_route(tensors)) {
+        return at::native::foreach_tensor_trunc_slow_(tensors);
+    }
+
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              UnaryOpFunctor<scalar_t,
+                                             /* depth */ 1,
+                                             /* r_args_depth */ 1, 
+                                             /* res_arg_index */ 0>(),
+                              Truncf<opmath_t>());
+    });
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4d8ea72761af..e7ac20599214 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7610,6 +7610,54 @@
     CPU: foreach_tensor_frac_slow_
     CUDA: foreach_tensor_frac_cuda_
 
+- func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow
+    CUDA: foreach_tensor_reciprocal_cuda
+
+- func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_reciprocal_slow_
+    CUDA: foreach_tensor_reciprocal_cuda_
+
+- func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow
+    CUDA: foreach_tensor_sigmoid_cuda
+
+- func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sigmoid_slow_
+    CUDA: foreach_tensor_sigmoid_cuda_
+
+- func: _foreach_trunc(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow
+    CUDA: foreach_tensor_trunc_cuda
+
+- func: _foreach_trunc_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_trunc_slow_
+    CUDA: foreach_tensor_trunc_cuda_
+
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   use_c10_dispatcher: full
   device_guard: False
diff --git a/test/test_foreach.py b/test/test_foreach.py
index eff6d969c5e5..c55c4e71dab0 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -53,6 +53,9 @@ class TestForeach(TestCase):
         (torch._foreach_log1p, torch._foreach_log1p_, torch.log1p, True, False),
         (torch._foreach_round, torch._foreach_round_, torch.round, False, False),
         (torch._foreach_frac, torch._foreach_frac_, torch.frac, False, False),
+        (torch._foreach_reciprocal, torch._foreach_reciprocal_, torch.reciprocal, True, True),
+        (torch._foreach_sigmoid, torch._foreach_sigmoid_, torch.sigmoid, True, False),
+        (torch._foreach_trunc, torch._foreach_trunc_, torch.trunc, False, False),
 
         # See test_abs
         # (torch._foreach_abs, torch._foreach_abs_, torch.abs, True, True),
@@ -173,7 +176,7 @@ def test_unary_ops(self, device, dtype):
                 control_dtype = torch.float32 if (self.device_type == 'cuda' and
                                                   (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
 
-                if self.device_type == 'cpu' and dtype == torch.half and torch_op not in [torch.neg, torch.frac]:
+                if self.device_type == 'cpu' and dtype == torch.half and torch_op not in [torch.neg, torch.frac, torch.reciprocal]:
                     with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"):
                         expected = [torch_op(tensors1[i]) for i in range(N)]
 
@@ -191,13 +194,14 @@ def test_unary_ops(self, device, dtype):
                         break
 
                 if dtype in [torch.complex64, torch.complex128] and not support_complex:
-                    # not using assertRaisesRegex due to different error messages
-                    with self.assertRaises(RuntimeError):
-                        expected = [torch_op(tensors1[i]) for i in range(N)]
+                    if not (self.device_type == 'cpu' and torch_op in [torch.sigmoid]):
+                        # not using assertRaisesRegex due to different error messages
+                        with self.assertRaises(RuntimeError):
+                            expected = [torch_op(tensors1[i]) for i in range(N)]
 
-                    with self.assertRaises(RuntimeError):
-                        res = fe_op(tensors1)
-                    break
+                        with self.assertRaises(RuntimeError):
+                            res = fe_op(tensors1)
+                        break
 
                 expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
                 res = fe_op(tensors1)
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index f270d0737ade..87cd3ab8e302 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -466,6 +466,9 @@ def __post_init__(self) -> None:
                     '_foreach_round_',
                     '_foreach_lgamma_',
                     '_foreach_frac_',
+                    '_foreach_reciprocal_',
+                    '_foreach_sigmoid_',
+                    '_foreach_trunc_',
                     '_foreach_addcmul_.Scalar',
                     '_foreach_addcdiv_.Scalar',
                     '_foreach_addcmul_.ScalarList',

From 924b001b71555cfd58b31249a2eb7963627f2fc8 Mon Sep 17 00:00:00 2001
From: Hui Guo <huiguo@fb.com>
Date: Mon, 7 Dec 2020 11:12:08 -0800
Subject: [PATCH 107/132] #48733 added logging statements to LLVM codegen using
 JIT logging (#48758)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48758

Test Plan: PYTORCH_JIT_LOG_LEVEL=">>llvm_codegen" python test/test_jit_fuser_te.py -k test_lerp

Reviewed By: ZolotukhinM

Differential Revision: D25295995

Pulled By: huiguoo

fbshipit-source-id: 8927808932ef3657da26508d0f6574c9e5fbbb25
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 35929a61266f..509015f7ffa5 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -30,6 +30,7 @@
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
+#include <torch/csrc/jit/jit_log.h>
 #define DEBUG_PRINT 0
 
 using namespace torch::jit::tensorexpr;
@@ -518,6 +519,13 @@ void LLVMCodeGenImpl::emitKernel(
   if (llvm::verifyFunction(*fn_, &llvm::outs())) {
     throw std::runtime_error("Function verification failed");
   }
+
+  // print graph debug info.
+  std::string fnstr;
+  llvm::raw_string_ostream FS(fnstr);
+  fn_->print(FS);
+  GRAPH_DEBUG("LLVM Function:\n", FS.str(), "\n");
+
   optimize(*module_);
 
 #if DEBUG_PRINT

From d307601365c3b848072b8b8381208aedc1a0aca5 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Mon, 7 Dec 2020 11:46:58 -0800
Subject: [PATCH 108/132] Revert D24923679: Fixed einsum
 compatibility/performance issues (#46398)

Test Plan: revert-hammer

Differential Revision:
D24923679 (https://github.com/pytorch/pytorch/commit/ea2a568cca71aaf690051782c225ca9dd2e5e1f9)

Original commit changeset: 47e48822cd67

fbshipit-source-id: 52f17b66a4aa075d0159bdf1c98616e6098091b8
---
 aten/src/ATen/native/Linear.cpp | 501 +++++++++++++-------------------
 test/test_linalg.py             | 219 +++++---------
 torch/functional.py             | 171 +++++------
 3 files changed, 348 insertions(+), 543 deletions(-)

diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index bac2f80e8a7c..c9e03aaa3b6b 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -136,334 +136,241 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   return result;
 }
 
-// There are roughly three parts to compute einsum:
-// 1. Parse equation to extract the labels for each input operand and output
-// 2. Unsqueeze missing dimensions from input operands and permute to align them
-// 3. Compute result by multiplying input operands and summing contraction
-//    dimensions We do the last part by reducing to bmm.
-Tensor einsum(std::string equation, TensorList operands) {
-  TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand");
-  checkDeviceType("einsum()", operands, operands[0].device().type());
-
-  // Code for encoding ellipsis ("...") with labels
-  constexpr int ELLIPSIS = '.';
-
-  // Find arrow (->) to split equation into lhs and rhs
-  const auto arrow_pos = equation.find("->");
-  const auto lhs = equation.substr(0, arrow_pos);
-
-  // Convert labels for input operands into an index in [0, 25] and store
-  // them in op_labels for each operand along with ELLIPSIS.
-  std::vector<std::vector<int>> op_labels(operands.size());
-  bool found_ell = false;
-  std::string::size_type curr_op = 0;
-  for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
-    switch (lhs[i]) {
-      case ' ':
-        // Ignore spaces
-        break;
-
-      case '.':
-        TORCH_CHECK(
-            // Only one ellipsis per operand can be given
-            !found_ell,
-            "einsum() found \'.\' for operand ",
-            curr_op,
-            " for which an ellipsis was already found");
-        TORCH_CHECK(
-            // Ensure it's a valid ellipsis
-            i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.',
-            "einsum() found \'.\' for operand ",
-            curr_op,
-            " that is not part of any ellipsis");
-        op_labels[curr_op].push_back(ELLIPSIS);
-        found_ell = true;
-        break;
-
-      case ',':
-        // Move onto next operand
-        ++curr_op;
-        TORCH_CHECK(
-            curr_op < operands.size(),
-            "einsum() fewer operands were provided than specified in the equation");
-        found_ell = false;
-        break;
-
-      default:
-        // Parse label
-        TORCH_CHECK(
-            lhs[i] >= 'a' && lhs[i] <= 'z',
-            "einsum() operand subscript must be in range [a, z] but found ",
-            lhs[i],
-            " for operand ",
-            curr_op);
-        // Convert label to index in [0, 25] and store
-        op_labels[curr_op].push_back(lhs[i] - 'a');
-    }
+Tensor einsum(std::string eqn, TensorList tensors) {
+  constexpr size_t number_of_letters = 26;
+  std::string in_eqn;
+  size_t pos;
+  // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis.
+  // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter
+  // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices.
+  // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that
+  // the letter has not been assigned an index yet (because it has not been seen).
+  // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices).
+  // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet.
+  // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below.
+
+  std::array<std::int64_t, number_of_letters> letter_mapping; // map letter to internal (numerical) label
+  letter_mapping.fill(-1);
+  int64_t num_ell_idxes = -1;
+  int64_t first_ell_idx = 0;
+
+  // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes.
+  // For each operand, we have a vector mapping each dimension to an internal index.
+  // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and
+  // of the last occurrence of each index.
+  std::vector<std::vector<int64_t>> input_op_idxes;                   // the parsed operand indices
+  std::array<std::int64_t, number_of_letters> num_letter_occurrences; // number of occurrence in the equation of this letter
+  num_letter_occurrences.fill(0);
+  std::vector<std::int64_t> last_idx_occurrence;                      // the last operator (left to right) using this index
+
+  if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side
+    in_eqn = eqn.substr(0, pos);
+  } else {
+    in_eqn = eqn;
   }
-
-  TORCH_CHECK(
-      curr_op == operands.size() - 1,
-      "einsum() more operands were provided than specified in the equation");
-
-  // Labels must be within [a, z].
-  constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
-  std::vector<int> label_count(TOTAL_LABELS, 0);
-
-  // The maximum number of dimensions covered by any ellipsis, needed when
-  // unsqueezing missing dimensions from operands to permute and broadcast
-  int64_t ell_num_dim = 0;
-
-  // Compute label frequency and number of dimensions covered by ellipsis
-  // We do this after parsing labels to make it more readable and simpler
-  // to compute the number of dimensions covered by ellipsis.
-  for (std::size_t i = 0; i < operands.size(); ++i) {
-    const Tensor operand = operands[i];
-    std::vector<int> labels = op_labels[i];
-    int64_t nlabels = labels.size();
-    int64_t ndims = operand.dim();
-    bool has_ellipsis = false;
-
-    for (int label : labels) {
-      if (label == ELLIPSIS) {
-        --nlabels;
-        has_ellipsis = true;
-        ell_num_dim = std::max(ell_num_dim, ndims - nlabels);
-      } else {
-        ++label_count[label];
+  // remove spaces for einsum compatibility (#9929)
+  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
+
+  // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
+  int64_t operand = 0;
+  std::stringstream eqn_stream(in_eqn);
+  std::string term;
+  int64_t num_total_idxes = 0;
+  while (! eqn_stream.eof()) {
+    std::getline(eqn_stream, term, ',');  // term = string with indices of current term
+    TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
+
+    int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
+    // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
+    int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3;
+    int64_t dims_in_term = 0;              // dimensions we have seen
+    std::vector<int64_t> current_op_idxes; // mapping of operand dimensions to indices for current term
+    for (auto &c : term) {                 // c = character with a single letter or '.'
+      if (c == '.') {
+        ell_char_count++;
+        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
+        if (ell_char_count == 3) {        // this completes the ellipsis
+          if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
+            first_ell_idx = num_total_idxes;
+            num_ell_idxes = candidate_num_ell_idxes;
+            num_total_idxes += num_ell_idxes;
+          }
+          else {                          // we have seen an ellipsis before, so we check compatibility
+            TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes,
+                     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
+          }
+          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
+            current_op_idxes.push_back(first_ell_idx + i);
+            last_idx_occurrence.push_back(operand);
+          }
+          dims_in_term += num_ell_idxes;                // keep track of dimensions
+        }
+      } else {                                          // a letter (hopefully)
+        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
+        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
+        if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
+          letter_mapping[letter_num] = num_total_idxes;
+          num_total_idxes++;
+          last_idx_occurrence.push_back(operand);
+        } else {                                        // letter we have already seen
+          last_idx_occurrence[letter_mapping[letter_num]] = operand;
+        }
+        num_letter_occurrences[letter_num]++;
+        current_op_idxes.push_back(letter_mapping[letter_num]);
+        dims_in_term++;
       }
     }
-
-    TORCH_CHECK(
-        has_ellipsis ? nlabels <= ndims : nlabels == ndims,
-        "einsum() the number of subscripts in the equation (",
-        nlabels,
-        has_ellipsis ? ") is more than the number of dimensions ("
-                     : ") does not match the number of dimensions (",
-        ndims,
-        ") for operand ",
-        i,
-        has_ellipsis ? "" : " and no ellipsis was given");
+    TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
+    input_op_idxes.push_back(std::move(current_op_idxes));
+    operand++;
   }
-
-  // Mapping of label to index in the permuted tensors (out_dims + sum_dims)
-  // This will be used for aligning the dimensions of all input operands
-  std::vector<int> label_perm_index(TOTAL_LABELS, -1);
-  
-  // Current index in the permuted shape
-  int perm_index = 0;
-
-  // Start index of ellipsis dimensions in the permuted shape
-  int64_t ell_index = 0;
-  found_ell = false;
-
-  if (arrow_pos == std::string::npos) {
-    // Implicit output is ellipsis (...) + labels seen only once
-    perm_index = ell_num_dim;
-    found_ell = true;
-    for (int label = 0; label < TOTAL_LABELS; ++label) {
-      if (label_count[label] == 1) {
-        label_perm_index[label] = perm_index++;
+  // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
+  TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
+
+  // the following parses or infers output (right hand side)
+  // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
+  // for the output indices. -1 means that the index has not been assigned a dimension yet
+  std::vector<int64_t> idxes_to_preprocessed_dims(num_total_idxes, -1);     // the position of the index in the tensor dimensions
+  int64_t num_output_dims = 0;
+  if (pos != std::string::npos) {            // parse the user provided right hand side
+    int64_t ell_char_count = 0;
+    for (auto &c : eqn.substr(pos+2)) {
+      if (c == '.') {                        // '.' as part of ellipsis
+        ell_char_count++;
+        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
+        if (ell_char_count == 3) {           // ellipsis complete
+          TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
+          for (int64_t i = 0; i < num_ell_idxes; ++i) {
+            idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
+            num_output_dims++;
+          }
+        }
+      } else if (! isspace(c)) {                              // letter (hopefully)
+        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
+        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        int64_t letter_num = c-'a';
+        TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output");
+        idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
+        num_output_dims++;
       }
     }
-  } else {
-    // Parse explicit output
-    const std::string rhs = equation.substr(arrow_pos + 2);
-    for (std::size_t i = 0; i < rhs.length(); ++i) {
-      switch (rhs[i]) {
-        case ' ':
-          // Ignore spaces
-          break;
-
-        case '.':
-          TORCH_CHECK(
-              // There can only be one ellipsis in the output
-              !found_ell,
-              "einsum() found \'.\' for output but an ellipsis (...) was already found");
-          TORCH_CHECK(
-              // Ensure ellipsis is correct
-              i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.',
-              "einsum() found \'.\' for output that is not part of any ellipsis (...)");
-          ell_index = perm_index;
-          perm_index += ell_num_dim;
-          found_ell = true;
-          break;
-
-        default:
-          TORCH_CHECK(
-              rhs[i] >= 'a' && rhs[i] <= 'z',
-              "einsum() subscripts must be in range [a, z] but found ",
-              rhs[i],
-              " for the output");
-          TORCH_CHECK(
-              // Ensure label appeared at least once for some input operand and at
-              // most once for the output
-              label_count[rhs[i] - 'a'] > 0,
-              "einsum() output subscript ",
-              rhs[i],
-              label_count[rhs[i] - 'a'] == -1
-                  ? " appears more than once in the output"
-                  : " does not appear in the equation for any input operand");
-          label_perm_index[rhs[i] - 'a'] = perm_index++;
-          
-          // Set to -1 to mark that this label already appeared in the output
-          label_count[rhs[i] - 'a'] = -1;
+  } else { // create an inferred right hand side
+    // the ellipsis (if in the lhs) comes first
+    if (num_ell_idxes >= 0) {
+      for (int64_t i = 0; i < num_ell_idxes; ++i) {
+        idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
+        num_output_dims++;
+      }
+    }
+    // then the indices that occur exactly once in alphabetic order
+    for (size_t idx = 0; idx < number_of_letters; idx++) {
+      if (num_letter_occurrences[idx] == 1) {
+        idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims;
+        num_output_dims++;
       }
     }
   }
-
-  // Save output size before adding sum dims
-  const int out_size = perm_index;
-
-  // If ellipsis is not part of the output, add to contraction dimensions
-  if (ell_num_dim > 0 && !found_ell) {
-    ell_index = perm_index;
-    perm_index += ell_num_dim;
-  }
-
-  // Add contraction labels (labels not present in output)
-  for (int label = 0; label < TOTAL_LABELS; ++label) {
-    if (label_count[label] > 0 && label_perm_index[label] == -1) {
-      label_perm_index[label] = perm_index++;
+  // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
+  // for the non-output indices - those that are eventually summed over
+  int64_t position = num_output_dims;
+  for (int64_t i = 0; i < num_total_idxes; i++) {
+    if (idxes_to_preprocessed_dims[i]==-1) {
+      idxes_to_preprocessed_dims[i] = position;
+      position++;
     }
   }
 
-  // Here we unsqueeze missing dimensions to make all operands have the same
-  // number of dimensions. We take diagonals for repeated labels within the
-  // same operand. Finally we permute the operands to align dimensions as 
-  // per the perm_out_index we computed above.
-  std::vector<Tensor> permuted_operands;
-  for (std::size_t i = 0; i < operands.size(); ++i) {
-    std::vector<int64_t> perm_shape(perm_index, -1);
-    std::vector<int64_t> label_dim(TOTAL_LABELS, -1);
-    const std::vector<int> labels = op_labels[i];
-    Tensor operand = operands[i];
-    const auto sizes = operand.sizes();
-    std::size_t j = 0;
-
-    for (int label : labels) {
-      if (label == ELLIPSIS) {
-        // Add missing dimensions under ellipsis
-        int64_t num_dim_diff =
-            ell_num_dim - (operand.dim() - labels.size() + 1);
-        for (int64_t k = 0; k < num_dim_diff; ++k) {
-          operand = operand.unsqueeze(j);
+  // we now "homogenize the dimensions", i.e.
+  // - take diagonals for duplicated indices
+  // - permute the dimensions to match the order given by idxes_to_preprocessed_dims
+  // - unsqueeze to create all dimensions for each index in each tensor where they are missing
+  // we also check that sizes match
+  // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable)
+  std::vector<Tensor> preprocessed_operands;
+  std::vector<std::int64_t> size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet
+  for (int64_t op = 0; op < (int64_t) tensors.size(); op++) {
+    auto preprocessed_op = tensors[op];
+    std::vector<int64_t> idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear
+    std::vector<int64_t>& current_op_input_idxes = input_op_idxes[op];
+    int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input
+    for (size_t i = 0; i < current_op_input_idxes.size(); i++) {
+      auto idx = current_op_input_idxes[i];
+      auto dim_out = idxes_to_preprocessed_dims[idx];
+      if (idx_to_dim[dim_out] == -1) { // first appearance
+        idx_to_dim[dim_out] = dim;
+        if (size_of_dims[idx] == -1) { // keep track of sizes
+          size_of_dims[idx] = preprocessed_op.size(dim);
+        }
+        else {
+          TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
         }
-        for (int64_t k = 0; k < ell_num_dim; ++k) {
-          perm_shape[ell_index + k] = j++;
+        dim++;
+      } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
+        TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+        preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
+        // diagonal moves the diagonal dimension to the back
+        // now we permute the last dim back to idx_to_dim[dim_out]
+        std::vector<int64_t> perm(preprocessed_op.dim(), 0);
+        for (int64_t d = 0; d < preprocessed_op.dim(); d++) {
+          if (d == idx_to_dim[dim_out]) {
+            perm[d] = preprocessed_op.dim() - 1;
+          } else {
+            perm[d] = d - (d > idx_to_dim[dim_out]);
+          }
         }
-      } else if (label_dim[label] != -1) {
-        // Repeated label, take diagonal
-        int64_t dim = label_dim[label];
-        TORCH_CHECK(
-            sizes[j] == sizes[dim],
-            "einsum() subscript ",
-            char(label + 'a'),
-            " is repeated for operand ",
-            i,
-            " but the sizes don't match, ",
-            sizes[j],
-            " != ",
-            sizes[dim]);
-        operand = operand.diagonal(0, j, dim).movedim(-1, dim);
-      } else {
-        // Lookup output index for label
-        label_dim[label] = j;
-        perm_shape[label_perm_index[label]] = j++;
+        preprocessed_op = preprocessed_op.permute(perm);
       }
     }
-
-    // Add dimensions for missing labels
-    for (int64_t& index : perm_shape) {
-      if (index == -1) {
-        operand = operand.unsqueeze(-1);
-        index = j++;
+    // now we permute the dimensions in the right order
+    std::vector<int64_t> permutation; // permutation for this tensor
+    for (auto &d : idx_to_dim) {
+      if (d > -1) {
+        permutation.push_back(d);
       }
     }
-
-    permuted_operands.push_back(operand.permute(perm_shape));
-  }
-
-  // Check if operands broadcast and keep track of last operand with
-  // dimension size != 1 for optimizing reductions
-  std::vector<std::size_t> dim_last_op(perm_index, 0);
-  bool has_zero_size_dim = false;
-  for (int dim = 0; dim < perm_index; ++dim) {
-    int64_t broadcast_size = permuted_operands[0].size(dim);
-    for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
-      int64_t dim_size = permuted_operands[i].size(dim);
-      if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) {
-        std::ostringstream msg;
-        msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:";
-        for (std::size_t j = 0; j < operands.size(); ++j) {
-          msg << " " << operands[j].sizes() << "->"
-              << permuted_operands[j].sizes();
-        }
-        TORCH_CHECK(false, msg.str());
-      }
-      if (dim_size != 1) {
-        broadcast_size = dim_size;
-        dim_last_op[dim] = i;
+    preprocessed_op = preprocessed_op.permute(permutation);
+    // finally, we insert dimensions for idxes not in the operand
+    for (size_t dim = 0; dim < idx_to_dim.size(); dim++) {
+      if (idx_to_dim[dim] == -1) {
+        preprocessed_op = preprocessed_op.unsqueeze(dim);
       }
     }
-    has_zero_size_dim |= broadcast_size == 0;
-  }
-
-  // Compute result
-  Tensor result = permuted_operands[0];
 
-  // Fast path for when an operand has zero sized dim
-  if (has_zero_size_dim) {
-    std::vector<int64_t> out_shape(out_size);
-    for (int i = 0; i < out_size; ++i) {
-      out_shape[i] = permuted_operands[dim_last_op[i]].size(i);
-    }
-    return at::zeros(out_shape, result.options());
+    preprocessed_operands.push_back(std::move(preprocessed_op));
   }
 
-  // Sum out or squeeze dimensions that are size 1 for all later operands
-  int dim = out_size;
-  for (int i = dim; i < perm_index; ++i, ++dim) {
-    if (dim_last_op[i] == 0) {
-      if (result.size(dim) == 1) {
-        result = result.squeeze(dim--);
-      } else {
-        result = result.sum(dim--);
-      }
+  // now we reduce the indices from left to right
+  // numpy allows to optimize the path using various
+  // algorithms (see eigen_path in numpy docs)
+  // we start with the leftmost operator and reduce indices that
+  // appear only there
+  Tensor result = std::move(preprocessed_operands[0]);
+  for (int64_t idx = 0; idx < num_total_idxes; idx++) {
+    if ((last_idx_occurrence[idx] == 0)
+        && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
+      result = result.sum(idxes_to_preprocessed_dims[idx], true);
     }
   }
 
-  for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
-    Tensor operand = permuted_operands[i];
+  // now we process each tensor using sumproduct_pair
+  for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) {
     std::vector<int64_t> sum_dims;
-
-    // Sum out or squeeze dimensions that are size 1 for all later operands
-    dim = out_size;
-    for (int j = dim; j < perm_index; ++j, ++dim) {
-      if (dim_last_op[j] < i) {
-        operand = operand.squeeze(dim);
-        --dim;
-      } else if (dim_last_op[j] == i) {
-        if (result.size(dim) == 1) {
-          operand = operand.sum(dim);
-          result = result.squeeze(dim);
-          --dim;
-        } else {
-          sum_dims.push_back(dim);
-        }
+    for (int64_t idx = 0; idx < num_total_idxes; idx++) {
+      if ((last_idx_occurrence[idx] == i)
+          && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
+        sum_dims.push_back(idxes_to_preprocessed_dims[idx]);
       }
     }
-
-    // Multiply tensors and sum out dimensions in sum_dims
-    if (sum_dims.empty()) {
-      result = result.mul(operand);
-    } else if (sum_dims.size() == result.sizes().size()) {
-      result = result.flatten().dot(operand.flatten());
-    } else {
-      result = sumproduct_pair(result, operand, sum_dims, false);
-    }
+    result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true);
+  }
+  // finally, we squeeze out all non-result dimensions
+  auto sizes = result.sizes().vec();
+  for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) {
+    sizes.erase(sizes.begin() + dim);
   }
 
+  result = result.view(sizes);
   return result;
 }
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 3fa677d2b1de..b6ff817a59fa 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2588,151 +2588,6 @@ def test_old_matrix_rank(self, device, dtype):
             self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True))
             self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True))
 
-    @dtypes(torch.double)
-    def test_einsum(self, device, dtype):
-        def check(equation, *operands):
-            ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands])
-            res = torch.einsum(equation, operands)
-            self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
-
-            # Check autograd
-            ops = [op.detach().requires_grad_() for op in operands]
-            self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops))
-            for op in ops:
-                self.assertTrue(op._version == 0)
-
-        # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
-        x = torch.rand(5, device=device, dtype=dtype)
-        y = torch.rand(7, device=device, dtype=dtype)
-        A = torch.randn(3, 5, device=device, dtype=dtype)
-        B = torch.randn(2, 5, device=device, dtype=dtype)
-        C = torch.randn(2, 3, 5, device=device, dtype=dtype)
-        D = torch.randn(2, 5, 7, device=device, dtype=dtype)
-        E = torch.randn(7, 9, device=device, dtype=dtype)
-        F = torch.randn(2, 3, 3, 5, device=device, dtype=dtype)
-        G = torch.randn(5, 4, 6, device=device, dtype=dtype)
-        H = torch.randn(4, 4, device=device, dtype=dtype)
-        I = torch.rand(2, 3, 2, device=device, dtype=dtype)
-
-        # Note: gradcheck fails if the same input is given multiple times which is why the
-        # calls to clone below. (see https://github.com/pytorch/pytorch/issues/9282)
-
-        # Vector operations
-        check('i->', x)                     # sum
-        check('i,i->', x, x.clone())        # dot
-        check('i,i->i', x, x.clone())       # vector element-wisem mul
-        check('i,j->ij', x, y)              # outer
-
-        # Matrix operations
-        check("ij->ji", A)                  # transpose
-        check("ij->j", A)                   # row sum
-        check("ij->i", A)                   # col sum
-        check("ij,ij->ij", A, A.clone())    # matrix element-wise mul
-        check("ij,j->i", A, x)              # matrix vector multiplication
-        check("ij,kj->ik", A, B)            # matmul
-        check("ij,ab->ijab", A, E)          # matrix outer product
-
-        # Tensor operations
-        check("aij,ajk->aik", C, D)         # batch matmul
-        check("ijk,jk->i", C, A)            # tensor matrix contraction
-        check("aij,jk->aik", D, E)          # tensor matrix contraction
-        check("abcd,dfg->abcfg", F, G)      # tensor tensor contraction
-        check("ijk,jk->ik", C, A)           # tensor matrix contraction with double indices
-        check("ijk,jk->ij", C, A)           # tensor matrix contraction with double indices
-        check("ijk,ik->j", C, B)            # non contiguous
-        check("ijk,ik->jk", C, B)           # non contiguous with double indices
-
-        # Test diagonals
-        check("ii", H)                      # trace
-        check("ii->i", H)                   # diagonal
-        check('iji->j', I)                  # non-contiguous trace
-
-        # Test ellipsis
-        check("i...->...", H)
-        check("ki,...k->i...", A.t(), B)
-        check("k...,jk->...", A.t(), B)
-        check('...ik, ...j -> ...ij', C, x)
-        check('bik,k...j->i...j', C, torch.rand(5, 3, device=device, dtype=dtype))
-        check('i...j, ij... -> ...ij', C, torch.rand(2, 5, 2, 3, device=device, dtype=dtype))
-
-        # torch.bilinear with discontiguous tensors
-        l = torch.randn(10, 5, device=device, dtype=dtype).transpose(0, 1)
-        r = torch.randn(20, 5, device=device, dtype=dtype).transpose(0, 1)
-        w = torch.randn(15, 10, 20, device=device, dtype=dtype)
-        check("bn,anm,bm->ba", l, w, r)
-
-        # with strided tensors
-        check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2])
-
-    def test_einsum_corner_cases(self, device):
-        def check(equation, *operands, expected_output):
-            tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple)
-                       else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands]
-            output = torch.einsum(equation, tensors)
-            self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
-
-        # Test equation variantions
-        check(' ', 1, expected_output=1)
-        check(' -> ', 1, expected_output=1)
-        check(' , ', 2, 2, expected_output=4)
-        check(' , , ', 2, 2, 2, expected_output=8)
-        check(' , -> ', 2, 2, expected_output=4)
-        check(' i ', [1], expected_output=[1])
-        check(' i -> ', [1], expected_output=1)
-        check(' i -> i ', [1], expected_output=[1])
-        check(' i , i ', [2], [2], expected_output=4)
-        check(' i , i -> i ', [2], [2], expected_output=[4])
-
-        # Test tensors with 0 size dimensions
-        check('i', [], expected_output=[])
-        check(' i j -> j', [[], []], expected_output=[])
-        check('ij->i', [[], []], expected_output=[0., 0.])
-        check(' i j k  ,  k  -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []])
-
-        # Test broadcasting
-        check('i,j', [2], [1, 2], expected_output=[[2, 4]])
-        check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]])
-
-        # Test ellipsis broadcasting
-        check('...', 1, expected_output=1)
-        check('...->', 1, expected_output=1)
-        check('...->...', 1, expected_output=1)
-        check('...', [1], expected_output=[1])
-        check('...->', [1], expected_output=1)
-        check('i...->i', [1], expected_output=[1])
-        check('i...->...i', [1], expected_output=[1])
-        check('...a->', [[2], [4]], expected_output=6)
-        check('a...b->ab', [[[1], [2]], [[3], [4]]], expected_output=[[3], [7]])
-
-    def test_einsum_error_cases(self, device):
-        def check(equation, operands, regex, exception=RuntimeError):
-            with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex):
-                torch.einsum(equation, operands)
-
-        x = torch.rand(2)
-        y = torch.rand(2, 3)
-
-        check('', [], r'must provide at least one operand')
-        check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis')
-        check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found')
-        check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0')
-        check(',', [x], r'fewer operands were provided than specified in the equation')
-        check('', [x, x], r'more operands were provided than specified in the equation')
-        check('', [x], r'the number of subscripts in the equation \(0\) does not match the number '
-                       r'of dimensions \(1\) for operand 0 and no ellipsis was given')
-        check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number '
-                         r'of dimensions \(1\) for operand 0 and no ellipsis was given')
-        check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number '
-                            r'of dimensions \(1\) for operand 0')
-        check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found')
-        check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)')
-        check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output')
-        check('a->aa', [x], r'output subscript a appears more than once in the output')
-        check('a->i', [x], r'output subscript i does not appear in the equation for any input operand')
-        check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2')
-        check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: '
-                               r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]')
-
     def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular,
                                      device, dtype):
         triangle_function = torch.triu if upper else torch.tril
@@ -3385,6 +3240,80 @@ def run_test(pivot):
         if self.device_type == 'cuda':
             run_test(False)
 
+    @onlyCPU
+    @slowTest
+    @dtypes(torch.double)
+    def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None:
+        # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+        x = torch.randn(5, dtype=dtype, device=device)
+        y = torch.randn(7, dtype=dtype, device=device)
+        A = torch.randn(3, 5, dtype=dtype, device=device)
+        B = torch.randn(2, 5, dtype=dtype, device=device)
+        C = torch.randn(2, 3, 5, dtype=dtype, device=device)
+        D = torch.randn(2, 5, 7, dtype=dtype, device=device)
+        E = torch.randn(7, 9, dtype=dtype, device=device)
+        F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device)
+        G = torch.randn(7, 11, 13, dtype=dtype, device=device)
+        H = torch.randn(4, 4, dtype=dtype, device=device)
+        I = torch.randn(3, 4, 4, dtype=dtype, device=device)
+        l = torch.randn(5, 10, dtype=dtype, device=device)
+        r = torch.randn(5, 20, dtype=dtype, device=device)
+        w = torch.randn(30, 10, 20, dtype=dtype, device=device)
+        test_list: List[Union[Tuple[str, torch.Tensor],
+                        Tuple[str, torch.Tensor, torch.Tensor],
+                        Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [
+            # -- Vector
+            ("i->", x),                 # sum
+            ("i,i->", x, x),            # dot
+            ("i,i->i", x, x),           # vector element-wise mul
+            ("i,j->ij", x, y),          # outer
+            # -- Matrix
+            ("ij->ji", A),              # transpose
+            ("ij->j", A),               # row sum
+            ("ij->i", A),               # col sum
+            ("ij,ij->ij", A, A),        # matrix element-wise mul
+            ("ij,j->i", A, x),          # matrix vector multiplication
+            ("ij,kj->ik", A, B),        # matmul
+            ("ij,ab->ijab", A, E),      # matrix outer product
+            # -- Tensor
+            ("aij,ajk->aik", C, D),     # batch matmul
+            ("ijk,jk->i", C, A),        # tensor matrix contraction
+            ("aij,jk->aik", D, E),      # tensor matrix contraction
+            ("abcd,dfg->abcfg", F, G),  # tensor tensor contraction
+            ("ijk,jk->ik", C, A),       # tensor matrix contraction with double indices
+            ("ijk,jk->ij", C, A),       # tensor matrix contraction with double indices
+            ("ijk,ik->j", C, B),        # non contiguous
+            ("ijk,ik->jk", C, B),       # non contiguous with double indices
+            # -- Diagonal
+            ("ii", H),                 # trace
+            ("ii->i", H),              # diagonal
+            # -- Ellipsis
+            ("i...->...", H),
+            ("ki,...k->i...", A.t(), B),
+            ("k...,jk", A.t(), B),
+            ("...ii->...i", I),       # batch diagonal
+            # -- Other
+            ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
+            ("... ii->...i  ", I),       # batch diagonal with spaces
+        ]
+        for test in test_list:
+            actual = torch.einsum(test[0], test[1:])
+            expected = np.einsum(test[0], *[t.numpy() for t in test[1:]])
+            self.assertEqual(expected.shape, actual.shape, msg=test[0])
+            self.assertEqual(expected, actual, msg=test[0])
+            # test vararg
+            actual2 = torch.einsum(test[0], *test[1:])
+            self.assertEqual(expected.shape, actual2.shape, msg=test[0])
+            self.assertEqual(expected, actual2, msg=test[0])
+
+            def do_einsum(*args):
+                return torch.einsum(test[0], args)
+            # FIXME: following test cases fail gradcheck
+            if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}:
+                gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:])
+                self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
+            self.assertTrue(A._version == 0)  # check that we do not use inplace ops
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
diff --git a/torch/functional.py b/torch/functional.py
index 72739018889c..62076a9dc29a 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -296,107 +296,76 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
 def einsum(equation, *operands):
     r"""einsum(equation, *operands) -> Tensor
 
-    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
-    based on the Einstein summation convention.
-
-    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
-    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of 
-    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
-    with some subscript and define which subscripts are part of the output. The output is then computed by summing
-    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
-    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
-    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
-
-    Equation:
-
-        The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of
-        the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a
-        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
-        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
-        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
-        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
-        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
-        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
-        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
-
-        Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation
-        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
-        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
-        at most once for the output.
-
-        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
-        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
-        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
-        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
-        'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not
-        explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions),
-        before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements
-        batch matrix multiplication `'...ij,...jk'`.
-
-        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
-        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
-
-    .. note::
-
-        ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions
-        covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output.
-
-    .. note::
-
-        This function does not optimize the given expression, so a different formula for the same computation may
-        run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
-        can optimize the formula for you.
-
-    Args:
-        equation (string): The subscripts for the Einstein summation.
-        operands (Tensor): The operands to compute the Einstein sum of.
-
-    Examples::
-
-        # trace
-        >>> torch.einsum('ii', torch.randn(4, 4))
-        tensor(-1.2104)
-
-        # diagonal
-        >>> torch.einsum('ii->i', torch.randn(4, 4))
-        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
-
-        # outer product
-        >>> x = torch.randn(5)
-        >>> y = torch.randn(4)
-        >>> torch.einsum('i,j->ij', x, y)
-        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
-                [-0.3744,  0.9381,  1.2685, -1.6070],
-                [ 0.7208, -1.8058, -2.4419,  3.0936],
-                [ 0.1713, -0.4291, -0.5802,  0.7350],
-                [ 0.5704, -1.4290, -1.9323,  2.4480]])
-
-        # batch matrix multiplication
-        >>> As = torch.randn(3,2,5)
-        >>> Bs = torch.randn(3,5,4)
-        >>> torch.einsum('bij,bjk->bik', As, Bs)
-        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
-                [-1.6706, -0.8097, -0.8025, -2.1183]],
-
-                [[ 4.2239,  0.3107, -0.5756, -0.2354],
-                [-1.4558, -0.3460,  1.5087, -0.8530]],
-
-                [[ 2.8153,  1.8787, -4.3839, -1.2112],
-                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
-
-        # batch permute
-        >>> A = torch.randn(2, 3, 4, 5)
-        >>> torch.einsum('...ij->...ji', A).shape 
-        torch.Size([2, 3, 5, 4])
-
-        # equivalent to torch.nn.functional.bilinear
-        >>> A = torch.randn(3,5,4)
-        >>> l = torch.randn(2,5)
-        >>> r = torch.randn(2,4)
-        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
-        tensor([[-0.3430, -5.2405,  0.4494],
-                [ 0.3311,  5.5201, -3.0356]])
-    """
+This function provides a way of computing multilinear expressions (i.e. sums of products) using the
+Einstein summation convention.
+
+Args:
+    equation (string): The equation is given in terms of lower case letters (indices) to be associated
+           with each dimension of the operands and result. The left hand side lists the operands
+           dimensions, separated by commas. There should be one index letter per tensor dimension.
+           The right hand side follows after `->` and gives the indices for the output.
+           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
+           sorted list of all indices appearing exactly once in the left hand side.
+           The indices not apprearing in the output are summed over after multiplying the operands
+           entries.
+           If an index appears several times for the same operand, a diagonal is taken.
+           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
+           the ellipsis dimensions are at the beginning of the output.
+    operands (Tensor): The operands to compute the Einstein sum of.
+
+.. note::
+
+    This function does not optimize the given expression, so a different formula for the same computation may
+    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
+    can optimize the formula for you.
+
+Examples::
+
+    >>> x = torch.randn(5)
+    >>> y = torch.randn(4)
+    >>> torch.einsum('i,j->ij', x, y)  # outer product
+    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
+            [ 1.2616,  0.6335,  0.5113, -0.4351],
+            [ 1.4452,  0.7257,  0.5857, -0.4984],
+            [-0.4647, -0.2333, -0.1883,  0.1603],
+            [-1.1130, -0.5588, -0.4510,  0.3838]])
+
+
+    >>> A = torch.randn(3,5,4)
+    >>> l = torch.randn(2,5)
+    >>> r = torch.randn(2,4)
+    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
+    tensor([[-0.3430, -5.2405,  0.4494],
+            [ 0.3311,  5.5201, -3.0356]])
+
+
+    >>> As = torch.randn(3,2,5)
+    >>> Bs = torch.randn(3,5,4)
+    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
+    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+             [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+            [[ 4.2239,  0.3107, -0.5756, -0.2354],
+             [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+            [[ 2.8153,  1.8787, -4.3839, -1.2112],
+             [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.einsum('ii->i', A) # diagonal
+    tensor([-0.7825,  0.8291, -0.1936])
+
+    >>> A = torch.randn(4, 3, 3)
+    >>> torch.einsum('...ii->...i', A) # batch diagonal
+    tensor([[-1.0864,  0.7292,  0.0569],
+            [-0.9725, -1.0270,  0.6493],
+            [ 0.5832, -1.1716, -1.5084],
+            [ 0.4041, -1.1690,  0.8570]])
+
+    >>> A = torch.randn(2, 3, 4, 5)
+    >>> torch.einsum('...ij->...ji', A).shape # batch permute
+    torch.Size([2, 3, 5, 4])
+"""
     if not torch.jit.is_scripting():
         if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
             return handle_torch_function(einsum, operands, equation, *operands)

From 88ebf6f894a61039d2ac0077438b0ad3637c7a71 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 7 Dec 2020 11:54:03 -0800
Subject: [PATCH 109/132] Revert D25304229: [pytorch][PR] Add type annotations
 to torch.onnx.* modules

Test Plan: revert-hammer

Differential Revision:
D25304229 (https://github.com/pytorch/pytorch/commit/8bc6023d7a822ea6936b7460027f29558149008d)

Original commit changeset: b01b21ddbf86

fbshipit-source-id: bc3308176e2c70423f29f694e9db94828213e7d6
---
 mypy.ini                        | 24 ++++++++++++
 torch/_C/__init__.pyi.in        | 68 +--------------------------------
 torch/_C/_onnx.pyi              |  1 -
 torch/onnx/symbolic_helper.py   | 23 +++++------
 torch/onnx/symbolic_opset8.py   |  2 +-
 torch/onnx/symbolic_opset9.py   |  9 ++---
 torch/onnx/symbolic_registry.py |  5 +--
 torch/onnx/utils.py             | 24 +++++-------
 8 files changed, 51 insertions(+), 105 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 0b9f5497162c..f4b37f15a820 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -143,6 +143,30 @@ ignore_errors = True
 [mypy-torch.nn.intrinsic.qat.modules.conv_fused]
 ignore_errors = True
 
+[mypy-torch.onnx.operators]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_opset8]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_opset9]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_opset11]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_caffe2]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_helper]
+ignore_errors = True
+
+[mypy-torch.onnx.symbolic_registry]
+ignore_errors = True
+
+[mypy-torch.onnx.utils]
+ignore_errors = True
+
 [mypy-torch.multiprocessing.pool]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 1452718ed793..cbb5b2452e21 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -165,7 +165,7 @@ def wait(fut: Future) -> Any: ...
 def _collect_all(futures: List[Future]) -> Future: ...
 
 def unify_type_list(types: List[JitType]) -> JitType: ...
-def _freeze_module(module: ScriptModule, preserved_attrs: List[str] = [], freeze_interfaces: _bool = True) -> ScriptModule: ...
+def _freeze_module(module: ScriptModule, preserved_attrs: List[str], freeze_interfaces: _bool = True) -> ScriptModule: ...
 def _is_tracing() -> _bool: ...
 def _jit_init() -> _bool: ...
 def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
@@ -217,8 +217,6 @@ def _jit_get_trigger_value(trigger_name: str) -> _int: ...
 # Defined in torch/csrc/jit/python/script_init.cpp
 ResolutionCallback = Callable[[str], Callable[..., Any]]
 
-# Defined in torch/csrc/jit/python/script_init.cpp
-#        and torch/csrc/jit/python/init.cpp
 def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ...
 def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
 def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
@@ -248,54 +246,6 @@ def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallb
 def _create_module_with_type(ty: JitType) -> ScriptModule: ...
 def _run_emit_module_hook(m: ScriptModule): ...
 def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def, new_name: str) -> Def: ...
-
-def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
-def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ...
-def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ...
-def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], onnx_shape_inference: _bool = False) -> None: ...
-def _jit_pass_fixup_onnx_loop_node_inputs(n: Node) -> None: ...
-def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ...
-def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
-def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
-def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ...
-def _jit_pass_fuse_addmm(graph: Graph) -> None: ...
-def _jit_pass_onnx_preprocess(graph: Graph) -> None: ...
-def _jit_pass_onnx_prepare_inplace_ops_for_onnx(graph: Graph) -> None: ...
-def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ...
-def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
-def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
-def _jit_pass_onnx_unpack_quantized_weights(
-    graph: Graph,
-    paramsDict: Dict[str, IValue]
-) -> Dict[str, IValue]: ...
-def _jit_pass_onnx_quantization_insert_permutes(
-    graph: Graph,
-    paramsDict: Dict[str, IValue]
-) -> Dict[str, IValue]: ...
-def _jit_pass_custom_pattern_based_rewrite_graph(pattern: str, fused_node_name: str, graph: Graph) -> None: ...
-def _jit_pass_erase_number_types(graph: Graph) -> None: ...
-def _jit_pass_onnx(graph: Graph, _jit_pass_onnx: _onnx.OperatorExportTypes) -> Graph: ...
-def _jit_pass_onnx_scalar_type_analysis(graph: Graph) -> None: ...
-def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: _bool) -> None: ...
-def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ...
-def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
-def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
-def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
-def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
-def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ...
-def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
-def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
-def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ...
-def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
-def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ...
-def _jit_pass_onnx_block(
-    old_block: Block,
-    new_block: Block,
-    operator_export_type: _onnx.OperatorExportTypes,
-    env: Dict[Value, Value]
-) -> None: ...
-def _jit_pass_fixup_onnx_controlflow_node(n: Node, opset_version: _int) -> Node: ...
-
 def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ...
 def _jit_script_compile_overload(
     qualname: str,
@@ -331,18 +281,8 @@ def import_ir_module_from_buffer(
     extra_files: Dict[str, Any]
 ) -> ScriptModule: ...
 
-def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
-def _check_onnx_proto(proto: str) -> None: ...
-def _propagate_and_assign_input_shapes(
-    graph: Graph,
-    inputs: Tuple[Tensor, ...],
-    with_grad: _bool,
-    propagate: _bool
-) -> Graph: ...
-
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
-    def eraseInput(self, i: _int) -> None: ...
     ...
 
 # Defined in torch/csrc/jit/ir/ir.h
@@ -426,8 +366,8 @@ class ScriptFunction:
     def qualified_name(self) -> str: ...
 
 class ScriptMethod:
-    graph: Graph
     ...
+
 class ModuleDict:
     def __init__(self, mod: ScriptModule) -> None: ...
     def items(self) -> List[Tuple[str, Any]]: ...
@@ -438,10 +378,6 @@ class ParameterDict:
 class BufferDict:
     def __init__(self, mod: ScriptModule) -> None: ...
 
-# Defined in torch/csrc/jit/api/module.h
-class Module:
-    ...
-
 # Defined in torch/csrc/Module.cpp
 def _initExtension(shm_manager_path: str) -> None: ...  # THPModule_initExtension
 def _autograd_init() -> _bool: ...  # THPAutograd_initExtension
diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi
index 7ab3cd9c567d..51f16566ce6c 100644
--- a/torch/_C/_onnx.pyi
+++ b/torch/_C/_onnx.pyi
@@ -29,7 +29,6 @@ class OperatorExportTypes(Enum):
     ONNX_ATEN = ...
     ONNX_ATEN_FALLBACK = ...
     RAW = ...
-    ONNX_FALLTHROUGH = ...
 
 class TrainingMode(Enum):
     EVAL = ...
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 8fd8ce3ea760..5e9430f995f8 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -2,7 +2,6 @@
 import torch
 import warnings
 from sys import maxsize as maxsize
-from typing import Set
 
 import torch.onnx
 # This import monkey-patches graph manipulation methods on Graph, used for the
@@ -126,7 +125,7 @@ def decorator(fn):
         def wrapper(g, *args, **kwargs):
             # some args may be optional, so the length may be smaller
             assert len(arg_descriptors) >= len(args)
-            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]  # type: ignore
+            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]
             # only support _outputs in kwargs
             assert len(kwargs) <= 1
             if len(kwargs) == 1:
@@ -233,11 +232,11 @@ def _select_helper(g, self, dim, index, apply_reshape=True):
 
 def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False):
     if _export_onnx_opset_version <= 9:
-        from torch.onnx.symbolic_opset9 import _slice as _slice9
-        return _slice9(g, input, axes, starts, ends)
+        from torch.onnx.symbolic_opset9 import _slice
+        return _slice(g, input, axes, starts, ends)
     else:
-        from torch.onnx.symbolic_opset10 import _slice as _slice10
-        return _slice10(g, input, axes, starts, ends, steps, dynamic_slice)
+        from torch.onnx.symbolic_opset10 import _slice
+        return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
 
 def _hardtanh_helper(g, input, min_val, max_val):
     if _export_onnx_opset_version <= 10:
@@ -381,7 +380,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_
                 size = g.op("Concat", *size, axis_i=0)
         scale_factor = _interpolate_size_to_scales(g, input, size, dim)
     else:
-        return _unimplemented("interpolate", "Both size and scales are None in __interpolate")
+        return _unimplemented("Both size and scales are None in __interpolate")
     return scale_factor, mode
 
 
@@ -389,7 +388,7 @@ def _unbind_helper(g, self, dim, _outputs):
     if _export_onnx_opset_version <= 9:
         from torch.onnx.symbolic_opset9 import unbind
     else:
-        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
+        from torch.onnx.symbolic_opset11 import unbind
     return unbind(g, self, dim, _outputs)
 
 
@@ -397,8 +396,7 @@ def _scatter_helper(g, self, dim, index, src):
     if _export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore
+        from torch.onnx.symbolic_opset11 import scatter
     return scatter(g, self, dim, index, src)
 
 
@@ -446,8 +444,7 @@ def _index_fill_reshape_helper(g, self, dim, index):
     if _export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
     else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore
+        from torch.onnx.symbolic_opset11 import scatter
 
     if self.type().dim() is None:
         return _unimplemented("index_fill", "input rank not accesible")
@@ -635,4 +632,4 @@ def _cast_func_template(to_i, g, input, non_blocking):
 
 # Global set to store the list of quantized operators in the network.
 # This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
-_quantized_ops: Set[int] = set()
+_quantized_ops = set()
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index e4023dab2320..c0c1d48ebec0 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -4,7 +4,7 @@
 import torch.onnx.symbolic_opset9 as sym_opset9
 
 from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type
-from torch.onnx.symbolic_opset9 import _cast_Float  # type: ignore
+from torch.onnx.symbolic_opset9 import _cast_Float
 
 import warnings
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 8630f48a62ad..e395ce5c703f 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -13,8 +13,6 @@
 import torch.onnx.symbolic_helper as sym_help
 from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented
 
-from typing import Optional
-
 import numpy
 import math
 import warnings
@@ -313,7 +311,7 @@ def _maybe_cast_reduce_op_input(g, self):
     if dtype is not None:
         # pytorch reduce-ops cast all other integral types to int64
         if not sym_help._is_fp(self) and not (dtype == 'Long'):
-            self = _cast_Long(g, self, False)  # type: ignore
+            self = _cast_Long(g, self, False)
     return self
 
 
@@ -2094,7 +2092,7 @@ def _pack_padded_sequence(g, input, lengths, batch_first):
     # It's really only necessary because those operators expand to something that
     # only works with int32 types in Caffe2...
     if lengths.type().scalarType() != 'Int':
-        lengths = _cast_Int(g, lengths, False)  # type: ignore
+        lengths = _cast_Int(g, lengths, False)
     return g.op("prim::PackPadded", input, lengths, outputs=2)
 
 
@@ -2438,7 +2436,7 @@ def _get_arange_dtype(dtype):
 
 
 def masked_fill(g, self, mask, value):
-    mask = _cast_Bool(g, mask, False)  # type: ignore
+    mask = _cast_Bool(g, mask, False)
     value = sym_help._maybe_get_scalar(value)
     return g.op('Where', mask, sym_help._if_scalar_type_as(g, value, self), self)
 
@@ -2736,7 +2734,6 @@ def as_strided(g, self, sizes, strides, offset=None):
     sizes = sym_help._maybe_get_const(sizes, 'is')
     rank = len(strides)
     self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
-    ind: Optional[torch.Tensor]
     if not sym_help._is_value(sizes):
         ind = torch.tensor([0], dtype=torch.long)
         for i, (size, stride) in enumerate(zip(sizes, strides)):
diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py
index c059e8f2eb31..48114d6c472b 100644
--- a/torch/onnx/symbolic_registry.py
+++ b/torch/onnx/symbolic_registry.py
@@ -1,7 +1,6 @@
 import warnings
 import importlib
 from inspect import getmembers, isfunction
-from typing import Dict, Tuple, Any, Union
 
 # The symbolic registry "_registry" is a dictionary that maps operators
 # (for a specific domain and opset version) to their symbolic functions.
@@ -9,9 +8,9 @@
 # The keys are tuples (domain, version), (where domain is a string, and version is an int),
 # and the operator's name (string).
 # The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
-_registry: Dict[Tuple[str, int], Dict] = {}
+_registry = {}
 
-_symbolic_versions: Dict[Union[int, str], Any] = {}
+_symbolic_versions = {}
 from torch.onnx.symbolic_helper import _onnx_stable_opsets
 for opset_version in _onnx_stable_opsets:
     module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version))
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 3fe19a56c124..5c41306b9ee2 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -18,7 +18,6 @@
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode
 from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto
-from typing import Union, Tuple, List
 
 
 # the flag to tell the user whether it's in the middle of ONNX export or not
@@ -77,7 +76,7 @@ def export(model, args, f, export_params=True, verbose=False, training=None,
     if aten or export_raw_ir:
         assert operator_export_type is None
         assert aten ^ export_raw_ir
-        operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW
+        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
     elif operator_export_type is None:
         if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE:
             operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK
@@ -352,7 +351,6 @@ def _trace_and_get_graph_from_model(model, args):
 
 def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes):
     torch_out = None
-    params: Union[List, Tuple]
     if isinstance(model, torch.jit.ScriptModule):
         try:
             graph = model.forward.graph
@@ -444,7 +442,7 @@ def _model_to_graph(model, args, verbose=False,
     param_names = input_and_param_names[len(input_and_param_names) - len(params):]
     params_dict = dict(zip(param_names, params))
 
-    if training is None or training == TrainingMode.EVAL:
+    if training is None or training == TrainingMode.EVAL or (training == TrainingMode.PRESERVE and not is_originally_training):
         params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
 
     if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
@@ -478,7 +476,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t
     if aten or export_raw_ir:
         assert operator_export_type is None
         assert aten ^ export_raw_ir
-        operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW
+        operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW
     elif operator_export_type is None:
         operator_export_type = OperatorExportTypes.ONNX
     return _export_to_pretty_string(model, args, f, export_params, verbose, training,
@@ -1053,10 +1051,6 @@ def _graph_constant(g, value, dims, type, *args, **kwargs):
         dims = [1]
         isscalar = True
     type = type.lower()
-    tensor: Union[torch.CharTensor, torch.ShortTensor,
-                  torch.IntTensor, torch.LongTensor,
-                  torch.HalfTensor, torch.FloatTensor,
-                  torch.DoubleTensor]
     if type == "char":
         tensor = torch.CharTensor(*dims)
     elif type == "short":
@@ -1074,7 +1068,7 @@ def _graph_constant(g, value, dims, type, *args, **kwargs):
     else:
         raise ValueError("Unknown type, type should be one of the following strings: "
                          "char, short, int, long, half, float, double")
-    tensor.fill_(value)  # type: ignore
+    tensor.fill_(value)
     if isscalar:
         return g.op("Constant", *args, value_z=tensor, **kwargs)
     return g.op("Constant", *args, value_t=tensor, **kwargs)
@@ -1147,8 +1141,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
             dynamic_axes[key] = value_dict
 
 
-torch._C.Graph.op = _graph_op  # type: ignore
-torch._C.Graph.at = _graph_at  # type: ignore
-torch._C.Block.op = _block_op  # type: ignore
-torch._C.Graph.constant = _graph_constant  # type: ignore
-torch._C.Node.__getitem__ = _node_getitem  # type: ignore
+torch._C.Graph.op = _graph_op
+torch._C.Graph.at = _graph_at
+torch._C.Block.op = _block_op
+torch._C.Graph.constant = _graph_constant
+torch._C.Node.__getitem__ = _node_getitem

From d6b5f3ad98a883941be88029a36956e7b879a605 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Mon, 7 Dec 2020 14:28:15 -0800
Subject: [PATCH 110/132] Add object-based collective APIs to public docs
 (#48909)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48909

Adds these new APIs to the documentation
ghstack-source-id: 117965961

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D25363279

fbshipit-source-id: af6889d377f7b5f50a1a77a36ab2f700e5040150
---
 docs/source/distributed.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index f5bce396054b..b35a34fc0265 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -384,16 +384,24 @@ Collective functions
 
 .. autofunction:: broadcast 
 
+.. autofunction:: broadcast_object_list
+
 .. autofunction:: all_reduce
 
 .. autofunction:: reduce
 
 .. autofunction:: all_gather
 
+.. autofunction:: all_gather_object
+
 .. autofunction:: gather
 
+.. autofunction:: gather_object
+
 .. autofunction:: scatter
 
+.. autofunction:: scatter_object_list
+
 .. autofunction:: reduce_scatter
 
 .. autofunction:: all_to_all

From b77ca9e829a7f919c06446ee8de1ca6dd540a134 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Mon, 7 Dec 2020 14:28:15 -0800
Subject: [PATCH 111/132] [Docs] Add examples for new object-based c10d APIs
 (#43932)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43932

Adds some basic examples to the documentation for each of the newly added
object-based collectibves.
ghstack-source-id: 117965966

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23441838

fbshipit-source-id: 91344612952cfcaa71f08ccf2a2c9ed162ca9c89
---
 torch/distributed/distributed_c10d.py | 52 +++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 13a950024af9..1081c6ee0e44 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1393,6 +1393,16 @@ def all_gather_object(object_list, obj, group=group.WORLD):
         known to be insecure. It is possible to construct malicious pickle data
         which will execute arbitrary code during unpickling. Only call this
         function with data you trust.
+
+    Example::
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes world_size of 3.
+        >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> output = [None for _ in gather_objects]
+        >>> dist.all_gather_object(output, gather_objects[dist.get_rank()])
+        >>> output
+        ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
         return
@@ -1467,6 +1477,21 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD):
         known to be insecure. It is possible to construct malicious pickle data
         which will execute arbitrary code during unpickling. Only call this
         function with data you trust.
+
+    Example::
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes world_size of 3.
+        >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> output = [None for _ in gather_objects]
+        >>> dist.gather_object(
+                gather_objects[dist.get_rank()],
+                output if dist.get_rank() == 0 else None,
+                dst=0
+            )
+        >>> # On rank 0
+        >>> output
+        ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
         return
@@ -1556,6 +1581,18 @@ def broadcast_object_list(object_list, src, group=group.WORLD):
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
         function with data you trust.
+
+    Example::
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> else:
+        >>>     objects = [None, None, None]
+        >>> dist.broadcast_object_list(objects, src=0)
+        >>> broadcast_objects
+        ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
         return
@@ -1634,6 +1671,21 @@ def scatter_object_list(
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
         function with data you trust.
+
+    Example::
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> else:
+        >>>     # Can be any list on non-src ranks, elements are not used.
+        >>>     objects = [None, None, None]
+        >>> output_list = [None]
+        >>> dist.scatter_object_list(output_list, objects, src=0)
+        >>> # Rank i gets objects[i]. For example, on rank 2:
+        >>> output_list
+        [{1: 2}]
     """
     if _rank_not_in_group(group):
         return

From f67259fe897bda05a69db54a6b184c5f20bb1368 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Mon, 7 Dec 2020 15:24:33 -0800
Subject: [PATCH 112/132] Fix CI by removing gen_pyi from mypy-stirct.ini
 (#48961)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48961

Reviewed By: janeyx99

Differential Revision: D25383152

Pulled By: malfet

fbshipit-source-id: ce0226398522342256d0d701edc13955d1095a0d
---
 mypy-strict.ini | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mypy-strict.ini b/mypy-strict.ini
index ddd369ebe621..42fc73abf1cc 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -35,7 +35,6 @@ files = tools/codegen/gen.py,
     tools/autograd/gen_trace_type.py,
     tools/autograd/gen_variable_factories.py,
     tools/autograd/load_derivatives.py,
-    tools/pyi/gen_pyi.py,
     torch/utils/benchmark/utils/common.py,
     torch/utils/benchmark/utils/timer.py,
     torch/utils/benchmark/utils/valgrind_wrapper/*.py,

From 7629612f9f5a2ad63e67a723a82273b318cf28a7 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang96@gmail.com>
Date: Mon, 7 Dec 2020 16:10:18 -0800
Subject: [PATCH 113/132] Update torch.randint documentation to include missing
 note (#48787)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/46497

Includes note about returning dtype torch.int64.

Current documentation: https://pytorch.org/docs/stable/generated/torch.randint.html?highlight=randint#torch.randint
New documentation:
![image](https://user-images.githubusercontent.com/14858254/101196939-48977d00-3616-11eb-90a5-a7b706e8505f.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48787

Test Plan: Built documentation and checked generated docs

Reviewed By: ailzhang

Differential Revision: D25339421

Pulled By: H-Huang

fbshipit-source-id: c2ecaacaeb57971fe7fba0d9d54f3c61b0fd04ce
---
 torch/_torch_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 56aec4668b0d..51200dc6b406 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -6786,7 +6786,7 @@ def merge_dicts(*dicts):
 
 The shape of the tensor is defined by the variable argument :attr:`size`.
 
-.. note:
+.. note::
     With the global dtype default (``torch.float32``), this function returns
     a tensor with dtype ``torch.int64``.
 

From e3893b867fd39cf4f10a129ba9f689eebf10f82b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 7 Dec 2020 16:13:41 -0800
Subject: [PATCH 114/132] Reenable some BF16 tests on CUDA (#48805)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48805

Reviewed By: agolynski

Differential Revision: D25375885

Pulled By: ailzhang

fbshipit-source-id: 2e19fe725ae9450bd1a2bc4e2d308c59b9f94fac
---
 test/test_tensor_creation_ops.py |  3 +--
 test/test_torch.py               | 44 +++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index b355005b1c69..9be3e6db5bf0 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -14,7 +14,7 @@
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
-    onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes,
+    onlyCPU, largeTensorTest, precisionOverride, dtypes,
     onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU)
 
 # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests
@@ -2581,7 +2581,6 @@ def test_arange_device_vs_cpu(self, device, dtype):
         self.assertEqual(cpu_tensor, device_tensor)
 
     @onlyCUDA
-    @skipCUDAIfNotRocm
     def test_arange_bfloat16(self, device):
         ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device)
         bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device)
diff --git a/test/test_torch.py b/test/test_torch.py
index 2d181c3b9400..ad88128617c9 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6316,10 +6316,6 @@ def test_copy_broadcast(self, device) -> None:
     torch.uint8
 ]
 
-# _types2 adds bfloat16 type to  _types only on ROCm. Should eventually be unified
-# with _types when bfloat16 bringup is complete on all platforms.
-_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types
-
 _float_types = [torch.half, torch.float, torch.double]
 
 _complex_types = [torch.cfloat, torch.cdouble]
@@ -6601,10 +6597,14 @@ def inner(self, device, dtype):
     ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)],
         1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False),
     ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False),
-    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
     ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
@@ -6618,10 +6618,14 @@ def inner(self, device, dtype):
     ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0,
      [torch.int16, torch.int32, torch.int64],
      [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]),
-    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
     ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     # TODO: can't check negative case - cross-device copy is contiguous
     ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)],
@@ -6705,12 +6709,16 @@ def inner(self, device, dtype):
                       torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d),
                       True],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True),
-        lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
-    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
+    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False),
     ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),
     ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),

From adbb74ded9a2a15bff7c9cbca93cbbf930341354 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Mon, 7 Dec 2020 17:10:12 -0800
Subject: [PATCH 115/132] [package] pre-emptively install submodules (#48799)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48799

Python's IMPORT_FROM bytecode will bypass the import infrastructure
when a packaging being loaded as part of a cirular dependency is being
accessed from the module _before_ that package has finished loading
and is installed on the module. Since we cannot override the lookup
on sys.modules, this PR pre-emptively does the module assignment before
running the submodules initialization code.

Note: this appears to work, but it is not clear to me why python doesn't
do this by default. It is possible that the logic for creating modules
is flexible enough in generic python that this interception between creating
the module and running its code is not always possible.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D25312467

Pulled By: zdevito

fbshipit-source-id: 6fe3132af29364ccb2b3cabdd2b847d0a09eb515
---
 torch/package/importer.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/torch/package/importer.py b/torch/package/importer.py
index 455119d18a1e..ffd474733021 100644
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@@ -140,7 +140,7 @@ def load_pickle(self, package: str, resource: str, map_location=None) -> Any:
     def _read_extern(self):
         return self.zip_reader.get_record('extern_modules').decode('utf-8').splitlines(keepends=False)
 
-    def _make_module(self, name: str, filename: Optional[str], is_package: bool):
+    def _make_module(self, name: str, filename: Optional[str], is_package: bool, parent: str):
         spec = importlib.machinery.ModuleSpec(name, self, is_package=is_package)  # type: ignore
         module = importlib.util.module_from_spec(spec)
         self.modules[name] = module
@@ -150,12 +150,18 @@ def _make_module(self, name: str, filename: Optional[str], is_package: bool):
         ns['__file__'] = filename
         ns['__cached__'] = None
         ns['__builtins__'] = self.patched_builtins
+
+        # pre-emptively install on the parent to prevent IMPORT_FROM from trying to
+        # access sys.modules
+        self._install_on_parent(parent, name, module)
+
         if filename is not None:
             code = self._compile_source(filename)
             exec(code, ns)
+
         return module
 
-    def _load_module(self, name: str):
+    def _load_module(self, name: str, parent: str):
         cur : _PathNode = self.root
         for atom in name.split('.'):
             if not isinstance(cur, _PackageNode) or atom not in cur.children:
@@ -166,7 +172,7 @@ def _load_module(self, name: str):
             if isinstance(cur, _ExternNode):
                 module = self.modules[name] = importlib.import_module(name)
                 return module
-        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode))  # type: ignore
+        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode), parent)  # type: ignore
 
     def _compile_source(self, fullpath):
         source = self.zip_reader.get_record(fullpath)
@@ -179,6 +185,14 @@ def get_source(self, module_name) -> str:
         module = self.import_module(module_name)
         return self.zip_reader.get_record(module.__file__).decode('utf-8')
 
+    def _install_on_parent(self, parent: str, name: str, module: types.ModuleType):
+        if not parent:
+            return
+        # Set the module as an attribute on its parent.
+        parent_module = self.modules[parent]
+        if parent_module.__loader__ is self:  # type: ignore
+            setattr(parent_module, name.rpartition('.')[2], module)
+
     # note: copied from cpython's import code, with call to create module replaced with _make_module
     def _do_find_and_load(self, name):
         path = None
@@ -196,13 +210,10 @@ def _do_find_and_load(self, name):
                 msg = (_ERR_MSG + '; {!r} is not a package').format(name, parent)
                 raise ModuleNotFoundError(msg, name=name) from None
 
-        module = self._load_module(name)
+        module = self._load_module(name, parent)
+
+        self._install_on_parent(parent, name, module)
 
-        if parent:
-            # Set the module as an attribute on its parent.
-            parent_module = self.modules[parent]
-            if parent_module.__loader__ is self:  # type: ignore
-                setattr(parent_module, name.rpartition('.')[2], module)
         return module
 
     # note: copied from cpython's import code

From 533c837833dd5cec712e1c32dff5c389ed9465cf Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 7 Dec 2020 17:16:41 -0800
Subject: [PATCH 116/132] Register OpInfos for torch.fft transforms (#48427)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48427

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D25266218

Pulled By: mruberry

fbshipit-source-id: 406e7ed5956bc7445daf8c027c9b4d2c8ff88fa1
---
 test/test_jit.py                              |   2 +-
 test/test_ops.py                              |  56 +++--
 torch/testing/_internal/common_device_type.py |   2 +-
 .../_internal/common_methods_invocations.py   | 191 ++++++++++++++++--
 .../_internal/jit_metaprogramming_utils.py    |  15 +-
 5 files changed, 216 insertions(+), 50 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index c85fcbd19747..65b9c110f64f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15733,7 +15733,7 @@ def fn(*inputs, **kwargs):
 
                 # alias annotation testing
                 if not is_magic_method and test_name not in EXCLUDE_SCRIPT and not exclude_tensor_method(name, test_name):
-                    check_alias_annotation(name, (self_variable,) + args_variable, kwargs_variable)
+                    check_alias_annotation(name, (self_variable,) + args_variable, kwargs_variable, aten_name=name)
 
             check(name)
             inplace_name = name + '_'
diff --git a/test/test_ops.py b/test/test_ops.py
index 1be90f2555f8..090232360309 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -231,69 +231,63 @@ def test_variant_consistency_jit(self, device, dtype, op):
         for sample in samples:
 
             # Acquires variants to test
+            func = op.get_op()
             method = op.get_method()
             inplace = op.get_inplace()
-            variants = (v for v in (method, inplace) if v is not None)
-
-            # Adds function variant to variant list
-            # TODO: inplace tests currently fail
-            # variants = (v for v in (op, method, inplace) if v is not None)
-            variants = (v for v in (op, method) if v is not None)
+            variants = {
+                'function': func, 'method': method,
+                # TODO: inplace tests currently fail
+                # 'inplace': inplace,
+            }
 
             # Test traced and scripted consistency
-            for variant in variants:
+            for func_type, variant in variants.items():
+                if variant is None:
+                    continue
+
                 # Create accessor for script function variant
-                if variant is op:
-                    name = op.name
-                    func_type = 'function'
-                elif variant is method:
-                    name = op.name
-                    func_type = 'method'
-                else:  # variant is inplace
-                    assert variant is inplace
-                    name = op.name + "_"
-                    func_type = 'inplace'
+                name = op.name + '_' if func_type == 'inplace' else op.name
 
                 # run with disable_autodiff_subgraph_inlining(True) to test
                 #   autodiff support. Context manager forces the graph to contain
                 #   DifferentiableGraph nodes if they are present
                 with disable_autodiff_subgraph_inlining():
                     def fn(*inputs, **kwargs):
-                        attr = getattr(inputs[0], name)
-                        output = attr(*inputs[1:], **kwargs)
+                        output = func(*inputs, **kwargs)
                         return op.output_func(output)
 
                     # bfloat16 grad doesn't work for some operators
                     dtypes_to_grad_check = floating_and_complex_types_and(torch.half) \
-                        if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16) 
+                        if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16)
 
                     # Check scripted forward, grad, and grad grad
                     script_fn = create_script_fn(self, name, func_type, op.output_func)
 
-                    check_against_reference(self, 
+                    check_against_reference(self,
                                             script_fn,
-                                            fn, 
-                                            (*sample.input,) + sample.args, 
-                                            sample.kwargs, 
+                                            fn,
+                                            (*sample.input,) + sample.args,
+                                            sample.kwargs,
                                             no_grad=(dtype not in dtypes_to_grad_check))
 
                     # Check traced forward, grad, and grad grad
                     traced_fn = create_traced_fn(self, variant)
-                    check_against_reference(self, 
+                    check_against_reference(self,
                                             traced_fn,
-                                            fn, 
-                                            (*sample.input,) + sample.args, 
-                                            sample.kwargs, 
+                                            fn,
+                                            (*sample.input,) + sample.args,
+                                            sample.kwargs,
                                             no_grad=(dtype not in dtypes_to_grad_check))
 
                     # Check alias annotation schema for correctness (make
                     #   sure inputs that aren't supposed to be modified aren't)
-                    # Note: only runs in float32 and int64 because schema isn't affected by dtype, 
+                    # Note: only runs in float32 and int64 because schema isn't affected by dtype,
                     #   so running it on all dtypes is would be excessive
                     if dtype in [torch.float32, torch.int32]:
-                        check_alias_annotation(name, (*sample.input,) + sample.args, sample.kwargs)
+                        check_alias_annotation(name, (*sample.input,) + sample.args, sample.kwargs,
+                                               func_type=func_type, aten_name=op.aten_name)
 
-                    # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample 
+                    # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample
                     if dtype is torch.float32:
                         # Sandcastle doesn't fuse nodes
                         if IS_SANDCASTLE:
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 0126b1dd0a93..36f02eff0c0f 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -171,7 +171,7 @@
 
 def _construct_test_name(test_name, op, device_type, dtype):
     if op is not None:
-        test_name += "_" + op.name
+        test_name += "_" + op.name.replace('.', '_')
 
     test_name += "_" + device_type
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 96d1cd03557e..26be9c9fde3a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10,11 +10,11 @@
 from typing import List, Tuple, Dict, Any
 
 from torch.testing import \
-    (make_non_contiguous, _dispatch_dtypes,
-     floating_types, floating_types_and, floating_and_complex_types,
-     floating_and_complex_types_and, all_types_and_complex_and, all_types_and)
+    (make_non_contiguous, _dispatch_dtypes, floating_types, floating_types_and,
+     floating_and_complex_types, floating_and_complex_types_and,
+     all_types_and_complex_and, all_types_and)
 from torch.testing._internal.common_device_type import \
-    (skipCUDAIfNoMagma, skipCPUIfNoLapack,
+    (skipCUDAIfNoMagma, skipCPUIfNoLapack, skipCPUIfNoMkl, skipCUDAIfRocm,
      expectedAlertNondeterministic, precisionOverride)
 from torch.testing._internal.common_utils import \
     (prod_single_zero, random_square_matrix_of_rank,
@@ -22,7 +22,7 @@
      random_symmetric_pd_matrix, make_nonzero_det,
      random_fullrank_matrix_distinct_singular_value, set_rng_seed,
      TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY,
-     torch_to_numpy_dtype_dict)
+     torch_to_numpy_dtype_dict, TEST_WITH_SLOW)
 
 if TEST_SCIPY:
     import scipy.special
@@ -54,6 +54,23 @@ def __init__(self, input, *, args=tuple(), kwargs=None):
         self.kwargs = kwargs if kwargs is not None else {}
 
 
+_NOTHING = object()  # Unique value to distinguish default from anything else
+
+
+# Extension of getattr to support qualified names
+# e.g. _getattr_qual(torch, 'linalg.norm') -> torch.linalg.norm
+def _getattr_qual(obj, name, default=_NOTHING):
+    try:
+        for path in name.split('.'):
+            obj = getattr(obj, path)
+        return obj
+    except AttributeError:
+        if default is not _NOTHING:
+            return default
+        else:
+            raise
+
+
 # Classes and methods for the operator database
 class OpInfo(object):
     """Operator information and helper functions for acquiring it."""
@@ -84,13 +101,16 @@ def __init__(self,
                  skips=tuple(),  # information about which tests to skip
                  decorators=None,  # decorators to apply to generated tests
                  promotes_integers_to_float=False,  # whether op promotes unary output to float or not
-                 sample_inputs_func=None):  # function to generate sample inputs
+                 sample_inputs_func=None,  # function to generate sample inputs
+                 aten_name=None,  # name of the corresponding aten:: operator
+                 ):
 
         # Validates the dtypes are generated from the dispatch-related functions
         for dtype_list in (dtypes, dtypesIfCPU, dtypesIfCUDA, dtypesIfROCM):
             assert isinstance(dtype_list, (_dispatch_dtypes, type(None)))
 
         self.name = name
+        self.aten_name = aten_name if aten_name is not None else name
 
         self.dtypes = set(dtypes)
         self.dtypesIfCPU = set(dtypesIfCPU) if dtypesIfCPU is not None else self.dtypes
@@ -99,12 +119,10 @@ def __init__(self,
         self._default_test_dtypes = set(default_test_dtypes) if default_test_dtypes is not None else None
 
         # NOTE: if the op is unspecified it is assumed to be under the torch namespace
-        if op is None:
-            assert hasattr(torch, self.name), f"Can't find torch.{self.name}"
-        self.op = op if op else getattr(torch, self.name)
-        self.method_variant = getattr(torch.Tensor, name) if hasattr(torch.Tensor, name) else None
+        self.op = op if op else _getattr_qual(torch, self.name)
+        self.method_variant = getattr(torch.Tensor, name, None)
         inplace_name = name + "_"
-        self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None
+        self.inplace_variant = getattr(torch.Tensor, inplace_name, None)
         self.skip_bfloat16_grad = skip_bfloat16_grad
 
         self.test_inplace_grad = test_inplace_grad
@@ -289,8 +307,71 @@ def wrapped_fn(x):
 
     return wrapped_fn
 
+
+# Metadata class for Fast Fourier Transforms in torch.fft.
+class SpectralFuncInfo(OpInfo):
+    """Operator information for torch.fft transforms. """
+
+    def __init__(self,
+                 name,  # the string name of the function
+                 *,
+                 ref=None,  # Reference implementation (probably in np.fft namespace)
+                 dtypes=floating_and_complex_types(),
+                 dtypesIfCPU=None,
+                 dtypesIfCUDA=None,
+                 dtypesIfROCM=None,
+                 ndimensional: bool,  # Whether dim argument can be a tuple
+                 skips=None,
+                 decorators=None,
+                 **kwargs):
+        dtypesIfCPU = dtypesIfCPU if dtypesIfCPU is not None else dtypes
+        dtypesIfCUDA = dtypesIfCUDA if dtypesIfCUDA is not None else dtypes
+        dtypesIfROCM = dtypesIfROCM if dtypesIfROCM is not None else dtypes
+
+        # gradgrad is quite slow
+        if not TEST_WITH_SLOW:
+            skips = skips if skips is not None else []
+            skips.append(SkipInfo('TestGradients', 'test_fn_gradgrad'))
+
+        decorators = decorators if decorators is not None else []
+        decorators += [skipCPUIfNoMkl, skipCUDAIfRocm]
+
+        super().__init__(name=name,
+                         dtypes=dtypes,
+                         dtypesIfCPU=dtypesIfCPU,
+                         dtypesIfCUDA=dtypesIfCUDA,
+                         dtypesIfROCM=dtypesIfROCM,
+                         skips=skips,
+                         decorators=decorators,
+                         **kwargs)
+        self.ref = ref if ref is not None else _getattr_qual(np, name)
+        self.ndimensional = ndimensional
+
+
+    def sample_inputs(self, device, dtype, requires_grad=False):
+        tensor = make_tensor((L, M), device, dtype,
+                             low=None, high=None,
+                             requires_grad=requires_grad)
+        if self.ndimensional:
+            return [
+                SampleInput(tensor),
+                SampleInput(tensor, kwargs=dict(dim=(-2,))),
+                SampleInput(tensor, kwargs=dict(norm='ortho')),
+                SampleInput(tensor, kwargs=dict(s=(10, 15))),
+                SampleInput(tensor, kwargs=dict(s=10, dim=1, norm='ortho')),
+            ]
+        else:
+            return [
+                SampleInput(tensor),
+                SampleInput(tensor, kwargs=dict(dim=-2)),
+                SampleInput(tensor, kwargs=dict(norm='ortho')),
+                SampleInput(tensor, kwargs=dict(n=15)),
+                SampleInput(tensor, kwargs=dict(n=10, dim=1, norm='ortho')),
+            ]
+
+
 # Operator database (sorted alphabetically)
-op_db: List[Any] = [
+op_db: List[OpInfo] = [
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
     UnaryUfuncInfo('acos',
                    ref=np.arccos,
@@ -448,6 +529,89 @@ def wrapped_fn(x):
                        SkipInfo('TestCommon', 'test_variant_consistency_jit',
                                 device_type='cuda', dtypes=[torch.float16]),
                    )),
+    SpectralFuncInfo('fft.fft',
+                     aten_name='fft_fft',
+                     ref=np.fft.fft,
+                     ndimensional=False,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.fftn',
+                     aten_name='fft_fftn',
+                     ref=np.fft.fftn,
+                     ndimensional=True,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,
+                     decorators=[precisionOverride(
+                         {torch.float: 1e-4, torch.cfloat: 1e-4})],),
+    SpectralFuncInfo('fft.hfft',
+                     aten_name='fft_hfft',
+                     ref=np.fft.hfft,
+                     ndimensional=False,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.rfft',
+                     aten_name='fft_rfft',
+                     ref=np.fft.rfft,
+                     ndimensional=False,
+                     dtypes=all_types_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.rfftn',
+                     aten_name='fft_rfftn',
+                     ref=np.fft.rfftn,
+                     ndimensional=True,
+                     dtypes=all_types_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,
+                     decorators=[precisionOverride({torch.float: 1e-4})],),
+    SpectralFuncInfo('fft.ifft',
+                     aten_name='fft_ifft',
+                     ref=np.fft.ifft,
+                     ndimensional=False,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.ifftn',
+                     aten_name='fft_ifftn',
+                     ref=np.fft.ifftn,
+                     ndimensional=True,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.ihfft',
+                     aten_name='fft_ihfft',
+                     ref=np.fft.ihfft,
+                     ndimensional=False,
+                     dtypes=all_types_and(torch.bool),
+                     default_test_dtypes=floating_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.irfft',
+                     aten_name='fft_irfft',
+                     ref=np.fft.irfft,
+                     ndimensional=False,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
+    SpectralFuncInfo('fft.irfftn',
+                     aten_name='fft_irfftn',
+                     ref=np.fft.irfftn,
+                     ndimensional=True,
+                     dtypes=all_types_and_complex_and(torch.bool),
+                     default_test_dtypes=floating_and_complex_types(),
+                     supports_tensor_out=False,
+                     test_inplace_grad=False,),
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, float('inf')),
@@ -644,7 +808,7 @@ def reference_sigmoid(x):
             return (1 / (1 + np.exp(-x)))
         return scipy.special.expit(x)
 
-    op_db_scipy_reference = [
+    op_db_scipy_reference: List[OpInfo] = [
         UnaryUfuncInfo('sigmoid',
                        ref=reference_sigmoid,
                        decorators=(precisionOverride({torch.float16: 1e-2,
@@ -695,6 +859,7 @@ def reference_sigmoid(x):
 
 # Common operator groupings
 unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)]
+spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 8036c73a6330..4a91394d53c5 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -229,8 +229,15 @@ def the_method({}):
     return {}
 '''
 
+def value_to_literal(value):
+    if isinstance(value, str):
+        # Quotes string and escapes special characters
+        return ascii(value)
+    else:
+        return str(value)
+
 def get_call(method_name, func_type, args, kwargs):
-    kwargs_str = ', '.join([k + '=' + str(v) for k, v in kwargs.items()])
+    kwargs_str = ', '.join([k + '=' + value_to_literal(v) for k, v in kwargs.items()])
     self_arg = args[0]
     if(func_type == 'method'):
         args = args[1:]
@@ -461,12 +468,12 @@ def make_module(script):
         return module
     return script_module
 
-def check_alias_annotation(method_name, args, kwargs):
+def check_alias_annotation(method_name, args, kwargs, *, aten_name, func_type='method'):
     formals, tensors, actuals = get_script_args(args)
-    call = get_call(method_name, 'method', actuals, kwargs)
+    call = get_call(method_name, func_type, actuals, kwargs)
     script = script_template.format(', '.join(formals), call)
     CU = torch.jit.CompilationUnit(script)
-    torch._C._jit_check_alias_annotation(CU.the_method.graph, tuple(tensors), method_name)
+    torch._C._jit_check_alias_annotation(CU.the_method.graph, tuple(tensors), aten_name)
 
 def get_nn_module_name_from_kwargs(**kwargs):
     if 'module_name' in kwargs:

From c876d4f477fdebfa8acbc3ebd8042ea8f5ed36dc Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Mon, 7 Dec 2020 17:37:59 -0800
Subject: [PATCH 117/132] [Gradient Compression] Let the dtype of created
 low-rank tensors P and Q be the same type as the input tensor (#48902)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48902

Previously if the dtype of input gradients is FP16, matrix multiplications will fail, because the created low-rank tensors P and Q use FP32 dtype.

Now let the dtype of P and Q be the same as the input tensor.

Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202
ghstack-source-id: 117962078

Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl

Reviewed By: rohan-varma

Differential Revision: D25362071

fbshipit-source-id: e68753ff23bb480605b02891e128202ed0f8a587
---
 .../algorithms/ddp_comm_hooks/powerSGD_hook.py         | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index e1d475a34425..81b876685a3c 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -162,11 +162,17 @@ def create_low_rank_tensor(fill_random_values, rng):
                 # only fork on CPU and then move the generated tensor to the CUDA device.
                 torch.manual_seed(rng.randint(1_000_000_000))
                 return torch.randn(
-                    square_side_length, state.matrix_approximation_rank, device="cpu"
+                    square_side_length,
+                    state.matrix_approximation_rank,
+                    device="cpu",
+                    dtype=input_tensor.dtype,
                 ).to(device)
         else:
             return torch.empty(
-                square_side_length, state.matrix_approximation_rank, device=device
+                square_side_length,
+                state.matrix_approximation_rank,
+                device=device,
+                dtype=input_tensor.dtype,
             )
 
     p = create_low_rank_tensor(fill_random_values=False, rng=state.rng)

From bea88ee1d0179e9cc3c29d105cc009e2027ee0d7 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Mon, 7 Dec 2020 18:59:38 -0800
Subject: [PATCH 118/132] Added entry for torch.linalg.cond to linalg.rst
 (#48941)

Summary:
This PR makes documentation for `cond` available at https://pytorch.org/docs/master/linalg.html
I forgot to include this change in https://github.com/pytorch/pytorch/issues/45832.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48941

Reviewed By: ngimel

Differential Revision: D25379244

Pulled By: mruberry

fbshipit-source-id: c8c0a0b8a05c17025d6c3cea405b2add369e2019
---
 docs/source/linalg.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index b5d78572c06b..a3bee886f062 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -13,6 +13,7 @@ Functions
 ---------
 
 .. autofunction:: cholesky
+.. autofunction:: cond
 .. autofunction:: det
 .. autofunction:: eigh
 .. autofunction:: eigvalsh

From 3aeb9cc85ddbed7516e34fa71475995af5b31812 Mon Sep 17 00:00:00 2001
From: Bharat123rox <bharatraghunthan9767@gmail.com>
Date: Mon, 7 Dec 2020 19:33:47 -0800
Subject: [PATCH 119/132] [DOCS]Correct docs for torch.lu_solve (#47762)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43498 by correcting the function signature of `torch.lu_solve`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/47762

Reviewed By: ljk53

Differential Revision: D24900259

Pulled By: ailzhang

fbshipit-source-id: 2a43170bde57e03d44025b23e3abcda169cfc9e2
---
 torch/_torch_docs.py | 2 +-
 torch/overrides.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 51200dc6b406..d9f7e8018264 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4638,7 +4638,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.lu_solve,
            r"""
-lu_solve(input, LU_data, LU_pivots, *, out=None) -> Tensor
+lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor
 
 Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
 LU factorization of A from :meth:`torch.lu`.
diff --git a/torch/overrides.py b/torch/overrides.py
index e8a3933a1954..2af6e36ea914 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -505,7 +505,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.lt: lambda input, other, out=None: -1,
         torch.less: lambda input, other, out=None: -1,
         torch.lu: lambda A, pivot=True, get_infos=False, out=None: -1,
-        torch.lu_solve: lambda input, LU_data, LU_pivots, out=None: -1,
+        torch.lu_solve: lambda b, LU_data, LU_pivots, out=None: -1,
         torch.margin_ranking_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1,
         torch.masked_fill: lambda input, mask, value: -1,
         torch.masked_scatter: lambda input, mask, source: -1,

From 5533be5170c37561c486ade50c1697a2be50bbe0 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 7 Dec 2020 19:47:27 -0800
Subject: [PATCH 120/132] CUDA BF16 backwards (#48809)

Summary:
Looks like there's no test?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48809

Reviewed By: mruberry

Differential Revision: D25378998

Pulled By: ngimel

fbshipit-source-id: d16789892902b5a20828e8c7b414b478de33c4a5
---
 .../cuda/BinaryMiscBackwardOpsKernels.cu      | 60 +++++++++----------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index ed7e2190f75e..a385aa721522 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -16,10 +16,8 @@ namespace native {
 
 void sigmoid_backward_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_backward_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "sigmoid_backward_cuda", [&] {
-      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * (scalar_t(1.) - b) * b;
-      });
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return a * (scalar_t(1.) - b) * b;
     });
   });
 }
@@ -31,31 +29,29 @@ void logit_backward_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
       iter.dtype(),
       "logit_cuda",
       [&]() {
-        AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "logit_cuda", [&] {
-          using T_ACC = acc_type<scalar_t, true>;
-          const T_ACC eps = eps_scalar.to<T_ACC>();
-          if (eps < T_ACC(0)) {
-            gpu_kernel(
-                iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
-                  const T_ACC dy_acc = static_cast<T_ACC>(dy);
-                  const T_ACC x_acc = static_cast<T_ACC>(x);
-                  return (x_acc < T_ACC(0) || x_acc > T_ACC(1))
-                      ? std::numeric_limits<T_ACC>::quiet_NaN()
-                      : dy_acc / (x_acc * (T_ACC(1) - x_acc));
-                });
-          } else {
-            const T_ACC lo = eps;
-            const T_ACC hi = T_ACC(1) - eps;
-            gpu_kernel(
-                iter, [lo, hi] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
-                  const T_ACC dy_acc = static_cast<T_ACC>(dy);
-                  const T_ACC x_acc = static_cast<T_ACC>(x);
-                  return (x_acc < lo || x_acc > hi)
-                      ? T_ACC(0)
-                      : dy_acc / (x_acc * (T_ACC(1) - x_acc));
-                });
-          }
-        });
+        using T_ACC = acc_type<scalar_t, true>;
+        const T_ACC eps = eps_scalar.to<T_ACC>();
+        if (eps < T_ACC(0)) {
+          gpu_kernel(
+              iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+                const T_ACC dy_acc = static_cast<T_ACC>(dy);
+                const T_ACC x_acc = static_cast<T_ACC>(x);
+                return (x_acc < T_ACC(0) || x_acc > T_ACC(1))
+                    ? std::numeric_limits<T_ACC>::quiet_NaN()
+                    : dy_acc / (x_acc * (T_ACC(1) - x_acc));
+              });
+        } else {
+          const T_ACC lo = eps;
+          const T_ACC hi = T_ACC(1) - eps;
+          gpu_kernel(
+              iter, [lo, hi] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+                const T_ACC dy_acc = static_cast<T_ACC>(dy);
+                const T_ACC x_acc = static_cast<T_ACC>(x);
+                return (x_acc < lo || x_acc > hi)
+                    ? T_ACC(0)
+                    : dy_acc / (x_acc * (T_ACC(1) - x_acc));
+              });
+        }
       });
 }
 
@@ -68,10 +64,8 @@ void tanh_backward_kernel_cuda(TensorIterator& iter) {
     });
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
-        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return a * (scalar_t{1.} - b * b);
-        });
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a * (scalar_t{1.} - b * b);
       });
     });
   }

From 881e9583b22891147560004b5c4ff594f7319291 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 7 Dec 2020 20:14:02 -0800
Subject: [PATCH 121/132] docker: Add make variable to add docker build args
 (#48942)

Summary:
Adds an extra make variable 'EXTRA_DOCKER_BUILD_FLAGS' that allows us to
add extra docker build flags to the docker build command.

Example:

    make -f docker.Makefile EXTRA_DOCKER_BUILD_FLAGS=--no-cache devel-image

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48942

Reviewed By: walterddr

Differential Revision: D25376288

Pulled By: seemethere

fbshipit-source-id: 9cf2c2a5e01d505fa54447604ecd653dcbdd42e1
---
 docker.Makefile | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/docker.Makefile b/docker.Makefile
index 3cd59f146e38..3af77ab9c7d1 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -1,31 +1,38 @@
-DOCKER_REGISTRY  = docker.io
-DOCKER_ORG       = $(shell docker info 2>/dev/null | sed '/Username:/!d;s/.* //')
-DOCKER_IMAGE     = pytorch
-DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
+DOCKER_REGISTRY           = docker.io
+DOCKER_ORG                = $(shell docker info 2>/dev/null | sed '/Username:/!d;s/.* //')
+DOCKER_IMAGE              = pytorch
+DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
 
 ifeq ("$(DOCKER_ORG)","")
 $(warning WARNING: No docker user found using results from whoami)
-DOCKER_ORG       = $(shell whoami)
+DOCKER_ORG                = $(shell whoami)
 endif
 
-CUDA_VERSION     = 11.0
-CUDNN_VERSION    = 8
-BASE_RUNTIME     = ubuntu:18.04
-BASE_DEVEL       = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
+CUDA_VERSION              = 11.0
+CUDNN_VERSION             = 8
+BASE_RUNTIME              = ubuntu:18.04
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
 
 # The conda channel to use to install pytorch / torchvision
-INSTALL_CHANNEL  = pytorch
+INSTALL_CHANNEL           = pytorch
 
-PYTHON_VERSION   = 3.7
+PYTHON_VERSION            = 3.7
 # Can be either official / dev
-BUILD_TYPE       = dev
-BUILD_PROGRESS   = auto
-BUILD_ARGS       = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
-				   --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
-				   --build-arg CUDA_VERSION=$(CUDA_VERSION) \
-				   --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL)
-DOCKER_BUILD     = DOCKER_BUILDKIT=1 docker build --progress=$(BUILD_PROGRESS) --target $(BUILD_TYPE) -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(BUILD_ARGS) .
-DOCKER_PUSH      = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)
+BUILD_TYPE                = dev
+BUILD_PROGRESS            = auto
+BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
+							--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+							--build-arg CUDA_VERSION=$(CUDA_VERSION) \
+							--build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL)
+EXTRA_DOCKER_BUILD_FLAGS ?=
+DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+							docker build \
+								--progress=$(BUILD_PROGRESS) \
+								$(EXTRA_DOCKER_BUILD_FLAGS) \
+								--target $(BUILD_TYPE) \
+								-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+								$(BUILD_ARGS) .
+DOCKER_PUSH               = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)
 
 .PHONY: all
 all: devel-image

From c3a90bedd4312ee8a9ec673ee4fbe0ffca7fa28b Mon Sep 17 00:00:00 2001
From: Jiatong Zhou <jiatong@fb.com>
Date: Mon, 7 Dec 2020 21:25:11 -0800
Subject: [PATCH 122/132] Move aten::__contains__.int_list for lite jit
 (#48950)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48950

Needed by noise suppression model

Test Plan: build

Reviewed By: linbinyu

Differential Revision: D25321582

fbshipit-source-id: fbc67fc35087c5f44b7ab68d1485b2b916747723
---
 torch/csrc/jit/runtime/register_prim_ops.cpp         | 5 +++++
 torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp | 4 ----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index f031d957449b..d9bffa7e4644 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -822,6 +822,11 @@ RegisterOperators reg(
            return 0;
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA(
+             "aten::__contains__.int_list(int[] l, int item) -> bool"),
+         listContains<int64_t>,
+         aliasAnalysisFromSchema()),
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
              "aten::__contains__.str_list(str[] l, str item) -> bool"),
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index 0be346246656..b63a2a228508 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -711,10 +711,6 @@ RegisterOperators reg2({
 
     // `listContains<T>` is not implemented for non-primitive types
     // TODO: Add List[bool] once .to<c10::List<bool>> doesn't throw an error
-    Operator(
-        "aten::__contains__.int_list(int[] l, int item) -> bool",
-        listContains<int64_t>,
-        aliasAnalysisFromSchema()),
     Operator(
         "aten::__contains__.float_list(float[] l, float item) -> bool",
         listContains<double>,

From cb6233aa538114fce55380a79978f3e576eb7cfe Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Mon, 7 Dec 2020 22:46:56 -0800
Subject: [PATCH 123/132] Fix some convoluted(?) code (#48893)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48893

This simplifies some code which is written in an interesting way. It may be that this was intentional, but I don't recognize the pattern being used.

Test Plan: N/A - Sandcastle

Reviewed By: igorsugak

Differential Revision: D25358283

fbshipit-source-id: 19bcf01cbb117843e08df0237e6a03ea77958078
---
 aten/src/THC/THCAsmUtils.cuh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh
index 6375891bd7f2..be0bf6ffa1ba 100644
--- a/aten/src/THC/THCAsmUtils.cuh
+++ b/aten/src/THC/THCAsmUtils.cuh
@@ -94,15 +94,16 @@ __device__ __forceinline__ int getLaneId() {
 
 #if defined(__HIP_PLATFORM_HCC__)
 __device__ __forceinline__ unsigned long long int getLaneMaskLt() {
-  std::uint64_t m = (1ull << getLaneId()) - 1ull;
+  const std::uint64_t m = (1ull << getLaneId()) - 1ull;
   return m;
+}
 #else
 __device__ __forceinline__ unsigned getLaneMaskLt() {
   unsigned mask;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
   return mask;
-#endif
 }
+#endif
 
 #if defined (__HIP_PLATFORM_HCC__)
 __device__ __forceinline__ unsigned long long int getLaneMaskLe() {
@@ -119,27 +120,28 @@ __device__ __forceinline__ unsigned getLaneMaskLe() {
 
 #if defined(__HIP_PLATFORM_HCC__)
 __device__ __forceinline__ unsigned long long int getLaneMaskGt() {
-  std::uint64_t m = getLaneMaskLe();
+  const std::uint64_t m = getLaneMaskLe();
   return m ? ~m : m;
+}
 #else
 __device__ __forceinline__ unsigned getLaneMaskGt() {
   unsigned mask;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
   return mask;
-#endif
 }
+#endif
 
 #if defined(__HIP_PLATFORM_HCC__)
 __device__ __forceinline__ unsigned long long int getLaneMaskGe() {
-  std::uint64_t m = getLaneMaskLt();
+  const std::uint64_t m = getLaneMaskLt();
   return ~m;
+}
 #else
 __device__ __forceinline__ unsigned getLaneMaskGe() {
   unsigned mask;
   asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
   return mask;
-#endif
 }
-
+#endif
 
 #endif // THC_ASM_UTILS_INC

From 32b098baf936b63ee23017f6bba4f3e4c56f22a6 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 8 Dec 2020 00:35:23 -0800
Subject: [PATCH 124/132] Add and adjust kernel launch checks (#46727)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46727

This adds kernel launch safety checks to a number of kernels. See D24309971 (https://github.com/pytorch/pytorch/commit/353e7f940f548e0a0cb3b420b4190b4624ae9b41) for context.

Test Plan: The existing pre-commit test rigs are used.

Reviewed By: ngimel

Differential Revision: D24334303

fbshipit-source-id: b6433f6be109fc8dbe789e91f3cbfbc31fd15951
---
 aten/src/ATen/native/cuda/AveragePool3d.cu    |  19 ++-
 aten/src/ATen/native/cuda/CUDALoops.cuh       |   6 +-
 .../ATen/native/cuda/DistributionTemplates.h  |   5 +-
 aten/src/ATen/native/cuda/EmbeddingBag.cu     |   2 +-
 .../ATen/native/cuda/FractionalMaxPool2d.cu   |   4 +-
 aten/src/ATen/native/cuda/Indexing.cu         |  72 ++++-----
 .../src/ATen/native/cuda/MultinomialKernel.cu |   5 +-
 aten/src/ATen/native/cuda/Normalization.cuh   |  18 ++-
 aten/src/ATen/native/cuda/ROCmLoops.cuh       |   4 +-
 aten/src/ATen/native/cuda/RangeFactories.cu   |   6 +-
 aten/src/ATen/native/cuda/Reduce.cuh          |   7 +-
 aten/src/ATen/native/cuda/ReflectionPad.cu    |  18 +--
 .../ATen/native/cuda/ReplicationPadding.cu    |  54 ++++---
 aten/src/ATen/native/cuda/ScanKernels.cu      |   9 +-
 .../ATen/native/cuda/ScatterGatherKernel.cu   |  10 +-
 aten/src/ATen/native/cuda/Shape.cu            |   8 +-
 aten/src/ATen/native/cuda/SoftMax.cu          | 143 +++++++++---------
 aten/src/ATen/native/cuda/Sorting.cu          |   6 +-
 aten/src/ATen/native/cuda/WeightNorm.cu       |   8 +-
 19 files changed, 205 insertions(+), 199 deletions(-)

diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu
index 4214b4dace19..388b04dba76a 100644
--- a/aten/src/ATen/native/cuda/AveragePool3d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool3d.cu
@@ -317,16 +317,17 @@ __global__ void avg_pool3d_cuda_update_grad_input(
   }
 }
 
-#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
+#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:      \
   avg_pool3d_cuda_update_output<KW, scalar_t, accscalar_t>  \
     <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( \
-       work_input.packed_accessor64<scalar_t, 4>(),           \
-       work_output.packed_accessor64<scalar_t, 4>(),          \
+       work_input.packed_accessor64<scalar_t, 4>(),         \
+       work_output.packed_accessor64<scalar_t, 4>(),        \
        kT, kH,                                              \
        dT, dH, dW,                                          \
        padT, padH, padW,                                    \
        count_include_pad,                                   \
        offsetZ, divisor);                                   \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();                           \
   break
 
 void avg_pool3d_out_cuda_template(
@@ -443,11 +444,10 @@ void avg_pool3d_out_cuda_template(
                 padT, padH, padW,
                 count_include_pad,
                 offsetZ, divisor);
-            break;
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+          break;
         }
 
-        AT_CUDA_CHECK(cudaGetLastError());
-
         totalZ -= 65535;
         offsetZ += 65535;
       }
@@ -581,8 +581,7 @@ void avg_pool3d_backward_out_cuda_template(
               kT, kH, kW,
               1.0f/divide_factor,
               offsetZ);
-
-          AT_CUDA_CHECK(cudaGetLastError());
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
 
           totalZ -= 65535;
           offsetZ += 65535;
@@ -614,6 +613,7 @@ void avg_pool3d_backward_out_cuda_template(
                   padT, padH, padW,
                   count_include_pad,
                   offsetZ, divisor);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
           else {
             avg_pool3d_cuda_update_grad_input<scalar_t, accscalar_t>
@@ -625,10 +625,9 @@ void avg_pool3d_backward_out_cuda_template(
                   padT, padH, padW,
                   count_include_pad,
                   offsetZ, divisor);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
 
-          AT_CUDA_CHECK(cudaGetLastError());
-
           totalZ -= 65535;
           offsetZ += 65535;
         }
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 91401e994ebd..d11a5bb074c5 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -101,9 +101,11 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t
   switch (vec_size) {
   case 4:
     vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 2:
     vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 1: {
     auto input_calc = TrivialOffsetCalculator<traits::arity>();
@@ -111,12 +113,12 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t
     auto loader = memory::LoadWithoutCast();
     auto storer = memory::StoreWithoutCast();
     unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   }
   default:
     TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
   }
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t>
@@ -127,7 +129,7 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc, l, s);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename func_t>
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 8cfc6c10f1ba..1b4f228bf229 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -155,6 +155,7 @@ void distribution_nullary_kernel(at::TensorIterator& iter,
         *out = transform_func(rand);
       }
     );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     auto offset_calc = make_offset_calculator<1>(iter);
     distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
@@ -167,8 +168,8 @@ void distribution_nullary_kernel(at::TensorIterator& iter,
         *out = transform_func(rand);
       }
     );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 // Binary kernel
@@ -260,10 +261,12 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg
     distribution_binary_elementwise_kernel<<<grid,num_threads, 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     distribution_binary_elementwise_kernel<<<grid, num_threads, 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 5bed5532baee..651261cf6408 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -245,8 +245,8 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
               max_indices.data_ptr<index_t>(), grad.data_ptr<scalar_t>(),
               grad_weight.data_ptr<scalar_t>(), stride, numBags);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
-        });
       });
+  });
 
   return grad_weight;
 }
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index bee3cfa4d436..41fc2dea5856 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -273,8 +273,8 @@ void fractional_max_pool2d_backward_out_cuda_template(
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
         devGradInput, devGradOutput, devIndices);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
-    );
+    }
+  );
 }
 
 }// namespace
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 47527935fe73..4e88ee34a9b4 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -233,18 +233,18 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va
       AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       value_.scalar_type(), "indexing_backward", [&] {
       AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "indexing_backward", [&] {
-      indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
-        sorted_indices.data_ptr<int64_t>(),
-        orig_indices.data_ptr<int64_t>(),
-        value_.data_ptr<scalar_t>(),
-        src_.data_ptr<scalar_t>(),
-        num_indices,
-        sliceSize,
-        strideBefore,
-        nElemBefore);
-      });
+        indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
+          sorted_indices.data_ptr<int64_t>(),
+          orig_indices.data_ptr<int64_t>(),
+          value_.data_ptr<scalar_t>(),
+          src_.data_ptr<scalar_t>(),
+          num_indices,
+          sliceSize,
+          strideBefore,
+          nElemBefore);
+        });
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
-      AT_CUDA_CHECK(cudaGetLastError());
       if (permuted)
           self.copy_(src_.permute(inversePerm));
   }
@@ -476,21 +476,23 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const
 
   int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
-#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)  \
   indexAddSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM> \
-    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
-      selfInfo, sourceInfo, indexInfo,                    \
-      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize);
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                \
+      selfInfo, sourceInfo, indexInfo,                                               \
+      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize);                          \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
 #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                        \
-                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)  \
+                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)            \
   indexAddLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                       \
-                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR> \
-    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(         \
-      selfInfo, sourceInfo, indexInfo,                          \
-      selfAddDim, sourceAddDim, sourceTotalSize,                     \
-      (IDX_IS_MAJOR) ? sliceSize : numIndex,                \
-      selfAddDimSize);
+                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>           \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                       \
+      selfInfo, sourceInfo, indexInfo,                                      \
+      selfAddDim, sourceAddDim, sourceTotalSize,                            \
+      (IDX_IS_MAJOR) ? sliceSize : numIndex,                                \
+      selfAddDimSize);                                                      \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
@@ -725,22 +727,24 @@ void index_select_out_cuda_impl(Tensor& out, const Tensor& self, long dim,
 
   int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
-#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM)         \
   indexSelectSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>     \
-    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
-      outInfo, selfInfo, indicesInfo,                            \
-      outSelectDim, selfSelectDim, static_cast<TYPE>(sliceSize), \
-      selfSelectDimSize);
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(                                   \
+      outInfo, selfInfo, indicesInfo,                                                   \
+      outSelectDim, selfSelectDim, static_cast<TYPE>(sliceSize),                        \
+      selfSelectDimSize);                                                               \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
 #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                           \
-                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)     \
+                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)                   \
   indexSelectLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                       \
-                        DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR> \
-    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(            \
-      outInfo, selfInfo, indicesInfo,                             \
-      outSelectDim, selfSelectDim, static_cast<TYPE>(outTotalSize), \
-      static_cast<TYPE>((IDX_IS_MAJOR) ? sliceSize : numIndices),  \
-      selfSelectDimSize);
+                        DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR>               \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                          \
+      outInfo, selfInfo, indicesInfo,                                          \
+      outSelectDim, selfSelectDim, static_cast<TYPE>(outTotalSize),            \
+      static_cast<TYPE>((IDX_IS_MAJOR) ? sliceSize : numIndices),              \
+      selfSelectDimSize);                                                      \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index a8779d3d97af..3d59617903b4 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -74,6 +74,7 @@ void renormRows(Tensor& t) {
         <<<grid, block, block.x * sizeof(scalar_t),
         at::cuda::getCurrentCUDAStream()>>>(t.data_ptr<scalar_t>(),
             rows, cols);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
 }
 
@@ -348,6 +349,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
                   self_v.stride(0),
                   self_v.stride(1)
           );
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       // Generic, slow implementation with memory allocations
 
@@ -399,12 +401,11 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
                 numDist, numCategories,
                 prefixSum.data_ptr<scalar_t>(),
                 normDist.data_ptr<scalar_t>());
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     }
   });
 
-  AT_CUDA_CHECK(cudaGetLastError());
-
   if (inputSize == 1) {
     result.resize_({n_sample});
   }
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index a0d37dd44be1..8355ac004308 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -558,6 +558,7 @@ void batch_norm_cuda_template(Tensor& output_, Tensor& save_mean_, Tensor& save_
   if (!train) {
     batch_norm_transform_input_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, false, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
       (input, output, running_mean, running_var, weight, bias, epsilon);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     // for the reduction, we cannot use blocks for the batch dim, but if we have few threads in
     // the feature dimension, we'll use some threads for blocks
@@ -566,10 +567,11 @@ void batch_norm_cuda_template(Tensor& output_, Tensor& save_mean_, Tensor& save_
     dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
     batch_norm_collect_statistics_kernel<InvStd, input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
       (input, epsilon, momentum, running_mean, running_var, save_mean, save_invstd);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     batch_norm_transform_input_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, true, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
       (input, output, save_mean, save_invstd, weight, bias, epsilon);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
@@ -615,7 +617,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda_template(const Tenso
   batch_norm_backward_kernel<input_scalar_t, stat_scalar_t, accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
     (input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var,
      save_mean, save_invstd, train, epsilon);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   return std::make_tuple(grad_input_, grad_weight_, grad_bias_);
 }
@@ -654,7 +656,7 @@ std::tuple<Tensor, Tensor> batch_norm_stats_cuda_template(const Tensor& input_,
   dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
   batch_norm_collect_statistics_kernel<InvStd, scalar_t, scalar_t, accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
     (input, epsilon, 0.0, dummy_mean, dummy_invstd, mean, invstd);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return std::make_tuple(mean_, invstd_);
 }
 
@@ -694,7 +696,7 @@ void batch_norm_elemt_cuda_template(Tensor& output_, const Tensor& input_, const
   dim3 threads_trans(tf, tb);
   batch_norm_transform_input_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, true, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
     (input, output, mean, invstd, weight, bias, epsilon);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template<typename scalar_t, typename accscalar_t, typename index_t>
@@ -727,7 +729,7 @@ std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda_template(const Tensor& m
   int grid = std::max<int>(1, features/block);
   batch_norm_reduce_statistics_kernel<scalar_t, accscalar_t, index_t> <<<grid, block, 0, stream>>>
       (mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return std::make_tuple(save_mean_, save_invstd_);
 }
 
@@ -777,7 +779,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_templ
 
   batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>>
     (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
 }
@@ -819,7 +821,7 @@ Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Te
   dim3 threads_trans(tf, tb);
   batch_norm_backward_elemt_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
     (input, grad_output, mean, invstd, weight, mean_dy, mean_dy_xmu, grad_input);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return grad_input_reshaped.view(input_.sizes());
 }
 
@@ -853,7 +855,7 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda_template(
   // NB: epsilon is unused by the Var transform, so we set it to 0
   batch_norm_collect_statistics_kernel<Var, input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
     (input, 0., momentum, running_mean, running_var, save_mean, save_var);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return std::make_tuple(save_mean_, save_var_);
 }
 
diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh
index b5115c6dcdfb..c339364b5a02 100644
--- a/aten/src/ATen/native/cuda/ROCmLoops.cuh
+++ b/aten/src/ATen/native/cuda/ROCmLoops.cuh
@@ -134,7 +134,7 @@ static void launch_kernel(int64_t N, const func_t& f) {
   dim3 grid((N + block.x * vt - 1) / (block.x * vt));
   auto stream = at::cuda::getCurrentCUDAStream();
   elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename traits, typename func_t, typename index_t, size_t... INDEX>
@@ -296,7 +296,7 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) {
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template<typename func_t, typename array_t, std::enable_if_t<!detail::has_same_arg_types<func_t>::value, int> = 0>
diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
index 4286f05111b6..107c3c28fdac 100644
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -39,8 +39,10 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
   using scalar_t = typename function_traits<func_t>::result_type;
   if (N <= std::numeric_limits<int>::max()) {
     elementwise_kernel_with_index<int><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     elementwise_kernel_with_index<int64_t><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
 
@@ -105,7 +107,6 @@ Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, c10::optiona
     result.copy_(r);
   }
 
-  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -164,7 +165,6 @@ Tensor& logspace_cuda_out(Tensor& result, Scalar start, Scalar end, c10::optiona
     result.copy_(r);
   }
 
-  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -201,7 +201,6 @@ Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
 
   });
 
-  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
@@ -263,7 +262,6 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
     }
   });
 
-  AT_CUDA_CHECK(cudaGetLastError());
   return result;
 }
 
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 618088cefb3a..ea797e6011af 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -817,15 +817,16 @@ static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction)
   switch(config.output_vec_size) {
   case 4:
     reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 2:
     reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   default:
     reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 class AccumulationBuffer {
@@ -872,7 +873,7 @@ int get_output_vec_size(TensorIterator &iter) {
       vec_size /= 2;
     }
   };
-  
+
   uint64_t base_address = reinterpret_cast<uint64_t>(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t);
   update_vec_size(base_address);
 
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index 2b182f32b5e7..95a6825d507f 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -200,10 +200,9 @@ void reflection_pad1d_out_template(
         grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
           input_w, pad_l, pad_r);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   );
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void reflection_pad1d_backward_out_template(
@@ -213,7 +212,7 @@ void reflection_pad1d_backward_out_template(
   if (grad_input.numel() == 0) {
     return;
   }
-                      
+
   TORCH_CHECK(canUse32BitIndexMath(input),
     "input tensor must fit into 32-bit index math");
 
@@ -252,15 +251,14 @@ void reflection_pad1d_backward_out_template(
         grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
           grad_input.data_ptr<scalar_t>(), grad_output.data_ptr<scalar_t>(),
           input_w, pad_l, pad_r);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   );
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void reflection_pad2d_out_template(
     Tensor &output, const Tensor &input_, IntArrayRef padding) {
-  
+
   TORCH_CHECK(canUse32BitIndexMath(input_),
     "input tensor must fit into 32-bit index math");
 
@@ -331,10 +329,9 @@ void reflection_pad2d_out_template(
           input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
           input_w, input_h,
           pad_t, pad_b, pad_l, pad_r);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   );
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void reflection_pad2d_backward_out_template(
@@ -344,7 +341,7 @@ void reflection_pad2d_backward_out_template(
   if (grad_input.numel() == 0) {
     return;
   }
-  
+
   TORCH_CHECK(canUse32BitIndexMath(input),
     "input tensor must fit into 32-bit index math");
   TORCH_CHECK(canUse32BitIndexMath(grad_output_),
@@ -393,10 +390,9 @@ void reflection_pad2d_backward_out_template(
           grad_input.data_ptr<scalar_t>(), grad_output.data_ptr<scalar_t>(),
           input_w, input_h,
           pad_t, pad_b, pad_l, pad_r);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   );
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index b896a47afed9..c80a98ddf13b 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -222,7 +222,7 @@ void replication_pad1d_out_cuda_template(
               (numInputDims == 3 && input.size(1) != 0 && input.size(2) != 0),
               "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
               input.sizes());
-  
+
   if (numInputDims == 3) {
     numBatch = input.size(0);
     planeDim++;
@@ -238,17 +238,17 @@ void replication_pad1d_out_cuda_template(
       " Calculated output W: ", outputW);
 
   if (numInputDims == 2) {
-    output.resize_({numPlanes, outputW});    
+    output.resize_({numPlanes, outputW});
   } else {
     output.resize_({numBatch, numPlanes, outputW});
   }
-  
+
   if (input.numel() == 0) {
     return;
   }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "replication_pad1d_cuda", [&] {
+    input.scalar_type(), "replication_pad1d_cuda", [&] {
       if (numInputDims == 2) {
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
@@ -263,6 +263,7 @@ void replication_pad1d_out_cuda_template(
 
         replication_pad_forward_kernel1d <<<gridSize, blockSize, 0,
           at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput, padL, padR);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         auto devInput = input.packed_accessor64<scalar_t, 3>();
         auto devOutput = output.packed_accessor64<scalar_t, 3>();
@@ -275,10 +276,10 @@ void replication_pad1d_out_cuda_template(
 
         replication_pad_forward_kernel1d <<<gridSize, blockSize, 0,
            at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput, padL, padR);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
-      }
+    }
   );
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void replication_pad1d_backward_out_cuda_template(
@@ -323,8 +324,8 @@ void replication_pad1d_backward_out_cuda_template(
       auto gradInput_ = gradInput;
       auto gradOutput_ = gradOutput;
       if (numInputDims == 2) {
-      gradInput_ = gradInput.unsqueeze(0);
-      gradOutput_ = gradOutput.unsqueeze(0);
+        gradInput_ = gradInput.unsqueeze(0);
+        gradOutput_ = gradOutput.unsqueeze(0);
       }
       auto devGradInput = gradInput_.packed_accessor64<scalar_t, 3>();
       auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 3>();
@@ -338,9 +339,8 @@ void replication_pad1d_backward_out_cuda_template(
       replication_pad_backward_kernel <<<gridSize, blockSize, 0,
                                       at::cuda::getCurrentCUDAStream()>>>(devGradInput, devGradOutput,
                                           padL, padR);
-      }
-  );
-  AT_CUDA_CHECK(cudaGetLastError());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
 }
 
 void replication_pad2d_out_cuda_template(
@@ -387,19 +387,17 @@ void replication_pad2d_out_cuda_template(
       " Calculated output H: ", outputH, " W: ", outputW);
 
   if (numInputDims == 3) {
-    output.resize_({numPlanes, outputH, outputW});    
+    output.resize_({numPlanes, outputH, outputW});
   } else {
     output.resize_({numBatch, numPlanes, outputH, outputW});
   }
-  
+
   if (input.numel() == 0) {
     return;
   }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "replication_pad2d_cuda", [&] {
-
-
+    input.scalar_type(), "replication_pad2d_cuda", [&] {
       if (numInputDims == 3) {
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
@@ -415,6 +413,7 @@ void replication_pad2d_out_cuda_template(
         replication_pad_forward_kernel2d <<<gridSize, blockSize, 0,
         at::cuda::getCurrentCUDAStream()>>>(
             devInput, devOutput, padT, padB, padL, padR);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         auto devInput = input.packed_accessor64<scalar_t, 4>();
         auto devOutput = output.packed_accessor64<scalar_t, 4>();
@@ -428,10 +427,10 @@ void replication_pad2d_out_cuda_template(
         replication_pad_forward_kernel2d <<<gridSize, blockSize, 0,
                                        at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput,
                                            padT, padB, padL, padR);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
-      }
+    }
   );
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void replication_pad2d_backward_out_cuda_template(
@@ -499,9 +498,9 @@ void replication_pad2d_backward_out_cuda_template(
         replication_pad_backward_kernel <<<gridSize, blockSize, 0,
         at::cuda::getCurrentCUDAStream()>>>(devGradInput, devGradOutput,
           padT, padB, padL, padR);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
   );
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 static inline void shapeCheck3d(
@@ -650,10 +649,9 @@ void replication_pad3d_out_cuda_template(
   if (input.numel() == 0) {
     return;
   }
-  
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "replication_pad3d_cuda", [&] {
 
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    input.scalar_type(), "replication_pad3d_cuda", [&] {
       if (numInputDims == 4) {
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
@@ -670,6 +668,7 @@ void replication_pad3d_out_cuda_template(
         replication_pad_forward_kernel3d <<<gridSize, blockSize, 0,
         at::cuda::getCurrentCUDAStream()>>>(
             devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         auto devInput = input.packed_accessor64<scalar_t, 5>();
         auto devOutput = output.packed_accessor64<scalar_t, 5>();
@@ -684,10 +683,10 @@ void replication_pad3d_out_cuda_template(
         replication_pad_forward_kernel3d <<<gridSize, blockSize, 0,
                                        at::cuda::getCurrentCUDAStream()>>>(
                                            devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
-      }
+    }
   );
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 void replication_pad3d_backward_out_cuda_template(
@@ -726,8 +725,7 @@ void replication_pad3d_backward_out_cuda_template(
   gradInput.zero_();
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "replication_pad3d_backward_cuda", [&] {
-
+    input.scalar_type(), "replication_pad3d_backward_cuda", [&] {
       auto gradInput_ = gradInput;
       auto gradOutput_ = gradOutput;
       if (numInputDims == 4) {
@@ -747,9 +745,9 @@ void replication_pad3d_backward_out_cuda_template(
       replication_pad_backward_kernel <<<gridSize, blockSize, 0,
                                       at::cuda::getCurrentCUDAStream()>>>(
                                           devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
-      }
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
   );
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 } // namespace
 
diff --git a/aten/src/ATen/native/cuda/ScanKernels.cu b/aten/src/ATen/native/cuda/ScanKernels.cu
index 099512912203..384854505054 100644
--- a/aten/src/ATen/native/cuda/ScanKernels.cu
+++ b/aten/src/ATen/native/cuda/ScanKernels.cu
@@ -183,7 +183,7 @@ __host__ void scan_outer_dim_with_indices(const Tensor& self, Tensor& values, Te
   tensor_kernel_scan_outer_dim_with_indices<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     self.data_ptr<scalar_t>(), values.data_ptr<scalar_t>(), indices.data_ptr<int64_t>(),
     num_orows, num_irows, row_size, init, binary_op);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t, class BinaryFunction>
@@ -199,7 +199,7 @@ __host__ void scan_innermost_dim_with_indices(const Tensor& self, Tensor& values
   tensor_kernel_scan_innermost_dim_with_indices<scalar_t, 16, 32><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     self.data_ptr<scalar_t>(), values.data_ptr<scalar_t>(), indices.data_ptr<int64_t>(),
     num_rows, row_size, init, binary_op);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template<typename scalar_t, typename BinaryFunction>
@@ -436,7 +436,7 @@ __host__ void scan_outer_dim(const Tensor& self, Tensor& result,
   tensor_kernel_scan_outer_dim<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(),
     num_orows, num_irows, row_size, init, binary_op);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t, class BinaryFunction>
@@ -456,7 +456,7 @@ void scan_innermost_dim(const Tensor& self, Tensor& result, scalar_t init, Binar
   tensor_kernel_scan_innermost_dim<scalar_t, 16, 32><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
     result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(),
     num_rows, row_size, init, binary_op);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template<typename scalar_t, class func_t>
@@ -485,6 +485,7 @@ void scan_cub(const Tensor& self, Tensor& result, scalar_t init, BinaryFunction
           result.data_ptr<scalar_t>() + i - 1,
           self.data_ptr<scalar_t>() + i,
           binary_op);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
     size_t temp_storage_bytes = 0;
     AT_CUDA_CHECK(cub::DeviceScan::InclusiveScan(
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index 552384b45945..66ac81f5ecbf 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -72,11 +72,11 @@ static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) {
     return;
   }
 
-  dim3 block(nt);
-  dim3 grid((N + block.x * vt - 1) / (block.x * vt));
-  auto stream = at::cuda::getCurrentCUDAStream();
+  const dim3 block(nt);
+  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  const auto stream = at::cuda::getCurrentCUDAStream();
   _scatter_gather_elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
-  AT_CUDA_CHECK(cudaGetLastError());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 
@@ -494,5 +494,5 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cuda_kernel);
 REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel);
 REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cuda_kernel);
 REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cuda_kernel);
-    
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index 7632438ba523..2831292845ec 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -294,7 +294,8 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
 #define HANDLE_CASE(DIMS) \
     HIP_CatArrayBatchedCopy<scalar_t, unsigned int, DIMS><<<\
         catGrid, applyBlock, 0, stream.stream()>>>(\
-            data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]);
+            data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     switch (nDims) {
       case 1:
         HANDLE_CASE(1);
@@ -310,7 +311,6 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
         break;
     }
 #undef HANDLE_CASE
-    AT_CUDA_CHECK(cudaGetLastError());
   }
 }
 
@@ -404,7 +404,8 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
 #define HANDLE_CASE(DIMS) \
     CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
         catGrid, applyBlock, 0, stream.stream()>>>(\
-            data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);
+            data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     switch (nDims) {
       case 1:
         HANDLE_CASE(1);
@@ -420,7 +421,6 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
         break;
     }
 #undef HANDLE_CASE
-    AT_CUDA_CHECK(cudaGetLastError());
   }
 }
 } // namespace
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index ca00a3520f29..fb43dcb4c3c3 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -709,32 +709,32 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
     if (inner_size == 1) {
       dim3 grid(outer_size);
       AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] {
-      using accscalar_t = acc_type<scalar_t, true>;
-      if (!half_to_float) {
-        if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
-          dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax>(
-              output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), dim_size, dim_size, outer_size);
-        } else {
-          constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
-          dim3 block = SoftMax_getBlockSize(ILP, dim_size);
-          cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
-            <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
-              output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), dim_size
-          );
-        }
-      } else {
-        if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
-          dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax>(
-              output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), dim_size, dim_size, outer_size);
+        using accscalar_t = acc_type<scalar_t, true>;
+        if (!half_to_float) {
+          if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+            dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax>(
+                output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), dim_size, dim_size, outer_size);
+          } else {
+            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+            dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+            cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+              <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
+                output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), dim_size);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          }
         } else {
-          constexpr int ILP = sizeof(float4) / sizeof(accscalar_t);
-          dim3 block = SoftMax_getBlockSize(ILP, dim_size);
-          cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
-            <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
-              output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), dim_size
-          );
+          if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+            dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax>(
+                output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), dim_size, dim_size, outer_size);
+          } else {
+            constexpr int ILP = sizeof(float4) / sizeof(accscalar_t);
+            dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+            cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+              <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
+                output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), dim_size);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          }
         }
-      }
       });
     // This kernel runs in a 2D grid, where each application along y dimension has a fixed
     // outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size.
@@ -743,29 +743,28 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
       uint32_t smem_size;
       dim3 grid, block;
       AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] {
-      using accscalar_t = acc_type<scalar_t, true>;
-      if (!half_to_float) {
-          SpatialSoftMax_getLaunchSizes<accscalar_t>(
-              &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, Epilogue>,
-              outer_size, dim_size, inner_size,
-              grid, block, smem_size);
-          cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, Epilogue>
-            <<<grid, block, smem_size, stream>>>(
-             output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), outer_size, dim_size, inner_size
-      );
-      } else {
-          SpatialSoftMax_getLaunchSizes<accscalar_t>(
-              &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, Epilogue>,
-              outer_size, dim_size, inner_size,
-              grid, block, smem_size);
-          cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, Epilogue>
-            <<<grid, block, smem_size, stream>>>(
-             output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), outer_size, dim_size, inner_size
-      );
-      }
+        using accscalar_t = acc_type<scalar_t, true>;
+        if (!half_to_float) {
+            SpatialSoftMax_getLaunchSizes<accscalar_t>(
+                &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, Epilogue>,
+                outer_size, dim_size, inner_size,
+                grid, block, smem_size);
+            cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, Epilogue>
+              <<<grid, block, smem_size, stream>>>(
+              output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), outer_size, dim_size, inner_size);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        } else {
+            SpatialSoftMax_getLaunchSizes<accscalar_t>(
+                &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, Epilogue>,
+                outer_size, dim_size, inner_size,
+                grid, block, smem_size);
+            cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, Epilogue>
+              <<<grid, block, smem_size, stream>>>(
+              output.data_ptr<accscalar_t>(), input.data_ptr<scalar_t>(), outer_size, dim_size, inner_size);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }
       });
     }
-    AT_CUDA_CHECK(cudaGetLastError());
   }
   return output;
 }
@@ -807,6 +806,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t
          <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
             gI.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), dim_size
         );
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     } else {
       if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
@@ -819,6 +819,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t
          <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
             gI.data_ptr<scalar_t>(), output.data_ptr<accscalar_t>(), grad.data_ptr<accscalar_t>(), dim_size
         );
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     }
     });
@@ -826,33 +827,35 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t
     uint32_t smem_size;
     dim3 grid, block;
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, gI.scalar_type(), "host_softmax_backward", [&] {
-    using accscalar_t = acc_type<scalar_t, true>;
-    if (!half_to_float) {
-        SpatialSoftMax_getLaunchSizes<accscalar_t>(
-            &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>,
-            outer_size, dim_size, inner_size,
-            grid, block, smem_size);
-
-        cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>
-          <<<grid, block, smem_size, stream>>>(
-            gI.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(),
-            outer_size, dim_size, inner_size
-        );
-    } else {
-        SpatialSoftMax_getLaunchSizes<accscalar_t>(
-            &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>,
-            outer_size, dim_size, inner_size,
-            grid, block, smem_size);
-
-        cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>
-          <<<grid, block, smem_size, stream>>>(
-            gI.data_ptr<scalar_t>(), output.data_ptr<accscalar_t>(), grad.data_ptr<accscalar_t>(),
-            outer_size, dim_size, inner_size
-        );
-    }
+      using accscalar_t = acc_type<scalar_t, true>;
+      if (!half_to_float) {
+          SpatialSoftMax_getLaunchSizes<accscalar_t>(
+              &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>,
+              outer_size, dim_size, inner_size,
+              grid, block, smem_size);
+
+          cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>
+            <<<grid, block, smem_size, stream>>>(
+              gI.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(),
+              outer_size, dim_size, inner_size
+          );
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else {
+          SpatialSoftMax_getLaunchSizes<accscalar_t>(
+              &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>,
+              outer_size, dim_size, inner_size,
+              grid, block, smem_size);
+
+          cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>
+            <<<grid, block, smem_size, stream>>>(
+              gI.data_ptr<scalar_t>(), output.data_ptr<accscalar_t>(), grad.data_ptr<accscalar_t>(),
+              outer_size, dim_size, inner_size
+          );
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
     });
   }
-  AT_CUDA_CHECK(cudaGetLastError());
+
   return gI;
 }
 }
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 59b07653593e..33fc4a18bffa 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -204,6 +204,7 @@ struct KthValueLauncher {
         self_info.strides[collapse_self_dim],
         values_info,
         indices_info);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 };
 
@@ -238,6 +239,7 @@ struct MedianLauncher {
         num_slices,
         self_info.strides[collapse_self_dim],
         ignore_nan);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 };
 
@@ -290,8 +292,6 @@ void kthvalue_cuda_template(
     values.squeeze_(dim);
     indices.squeeze_(dim);
   }
-
-  AT_CUDA_CHECK(cudaGetLastError());
 }
 
 std::tuple<Tensor&, Tensor&> kthvalue_out_impl_cuda(
@@ -371,8 +371,6 @@ std::tuple<Tensor&, Tensor&> median_with_indices_impl(
                 vals, inds, in, dim, MedianLauncher(ignore_nan));
           }
         });
-
-    AT_CUDA_CHECK(cudaGetLastError());
   }
 
   guard.reset();
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index d90dc03007fd..8261eda01a3c 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -394,14 +394,14 @@ std::tuple<Tensor,Tensor> weight_norm_cuda
             g.data_ptr<scalar_t>(),
             fast_dim_size,
             slower_dims_size);
+         C10_CUDA_KERNEL_LAUNCH_CHECK();
        });
   }
 
   // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
   // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
   // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually
-  // synchronizing here, this is the best we can do.
-  AT_CUDA_CHECK(cudaGetLastError());
+  // synchronizing here, the foregoing is the best we can do.
 
   return std::tuple<Tensor, Tensor>{w, norms};
 }
@@ -486,14 +486,14 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
             saved_norms.data_ptr<accscalar_t>(),
             fast_dim_size,
             slower_dims_size);
+         C10_CUDA_KERNEL_LAUNCH_CHECK();
        });
   }
 
   // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
   // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
   // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually
-  // synchronizing here, this is the best we can do.
-  AT_CUDA_CHECK(cudaGetLastError());
+  // synchronizing here, the foregoing is the best we can do.
 
   return std::tuple<Tensor, Tensor>{grad_v, grad_g};
 }

From 046ea6696dde24253bea07a53c757738d4f96b43 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 8 Dec 2020 03:41:19 -0800
Subject: [PATCH 125/132] Enable faithful API for all ops (#47711)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47711

Seems we generated the declaration but the definition only for c10-full ops. We should also generate the definition for non-c10-full ops.
This makes future migrations of ops from non-c10-full to c10-full have a lower impact on the C++ API.
ghstack-source-id: 118064755

Test Plan: waitforsandcastle

Reviewed By: bhosmer

Differential Revision: D24835006

fbshipit-source-id: 8f5c3c0ffcdc9b479ca3785d57da16db508795f5
---
 tools/codegen/gen.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 4db060acd401..bd76fc11f670 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -480,8 +480,7 @@ def generate_defn(sig: CppSignature) -> str:
 
         result = generate_defn(sig_group.signature)
         if sig_group.faithful_signature is not None:
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                result += generate_defn(sig_group.faithful_signature)
+            result += generate_defn(sig_group.faithful_signature)
 
         return result
 

From 3ef36dca8ec11540e35876c5e04c4f3ed63e585a Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 8 Dec 2020 03:41:19 -0800
Subject: [PATCH 126/132] Faithful out arguments (#47712)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47712

This adds a faithful API for ops with out arguments, as described in https://docs.google.com/document/d/1h7nBibRwkRLQ8rsPhfALlwWR0QbkdQm30u4ZBwmaps8/edit# .

After this, an op will generate the following overloads for the C++ API:

```cpp
// Generated from the aten::abs operator (NOT from aten::abs.out)
Tensor at::abs(Tensor& self)

// Generated from the aten::abs.out operator
Tensor& at::abs(Tensor& self, Tensor& out)
Tensor& at::abs_out(Tensor& out, Tensor& self)

```

This is an important step towards making those ops c10-full (it allows VariableType, XLA and other backends to ignore reordering and just call through with the same argument order), but this does not make any of those ops c10-full yet.
It enables the faithful API independent from c10-fullness. That means the API is more consistent with the same API for all ops and making an op c10-full in the future will not trigger future C++ API changes.
ghstack-source-id: 118068091

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D24835252

fbshipit-source-id: dedfabd07140fc8347bbf16ff219aad3b20f2870
---
 tools/codegen/api/cpp.py        | 15 ++++--
 tools/codegen/api/dispatcher.py | 23 ++++++++--
 tools/codegen/api/native.py     |  3 +-
 tools/codegen/api/python.py     |  4 +-
 tools/codegen/api/types.py      | 81 ++++++++++++++-------------------
 tools/codegen/gen.py            | 32 +++++++++----
 6 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index b20497b5a82c..ea7179fdc599 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -23,10 +23,14 @@
 # BTW: policy on name collisions: we try not to have types with
 # collisions, but functions are fair game to collide
 
-def name(func: FunctionSchema) -> str:
+def name(func: FunctionSchema, *, faithful_name_for_out_overloads: bool = False) -> str:
     name = str(func.name.name)
     if func.is_out_fn():
-        name += '_out'
+        if faithful_name_for_out_overloads:
+            name += '_outf'
+        else:
+            name += '_out'
+
     return name
 
 # Translation of "value types" in JIT schema to C++ API type.  Value
@@ -273,10 +277,11 @@ def argument_faithful(
         return argument(a)
 
 def group_arguments(
-    func: FunctionSchema, *, method: bool
+    func: FunctionSchema, *, method: bool, faithful: bool,
 ) -> Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]]:
     args: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
-    args.extend(func.arguments.out)
+    if not faithful:
+        args.extend(func.arguments.out)
     args.extend(func.arguments.pre_self_positional)
     if func.arguments.self_arg is not None:
         if method:
@@ -288,4 +293,6 @@ def group_arguments(
     if func.arguments.tensor_options is not None:
         args.append(func.arguments.tensor_options)
     args.extend(func.arguments.post_tensor_options_kwarg_only)
+    if faithful:
+        args.extend(func.arguments.out)
     return args
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 8f3925de0041..b95803ca4e81 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -68,7 +68,7 @@ def name(func: FunctionSchema) -> str:
 
 def arguments(func: FunctionSchema) -> Tuple[DispatcherArgument, ...]:
     if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        return tuple(map(argument, itertools.chain(func.arguments.out, func.arguments.positional, func.arguments.kwarg_only)))
+        return tuple(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only, func.arguments.out)))
     else:
         return tuple(
             DispatcherArgument(type=la.type, name=la.name, argument=la.argument)
@@ -137,7 +137,22 @@ def cppargument_exprs(
     else:
         assert_never(a)
 
-def cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherExpr]:
+def cpparguments_exprs(func: FunctionSchema, * , method: bool, api_is_faithful: bool) -> Sequence[DispatcherExpr]:
+    dispatcher_calling_convention_is_faithful = local.use_c10_dispatcher().dispatcher_uses_new_style()
+    arguments = cpp.group_arguments(func, method=method, faithful=dispatcher_calling_convention_is_faithful)
+
+    if api_is_faithful:
+        argument_packs = tuple(
+            cpp.argument_faithful(a) for a in arguments
+        )
+    else:
+        argument_packs = tuple(
+            cpp.argument(a) for a in arguments
+        )
+
+    return _cpparguments_exprs(argument_packs)
+
+def _cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherExpr]:
     tensor_options = next(
         (a.this for a in args if isinstance(a, CppSingleArgumentPack) and
             isinstance(a.this.argument, TensorOptionsArguments)),
@@ -148,13 +163,13 @@ def cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherEx
 # I don't think this is entirely sound, but it should be reasonably
 # close
 def nativearguments_exprs(args: Sequence[NativeArgument]) -> Sequence[DispatcherExpr]:
-    return cpparguments_exprs([
+    return _cpparguments_exprs([
         CppSingleArgumentPack(CppArgument(type=a.type, name=a.name, default=None, argument=a.argument))
         for a in args
     ])
 
 def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]:
-    return cpparguments_exprs([
+    return _cpparguments_exprs([
         CppSingleArgumentPack(CppArgument(type=a.type, name=a.name, default=None, argument=a.argument))
         for a in args
     ])
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index b9e5257aef85..620e1c8cbf8c 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -105,4 +105,5 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> Sequen
         assert_never(a)
 
 def arguments(func: FunctionSchema) -> Tuple[NativeArgument, ...]:
-    return tuple(i for arg in cpp.group_arguments(func, method=False) for i in argument(arg))
+    args = cpp.group_arguments(func, method=False, faithful=local.use_c10_dispatcher() is UseC10Dispatcher.full)
+    return tuple(i for arg in args for i in argument(arg))
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index 26b0f8eb8076..e7383d7cf76b 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -583,7 +583,7 @@ def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
 
 def has_tensor_options(f: NativeFunction) -> bool:
     return any(filter(lambda a: isinstance(a, TensorOptionsArguments),
-                      cpp.group_arguments(f.func, method=False)))
+                      cpp.group_arguments(f.func, method=False, faithful=True)))
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
@@ -731,7 +731,7 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) ->
     # Skip SelfArgument if this is method.
     # Skip TensorOptionsArguments in C++ signature. Python side TensorOptions
     # arguments are created based on different rules - see below.
-    cpp_args = cpp.group_arguments(f.func, method=method)
+    cpp_args = cpp.group_arguments(f.func, method=method, faithful=True)
     args = tuple(a for a in cpp_args if isinstance(a, Argument))
 
     input_arg_set = set(a.name for a in f.func.arguments.positional)
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index 32caf26f223f..796cd68f9233 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -68,7 +68,7 @@ class CppSingleArgumentPack(CppArgumentPackIface):
     this: CppArgument
 
     def no_default(self) -> 'CppSingleArgumentPack':
-        return CppSingleArgumentPack(self.this.no_default())
+        return CppSingleArgumentPack(this=self.this.no_default())
 
     @property
     def type(self) -> str:
@@ -150,49 +150,25 @@ class CppSignature:
     # The schema this signature is derived from
     func: FunctionSchema
 
-    # Enough information about the C++ types to generate a full
-    # C++ type signature for this signature.  I'm not too sure
-    # if these are the right representations, so for now this
-    # is intended to be more abstract.
-    _argument_packs: Tuple[CppArgumentPack, ...]
-    _returns_type: str
+    # Is this a C++ signature for a method, i.e. Tensor::my_op(...)?
+    method: bool
+
+    # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API
+    # (i.e. with a potential TensorOptions argument and out arguments in the front)
+    faithful: bool
 
     # Return the unpacked argument structure of this signature,
     # discarding information about which arguments are semantically
     # related to each other.
     def arguments(self) -> Sequence[CppArgument]:
-        return [sub_a for a in self._argument_packs for sub_a in a.explicit_arguments()]
+        return [sub_a for a in self.argument_packs() for sub_a in a.explicit_arguments()]
 
     # Return the packed argument structure of this signature.  This preserves
     # high-level structure of the arguments so you may find it easier to do
     # translations working with this representation.
     def argument_packs(self) -> Sequence[CppArgumentPack]:
-        return self._argument_packs
-
-    # Render the C++ declaration for this signature
-    def decl(self) -> str:
-        cpp_args_str = ', '.join(map(str, self.arguments()))
-        return f"{self._returns_type} {cpp.name(self.func)}({cpp_args_str})"
-
-    # Render the C++ definition for this signature, not including
-    # the body (with curly braces)
-    def defn(self, name: Optional[str] = None, *, prefix: str = "") -> str:
-        cpp_args_str = ', '.join(a.str_no_default() for a in self.arguments())
-        if name is None:
-            name = prefix + cpp.name(self.func)
-        return f"{self._returns_type} {name}({cpp_args_str})"
-
-    # NB: This constructor knows how to disambiguate defaults when
-    # faithful is True.  Ideally this would live as an external process
-    # see https://github.com/pytorch/pytorch/pull/45666
-    @staticmethod
-    def _from_grouped_arguments(
-        func: FunctionSchema,
-        arguments: Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]],
-        *,
-        faithful: bool
-    ) -> 'CppSignature':
-        if faithful:
+        grouped_args = cpp.group_arguments(self.func, method=self.method, faithful=self.faithful)
+        if self.faithful:
             # Faithful signatures will ungroup arguments into argument
             # packs.
             #
@@ -201,17 +177,31 @@ def _from_grouped_arguments(
             # principle, we should be able to do this at some later
             # point in time with other overload disambiguation
             argument_packs = tuple(
-                cpp.argument_faithful(a).no_default() for a in arguments
+                cpp.argument_faithful(a).no_default() for a in grouped_args
             )
         else:
             argument_packs = tuple(
-                cpp.argument(a) for a in arguments
+                cpp.argument(a) for a in grouped_args
             )
-        return CppSignature(
-            func=func,
-            _argument_packs=argument_packs,
-            _returns_type=cpp.returns_type(func.returns),
-        )
+        return argument_packs
+
+    def name(self) -> str:
+        return cpp.name(self.func, faithful_name_for_out_overloads=self.faithful)
+
+    # Render the C++ declaration for this signature
+    def decl(self) -> str:
+        returns_type = cpp.returns_type(self.func.returns)
+        cpp_args_str = ', '.join(map(str, self.arguments()))
+        return f"{returns_type} {self.name()}({cpp_args_str})"
+
+    # Render the C++ definition for this signature, not including
+    # the body (with curly braces)
+    def defn(self, *, prefix: str = "") -> str:
+        returns_type = cpp.returns_type(self.func.returns)
+        cpp_args_str = ', '.join(a.str_no_default() for a in self.arguments())
+        name = prefix + self.name()
+        return f"{returns_type} {name}({cpp_args_str})"
+
 
 # Represents group of all CppSignatures associated with a
 # FunctionSchema.  Right now, that's the regular, user-visible
@@ -225,13 +215,12 @@ class CppSignatureGroup:
 
     @staticmethod
     def from_schema(func: FunctionSchema, *, method: bool) -> 'CppSignatureGroup':
-        grouped_arguments = cpp.group_arguments(func, method=method)
         faithful_signature: Optional[CppSignature]
-        if any(isinstance(a, TensorOptionsArguments) for a in grouped_arguments):
-            faithful_signature = CppSignature._from_grouped_arguments(func, grouped_arguments, faithful=True)
+        if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
+            faithful_signature = CppSignature(func=func, faithful=True, method=method)
         else:
             faithful_signature = None
-        signature = CppSignature._from_grouped_arguments(func, grouped_arguments, faithful=False)
+        signature = CppSignature(func=func, faithful=False, method=method)
         return CppSignatureGroup(
             func=func,
             signature=signature,
@@ -385,7 +374,7 @@ class MetaArgument:
     type: str
     name: str
     # structured kernels (for which MetaArgument matters) always will
-    # be use_c10_dispatcher full.  That means JIT arguments and 
+    # be use_c10_dispatcher full.  That means JIT arguments and
     # meta arguments are always in 1:1 correspondence.  If this is ever not true
     # we will have to do something more fancy here.
     argument: Argument
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index bd76fc11f670..3e5744207aff 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -462,10 +462,15 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 
         assert self.target is Target.DEFINITION
 
-        def generate_defn(sig: CppSignature) -> str:
+        def generate_defn(faithful: bool) -> str:
             dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
-            dispatcher_exprs = dispatcher.cpparguments_exprs(sig.argument_packs())
+            if faithful and sig_group.faithful_signature is not None:
+                sig = sig_group.faithful_signature
+            else:
+                sig = sig_group.signature
+
+            dispatcher_exprs = dispatcher.cpparguments_exprs(f.func, method=False, api_is_faithful=faithful)
             dispatcher_exprs_str = ', '.join(a.expr for a in dispatcher_exprs)
 
             return f"""
@@ -478,9 +483,9 @@ def generate_defn(sig: CppSignature) -> str:
 }}
 """
 
-        result = generate_defn(sig_group.signature)
+        result = generate_defn(sig_group.faithful_signature is None)
         if sig_group.faithful_signature is not None:
-            result += generate_defn(sig_group.faithful_signature)
+            result += generate_defn(True)
 
         return result
 
@@ -512,10 +517,16 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 
         assert self.target is Target.DEFINITION
 
-        def generate_defn(sig: CppSignature) -> str:
+        def generate_defn(faithful: bool) -> str:
             dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
-            dispatcher_exprs = dispatcher.cpparguments_exprs(sig.argument_packs())
+            if faithful:
+                sig = sig_group.faithful_signature
+                assert sig is not None
+            else:
+                sig = sig_group.signature
+
+            dispatcher_exprs = dispatcher.cpparguments_exprs(f.func, method=True, api_is_faithful=faithful)
             dispatcher_exprs_str = ', '.join(a.expr for a in dispatcher_exprs)
 
             return f"""
@@ -528,9 +539,9 @@ def generate_defn(sig: CppSignature) -> str:
 }}
 """
 
-        result = generate_defn(sig_group.signature)
+        result = generate_defn(faithful=False)
         if sig_group.faithful_signature is not None:
-            result += generate_defn(sig_group.faithful_signature)
+            result += generate_defn(faithful=True)
 
         return result
 
@@ -848,7 +859,10 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         for a in schema_order_jit_arguments
     ]
 
-    cpp_schema_order_types = [cpp.argument(a).type for a in schema_order_jit_arguments]
+    cpp_schema_order_types = [
+        cpp.argument(a).type for a in schema_order_jit_arguments
+    ]
+
     cpp_returns = cpp.returns_type(f.func.returns)
     schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})"
 

From b643dbb8a4bf9850171b2de848b6b89206973972 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Tue, 8 Dec 2020 03:41:19 -0800
Subject: [PATCH 127/132] VariableType calls faithful C++ API for c10-full out
 ops (#47792)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47792

For operators with out arguments, VariableType previously called the out overload of the C++ API because that's all we had.
We introduced a faithful C++ API that takes out arguments in schema-order in D24835252 and this PR changes VariableType to use that API instead.

Note that this only applies to c10-full ops. Non-c10-full ops still call the unfaithful API. There aren't any c10-full out ops at the moment.
So this PR can only be tested and evaluated together with PRs on top that make ops with out arguments c10-full.
ghstack-source-id: 118068088

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D24901945

fbshipit-source-id: a99db7e4d96fcc421f9664504f87df68fe1c482f
---
 tools/autograd/gen_variable_type.py | 9 +++++++--
 tools/autograd/utils.py             | 8 +++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index f0ecd55a9b66..4948ac3af0dc 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -23,7 +23,7 @@
 #     differentiable subcomponents.
 #
 
-from .utils import CodeTemplate, nested_dict, write
+from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful
 from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \
     MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT
 from .gen_autograd_functions import uses_single_grad
@@ -614,8 +614,13 @@ def save_variables(
     def emit_dispatch_call(api_name, input_base, unpacked_args):
         """ Dispatch call via function in a namespace or method on Tensor."""
         if 'namespace' in declaration['method_of']:
+            if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']:
+                dispatcher_api_name = make_out_api_name_faithful(api_name)
+            else:
+                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                dispatcher_api_name = api_name
             call = CALL_DISPATCH_VIA_NAMESPACE.substitute(
-                api_name=api_name,
+                api_name=dispatcher_api_name,
                 unpacked_args=unpacked_args)
         else:
             call = CALL_DISPATCH_VIA_METHOD.substitute(
diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py
index b4889d219e9c..86758b5b3ff3 100644
--- a/tools/autograd/utils.py
+++ b/tools/autograd/utils.py
@@ -47,8 +47,14 @@ def split_name_params(prototype):
 def uninplace_api_name(api_name):
     if api_name.endswith('_') and not api_name.endswith('__'):
         api_name = api_name[:-1]
+    return unout_api_name(api_name)
+
+def make_out_api_name_faithful(api_name):
+    # Variable kernel needs to call the _outf overload instead of the _out overload
+    # because the _outf overload matches the argument order as it's passed into
+    # the variable kernel
     if api_name.endswith('_out'):
-        api_name = api_name[:-4]
+        api_name = api_name + 'f'
     return api_name
 
 

From 07978bd62e0c59b75bdcbf993ccdf9e127d7bf9a Mon Sep 17 00:00:00 2001
From: Ansha Yu <ansha@fb.com>
Date: Tue, 8 Dec 2020 05:52:48 -0800
Subject: [PATCH 128/132] [static runtime] fuse inference ops (1) (#48948)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48948

Fuse inference ops for the following inside static runtime:
ConcatAddMulReplaceNaNClip
CastedBatchOneHotLengths
ConcatBatchMatMulBatchGather

TODO:
1. add unit tests
2. add more restrictions on the graph transform (e.g. check inputs, check outputs not used elsewhere)

Test Plan:
Run adindexer model with static runtime and fusion; check ops
```
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 numactl -m 0 -C 3 ./buck-out/opt/gen/caffe2/caffe2/fb/predictor/ptvsc2_predictor_bench --scripted_model=/data/users/ansha/tmp/adindexer/traced_precomputation2.pt --pt_inputs=/data/users/ansha/tmp/adindexer/merge/container_precomputation_bs1.pt --iters=3000 --warmup_iters=10000  --num_threads=1 --pred_net=/data/users/ansha/tmp/adindexer/precomputation_merge_net.pb --c2_inputs=/data/users/ansha/tmp/adindexer/merge/c2_inputs_precomputation_bs1.pb --c2_sigrid_transforms_opt=1 --c2_use_memonger=1 --c2_weights=/data/users/ansha/tmp/adindexer/merge/c2_weights_precomputation.pb --pt_enable_static_runtime
```
transformed model graph contains the fused ops: P151559641

Results before fusion: P151567611
Results after fusion: P151566783 (8% speedup for bs=20, 14% speedup for bs=1)

Reviewed By: hlu1

Differential Revision: D25224107

fbshipit-source-id: c8442e8ceb018879c61ce564367b1c1b9412601b
---
 tools/build_variables.bzl                |  1 +
 torch/csrc/jit/runtime/static/impl.cpp   |  3 +
 torch/csrc/jit/runtime/static/passes.cpp | 83 ++++++++++++++++++++++++
 torch/csrc/jit/runtime/static/passes.h   |  9 +++
 4 files changed, 96 insertions(+)
 create mode 100644 torch/csrc/jit/runtime/static/passes.cpp
 create mode 100644 torch/csrc/jit/runtime/static/passes.h

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 7e5a5e4e7f8a..146abca386eb 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -268,6 +268,7 @@ core_sources_full_mobile = [
 core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/runtime/static/impl.cpp",
     "torch/csrc/jit/runtime/static/ops.cpp",
+    "torch/csrc/jit/runtime/static/passes.cpp",
 ]
 
 libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 07d41fb1f642..dabc19dfc4fe 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
+#include <torch/csrc/jit/runtime/static/passes.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch {
@@ -21,6 +22,8 @@ void OptimizeGraph(std::shared_ptr<torch::jit::Graph>& graph) {
   ConstantPropagation(graph);
   RemoveTensorMutation(graph);
   ConstantPropagation(graph);
+  FuseInferenceOpsForSparseNN(graph);
+  ConstantPropagation(graph);
 }
 
 void CheckGraphEligibility(const std::shared_ptr<torch::jit::Graph>& graph) {
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
new file mode 100644
index 000000000000..a75d187b2a49
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -0,0 +1,83 @@
+#include <torch/csrc/jit/runtime/static/passes.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace torch {
+namespace jit {
+
+void ConcatAddMulReplaceNaNClip(std::shared_ptr<torch::jit::Graph>& graph) {
+  // TODO:: check restrictions for inputs; outputs not used elsewhere
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j):
+        %y0 = aten::cat(%a, %b)
+        %y1 = aten::add(%y0, %c, %d)
+        %y2 = aten::mul(%y1, %e)
+        %y3 = aten::nan_to_num(%y2, %f, %g, %h)
+        %res = aten::clamp(%y3, %i, %j)
+        return (%res))IR";
+  std::string pattern2 = R"IR(
+    graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j):
+        %y0 = aten::cat(%a, %b)
+        %y1 = aten::add(%y0, %c, %d)
+        %y2 = aten::mul(%y1, %e)
+        %y3 = aten::nan_to_num_(%y2, %f, %g, %h)
+        %res = aten::clamp(%y3, %i, %j)
+        return (%res))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j):
+        %res = fb::concat_add_mul_replacenan_clip(%c, %e, %a, %i, %j)
+        return (%res))IR";
+
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+
+  fuse.RegisterRewritePattern(pattern2, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
+void CastedBatchOneHotLengths(std::shared_ptr<torch::jit::Graph>& graph) {
+  // TODO:: check restrictions for inputs; outputs not used elsewhere
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f, %g):
+        %y0 : Tensor = aten::to(%a, %b, %c, %c, %d)
+        %y1 : Tensor = fb::batch_one_hot_lengths(%y0, %e, %f)
+        %res : Tensor = aten::to(%y1, %g, %c, %c, %d)
+        return (%res))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f, %g):
+        %res : Tensor = fb::casted_batch_one_hot_lengths(%a, %e, %f)
+        return (%res))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
+void ConcatBatchMatMulBatchGather(std::shared_ptr<torch::jit::Graph>& graph) {
+  // TODO:: check restrictions for inputs; outputs not used elsewhere
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f):
+        %y0 : Tensor = aten::stack(%a, %b)
+        %y1 : Tensor = aten::transpose(%y0, %b, %c)
+        %y2 : Tensor = aten::bmm(%y0, %y1)
+        %y3 : Tensor = aten::flatten(%y2, %d, %e)
+        %res : Tensor = aten::index_select(%y3, %b, %f)
+        return (%res))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f):
+        %res : Tensor = fb::concat_batch_matmul_batch_gather(%f, %a)
+        return (%res))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
+void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
+#ifdef FBCODE_CAFFE2
+  ConcatAddMulReplaceNaNClip(graph);
+  CastedBatchOneHotLengths(graph);
+  ConcatBatchMatMulBatchGather(graph);
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
new file mode 100644
index 000000000000..7cc9c52f7696
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -0,0 +1,9 @@
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph);
+
+} // namespace jit
+} // namespace torch

From 39445f718c7190ee1d703be8028c8bd5c7d80f85 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Tue, 8 Dec 2020 07:03:44 -0800
Subject: [PATCH 129/132] Revert D25375885: [pytorch][PR] Reenable some BF16
 tests on CUDA

Test Plan: revert-hammer

Differential Revision:
D25375885 (https://github.com/pytorch/pytorch/commit/e3893b867fd39cf4f10a129ba9f689eebf10f82b)

Original commit changeset: 2e19fe725ae9

fbshipit-source-id: 69829f3fff4d4a2d1a71bb52e90d3c7f16b27fa3
---
 test/test_tensor_creation_ops.py |  3 ++-
 test/test_torch.py               | 44 +++++++++++++-------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 9be3e6db5bf0..b355005b1c69 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -14,7 +14,7 @@
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
-    onlyCPU, largeTensorTest, precisionOverride, dtypes,
+    onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes,
     onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU)
 
 # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests
@@ -2581,6 +2581,7 @@ def test_arange_device_vs_cpu(self, device, dtype):
         self.assertEqual(cpu_tensor, device_tensor)
 
     @onlyCUDA
+    @skipCUDAIfNotRocm
     def test_arange_bfloat16(self, device):
         ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device)
         bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device)
diff --git a/test/test_torch.py b/test/test_torch.py
index ad88128617c9..2d181c3b9400 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6316,6 +6316,10 @@ def test_copy_broadcast(self, device) -> None:
     torch.uint8
 ]
 
+# _types2 adds bfloat16 type to  _types only on ROCm. Should eventually be unified
+# with _types when bfloat16 bringup is complete on all platforms.
+_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types
+
 _float_types = [torch.half, torch.float, torch.double]
 
 _complex_types = [torch.cfloat, torch.cdouble]
@@ -6597,14 +6601,10 @@ def inner(self, device, dtype):
     ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)],
         1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False),
     ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False),
-    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
     ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
@@ -6618,14 +6618,10 @@ def inner(self, device, dtype):
     ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0,
      [torch.int16, torch.int32, torch.int64],
      [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]),
-    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
-    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
     ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     # TODO: can't check negative case - cross-device copy is contiguous
     ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)],
@@ -6709,16 +6705,12 @@ def inner(self, device, dtype):
                       torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d),
                       True],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
-    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
-    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
-    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
-    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5,
-        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True),
+        lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False),
+    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
+    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
+    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
+    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
     ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False),
     ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),
     ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),

From e2befb84bc8a832ddd4ed4aff623862e5c396e5e Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Tue, 8 Dec 2020 07:47:01 -0800
Subject: [PATCH 130/132] minor README change to fix #25464 (#48970)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/25464

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48970

Reviewed By: walterddr

Differential Revision: D25396284

Pulled By: janeyx99

fbshipit-source-id: 8355c417b5c8b8865f208d7d8e8154048423afd9
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 195dffc09058..d29eacc28664 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda102  # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version
+conda install -c pytorch magma-cuda110  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
 ```
 
 On MacOS

From 58c13cf685969bec4d49848399cd72c8f6834857 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Tue, 8 Dec 2020 07:51:35 -0800
Subject: [PATCH 131/132] Back out "Revert D25375885: [pytorch][PR] Reenable
 some BF16 tests on CUDA"

Summary: Revert D25397144 69829f3fff4d4a2d1a71bb52e90d3c7f16b27fa3

Test Plan: Revert Hammer

Reviewed By: janeyx99

Differential Revision: D25397572

fbshipit-source-id: 625ca2a32e4558ae4582a15697b6e1cc57cc1573
---
 test/test_tensor_creation_ops.py |  3 +--
 test/test_torch.py               | 44 +++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index b355005b1c69..9be3e6db5bf0 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -14,7 +14,7 @@
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
-    onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes,
+    onlyCPU, largeTensorTest, precisionOverride, dtypes,
     onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU)
 
 # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests
@@ -2581,7 +2581,6 @@ def test_arange_device_vs_cpu(self, device, dtype):
         self.assertEqual(cpu_tensor, device_tensor)
 
     @onlyCUDA
-    @skipCUDAIfNotRocm
     def test_arange_bfloat16(self, device):
         ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device)
         bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device)
diff --git a/test/test_torch.py b/test/test_torch.py
index 2d181c3b9400..ad88128617c9 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6316,10 +6316,6 @@ def test_copy_broadcast(self, device) -> None:
     torch.uint8
 ]
 
-# _types2 adds bfloat16 type to  _types only on ROCm. Should eventually be unified
-# with _types when bfloat16 bringup is complete on all platforms.
-_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types
-
 _float_types = [torch.half, torch.float, torch.double]
 
 _complex_types = [torch.cfloat, torch.cdouble]
@@ -6601,10 +6597,14 @@ def inner(self, device, dtype):
     ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)],
         1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False),
     ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False),
-    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
     ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
@@ -6618,10 +6618,14 @@ def inner(self, device, dtype):
     ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0,
      [torch.int16, torch.int32, torch.int64],
      [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]),
-    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
-    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2),
+    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
+    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
     ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     # TODO: can't check negative case - cross-device copy is contiguous
     ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)],
@@ -6705,12 +6709,16 @@ def inner(self, device, dtype):
                       torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d),
                       True],
         1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True),
-        lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False),
-    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
-    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False),
+    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
+    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False),
     ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),
     ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False),

From c29f51642ecc41da3a6a40e7685ef4decce8bbf3 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Tue, 8 Dec 2020 07:52:52 -0800
Subject: [PATCH 132/132] Modify NEON check for ARM64 on OS X (#48982)

Summary:
Use CMAKE_SYSTEM_PROCESSOR rather than run sysctl

Fixes https://github.com/pytorch/pytorch/issues/48874

Pull Request resolved: https://github.com/pytorch/pytorch/pull/48982

Reviewed By: walterddr

Differential Revision: D25385883

Pulled By: malfet

fbshipit-source-id: 47b6dc5be8d75f6d4a66a11c564abdfe31ac90b4
---
 cmake/Modules/FindARM.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
index bd68f5f36735..acd00cfa6772 100644
--- a/cmake/Modules/FindARM.cmake
+++ b/cmake/Modules/FindARM.cmake
@@ -41,9 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
    ENDIF (OMAP4_TRUE)
 
 ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
-   EXEC_PROGRAM("/usr/sbin/sysctl -n hw.optional.arm64" OUTPUT_VARIABLE
-      IS_ARM64)
-   IF(IS_ARM64 STREQUAL "1")
+   IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
       set(NEON_FOUND true CACHE BOOL "NEON available on ARM64")
    ENDIF()
    EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE