From f6f0fde8411882af712d53fc7f7c0bdffeb47683 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 5 Jan 2021 20:25:56 -0800
Subject: [PATCH 01/44] [reland][quant][graphmode][fx] Standalone module
 support {input/output}_quantized_idxs (#49754) (#50058)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50058

This PR adds the support for {input/output}_quantized_idxs for standalone module.

if input_quantized_idxs = [] and output_quantized_idxs = [], the standalone module will be expecting float
input and produce float output, and will quantize the input and dequantize output internally

if input_quantized_idxs = [0] and otuput_qiuantized_idxs = [0], the standalone module will be expecting quantized
input and produce quantized output, the input will be quantized in the parent module, and output will be dequantized
in the parent module as well, this is similar to current quantized modules like nn.quantized.Conv2d

For more details, please see the test case

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_standalone_module

Imported from OSS

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D25768910

fbshipit-source-id: 96c21a3456cf192c8f1400afa4e86273ee69197b
---
 test/quantization/test_quantize_fx.py         | 126 ++++++++++++----
 torch/quantization/fx/fuse.py                 |  11 +-
 torch/quantization/fx/fusion_patterns.py      |  23 ++-
 torch/quantization/fx/observed_module.py      |  10 +-
 .../quantization/fx/quantization_patterns.py  |   4 +-
 torch/quantization/fx/quantize.py             | 138 +++++++++++++-----
 torch/quantization/fx/utils.py                |   6 +-
 torch/quantization/quantize_fx.py             |  23 ++-
 8 files changed, 253 insertions(+), 88 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index d014bd31f02e..7965b3cc88a4 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -573,7 +573,16 @@ def forward(self, x):
         m = convert_fx(m)
         m(tensor_input)
 
-    def test_standalone_module(self):
+    def _test_standalone_module(
+            self,
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check):
+        """ Test standalone module with different quantized input/quantized output
+        configurations
+        """
         class StandaloneModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -613,45 +622,32 @@ def forward(self, x):
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
-        qconfig_dict = {"": default_qconfig}
-        config_name = {"standalone_module_name": [("standalone", None, None)]}
-        config_class = {"standalone_module_class": [(StandaloneModule, None, None)]}
-        for prepare_config in [config_name, config_class]:
+        for is_name in [True, False]:
+            if is_name:
+                prepare_config = {
+                    "standalone_module_name": [("standalone", None, interface_config)]
+                }
+            else:
+                prepare_config = {
+                    "standalone_module_class": [(StandaloneModule, None, interface_config)]
+                }
+
             original_m_copy = copy.deepcopy(original_m)
             original_ref_m_copy = copy.deepcopy(original_ref_m)
+
+            qconfig_dict = {"": default_qconfig}
             # check prepared model
             m = prepare_fx(
                 original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config)
             # calibration
             m(data)
-            # input and output of first conv, observer for standalone module
-            # will be inserted in the standalone module itself
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            # for input and output of conv in the standalone module
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check)
 
             # check converted/quantized model
             m = convert_fx(m)
-            count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d) : 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            count_check = {
-                # standalone module will take float as input and output
-                # so we'll see quantize and dequantize in the modoule
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d): 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check)
             res = m(data)
 
             # quantize the reference model
@@ -661,6 +657,76 @@ def forward(self, x):
             ref_res = ref_m(data)
             self.assertEqual(res, ref_res)
 
+    def test_standalone_module_float_interface(self):
+        float_interface_config = {
+            "input_quantized_idxs": [],  # float input
+            "output_quantized_idxs": [],  # float output
+        }
+        interface_config = float_interface_config
+        # input and output of first conv, observer for standalone module
+        # will be inserted in the standalone module itself
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for input and output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        convert_count_check = {
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # standalone module will take float as input and output
+            # so we'll see quantize and dequantize in the modoule
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d): 1,
+            ns.call_method("dequantize") : 1,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
+    def test_standalone_module_quantized_interface(self):
+        quantized_interface_config = {
+            "input_quantized_idxs": [0],  # quantized input
+            "output_quantized_idxs": [0],  # quantized output
+        }
+        interface_config = quantized_interface_config
+        # observer for input and output of first conv
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 1
+        }
+        convert_count_check = {
+            # quantizing input for conv
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            # dequantizing output of standalone module
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # quantization of input happens in parent module
+            # quantization of output happens in the quantized conv module
+            ns.call_function(torch.quantize_per_tensor) : 0,
+            ns.call_module(nnq.Conv2d): 1,
+            # dequantization for output happens in parent module
+            ns.call_method("dequantize") : 0,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 5aabbd66c4b1..59e3851dcd57 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -21,7 +21,7 @@
 
 from .quantization_types import Pattern
 
-from typing import Callable, Tuple, Optional
+from typing import Callable, Tuple
 
 
 class Fuser:
@@ -59,11 +59,12 @@ def load_arg(a):
         model = GraphModule(input_root, self.fused_graph)
         return model
 
-    def _find_matches(self, root: GraphModule, graph: Graph,
-                      patterns: Dict[Pattern, Callable]
-                      ) -> Dict[str, Tuple[Node, Optional[Any]]]:
+    def _find_matches(
+            self, root: GraphModule, graph: Graph,
+            patterns: Dict[Pattern, Callable]
+    ) -> Dict[str, Tuple[Node, FuseHandler]]:
         modules = dict(root.named_modules())
-        match_map = {}  # node name -> (root_node, match_value?)
+        match_map : Dict[str, Tuple[Node, FuseHandler]] = {}  # node name -> (root_node, match_value)
 
         def apply_match(pattern, node, match):
             if isinstance(pattern, tuple):
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index b7af6008b3f3..1749484fccec 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -6,12 +6,25 @@
 from .utils import _parent_name
 from .quantization_types import QuantizerCls
 from ..fuser_method_mappings import get_fuser_method
+from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict
 
 # ---------------------
-# Fusion Patterns
+# Fusion Pattern Registrations
 # ---------------------
 
+# Base Pattern Handler
+class FuseHandler(ABC):
+    """ Base handler class for the fusion patterns
+    """
+    def __init__(self, quantizer: QuantizerCls, node: Node):
+        pass
+
+    @abstractmethod
+    def fuse(self, quantizer: QuantizerCls, load_arg: Callable,
+             fuse_custom_config_dict: Dict[str, Any] = None) -> Node:
+        pass
+
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d))
@@ -27,9 +40,9 @@
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d)))
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
-class ConvBNReLUFusion():
+class ConvBNReLUFusion(FuseHandler):
     def __init__(self, quantizer: QuantizerCls, node: Node):
-        super().__init__()
+        super().__init__(quantizer, node)
         self.relu_node = None
         self.bn_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
@@ -94,9 +107,9 @@ def fuse(self, quantizer: QuantizerCls, load_arg: Callable,
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d))
-class ModuleReLUFusion():
+class ModuleReLUFusion(FuseHandler):
     def __init__(self, quantizer: QuantizerCls, node: Node):
-        super().__init__()
+        super().__init__(quantizer, node)
         self.relu_node = node
         assert isinstance(node.args[0], Node)
         node = node.args[0]
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
index a95bc184fa10..808a3b36fb4a 100644
--- a/torch/quantization/fx/observed_module.py
+++ b/torch/quantization/fx/observed_module.py
@@ -2,11 +2,11 @@
 import copy
 from torch.fx import GraphModule  # type: ignore
 from torch.fx.graph import Graph
-from typing import Union, Dict, Any
+from typing import Union, Dict, Any, List
 
 class ObservedGraphModule(GraphModule):
 
-    def get_preserved_attr_names(self):
+    def get_preserved_attr_names(self) -> List[str]:
         return ['_activation_post_process_map',
                 '_patterns',
                 '_qconfig_map',
@@ -35,6 +35,12 @@ def is_observed_module(module: Any) -> bool:
     return isinstance(module, ObservedGraphModule)
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
+    def get_preserved_attr_names(self) -> List[str] :
+        return super().get_preserved_attr_names() + [
+            "_standalone_module_input_quantized_idxs",
+            "_standalone_module_output_quantized_idxs"
+        ]
+
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 46fbed74bdc8..fb5bef0bd0ad 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -755,10 +755,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         qconfig = quantizer.qconfig_map[node.name]
         convert = torch.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore
         observed_standalone_module = quantizer.modules[node.target]
+        input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist()
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
         # update the modules dict
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
-        # standalone module takes float input
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index af9496a66a63..318295270b61 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -102,14 +102,15 @@ def insert_observer(
         'call_module', observer_name, (load_arg(node),), {})
     observed_node_names_set.add(node.name)
 
-def insert_observer_for_special_module(
+def maybe_insert_observer_for_special_module(
         quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module],
-        prepare_custom_config_dict: Any, qconfig: Any, node: Node):
+        prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]:
     """ Insert observer for custom module and standalone module
       Returns: standalone_module_input_idxs: the indexs for inputs that
       needs to be observed by parent module
     """
     assert modules is not None
+    standalone_module_input_idxs = None
     if isinstance(quantize_handler, CustomModuleQuantizeHandler):
         custom_module = modules[node.target]  # type: ignore
         custom_module_class_mapping = prepare_custom_config_dict.get(
@@ -129,19 +130,22 @@ def insert_observer_for_special_module(
         class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs}
         name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs}
         config = class_config_map.get(type(standalone_module), (None, None))
-        config = name_config_map.get(node.target, (None, None))
-        standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
-        standalone_prepare_config_dict = {} if config[1] is None else config[1]
+        config = name_config_map.get(node.target, config)
+        sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
+        sm_prepare_config_dict = {} if config[1] is None else config[1]
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
-            prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict)
+            prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict)
+        standalone_module_input_idxs = observed_standalone_module.\
+            _standalone_module_input_quantized_idxs.int().tolist()
         observed_standalone_module = mark_observed_standalone_module(
             observed_standalone_module)
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name,
                 observed_standalone_module)
         modules[node.target] = observed_standalone_module  # type: ignore
+    return standalone_module_input_idxs
 
 def insert_observer_for_output_of_the_node(
         node: Node,
@@ -155,7 +159,8 @@ def insert_observer_for_output_of_the_node(
         observed_graph: Graph,
         load_arg: Callable,
         observed_node_names_set: Set[str],
-        matched_nodes: Optional[List[Node]]):
+        matched_nodes: Optional[List[Node]],
+        standalone_module_input_idxs: Optional[List[int]]):
     """ Insert observer/fake_quantize module for output of the observed
     module if needed
     """
@@ -215,8 +220,13 @@ def input_is_observed(arg):
                 observed_node_names_set.add(node.name)
         elif isinstance(quantize_handler,
                         StandaloneModuleQuantizeHandler):
-            # output is observed in the standalone module
-            return
+            assert node.op == "call_module"
+            assert isinstance(node.target, str)
+            sm_out_qidxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist()  # type: ignore
+            output_is_quantized = 0 in sm_out_qidxs
+
+            if output_is_quantized:
+                observed_node_names_set.add(node.name)
         elif (quantize_handler.all_node_args and
               input_output_observed(quantize_handler)):
             # observer for outputs
@@ -226,6 +236,16 @@ def input_is_observed(arg):
                 activation_post_process_map, env, observed_graph,
                 load_arg, observed_node_names_set)
 
+        # insert observer for input of standalone module
+        if standalone_module_input_idxs is not None:
+            for idx in standalone_module_input_idxs:
+                if node.args[idx].name not in observed_node_names_set:  # type: ignore
+                    new_observer = qconfig.activation()
+                    insert_observer(
+                        node, new_observer, model,
+                        activation_post_process_map, env, observed_graph,
+                        load_arg, observed_node_names_set)
+
 def insert_observer_for_input_arg_of_observed_node(
         node: Node, observed_node_names_set: Set[str],
         quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]],
@@ -373,10 +393,19 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        When we are preparing a standalone module:
-        both input and output are observed in prepared standalone module
+        How the standalone module is observed is specified by `input_quantized_idxs` and
+        `output_quantized_idxs` in the prepare_custom_config for the standalone module
         Returns:
             model(GraphModule): prepared standalone module
+            attributes:
+                _standalone_module_input_quantized_idxs(List[Int]): a list of
+                    indexes for the graph input that is expected to be quantized,
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
+                _standalone_module_output_quantized_idxs(List[Int]): a list of
+                    indexs for the graph output that is quantized
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
@@ -430,8 +459,6 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
-        # indexes for the inputs that needs to be observed
-        standalone_module_observed_input_idxs: List[int] = []
         graph_inputs = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
@@ -487,14 +514,15 @@ def load_arg(a):
                 # parent
                 if qconfig is not None:
                     assert obj is not None
-                    insert_observer_for_special_module(
-                        obj, self.modules, prepare_custom_config_dict, qconfig,
-                        node)
+                    standalone_module_input_idxs = \
+                        maybe_insert_observer_for_special_module(
+                            obj, self.modules, prepare_custom_config_dict, qconfig,
+                            node)
                     insert_observer_for_output_of_the_node(
                         node, obj, qconfig, self.modules, model, pattern,
                         self.activation_post_process_map, env,
                         observed_graph, load_arg, observed_node_names_set,
-                        matched_nodes)
+                        matched_nodes, standalone_module_input_idxs)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
@@ -516,6 +544,21 @@ def load_arg(a):
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         model = mark_observed_module(model)
+        if is_standalone_module:
+            assert result_node is not None
+            assert isinstance(result_node.args[0], Node), \
+                "standalone module only supports returning simple value currently"\
+                "(not tuple, dict etc.)"
+            # indicator for whether output is observed or not.
+            # This used for correctly quantize standalone modules
+            output_is_observed = \
+                result_node.args[0].name in observed_node_names_set
+            # these inputs are observed in parent
+            # converting List[int] to Tensor since module attribute is
+            # Union[Tensor, Module]
+            model._standalone_module_input_quantized_idxs = \
+                torch.Tensor(input_quantized_idxs)
+            model._standalone_module_output_quantized_idxs = torch.Tensor(output_quantized_idxs)
         return model
 
     def save_state(self, observed: GraphModule) -> None:
@@ -569,8 +612,10 @@ def _convert(self, model: GraphModule, debug: bool = False,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        Returns a quantized standalone module which accepts float input
-        and produces float output.
+        Returns a quantized standalone module, whether input/output is quantized is
+        specified by prepare_custom_config_dict, with
+        input_quantized_idxs, output_quantized_idxs, please
+        see docs for prepare_fx for details
         """
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
@@ -627,36 +672,50 @@ def load_x(n: Node) -> Node:
             else:
                 return env[n.name]
 
-        def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]]
+        def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]]
                      ) -> Callable[[Node], Argument]:
             """
             Input: quantized, which can be None, list, boolean or tuple
-              - if quantized is a list or tuple, then arg should be a list and
-                the args with corresponding indexes will be quantized
-              - if quantized is a boolean, then all args will be
-                quantized/not quantized
               - if quantized is None, then we'll load the node as long as it
                 exists
+              - if quantized is a boolean, then all args will be
+                quantized/not quantized
+              - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False)
+              - if quantized is a list or tuple, then arg should be a list and
+                the args with corresponding indexes will be quantized
 
             Output: fn which takes arg_or_args, and loads them from the
                 corresponding environment depending on the value of quantized.
             """
             assert quantized is None or \
                 isinstance(quantized, (tuple, list, bool)), type(quantized)
+            if isinstance(quantized, (tuple, list)) and len(quantized) == 0:
+                # empty tuple or list means nothing is quantized
+                quantized = False
 
             def load_arg_impl(arg_or_args):
-                if quantized is None:
+                # we'll update the format of `quantized`
+                # to better match arg_or_args
+                updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized
+
+                if isinstance(quantized, (tuple, list)) and \
+                   len(quantized) == 1 and isinstance(arg_or_args, Node):
+                    # when argument is one Node instead of tuple, we just need to check
+                    # 0 is in the quantized list
+                    updated_quantized = 0 in quantized
+
+                if updated_quantized is None:
                     return map_arg(arg_or_args, load_x)
-                if isinstance(quantized, bool):
+                if isinstance(updated_quantized, bool):
                     return map_arg(
                         arg_or_args,
-                        load_quantized if quantized else load_non_quantized)
-                elif isinstance(quantized, (tuple, list)):
+                        load_quantized if updated_quantized else load_non_quantized)
+                elif isinstance(updated_quantized, (tuple, list)):
                     assert isinstance(arg_or_args, (tuple, list)), arg_or_args
                     loaded_args = []
                     # for now, we only support quantizing positional arguments
                     for i, a in enumerate(arg_or_args):
-                        if i in quantized:
+                        if i in updated_quantized:
                             loaded_args.append(map_arg(a, load_quantized))
                         else:
                             loaded_args.append(map_arg(a, load_non_quantized))
@@ -690,10 +749,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool:
         def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
-            # by default the output is expected to be quantized
+            # by default the output for a quantizable node is expected to be quantized
             quantized = True
 
-            # Need to get correct quantized/non-quantized state for the output
+            # Need to get correct quantized/non-quantized state forn the output
             # of CopyNode
             if type(obj) in [
                     CopyNode,
@@ -750,7 +809,7 @@ def insert_quantize_node(node: Node) -> None:
             "output_quantized_idxs", [])
 
         for node in model.graph.nodes:
-            if node.op == 'output':
+            if node.op == "output":
                 cur_output_node_idx = output_node_seen_cnt
                 output_node_seen_cnt += 1
                 if cur_output_node_idx in output_quantized_idxs:
@@ -775,12 +834,19 @@ def insert_quantize_node(node: Node) -> None:
                     quantized = False
                 else:
                     assert obj is not None
+                    # We will get whether the output is quantized or not before
+                    # convert for standalone module and after convert
+                    # for non-standalone module, since _standalone_module_output_quantized_idxs
+                    # is only available in observed standalone module
+                    if is_observed_standalone_module_node:
+                        out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs.tolist()  # type: ignore
+                        assert len(out_quant_idxs) <= 1, "Currently standalone only support one output"
+                        quantized = 0 in out_quant_idxs
+
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    if is_observed_standalone_module_node:
-                        quantized = False
-                    else:
+                    if not is_observed_standalone_module_node:
                         quantized = is_output_quantized(node, obj)
 
                 if quantized:
@@ -929,7 +995,7 @@ def _find_matches(
             standalone_module_names = []
 
         match_map: Dict[str, MatchResult] = {}
-        all_matched = set()
+        all_matched : Set[str] = set()
 
         def record_match(pattern, node, matched):
             if isinstance(pattern, tuple):
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index c1f849803342..8285e204b1ed 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -9,7 +9,7 @@
     Node,
 )
 
-from typing import Callable, Optional, List, Dict, Any
+from typing import Callable, Optional, List, Dict, Any, Set
 
 # turn foo.bar -> ['foo', 'bar']
 def _parent_name(target):
@@ -140,7 +140,7 @@ def get_next_qparams_idx(module, qparams):
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
 
-def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key):
+def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key) -> List[Any]:
     r""" Get all the unique custom module keys in the custom config dict
     e.g.
     Input:
@@ -163,7 +163,7 @@ def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key):
     [CustomModule1, CustomModule2, CustomModule3]
     """
     # using set to dedup
-    float_custom_module_classes = set()
+    float_custom_module_classes : Set[Any] = set()
     custom_module_mapping = custom_config_dict.get(custom_config_dict_key, {})
     for quant_mode in ["static", "dynamic", "weight_only"]:
         quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {})
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index cba104b8f783..89ba877ffe78 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -107,8 +107,20 @@ def _prepare_standalone_module_fx(
     standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
-    Both input and output of the module are observed in the
-    standalone module.
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+
+    Returns:
+        model(GraphModule): prepared standalone module
+        attributes:
+            _standalone_module_input_quantized_idxs(List[Int]): a list of
+                indexes for the graph input that is expected to be quantized,
+                same as input_quantized_idxs configuration provided
+                for the standalone module
+            _standalone_module_output_quantized_idxs(List[Int]): a list of
+                indexs for the graph output that is quantized
+                same as input_quantized_idxs configuration provided
+                for the standalone module
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
@@ -378,8 +390,9 @@ def _convert_standalone_module_fx(
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
-    Return:
-        A quantized standalone module which accepts float input
-        and produces float output.
+    Returns a quantized standalone module, whether input/output is quantized is
+    specified by prepare_custom_config_dict, with
+    input_quantized_idxs, output_quantized_idxs, please
+    see docs for prepare_fx for details
     """
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)

From 57d489e43a5b915cdb4bd8a16112ac68eb792581 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 5 Jan 2021 22:34:19 -0800
Subject: [PATCH 02/44] Fix for possible RNG offset calculation bug in cuda
 vectorized dropout with VEC=2 (#50110)

Summary:
The [offset calculation](https://github.com/pytorch/pytorch/blob/e3c56ddde67ca1a49159ffa886d889b6e65c7033/aten/src/ATen/native/cuda/Dropout.cu#L328) (which gives an estimated ceiling on the most 32-bit values in the philox sequence any thread in the launch will use) uses the hardcoded UNROLL value of 4, and assumes the hungriest threads can use every value (.x, .y, .z, and .w) their curand_uniform4 calls provide.  However, the way fused_dropout_kernel_vec is currently written, that assumption isn't true in the VEC=2 case:  Each iteration of the `grid x VEC` stride loop, each thread calls curand_uniform4 once, uses rand.x and rand.y, and discards rand.z and rand.w.  This means (I _think_) curand_uniform4 may be called twice as many times per thread in the VEC=2 case as for the VEC=4 case or the fully unrolled code path, which means the offset calculation (which is a good estimate for the latter two cases) is probably wrong for the `fused_dropout_kernel_vec<..., /*VEC=*/2>` code path.

The present PR inserts some value-reuse in fused_dropout_kernel_vec to align the number of times curand_uniform4 is called for launches with the same totalElements in the VEC=2 and VEC=4 cases.  The diff should
- make the offset calculation valid for all code paths
- provide a very small perf boost by reducing the number of curand_uniform4 calls in the VEC=2 path
- ~~make results bitwise accurate for all code paths~~ nvm, tensor elements are assigned to threads differently in the unrolled, VEC 2 and VEC 4 cases, so we're screwed here no matter what.

ngimel what do you think?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50110

Reviewed By: smessmer

Differential Revision: D25790121

Pulled By: ngimel

fbshipit-source-id: f8f533ad997268c6f323cf4d225de547144247a8
---
 aten/src/ATen/native/cuda/Dropout.cu | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 67adbaabbb84..c3e456d97056 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -57,6 +57,12 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
 
   accscalar_t pinv = accscalar_t(1)/p;
 
+  // Helps align the total number of times curand_uniform4 is called by each thread for the same totalElements
+  // in the vec=2 and vec=4 cases.
+  bool gridxvec_loop_state = 0;
+
+  float4 rand;
+
   // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time
   for (IndexType linearIndex = idx * VEC;
       linearIndex < totalElements;
@@ -69,12 +75,21 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
     //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
     // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
     // sets of rand.
-    float4 rand = curand_uniform4(&state);
+    if ((VEC == 4) || (gridxvec_loop_state == 0)) {
+      rand = curand_uniform4(&state);
+    } else {
+      // sets up the last two values we generated last iteration to be used this iteration.
+      rand.x = rand.z;
+      rand.y = rand.w;
+      gridxvec_loop_state ^= 1;
+    }
 
     rand.x = rand.x < p;
     rand.y = rand.y < p;
-    rand.z = rand.z < p;
-    rand.w = rand.w < p;
+    if (VEC == 4) {
+      rand.z = rand.z < p;
+      rand.w = rand.w < p;
+    }
 
     // Note: We explicitly check for is_contiguous() before launching the vectorized kernel
     // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other)

From 282552dde2415d3cb3e4b1f0b18356810cf1ecd4 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 5 Jan 2021 22:57:12 -0800
Subject: [PATCH 03/44] [PyTorch] Reapply D25546409: Use .sizes() isntead of
 .size() in cat_serial_kernel_impl (#49762)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49762

This was reverted because it landed in a stack together with
D25542799 (https://github.com/pytorch/pytorch/commit/9ce1df079f6ea90dd4b7f9aa12a1a78d51a8b204), which really was broken.
ghstack-source-id: 119326870

Test Plan: CI

Reviewed By: maratsubkhankulov

Differential Revision: D25685905

fbshipit-source-id: f4ec9e114993f988d4af380677331c72dfe41c44
---
 aten/src/ATen/native/cpu/CatKernel.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp
index 299850407da3..f86adb8e6318 100644
--- a/aten/src/ATen/native/cpu/CatKernel.cpp
+++ b/aten/src/ATen/native/cpu/CatKernel.cpp
@@ -15,18 +15,20 @@ struct InputMeta {
 
   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
     : data_ptr(t.data_ptr())
-    , inner_size(t.size(dim) * inner) {}
+    , inner_size(t.sizes()[dim] * inner) {}
 };
 
 template <typename scalar_t>
 void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) {
-  int64_t outer = result.numel() / (result.size(dim) * result.stride(dim));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl");
+  int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]);
   scalar_t* result_data = result.data_ptr<scalar_t>();
   int64_t ninputs = tensors.size();
   std::vector<InputMeta> inputs;
   inputs.reserve(ninputs);
   for (auto const &tensor : tensors) {
-    inputs.emplace_back(tensor, dim, result.stride(dim));
+    inputs.emplace_back(tensor, dim, result.strides()[dim]);
   }
 
   using Vec = vec256::Vec256<scalar_t>;

From ad7d208ba5f2c5614679a7999918b75ae74530e9 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 5 Jan 2021 23:20:42 -0800
Subject: [PATCH 04/44] Revert D25239967: [fx] Add matrix multiplication fusion
 pass

Test Plan: revert-hammer

Differential Revision:
D25239967 (https://github.com/pytorch/pytorch/commit/9b7f3fa146d350628b295ab9b794d64173f17da1)

Original commit changeset: fb99ad25b7d8

fbshipit-source-id: 370167b5ade8bf2b3a6cccdf4290ea07b8347c79
---
 test/test_fx_experimental.py          | 123 ---------------
 torch/fx/experimental/merge_matmul.py | 215 --------------------------
 2 files changed, 338 deletions(-)
 delete mode 100644 torch/fx/experimental/merge_matmul.py

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index ac71d6037591..6e9c877b8de6 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -21,7 +21,6 @@
     PartitionMode
 )
 from torch.fx.experimental.fuser import fuse
-from torch.fx.experimental import merge_matmul
 
 try:
     from torchvision.models import resnet18
@@ -845,128 +844,6 @@ def forward(self, a):
                 for p_name in para_list:
                     assert p_name in node.attrs_for_lowering
 
-    def test_merge_matmuls(self):
-        """
-        A collection of test cases for torch.fx.experimental.merge_matmul,
-        a graph transformation that merges matrix multiplication operations.
-        """
-        # Utility function for counting matmuls for test assertions.
-        def _count_matmuls(mod):
-            gm = torch.fx.symbolic_trace(mod)
-
-            num_matmuls = 0
-            for node in gm.graph.nodes:
-                if node.target == torch.matmul:
-                    num_matmuls += 1
-
-            return num_matmuls
-
-        # Simple test case in which there are two matmuls of the same size to merge.
-        class SimpleMergeMatmulModule(torch.nn.Module):
-            def __init__(self, rhs):
-                super().__init__()
-                self.rhs = rhs
-
-            def forward(self, x, y):
-                a = torch.matmul(x, self.rhs)
-                b = torch.matmul(y, self.rhs)
-                return a + b
-
-        # Initialize inputs.
-        a = torch.randn(3, 3)
-        b = torch.randn(3, 3)
-
-        # Initialize RHS for matmuls.
-        rhs = torch.randn(3, 4)
-
-        # Construct SimpleMergeMatmulModule and call merge_matmul on it.
-        module = SimpleMergeMatmulModule(rhs)
-        opt_module = merge_matmul.merge_matmul(module)
-
-        # Numerical correctness check.
-        before = module(a, b)
-        after = opt_module(a, b)
-        before.allclose(after)
-
-        # Basic graph structure check; original module should have 2 matmuls
-        # and optimized module should have 1.
-        self.assertEqual(_count_matmuls(module), 2)
-        self.assertEqual(_count_matmuls(opt_module), 1)
-
-        # Test case in which there are multiple matmuls of different sizes to merge.
-        class FiveMergeMatmulModule(torch.nn.Module):
-            def __init__(self, rhs):
-                super().__init__()
-                self.rhs = rhs
-
-            def forward(self, a, b, c, d, e):
-                s = torch.Tensor((0))
-                matmuls = []
-
-                # For some reason using a list comprehension or for-loop for this
-                # doesn't work.
-                matmuls.append(torch.matmul(a, self.rhs))
-                matmuls.append(torch.matmul(b, self.rhs))
-                matmuls.append(torch.matmul(c, self.rhs))
-                matmuls.append(torch.matmul(d, self.rhs))
-                matmuls.append(torch.matmul(e, self.rhs))
-
-                for m in matmuls:
-                    s += torch.sum(m)
-
-                return s
-
-        # Initialize inputs.
-        inputs = [torch.randn(2 * i + 1, 5) for i in range(5)]
-
-        # Initialize RHS.
-        rhs = torch.randn(5, 4)
-
-        # Construct FiveMergeMatmulModule and call merge_matmul on it.
-        module = FiveMergeMatmulModule(rhs)
-        opt_module = merge_matmul.merge_matmul(module)
-
-        # Numerical correctness check.
-        before = module(*inputs)
-        after = opt_module(*inputs)
-        before.allclose(after)
-
-        # Basic graph structure check; original module should have len(inputs) matmuls
-        # and optimized module should have 1.
-        self.assertEqual(_count_matmuls(module), len(inputs))
-        self.assertEqual(_count_matmuls(opt_module), 1)
-
-        # Simple test case in which two matmuls cannot be merged due to a data dependency between
-        # the LHS operands.
-        class UnmergeableMatmulModule(torch.nn.Module):
-            def __init__(self, rhs):
-                super().__init__()
-                self.rhs = rhs
-
-            def forward(self, x):
-                a = torch.matmul(x, self.rhs)
-                a_abs = torch.abs(a)
-                b = torch.matmul(a_abs.transpose(1, 0), self.rhs)
-                return b
-
-        # Initialize inputs.
-        a = torch.randn(3, 3)
-
-        # Initialize RHS for matmuls.
-        rhs = torch.randn(3, 4)
-
-        # Construct UnmergeableMatmulModule and call merge_matmul on it.
-        module = UnmergeableMatmulModule(rhs)
-        opt_module = merge_matmul.merge_matmul(module)
-
-        # Numerical correctness check.
-        before = module(a)
-        after = opt_module(a)
-        before.allclose(after)
-
-        # Basic graph structure check; the number of matrix multiplcations should not have changed.
-        self.assertEqual(_count_matmuls(module), 2)
-        self.assertEqual(_count_matmuls(opt_module), 2)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
deleted file mode 100644
index a5bd24c84c12..000000000000
--- a/torch/fx/experimental/merge_matmul.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import torch
-
-import itertools
-import operator
-
-from typing import List
-
-
-def get_first_dim(t: torch.Tensor) -> int:
-    """
-    A free function primarily for use in the merge_matmul graph transformation below
-    that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape
-    is an attribute (and cannot be the target of a call_function node) and also helps save
-    a getitem op in the graph.
-
-    Arguments:
-        t: The tensor to get the first dimension of.
-
-    Returns:
-        The first dimension of t.
-    """
-    return t.shape[0]
-
-
-def legalize_graph(gm: torch.fx.GraphModule):
-    """
-    Replace the graph of the given GraphModule with one that contains the same nodes as the
-    original, but in topologically sorted order.
-
-    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
-    order of its input GraphModule, so that this order is restored before further transformation.
-
-    Arguments:
-        gm: The graph module to topologically sort. It is modified in-place.
-
-    """
-    # Build an adjacency list representation of node dependencies in the graph. This also
-    # serves as a list of nodes that still need to be inserted into the new, topologically
-    # sorted graph.
-    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
-
-    # Construct a new graph that will contain all nodes in topologically sorted order.
-    new_graph = torch.fx.Graph()
-    value_remap = {}
-
-    # Copy over all nodes with no dependencies.
-    for node, deps in dependencies.items():
-        if not deps:
-            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-
-    # Remove the copied over nodes from the adjacency list.
-    for copied_node in value_remap.keys():
-        del dependencies[copied_node]
-
-    # While there are still nodes to insert into the new graph:
-    while dependencies:
-        copied_this_round = []
-
-        # Copy over all nodes whose dependencies already exist in the new graph.
-        for node, deps in dependencies.items():
-            all_deps_copied = True
-            for dep in deps:
-                if dep not in value_remap:
-                    all_deps_copied = False
-
-            if all_deps_copied:
-                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-                copied_this_round.append(node)
-
-        # Delete all nodes copied over in this iteration from dependencies.
-        for copied_node in copied_this_round:
-            del dependencies[copied_node]
-
-    # Replace the old graph with the new, topologically sorted one.
-    gm.graph = new_graph
-
-
-def may_depend_on(a: torch.fx.Node, b: torch.fx.Node, search_depth: int = 6):
-    """
-    Determine if one node depends on another in a torch.fx.Graph.
-
-    Arguments:
-        a: The node that may have a dependency on b.
-        b: The node that a may have a dependency on.
-        search_depth: In the case of an indirect dependency, this function
-                        searches upto this many nodes away in search of a
-                        data dependency. If none is found, the function
-                        makes the conservative assumption that there is a
-                        dependency.
-
-    Returns:
-        True if a may depend on b, False if it definitely does not.
-    """
-    # Equivalence is defined as dependence.
-    if a == b:
-        return True
-
-    # If a has no inputs, it cannot depend on b.
-    if len(a.all_input_nodes) == 0:
-        return False
-
-    # If the search depth has been exhausted and no conclusion has been
-    # reached, assume that there is a data dependency.
-    if search_depth == 0:
-        return True
-
-    # Recursively check all inputs of a.
-    for inp in a.all_input_nodes:
-        if may_depend_on(inp, b, search_depth - 1):
-            return True
-
-    return False
-
-
-def are_nodes_independent(nodes: List[torch.fx.Node]):
-    """
-    Check if all of the given nodes are pairwise-data independent.
-
-    Arguments:
-        nodes: The nodes to check for data dependencies.
-
-    Returns:
-        True if any pair in nodes has a data dependency.
-    """
-    # For each pair in nodes:
-    for i, j in itertools.combinations(nodes, 2):
-        if may_depend_on(i, j) or may_depend_on(j, i):
-            return False
-
-    return True
-
-
-def merge_matmul(in_mod: torch.nn.Module):
-    """
-    A graph transformation that merges matrix multiplication operations that share the same right-hand
-    side operand into one large matrix multiplication.
-               ____      _________        _________
-      ----    |    |    |         |     M|  A * C  |
-    M| A  |  T| B  | * K|    C    | =    |---------|
-      ---- ,  |    |    |         |     T|  B * C  |
-       K       ----      ---------        ---------
-                K            R                R
-    """
-    gm = torch.fx.symbolic_trace(in_mod)
-
-    rhs_users = {}
-    lhs_users = {}
-
-    # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to
-    # the matmul of which they are the LHS/RHS.
-    for node in gm.graph.nodes:
-        if node.op != "call_function" or node.target is not torch.matmul:
-            continue
-
-        lhs, rhs = node.args
-
-        # TODO: Properly handle aliasing caused by get_attr. For now,
-        # use the attribute name as the operand if the node is a
-        # get_attr.
-        lhs = lhs.target if lhs.op == "get_attr" else lhs
-        rhs = rhs.target if rhs.op == "get_attr" else rhs
-
-        lhs_users.setdefault(lhs, []).append(node)
-        rhs_users.setdefault(rhs, []).append(node)
-
-    for rhs, mms in rhs_users.items():
-        # There must be at least matmuls for a merge to make sense.
-        if len(mms) < 2:
-            continue
-
-        # All matmuls must not depend on each other directly or indirectly
-        # in order for the merge to be possible.
-        if not are_nodes_independent(mms):
-            continue
-
-        lhs_vals = [mm.args[0] for mm in mms]
-
-        # Merge the matmul.
-        # Collect a list of LHS operands and the single RHS operand.
-        lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals]
-        rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs
-
-        # Concatenate all the LHS operands.
-        merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {})
-
-        # Multiply the concatenated LHS operands with the one RHS. This will produce
-        # the same results as all the individual matmuls involving rhs in the original graph,
-        # but they will all be concatenated together.
-        merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {})
-
-        # Split the result of the merged matmul using the shapes of the LHS operands
-        # to ascertain how large each chunk should be.
-        merge_mm_sizes = [
-            gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs
-        ]
-        merge_mm_split = gm.graph.call_function(
-            torch.split, (merge_mm, merge_mm_sizes), {}
-        )
-        merge_mm_res = [
-            gm.graph.call_function(operator.getitem, (merge_mm_split, out), {})
-            for out in range(len(lhs))
-        ]
-
-        # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul.
-        for old, new in zip(mms, merge_mm_res):
-            old.replace_all_uses_with(new)
-            gm.graph.erase_node(old)
-
-        # All of the new nodes created above were inserted at the end, so we need to sort
-        # the nodes topologically to make sure all definitions precede uses.
-        legalize_graph(gm)
-
-    gm.recompile()
-    gm.graph.lint(in_mod)
-    return gm

From 0ad6f066843537d6cf86e57910f4bbf8faa60f9e Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Wed, 6 Jan 2021 06:50:56 -0800
Subject: [PATCH 05/44] drop a unneeded comma from cmakelist.txt (#50091)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50091

Reviewed By: smessmer

Differential Revision: D25782083

Pulled By: ezyang

fbshipit-source-id: f90f57c6c9fc0c1e68ab30dd3b56dfe971798df2
---
 aten/src/ATen/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index fd3c95f2573b..6fedef185b21 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -72,7 +72,7 @@ file(GLOB metal_h "metal/*.h")
 file(GLOB metal_cpp "metal/*.cpp")
 file(GLOB_RECURSE native_metal_h "native/metal/*.h")
 file(GLOB metal_test_srcs "native/metal/mpscnn/tests/*.mm")
-file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm", "native/metal/*.cpp")
+file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm" "native/metal/*.cpp")
 EXCLUDE(native_metal_srcs "${native_metal_srcs}" ${metal_test_srcs})
 file(GLOB metal_prepack_h "native/metal/MetalPrepackOpContext.h")
 file(GLOB metal_prepack_cpp "native/metal/MetalPrepackOpRegister.cpp")

From 45ec35827ed73c27c114ba0444517baa5b3cdbee Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 6 Jan 2021 06:55:10 -0800
Subject: [PATCH 06/44] Set USE_RCCL cmake option (dependent on USE_NCCL)
 [REDUX] (#34683)

Summary:
Refiled duplicate of https://github.com/pytorch/pytorch/issues/31341 which was reverted in commit 63964175b52197a75e03b73c59bd2573df66b398.

This PR enables RCCL support when building Gloo as part of PyTorch for ROCm.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/34683

Reviewed By: glaringlee

Differential Revision: D25540578

Pulled By: ezyang

fbshipit-source-id: fcb02e5745d62e1b7d2e02048160e9e7a4b4df2d
---
 CMakeLists.txt               |  2 ++
 tools/amd_build/build_amd.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e346087c0cdb..3df73f8a3041 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,6 +173,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
     USE_NCCL "Use NCCL" ON
     "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_RCCL "Use RCCL" ON
+    USE_NCCL OFF)
 cmake_dependent_option(
     USE_STATIC_NCCL "Use static NCCL" OFF
     "USE_NCCL" OFF)
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 026293a9281a..9d4fa54c93b3 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -131,6 +131,20 @@ def is_hip_clang():
                 sources.write(line)
         print("%s updated" % gloo_cmake_file)
 
+gloo_cmake_file = "third_party/gloo/cmake/Modules/Findrccl.cmake"
+if os.path.exists(gloo_cmake_file):
+    do_write = False
+    with open(gloo_cmake_file, "r") as sources:
+        lines = sources.readlines()
+    newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines]
+    if lines == newlines:
+        print("%s skipped" % gloo_cmake_file)
+    else:
+        with open(gloo_cmake_file, "w") as sources:
+            for line in newlines:
+                sources.write(line)
+        print("%s updated" % gloo_cmake_file)
+
 hipify_python.hipify(
     project_directory=proj_dir,
     output_directory=out_dir,

From 2ac180a5dddf04178068dba7cbced33df250eb60 Mon Sep 17 00:00:00 2001
From: Chester Liu <skyline75489@outlook.com>
Date: Wed, 6 Jan 2021 07:08:16 -0800
Subject: [PATCH 07/44] Fix cl.exe detection in cpu/fused_kernel.cpp (#50085)

Summary:
The command used here is essentially `where cl.exe`. By using `system()` we will not be able to find cl.exe unless we are using VS Developer Prompt, which makes `activate()` meaningless. Change `system()` to `run()` fixes this.

Found during https://github.com/pytorch/pytorch/issues/49781.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50085

Reviewed By: smessmer

Differential Revision: D25782054

Pulled By: ezyang

fbshipit-source-id: e8e3cac903a73f3bd78def667ebe0e93201814c8
---
 torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 4e76dc23e55d..4f4aa0d1536b 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -45,11 +45,17 @@ constexpr int so_suffix_len = 3;
 constexpr int cpp_suffix_len = 4;
 #endif
 
+intptr_t run(const std::string& cmd);
+
 static bool programExists(const std::string& program) {
   TemplateEnv env;
   env.s("program", program);
   std::string cmd = format(check_exists_string, env);
+#ifdef _MSC_VER
+  return (run(cmd.c_str()) == 0);
+#else
   return (system(cmd.c_str()) == 0);
+#endif
 }
 
 #ifdef _MSC_VER

From c517e15d79b8ae672ee2a94581fc57fa62155adf Mon Sep 17 00:00:00 2001
From: Nathan Howell <nathan.d.howell@gmail.com>
Date: Wed, 6 Jan 2021 07:36:12 -0800
Subject: [PATCH 08/44] Add support for converting sparse bool tensors to dense
 (#50019)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49977

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50019

Reviewed By: smessmer

Differential Revision: D25782045

Pulled By: ezyang

fbshipit-source-id: a8389cbecb7e79099292a423a6fd8ac28631905b
---
 aten/src/ATen/native/sparse/SparseTensorMath.cpp         | 2 +-
 aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu | 4 ++--
 test/test_sparse.py                                      | 5 +++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 9bb679beb3d0..6c3298b72e75 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -650,7 +650,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen
       dstBuffer.add_(srcBuffer, value);
     }
   } else {
-    AT_DISPATCH_ALL_TYPES(
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool,
         commonDtype, "add_dense_sparse", [&] {
           add_dense_sparse_worker_cpu<scalar_t>(resultBuffer, value, sparse, indices, valuesBuffer);
         });
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index c8366f71618e..fce3446816e7 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -338,8 +338,8 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
     if (sparse.dense_dim() == 0) {
       TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
-      AT_DISPATCH_ALL_TYPES_AND2(
-        at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] {
+      AT_DISPATCH_ALL_TYPES_AND3(
+        at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] {
           apply::sparseElementwiseKernelScalar<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
             <<<grid, block, 0, stream>>>(
               TensorCAddOp<scalar_t>(value.to<scalar_t>()),
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 4e982b8333d9..228c66aa403e 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -356,6 +356,11 @@ def test_to_sparse(self):
         sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3])
         self.assertRaises(RuntimeError, lambda: sp.to_sparse())
 
+    def test_sparse_bool(self):
+        a = self.value_tensor([True, False]).to(torch.bool)
+        b = a.to_sparse().to_dense()
+        self.assertEqual(a, b)
+
     def test_scalar(self):
         # tensor with value
         a = self.sparse_tensor(self.index_tensor([]).unsqueeze(1), 12.3, [])

From 5f2ec6293d6a443b8acca1d3ff7d57f9121afcc7 Mon Sep 17 00:00:00 2001
From: Alex Henrie <alexhenrie24@gmail.com>
Date: Wed, 6 Jan 2021 08:15:08 -0800
Subject: [PATCH 09/44] Unused variables in neural net classes and functions
 (#50100)

Summary:
These unused variables were identified by [pyflakes](https://pypi.org/project/pyflakes/). They can be safely removed to simplify the code and possibly improve performance.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50100

Reviewed By: ezyang

Differential Revision: D25797764

Pulled By: smessmer

fbshipit-source-id: ced341aee692f429d2dcc3a4ef5c46c8ee99cabb
---
 torch/nn/modules/module.py                  | 1 -
 torch/nn/parallel/replicate.py              | 1 -
 torch/nn/quantized/dynamic/modules/rnn.py   | 2 --
 torch/nn/quantized/modules/embedding_ops.py | 1 -
 torch/nn/quantized/modules/normalization.py | 5 -----
 torch/nn/utils/prune.py                     | 1 -
 6 files changed, 11 deletions(-)

diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 297a4edf15bf..f054590da66a 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -843,7 +843,6 @@ def _slow_forward(self, *input, **kwargs):
         if recording_scopes:
             name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None
             if name:
-                cur_scope_name = tracing_state.current_scope()
                 tracing_state.push_scope(name)
             else:
                 recording_scopes = False
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index a069c6c6f939..8effeece5908 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -108,7 +108,6 @@ def replicate(network, devices, detach=False):
     modules = list(network.modules())
     module_copies = [[] for device in devices]
     module_indices = {}
-    scriptmodule_skip_attr = {"_parameters", "_buffers", "_modules", "forward", "_c"}
 
     for i, module in enumerate(modules):
         module_indices[module] = i
diff --git a/torch/nn/quantized/dynamic/modules/rnn.py b/torch/nn/quantized/dynamic/modules/rnn.py
index df88169471ca..59c0195d7858 100644
--- a/torch/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/nn/quantized/dynamic/modules/rnn.py
@@ -239,8 +239,6 @@ def from_float(cls, mod):
         _all_weight_values = []
         for layer in range(qRNNBase.num_layers):
             for direction in range(num_directions):
-                layer_input_size = qRNNBase.input_size if layer == 0 else qRNNBase.hidden_size * num_directions
-
                 suffix = '_reverse' if direction == 1 else ''
 
                 def retrieve_weight_bias(ihhh):
diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py
index d16748b3baf7..e41d55347741 100644
--- a/torch/nn/quantized/modules/embedding_ops.py
+++ b/torch/nn/quantized/modules/embedding_ops.py
@@ -52,7 +52,6 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
         self.dtype = state_dict[prefix + 'dtype']
         state_dict.pop(prefix + 'dtype')
 
diff --git a/torch/nn/quantized/modules/normalization.py b/torch/nn/quantized/modules/normalization.py
index 4664120ec8b5..c12f74374863 100644
--- a/torch/nn/quantized/modules/normalization.py
+++ b/torch/nn/quantized/modules/normalization.py
@@ -29,7 +29,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.normalized_shape, mod.weight, mod.bias, float(scale),
@@ -63,7 +62,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_groups, mod.num_channels, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -98,7 +96,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -133,7 +130,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -168,7 +164,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 84fa30021ed1..851a551da0d8 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -587,7 +587,6 @@ def compute_mask(self, t, default_mask):
         # Compute number of units to prune: amount if int,
         # else amount * tensor_size
         nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
-        nparams_tokeep = tensor_size - nparams_toprune
         # This should raise an error if the number of units to prune is larger
         # than the number of units in the tensor
         _validate_pruning_amount(nparams_toprune, tensor_size)

From 688992c775e2eeef53f3184b2e3428ef2f3a2967 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 08:33:26 -0800
Subject: [PATCH 10/44] [PyTorch] Additional IValue tests (#49718)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49718

Improving test coverage in preparation for updating the
implementation of IValue.
ghstack-source-id: 119327373

Test Plan: ivalue_test

Reviewed By: hlu1

Differential Revision: D25674605

fbshipit-source-id: 37a82bb135f75ec52d2d8bd929c4329e8dcc4d25
---
 aten/src/ATen/test/ivalue_test.cpp | 217 +++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)

diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
index 14e75205aa66..a0e2648758ff 100644
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@@ -51,6 +51,91 @@ TEST(IValueTest, Basic) {
   ASSERT_EQ(tv.use_count(), 2);
 }
 
+static std::array<IValue, 5> makeSampleIValues() {
+  return { at::rand({3, 4}), "hello", 42, true, 1.5 };
+}
+
+static std::array<IValue, 5> makeMoreSampleIValues() {
+  return { at::rand({3, 4}), "goodbye", 23, false, 0.5 };
+}
+
+// IValue::operator== doesn't seem to work on Tensors.
+#define EXPECT_IVALUE_EQ(a, b)                          \
+  EXPECT_EQ((a).isTensor(), (b).isTensor());            \
+  if ((a).isTensor()) {                                 \
+    EXPECT_TRUE(a.toTensor().equal(b.toTensor()));      \
+  } else {                                              \
+    EXPECT_EQ(a, b);                                    \
+  }
+
+TEST(IValueTest, Swap) {
+  // swap() has the following 3 cases: tensor, intrusive_ptr, or
+  // neither. Exercise all pairs of the three.
+
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+  for (const auto& input: sampleInputs) {
+    for (const auto& target: sampleTargets) {
+      IValue a(input);
+      IValue b(target);
+      EXPECT_IVALUE_EQ(a, input);
+      EXPECT_IVALUE_EQ(b, target);
+      a.swap(b);
+      EXPECT_IVALUE_EQ(a, target);
+      EXPECT_IVALUE_EQ(b, input);
+    }
+  }
+}
+
+TEST(IValueTest, CopyConstruct) {
+  auto sampleInputs = makeSampleIValues();
+  for (const IValue& v: sampleInputs) {
+    IValue copy(v);
+    EXPECT_IVALUE_EQ(copy, v);
+  }
+}
+
+TEST(IValueTest, MoveConstruct) {
+  auto sampleInputs = makeSampleIValues();
+  for (const IValue& v: sampleInputs) {
+    IValue source(v);
+    IValue target(std::move(source));
+    EXPECT_IVALUE_EQ(target, v);
+    EXPECT_TRUE(source.isNone());
+  }
+}
+
+TEST(IValueTest, CopyAssign) {
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+
+  for (const IValue& input: sampleInputs) {
+    for (const IValue& target: sampleTargets) {
+      IValue copyTo(target);
+      IValue copyFrom(input);
+      copyTo = copyFrom;
+      EXPECT_IVALUE_EQ(copyTo, input);
+      EXPECT_IVALUE_EQ(copyFrom, input);
+      EXPECT_IVALUE_EQ(copyTo, copyFrom);
+    }
+  }
+}
+
+TEST(IValueTest, MoveAssign) {
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+
+  for (const IValue& input: sampleInputs) {
+    for (const IValue& target: sampleTargets) {
+      IValue moveTo(target);
+      IValue moveFrom(input);
+      moveTo = std::move(moveFrom);
+      EXPECT_IVALUE_EQ(moveTo, input);
+      EXPECT_TRUE(moveFrom.isNone());
+    }
+  }
+}
+
 TEST(IValueTest, Tuple) {
   std::tuple<int64_t, at::Tensor> t = std::make_tuple(123, at::randn({1}));
   auto iv = IValue(t);
@@ -318,5 +403,137 @@ TEST(IValueTest, EnumEquality) {
   );
 }
 
+TEST(IValueTest, isPtrType) {
+  IValue tensor(at::rand({3, 4}));
+  IValue undefinedTensor((at::Tensor()));
+  IValue integer(42);
+  IValue str("hello");
+
+  EXPECT_TRUE(tensor.isPtrType());
+  EXPECT_FALSE(undefinedTensor.isPtrType());
+  EXPECT_FALSE(integer.isPtrType());
+  EXPECT_TRUE(str.isPtrType());
+}
+
+TEST(IValueTest, isAliasOf) {
+  auto sampleIValues = makeSampleIValues();
+  for (auto& iv: sampleIValues) {
+    for (auto& iv2: sampleIValues) {
+      if (&iv == &iv2 && iv.isPtrType()) {
+        EXPECT_TRUE(iv.isAliasOf(iv2));
+      } else {
+        EXPECT_FALSE(iv.isAliasOf(iv2));
+      }
+    }
+  }
+}
+
+TEST(IValueTest, internalToPointer) {
+  IValue tensor(at::rand({3, 4}));
+  IValue str("hello");
+
+  EXPECT_EQ(tensor.internalToPointer(), tensor.unsafeToTensorImpl());
+  EXPECT_NE(str.internalToPointer(), nullptr);
+
+  IValue nullStr((c10::intrusive_ptr<ivalue::ConstantString>()));
+  ASSERT_TRUE(nullStr.isString());
+  EXPECT_EQ(nullStr.internalToPointer(), nullptr);
+}
+
+TEST(IValueTest, IdentityComparisonAndHashing) {
+  at::Tensor t1 = at::rand({3, 4});
+  at::Tensor t2 = at::rand({3, 4});
+  IValue tv1(t1), tv2(t2);
+  IValue tv1b(t1);
+
+  EXPECT_EQ(tv1.hash(), tv1b.hash());
+  EXPECT_NE(tv1.hash(), tv2.hash());
+
+  EXPECT_TRUE(tv1.is(tv1));
+  EXPECT_TRUE(tv1.is(tv1b));
+  EXPECT_TRUE(tv1b.is(tv1));
+  EXPECT_TRUE(tv2.is(tv2));
+
+  EXPECT_FALSE(tv1.is(tv2));
+  EXPECT_FALSE(tv2.is(tv1));
+
+  IValue none;
+  IValue undefinedTensor((at::Tensor()));
+
+  EXPECT_TRUE(none.is(undefinedTensor));
+  EXPECT_TRUE(undefinedTensor.is(none));
+
+  // Is this a bug? We should probably have a is b => a.hash() == b.hash()
+  EXPECT_NE(none.hash(), undefinedTensor.hash());
+
+  auto sampleIValues = makeSampleIValues();
+  auto sampleIValues2 = makeSampleIValues();
+  auto moreSampleIValues = makeMoreSampleIValues();
+
+  ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size());
+  for (int ii = 0; ii < sampleIValues.size(); ++ii) {
+    // Constant strings will have the same pointer value.
+    if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) {
+      EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
+    } else {
+      EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
+    }
+    EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash());
+  }
+}
+
+TEST(IValueTest, getSubValues) {
+  // Scalars have no subvalues.
+  IValue integer(42), float_(1.5);
+
+  IValue::HashAliasedIValues subvalues;
+
+  integer.getSubValues(subvalues);
+  EXPECT_TRUE(subvalues.empty());
+
+  subvalues.clear();
+
+  float_.getSubValues(subvalues);
+  EXPECT_TRUE(subvalues.empty());
+
+  subvalues.clear();
+
+  at::Tensor t1(at::rand({3, 4})), t2(at::rand({3, 4}));
+  IValue tv1(t1), tv2(t2);
+  IValue list(std::vector<at::Tensor>{t1, t2});
+  IValue tuple(ivalue::Tuple::create({tv1, tv2}));
+
+  std::unordered_map<int64_t, at::Tensor> m;
+  m[1] = t1;
+  m[2] = t2;
+
+  IValue dict(std::move(m));
+
+  auto objType = ClassType::create(nullopt, {});
+  objType->addAttribute("t1", tv1.type());
+  objType->addAttribute("t2", tv2.type());
+
+  auto o = ivalue::Object::create(StrongTypePtr(nullptr, objType), 2);
+  o->setSlot(0, tv1);
+  o->setSlot(1, tv2);
+
+  IValue object(o);
+  tv1.getSubValues(subvalues);
+  EXPECT_EQ(subvalues.size(), 1);
+  EXPECT_EQ(subvalues.count(tv1), 1);
+
+  subvalues.clear();
+
+  for (auto& container: {list, tuple, dict, object}) {
+    container.getSubValues(subvalues);
+    EXPECT_EQ(subvalues.size(), 3);
+    EXPECT_EQ(subvalues.count(container), 1);
+    EXPECT_EQ(subvalues.count(tv1), 1);
+    EXPECT_EQ(subvalues.count(tv2), 1);
+
+    subvalues.clear();
+  }
+}
+
 // TODO(gmagogsfm): Add type conversion test?
 } // namespace c10

From 1b31e1353903eb52140aedef04c6edff5bb7b7e6 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 08:33:26 -0800
Subject: [PATCH 11/44] [PyTorch] Store Tensor explicitly in IValue (#48824)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48824

Enables following diff, which will make toTensor() return
`const Tensor&` and allow callers to avoid refcounting overhead.
ghstack-source-id: 119327370

Test Plan:
ivalue_test

Internal benchmark to ensure perf parity. Some interesting steps
during the debugging process:

- First version was about a 5% regression
- Directly implementing move construction instead of using swap
  lowered the regression to 2-3%
- Directly implementing move assign was maybe an 0.5% improvement
- Adding C10_ALWAYS_INLINE on move assign got our regression to
  negligible
- Fixing toTensor() to actually be correct regressed us again, but
  omitting the explicit dtor call as exhaustively spelled out in a
  comment fixed it.

Reviewed By: bwasti

Differential Revision: D25324617

fbshipit-source-id: 7518c1c67f6f2661f151b43310aaddf4fb6e511a
---
 aten/src/ATen/core/ivalue.cpp   |  12 +-
 aten/src/ATen/core/ivalue.h     | 279 +++++++++++++++++++++++---------
 aten/src/ATen/core/ivalue_inl.h |  95 +++++++----
 aten/src/ATen/core/jit_type.h   |   8 +-
 c10/util/intrusive_ptr.h        |   4 +-
 5 files changed, 275 insertions(+), 123 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 320fa6294638..1223577c59c6 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -265,7 +265,7 @@ bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) {
   TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr);
   TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr);
   return lhs.tag == rhs.tag &&
-      lhs.payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+      lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
 }
 
 IValue IValue::equals(const IValue& rhs) const {
@@ -325,17 +325,17 @@ size_t IValue::hash(const IValue& v) {
     case Tag::None:
       return 0;
     case Tag::Bool:
-      return c10::get_hash(v.payload.as_bool);
+      return c10::get_hash(v.payload.u.as_bool);
     case Tag::Double:
-      return c10::get_hash(v.payload.as_double);
+      return c10::get_hash(v.payload.u.as_double);
     case Tag::Tensor:
       // Tensor __hash__ is equivalent to `id()`, so take the pointer value of
       // the tensor to emulate it
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.as_tensor.unsafeGetTensorImpl());
     case Tag::Storage:
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::Int:
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::String:
       return c10::get_hash(v.toStringRef());
     case Tag::Tuple:
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 4a7e15c4008b..5370294b2f2c 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -131,10 +131,15 @@ struct Capsule {
 // they are marked `@private`, which hides them on the doxygen documentation for
 // this page.
 
-/// IValue (Interpreter Value) is a tagged union over the types supported by the
-/// TorchScript interpreter. IValues contain their values as an
-/// `IValue::Payload`, which holds primitive types (`int64_t`, `bool`, `double`,
-/// `Device`), as values and all other types as a `c10::intrusive_ptr`.
+/// IValue (Interpreter Value) is a tagged union over the types
+/// supported by the TorchScript interpreter. IValues contain their
+/// values as an `IValue::Payload`, which holds primitive types
+/// (`int64_t`, `bool`, `double`, `Device`) and `Tensor` as values,
+/// and all other types as a `c10::intrusive_ptr`. In order to
+/// optimize performance of the destructor and related operations by
+/// making the `Tensor` and `c10::intrusive_ptr` paths generate the
+/// same code, we represent a null `c10::intrusive_ptr` as
+/// `UndefinedTensorImpl::singleton()`, *not* `nullptr`.
 ///
 /// IValues are used as inputs to and outputs from the TorchScript interpreter.
 /// To retrieve the value contained within an IValue, use the `.toX()` methods,
@@ -160,27 +165,35 @@ struct Capsule {
 struct TORCH_API IValue final {
   IValue(const IValue& rhs)
       : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
+    if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
     }
   }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
+
+  IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    moveFrom(std::move(rhs));
   }
+
   /// @private [doxygen private]
   ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
-    }
+    destroy();
   }
-  IValue& operator=(IValue&& rhs) & noexcept {
-    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+
+  C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
     return *this;
   }
+
   IValue& operator=(IValue const& rhs) & {
     IValue(rhs).swap(*this);
     return *this;
   }
+
   void dump() const;
 
   /**
@@ -260,13 +273,6 @@ struct TORCH_API IValue final {
       return false;
     }
 
-    if (!this->is_intrusive_ptr) {
-      // Primitive types don't alias anything
-      return false;
-    }
-
-    AT_ASSERT(rhs.is_intrusive_ptr);
-
     // Tensors should be compared based on internal storage
     if (this->isTensor()) {
       const auto thisTensor = this->toTensor();
@@ -274,22 +280,56 @@ struct TORCH_API IValue final {
       return thisTensor.is_alias_of(rhsTensor);
     }
 
+    if (!this->is_intrusive_ptr) {
+      // Primitive types don't alias anything
+      return false;
+    }
+
+    AT_ASSERT(rhs.is_intrusive_ptr);
+
     // Other types can be compared by their ptr value
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+    return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
   }
 
   /// @private [doxygen private]
   size_t use_count() const noexcept {
+    if (isTensor()) {
+      return payload.as_tensor.use_count();
+    }
+
     if (!is_intrusive_ptr) {
       return 1;
     }
 
-    return c10::raw::intrusive_ptr::use_count(payload.as_intrusive_ptr);
+    if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) {
+      return 0;
+    }
+    return c10::raw::intrusive_ptr::use_count(payload.u.as_intrusive_ptr);
   }
 
   /// @private [doxygen private]
   void swap(IValue& rhs) noexcept {
-    std::swap(payload, rhs.payload);
+    if (isTensor() && rhs.isTensor()) {
+      std::swap(payload.as_tensor, rhs.payload.as_tensor);
+    } else if (isTensor()) {
+      at::Tensor t = std::move(payload.as_tensor);
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // payload.as_tensor.~Tensor();
+      payload.u = rhs.payload.u;
+      new (&rhs.payload.as_tensor) at::Tensor(std::move(t));
+    } else if (rhs.isTensor()) {
+      rhs.swap(*this);
+      return;
+    } else {
+      std::swap(payload.u, rhs.payload.u);
+    }
     std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
     std::swap(tag, rhs.tag);
   }
@@ -298,13 +338,8 @@ struct TORCH_API IValue final {
   // While some of these accessors could be generated through templates,
   // we prefer to write them manually for clarity
 
-  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
+    new (&payload.as_tensor) at::Tensor(std::move(t));
   }
   bool isTensor() const {
     return Tag::Tensor == tag;
@@ -312,7 +347,7 @@ struct TORCH_API IValue final {
   at::Tensor toTensor() &&;
   at::Tensor toTensor() const&;
   at::TensorImpl* unsafeToTensorImpl() const {
-    return static_cast<at::TensorImpl*>(payload.as_intrusive_ptr);
+    return payload.as_tensor.unsafeGetTensorImpl();
   }
 
   IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast<bool>(s)) {
@@ -321,7 +356,7 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined tensor.
-    payload.as_intrusive_ptr = s.unsafeReleaseStorageImpl();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl());
   }
   bool isStorage() const {
     return Tag::Storage == tag;
@@ -341,7 +376,7 @@ struct TORCH_API IValue final {
       : tag(Tag::Blob), is_intrusive_ptr(true) {
     // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
     // and store it as a Tensor instead.
-    payload.as_intrusive_ptr = blob.release();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
   }
 
   /// @private [doxygen private]
@@ -397,14 +432,14 @@ struct TORCH_API IValue final {
 
   // Double
   IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) {
-    payload.as_double = d;
+    payload.u.as_double = d;
   }
   bool isDouble() const {
     return Tag::Double == tag;
   }
   double toDouble() const {
     AT_ASSERT(isDouble());
-    return payload.as_double;
+    return payload.u.as_double;
   }
 
   // Future
@@ -433,7 +468,7 @@ struct TORCH_API IValue final {
 
   // Int
   IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = i;
+    payload.u.as_int = i;
   }
 
   // allow you to pass literals (3, 4) without ambiguity
@@ -445,7 +480,7 @@ struct TORCH_API IValue final {
 
   int64_t toInt() const {
     AT_ASSERT(isInt());
-    return payload.as_int;
+    return payload.u.as_int;
   }
 
   // Bool
@@ -454,9 +489,9 @@ struct TORCH_API IValue final {
     // Initializing entire payload stops valgrind's from reporting
     // "jump or move depends on uninitialised value" in IValue copy constructor
     // See https://github.com/pytorch/pytorch/issues/37117
-    payload.as_int = b;
+    payload.u.as_int = b;
 #else
-    payload.as_bool = b;
+    payload.u.as_bool = b;
 #endif
   }
   bool isBool() const {
@@ -464,7 +499,7 @@ struct TORCH_API IValue final {
   }
   bool toBool() const {
     AT_ASSERT(isBool());
-    return payload.as_bool;
+    return payload.u.as_bool;
   }
 
   // IntList
@@ -580,7 +615,7 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
 
   // None
-  IValue() : payload{0}, tag(Tag::None), is_intrusive_ptr(false) {}
+  IValue() : tag(Tag::None), is_intrusive_ptr(false) {}
   bool isNone() const {
     return Tag::None == tag;
   }
@@ -616,21 +651,21 @@ struct TORCH_API IValue final {
 
   // Device
   IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) {
-    payload.as_device.type = d.type();
-    payload.as_device.index = d.index();
+    payload.u.as_device.type = d.type();
+    payload.u.as_device.index = d.index();
   }
   bool isDevice() const {
     return Tag::Device == tag;
   }
   c10::Device toDevice() const {
     AT_ASSERT(isDevice());
-    return c10::Device(payload.as_device.type, payload.as_device.index);
+    return c10::Device(payload.u.as_device.type, payload.u.as_device.index);
   }
 
   //Stream
   IValue(c10::Stream stream)
     : tag(Tag::Stream), is_intrusive_ptr(false) {
-    payload.as_int = stream.pack();
+    payload.u.as_int = stream.pack();
   }
   c10::Stream toStream() &&;
   c10::Stream toStream() const &;
@@ -659,7 +694,7 @@ struct TORCH_API IValue final {
 
   // QScheme
   IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = static_cast<int64_t>(qscheme);
+    payload.u.as_int = static_cast<int64_t>(qscheme);
   }
 
   at::QScheme toQScheme() const {
@@ -680,7 +715,7 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined generator.
-    payload.as_intrusive_ptr = g.unsafeReleaseGeneratorImpl();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl());
   }
   bool isGenerator() const {
     return Tag::Generator == tag;
@@ -749,14 +784,19 @@ struct TORCH_API IValue final {
       const IValue& v);
 
   bool isPtrType() const {
-    return is_intrusive_ptr;
+    return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr;
   }
 
   /// @private [doxygen private]
   const void* internalToPointer() const {
     TORCH_INTERNAL_ASSERT(
         isPtrType(), "Can only call internalToPointer() for pointer types");
-    return payload.as_intrusive_ptr;
+    if (isTensor()) {
+      return payload.as_tensor.unsafeGetTensorImpl();
+    } else {
+      return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()
+        ? payload.u.as_intrusive_ptr : nullptr;
+    }
   }
 
   TypePtr type() const;
@@ -770,7 +810,7 @@ struct TORCH_API IValue final {
       }
       // If it is not a Tensor, then two mutable IValues alias each other only
       // if they are the same pointer.
-      return val.payload.as_int;
+      return val.payload.u.as_int;
     }
   };
 
@@ -800,6 +840,10 @@ struct TORCH_API IValue final {
   IValue deepcopy(HashAliasedIValueMap& memo) const;
 
  private:
+  static c10::intrusive_ptr_target* null_to_undefined_tensor(c10::intrusive_ptr_target* p) {
+    return p ? p : static_cast<c10::intrusive_ptr_target*>(c10::UndefinedTensorImpl::singleton());
+  }
+
   static bool ptrEqual(const IValue& lhs, const IValue& rhs);
   // NOTE: IValue tags are intentionally private. In the future we may encode
   // this value different (e.g. using NaN boxing), and this would make it more
@@ -822,24 +866,77 @@ struct TORCH_API IValue final {
       class NullType = c10::detail::intrusive_target_default_null_type<T>>
   c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
 
-  void clearToNone() {
-    payload.as_int = 0;
+  void destroy() {
+    // We carefully construct this call to both 1) avoid UB by using
+    // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable
+    // the compiler to generate the same code for each case. It is
+    // surprisingly difficult to get this right.
+    if (isTensor() || is_intrusive_ptr) {
+      c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr;
+      c10::intrusive_ptr<intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(p);
+      // No need to make this destructor call!
+      // payload.as_tensor.~Tensor();
+    }
+  }
+
+  C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // rhs.payload.as_tensor.~Tensor();
+    } else {
+      payload.u = rhs.payload.u;
+    }
+    tag = rhs.tag;
+    is_intrusive_ptr = rhs.is_intrusive_ptr;
+    rhs.clearToNone();
+  }
+
+  void clearToNone() noexcept {
+    payload.u.as_int = 0;
     tag = Tag::None;
     is_intrusive_ptr = false;
   }
 
   union Payload {
-    int64_t as_int;
-    double as_double;
-    bool as_bool;
-    c10::intrusive_ptr_target* as_intrusive_ptr;
-    struct {
-      DeviceType type;
-      DeviceIndex index;
-    } as_device;
+    // We use a nested union here so that we can make the copy easy
+    // and efficient in the non-tensor (i.e., trivially copyable)
+    // case. Specifically, we do not have to do a switch-on-tag to
+    // figure out which union member to assign; we can just use
+    // TriviallyCopyablePayload::operator=.
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+      // Invariant: never nullptr; null state is represented as
+      // c10::UndefinedTensorImpl::singleton() for consistency of
+      // representation with Tensor.
+      c10::intrusive_ptr_target* as_intrusive_ptr;
+      struct {
+        DeviceType type;
+        DeviceIndex index;
+      } as_device;
+    } u;
+    at::Tensor as_tensor;
+    Payload() : u() {}
+    ~Payload() {}
   };
 
-  IValue(Payload p, Tag t, bool i) : payload(p), tag(t), is_intrusive_ptr(i) {}
+  IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) {
+    if (isTensor()) {
+      new (&payload.as_tensor) at::Tensor(p.as_tensor);
+    } else {
+      payload.u = p.u;
+    }
+  }
 
   Payload payload;
   Tag tag;
@@ -848,29 +945,36 @@ struct TORCH_API IValue final {
 };
 
 struct TORCH_API WeakIValue final {
-  WeakIValue() : payload{0}, tag(IValue::Tag::None), is_intrusive_ptr(false) {}
+  WeakIValue() : tag(IValue::Tag::None), is_intrusive_ptr(false) {}
 
   WeakIValue(const WeakIValue& rhs)
       : payload(rhs.payload),
         tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
+    if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
       c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
     }
   }
   WeakIValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
+      : tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (rhs.isTensor()) {
+      payload.as_intrusive_ptr = rhs.unsafeToTensorImpl();
+      is_intrusive_ptr = true;
+    } else {
+      payload = rhs.payload.u;
+    }
     if (is_intrusive_ptr) {
-      c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+      if (payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+        c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+      }
     }
   }
   WeakIValue(WeakIValue&& rhs) noexcept : WeakIValue() {
     swap(rhs);
   }
   ~WeakIValue() {
-    if (is_intrusive_ptr) {
+    if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
       c10::raw::weak_intrusive_ptr::decref(payload.as_intrusive_ptr);
     }
   }
@@ -895,17 +999,33 @@ struct TORCH_API WeakIValue final {
 
   IValue lock() const {
     if (!is_intrusive_ptr) {
-      return IValue(payload, tag, false);
+      IValue::Payload newPayload;
+      newPayload.u = payload;
+      return IValue(newPayload, tag, false);
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
-        payload.as_intrusive_ptr);
-    IValue::Payload pl;
-    pl.as_intrusive_ptr = temp.lock().release();
-    temp.release();
-    if (!pl.as_intrusive_ptr) {
-      return IValue();
+    if (IValue::Tag::Tensor == tag) {
+      auto temp = c10::weak_intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl>::reclaim(
+          static_cast<at::TensorImpl*>(payload.as_intrusive_ptr));
+      c10::intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl> ip(temp.lock());
+      temp.release();
+      if (!ip) {
+        return IValue();
+      } else {
+        return IValue(at::Tensor(std::move(ip)));
+      }
     } else {
-      return IValue(pl, tag, true);
+      auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+          payload.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+          ? nullptr
+          : payload.as_intrusive_ptr);
+      IValue::Payload pl;
+      pl.u.as_intrusive_ptr = temp.lock().release();
+      temp.release();
+      if (!pl.u.as_intrusive_ptr) {
+        return IValue();
+      } else {
+        return IValue(pl, tag, true);
+      }
     }
   }
 
@@ -913,7 +1033,7 @@ struct TORCH_API WeakIValue final {
     if (!is_intrusive_ptr) {
       return 1;
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(
         payload.as_intrusive_ptr);
     size_t result = temp.use_count();
     temp.release();
@@ -924,7 +1044,7 @@ struct TORCH_API WeakIValue final {
     if (!is_intrusive_ptr) {
       return 1;
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(
         payload.as_intrusive_ptr);
     size_t result = temp.weak_use_count();
     temp.release();
@@ -935,7 +1055,8 @@ struct TORCH_API WeakIValue final {
   }
 
  private:
-  IValue::Payload payload;
+  using Payload = IValue::Payload::TriviallyCopyablePayload;
+  Payload payload;
   IValue::Tag tag;
   bool is_intrusive_ptr;
 };
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 89c8e669c138..fe55d783e780 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -48,14 +48,18 @@ struct tagged_capsule {
 template <class T, class NullType>
 c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
   auto t = c10::intrusive_ptr<T, NullType>::reclaim(
-      static_cast<T*>(payload.as_intrusive_ptr));
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
   clearToNone();
   return t;
 }
 template <typename T, class NullType>
 c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
   auto r = c10::intrusive_ptr<T, NullType>::reclaim(
-      static_cast<T*>(payload.as_intrusive_ptr));
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
   auto p = r;
   r.release();
   return p;
@@ -131,12 +135,22 @@ inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() const& {
 }
 inline at::Tensor IValue::toTensor() && {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(
-      moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  auto result = std::move(payload.as_tensor);
+  // As far as I can tell, omitting the usual explicit destructor call
+  // is not UB in and of itself, and it's a slight perf win. The
+  // destructor is a no-op, because the moved-from Tensor is
+  // effectively an intrusive_ptr in the null state, so we don't need
+  // the behavior for correctness reasons either. Leaving this
+  // explanatory comment, including commented-out destructor call, to
+  // make this abundantly clear.
+  //
+  // payload.as_tensor.~Tensor();
+  clearToNone();
+  return result;
 }
 inline at::Tensor IValue::toTensor() const& {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  return payload.as_tensor;
 }
 inline c10::Storage IValue::toStorage() && {
   AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
@@ -148,10 +162,10 @@ inline c10::Storage IValue::toStorage() const& {
   return c10::Storage(toIntrusivePtr<at::StorageImpl>());
 }
 inline c10::Stream IValue::toStream() && {
-  return c10::Stream::unpack(payload.as_int);
+  return c10::Stream::unpack(payload.u.as_int);
 }
 inline c10::Stream IValue::toStream() const& {
-  return c10::Stream::unpack(payload.as_int);
+  return c10::Stream::unpack(payload.u.as_int);
 }
 inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
   AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
@@ -713,7 +727,8 @@ using _guarded_unsigned_long = std::conditional_t<
 
 inline const ivalue::Object& IValue::toObjectRef() const {
   AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
-  return *static_cast<const c10::ivalue::Object*>(payload.as_intrusive_ptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference");
+  return *static_cast<const c10::ivalue::Object*>(payload.u.as_intrusive_ptr);
 }
 
 // note: when adding a DEFINE_TO case here you should also add a
@@ -980,8 +995,11 @@ inline c10::List<int64_t> IValue::toIntList() const& {
 }
 inline std::vector<int64_t> IValue::toIntVector() const {
   AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toIntVector on null intrusive_ptr IValue");
   return createVectorFromList<int64_t>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<double> IValue::toDoubleList() && {
   AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
@@ -993,8 +1011,11 @@ inline c10::List<double> IValue::toDoubleList() const& {
 }
 inline std::vector<double> IValue::toDoubleVector() const {
   AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDoubleVector on null intrusive_ptr IValue");
   return createVectorFromList<double>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<bool> IValue::toBoolList() && {
   AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
@@ -1014,8 +1035,11 @@ inline c10::List<at::Tensor> IValue::toTensorList() const& {
 }
 inline std::vector<at::Tensor> IValue::toTensorVector() const {
   AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTensorVector on null intrusive_ptr IValue");
   return createVectorFromList<at::Tensor>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<IValue> IValue::toList() && {
   AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
@@ -1027,7 +1051,10 @@ inline c10::List<IValue> IValue::toList() const& {
 }
 inline c10::ArrayRef<IValue> IValue::toListRef() const {
   AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
-  return static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr)
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toListRef on null intrusive_ptr IValue");
+  return static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr)
       ->list;
 }
 inline c10::Dict<IValue, IValue> IValue::toGenericDict() && {
@@ -1049,7 +1076,7 @@ inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const& {
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
     : tag(Tag::Tuple), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 template <
     typename... Args,
@@ -1065,14 +1092,14 @@ inline IValue::IValue(const std::tuple<Args...>& t)
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
     : tag(Tag::String), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 inline IValue::IValue(std::string v)
     : IValue(ivalue::ConstantString::create(std::move(v))) {}
 
 inline IValue::IValue(c10::impl::GenericList v)
     : tag(Tag::GenericList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.impl_.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 
 template <class T, IValue::enable_if_ivalue_constructible<T>>
@@ -1104,7 +1131,7 @@ inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
 
 inline IValue::IValue(c10::impl::GenericDict v)
     : tag(Tag::GenericDict), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.impl_.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 template <class Key, class Value>
 inline IValue::IValue(c10::Dict<Key, Value> v)
@@ -1131,17 +1158,17 @@ inline IValue::IValue(c10::nullopt_t) : IValue() {}
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
     : tag(Tag::Object), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v)
     : tag(Tag::PyObject), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::EnumHolder> v)
     : tag(Tag::Enum), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue IValue::make_capsule(
@@ -1149,7 +1176,7 @@ inline IValue IValue::make_capsule(
   IValue iv;
   iv.tag = Tag::Capsule;
   iv.is_intrusive_ptr = true;
-  iv.payload.as_intrusive_ptr = blob.release();
+  iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
   return iv;
 }
 
@@ -1170,30 +1197,33 @@ IValue::IValue(c10::intrusive_ptr<T> custom_class) {
   auto ivalue_obj = c10::ivalue::Object::create(
       c10::StrongTypePtr(nullptr, classType), /*num_slots=*/1);
   ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class)));
-  payload.as_intrusive_ptr = ivalue_obj.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release());
   tag = Tag::Object;
   is_intrusive_ptr = true;
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
     : tag(Tag::Future), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
     : tag(Tag::RRef), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<at::Quantizer> v)
     : tag(Tag::Quantizer), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline const std::string& IValue::toStringRef() const {
   AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringRef on null intrusive_ptr IValue");
   return static_cast<const c10::ivalue::ConstantString*>(
-             payload.as_intrusive_ptr)
+             payload.u.as_intrusive_ptr)
       ->string();
 }
 inline c10::optional<std::reference_wrapper<const std::string>> IValue::
@@ -1202,8 +1232,11 @@ inline c10::optional<std::reference_wrapper<const std::string>> IValue::
     return c10::nullopt;
   }
   AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalStringRef on null intrusive_ptr IValue");
   return std::reference_wrapper<const std::string>(
-      static_cast<const c10::ivalue::ConstantString*>(payload.as_intrusive_ptr)
+      static_cast<const c10::ivalue::ConstantString*>(payload.u.as_intrusive_ptr)
           ->string());
 }
 
@@ -1241,15 +1274,13 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
     // for bool type, do equality check
     return this->toBool() == rhs.toBool();
   } else if (this->isTensor() && rhs.isTensor()) {
-    // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr
-    // is false for undefined tensor
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+    return this->payload.as_tensor.is_same(rhs.payload.as_tensor);
   } else if (this->isTensor() && rhs.isNone()) {
     // special case: undefined tensor and None are the same identity
-    return !this->is_intrusive_ptr;
+    return !this->payload.as_tensor.defined();
   } else if (this->isNone() && rhs.isTensor()) {
     // special case: undefined tensor and None are the same identity
-    return !rhs.is_intrusive_ptr;
+    return !rhs.payload.as_tensor.defined();
   } else if (this->isInt() && rhs.isInt()) {
     return this->toInt() == rhs.toInt();
   } else if (this->isDouble() && rhs.isDouble()) {
@@ -1260,7 +1291,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
     // for objects holding in IValue, do shallow compare on pointer address to
     // testify the identity
     return this->is_intrusive_ptr && rhs.is_intrusive_ptr &&
-        this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+        this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
   }
 }
 
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index a3ae813616e0..7d3890f582b8 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -2370,19 +2370,19 @@ struct TORCH_API AnyClassType : public Type {
 
 inline bool IValue::isDoubleList() const {
   // note: avoids calling type() to avoid extra referencing counting for the returned type.
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == FloatType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == FloatType::Kind;
 }
 
 inline bool IValue::isTensorList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == TensorType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == TensorType::Kind;
 }
 
 inline bool IValue::isIntList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == IntType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == IntType::Kind;
 }
 
 inline bool IValue::isBoolList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == BoolType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == BoolType::Kind;
 }
 
 template<>
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 637db95991f2..790d97ee3994 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -206,7 +206,7 @@ class intrusive_ptr final {
       "NullType must have a constexpr singleton() method");
 #endif
   static_assert(
-      std::is_same<TTarget*, decltype(NullType::singleton())>::value,
+      std::is_base_of<TTarget, typename std::remove_pointer<decltype(NullType::singleton())>::type>::value,
       "NullType::singleton() must return a element_type* pointer");
 
   TTarget* target_;
@@ -509,7 +509,7 @@ class weak_intrusive_ptr final {
       "NullType must have a constexpr singleton() method");
 #endif
   static_assert(
-      std::is_same<TTarget*, decltype(NullType::singleton())>::value,
+      std::is_base_of<TTarget, typename std::remove_pointer<decltype(NullType::singleton())>::type>::value,
       "NullType::singleton() must return a element_type* pointer");
 
   TTarget* target_;

From 480a756194f27580753a63d908393dfda3baeb25 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 08:33:26 -0800
Subject: [PATCH 12/44] [PyTorch] IValue::toTensor can now return const Tensor&
 (#48868)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48868

Building on the previous diff, we can make `toTensor()` return a
`const Tensor&`, which should make it easier to avoid reference
counting.
ghstack-source-id: 119327372

Test Plan: internal benchmarks.

Reviewed By: bwasti

Differential Revision: D25325379

fbshipit-source-id: ca699632901691bcee432f595f75b0a4416d55dd
---
 aten/src/ATen/core/ivalue.h                   |  7 +-
 aten/src/ATen/core/ivalue_inl.h               |  7 +-
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp  |  2 +-
 torch/csrc/jit/frontend/tracer.cpp            |  6 +-
 torch/csrc/jit/passes/freeze_module.cpp       |  8 +-
 torch/csrc/jit/runtime/argument_spec.h        |  2 +-
 torch/csrc/jit/runtime/interpreter.cpp        |  4 +-
 torch/csrc/jit/runtime/profiling_record.cpp   |  2 +-
 torch/csrc/jit/runtime/static/ops.cpp         | 82 +++++++++----------
 torch/csrc/jit/serialization/pickler.cpp      |  2 +-
 torch/csrc/jit/serialization/python_print.cpp |  4 +-
 torch/csrc/jit/serialization/unpickler.cpp    |  2 +-
 12 files changed, 67 insertions(+), 61 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 5370294b2f2c..ca68a8df46e1 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -275,8 +275,8 @@ struct TORCH_API IValue final {
 
     // Tensors should be compared based on internal storage
     if (this->isTensor()) {
-      const auto thisTensor = this->toTensor();
-      const auto rhsTensor = rhs.toTensor();
+      const auto& thisTensor = this->toTensor();
+      const auto& rhsTensor = rhs.toTensor();
       return thisTensor.is_alias_of(rhsTensor);
     }
 
@@ -345,7 +345,8 @@ struct TORCH_API IValue final {
     return Tag::Tensor == tag;
   }
   at::Tensor toTensor() &&;
-  at::Tensor toTensor() const&;
+  at::Tensor& toTensor() &;
+  const at::Tensor& toTensor() const&;
   at::TensorImpl* unsafeToTensorImpl() const {
     return payload.as_tensor.unsafeGetTensorImpl();
   }
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index fe55d783e780..b96f4b834989 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -148,7 +148,11 @@ inline at::Tensor IValue::toTensor() && {
   clearToNone();
   return result;
 }
-inline at::Tensor IValue::toTensor() const& {
+inline at::Tensor& IValue::toTensor() & {
+  AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
+  return payload.as_tensor;
+}
+inline const at::Tensor& IValue::toTensor() const& {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
   return payload.as_tensor;
 }
@@ -744,6 +748,7 @@ inline const ivalue::Object& IValue::toObjectRef() const {
   inline type IValue::to<type>() const& {  \
     return this->method_name();            \
   }
+
 DEFINE_TO(at::Tensor, toTensor)
 DEFINE_TO(at::Storage, toStorage)
 DEFINE_TO(c10::Stream, toStream)
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index f1a0a634727a..5bddc510fe56 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -209,7 +209,7 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
   std::stringstream encoded_inputs;
   for (const auto& input : inputs) {
     if (input.isTensor()) {
-      auto input_tensor = input.toTensor();
+      auto& input_tensor = input.toTensor();
 
       encoded_inputs << ";";
       auto sep = "";
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 1bab391bd393..0c88371399de 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -137,7 +137,7 @@ Value* TracingState::getValue(const IValue& var) {
     return graph->insertNode(dict_node)->output();
   }
   if (var.isTensor()) {
-    auto ten = var.toTensor();
+    auto& ten = var.toTensor();
     if (!ten.defined()) {
       Node* n = graph->createNone();
       return graph->insertNode(n)->output();
@@ -237,7 +237,7 @@ bool TracingState::hasValue(const IValue& var) const {
 Value* TracingState::getOutput(const IValue& iv, size_t i) {
   bool tracing_mode_strict = getTracingState()->strict;
   if (iv.isTensor()) {
-    at::Tensor var = iv.toTensor();
+    const at::Tensor& var = iv.toTensor();
     if (!var.defined()) {
       Node* n = graph->createNone();
       return graph->insertNode(n)->output();
@@ -506,7 +506,7 @@ void setValueTrace(const IValue& v, Value* value) {
 }
 void TracingState::setValue(const IValue& v, Value* value) {
   if (v.isTensor()) {
-    auto var = v.toTensor();
+    auto& var = v.toTensor();
     AT_ASSERT(var.defined());
     env_stack.back()[v] = value;
   } else if (v.isTensorList()) {
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 2778c7712f23..f66f54eeb567 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -289,11 +289,11 @@ class AttributePropagator {
 
   IValue overrideGradient(IValue attr) {
     if (attr.isTensor()) {
-      auto t = attr.toTensor();
+      auto& t = attr.toTensor();
       if (t.requires_grad()) {
-        t = t.detach();
-        t.set_requires_grad(false);
-        attr = IValue(t);
+        auto detached = t.detach();
+        detached.set_requires_grad(false);
+        attr = IValue(std::move(detached));
       }
     } else if (attr.isTuple()) {
       auto tuple = std::move(attr).toTuple();
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 401933c6d67e..a0e60e879146 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -237,7 +237,7 @@ struct CompleteArgumentSpec {
     for (int32_t i = 0; i < num_inputs; i++) {
       if (!inputs[i].isTensor())
         continue;
-      auto tensor = inputs[i].toTensor();
+      auto& tensor = inputs[i].toTensor();
       all_dims += tensor.defined() ? tensor.ndimension() : 0;
     }
     // allocate enough room for all TensorPODs and dimensions
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 24ca9dbf9793..ce4718becaf7 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -1418,7 +1418,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             // Check every input's shape against profiled (expected) shape.
             for (i = 0; i < num_inputs; i++) {
               auto& input = peek(stack, i, num_inputs);
-              auto t = input.toTensor();
+              auto& t = input.toTensor();
               const TypePtr& expected = frame.function->type_table_[inst.X + i];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() && !expected_type->matchTensor(t)) {
@@ -1439,7 +1439,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
               // so it's safe to pass this guard check
               push(stack, true);
             } else {
-              auto t = stack.back().toTensor();
+              auto& t = stack.back().toTensor();
               const TypePtr& expected = frame.function->type_table_[inst.X];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 8d276dd58b50..d233f089f187 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -165,7 +165,7 @@ void ProfilingRecord::insertShapeProfile(Node* n, size_t offset) {
     if (v.isTensor()) {
       std::lock_guard<std::mutex> lock(this->mutex_);
       auto& profiled_types = profiled_types_per_frame_[frame_id];
-      auto t = v.toTensor();
+      auto& t = v.toTensor();
       if (t.defined()) {
         auto pttp = tensorTypeInCurrentExecutionContext(t);
         GRAPH_DEBUG(
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 5c118f513565..89519d3765b5 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -79,13 +79,13 @@ struct static_add final : public at::native::structured_add_out {
 
 REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     auto in2_s = p_node->Input(2, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     static_add op{out_t};
     op.meta(in0_t, in1_t, in2_s);
     op.impl(in0_t, in1_t, in2_s, out_t);
@@ -94,12 +94,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::mul_out(out_t, in0_t, in1_t);
   };
@@ -107,15 +107,15 @@ REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
-    auto in2_t = p_node->Input(2, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
+    auto& in2_t = p_node->Input(2, reg).toTensor();
     auto in3_s = p_node->Input(3, reg).toScalar();
     auto in4_s = p_node->Input(4, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s);
   };
@@ -123,13 +123,13 @@ REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     auto in1_s = p_node->Input(1, reg).toScalar();
     auto in2_s = p_node->Input(2, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::clamp_out(out_t, in0_t, in1_s, in2_s);
   };
@@ -137,12 +137,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::bmm, aten_bmm, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::bmm_out_cpu(out_t, in0_t, in1_t);
   };
@@ -154,7 +154,7 @@ REGISTER_OPERATOR_FUNCTOR(
     [](Node* n) -> SROperator {
       return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
         auto input_size = p_node->input_regs().size();
-        auto in0_t = p_node->Input(0, reg).toTensor();
+        auto& in0_t = p_node->Input(0, reg).toTensor();
         double in1_d = input_size > 1 ? p_node->Input(1, reg).toDouble() : 0;
         double in2_d = input_size > 2 ? p_node->Input(2, reg).toDouble()
                                       : std::numeric_limits<double>::infinity();
@@ -164,7 +164,7 @@ REGISTER_OPERATOR_FUNCTOR(
         if (p_node->Output(0, reg).isNone()) {
           p_node->Output(0, reg) = create_empty_from(in0_t);
         }
-        auto out_t = p_node->Output(0, reg).toTensor();
+        auto& out_t = p_node->Output(0, reg).toTensor();
         out_t.resize_({0});
         at::native::nan_to_num_out(out_t, in0_t, in1_d, in2_d, in3_d);
       };
@@ -176,18 +176,18 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_tl[0]);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::_cat_out_cpu(out_t, in0_tl, in1_i);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::tanh, aten_tanh, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::tanh_out(out_t, in0_t);
   };
@@ -217,7 +217,7 @@ SROperator aten_stack(Node* n) {
     for (auto i = 0; i < inputs.size(); i++) {
       inputs[i] = inputs[i].unsqueeze(dim);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::_cat_out_cpu(out_t, inputs, dim);
   };
@@ -230,11 +230,11 @@ REGISTER_OPERATOR_FUNCTOR(
     aten_sigmoid,
     [](Node* n) -> SROperator {
       return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-        auto in0_t = p_node->Input(0, reg).toTensor();
+        auto& in0_t = p_node->Input(0, reg).toTensor();
         if (p_node->Output(0, reg).isNone()) {
           p_node->Output(0, reg) = create_empty_from(in0_t);
         }
-        auto out_t = p_node->Output(0, reg).toTensor();
+        auto& out_t = p_node->Output(0, reg).toTensor();
         out_t.resize_({0});
         at::native::sigmoid_out(out_t, in0_t);
       };
@@ -247,57 +247,57 @@ REGISTER_OPERATOR_FUNCTOR(
       if (in1) {
         auto in1_s = in1->toScalar();
         return [=](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-          auto in0_t = p_node->Input(0, reg).toTensor();
+          auto& in0_t = p_node->Input(0, reg).toTensor();
           if (p_node->Output(0, reg).isNone()) {
             p_node->Output(0, reg) = create_empty_from(in0_t);
           }
-          auto out_t = p_node->Output(0, reg).toTensor();
+          auto& out_t = p_node->Output(0, reg).toTensor();
           at::native::leaky_relu_out(out_t, in0_t, in1_s);
         };
       } else {
         return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-          auto in0_t = p_node->Input(0, reg).toTensor();
+          auto& in0_t = p_node->Input(0, reg).toTensor();
           auto in1_s = p_node->Input(1, reg).toScalar();
           if (p_node->Output(0, reg).isNone()) {
             p_node->Output(0, reg) = create_empty_from(in0_t);
           }
-          auto out_t = p_node->Output(0, reg).toTensor();
+          auto& out_t = p_node->Output(0, reg).toTensor();
           at::native::leaky_relu_out(out_t, in0_t, in1_s);
         };
       }
     });
 REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::threshold_out(out_t, in0_t, 0, 0);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     double in1_d = p_node->input_regs().size() > 1
         ? p_node->Input(1, reg).toDouble()
         : -1.0;
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::logit_out(out_t, in0_t, in1_d);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     at::native::resize_as_(out_t, in0_t, c10::nullopt);
     at::native::copy_(out_t, in0_t, false);
   };
@@ -317,14 +317,14 @@ std::function<void(const ProcessedNode*, std::vector<IValue>&)>
 getNativeOperation(Node* n) {
   if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       p_node->Output(0, reg) = at::native::transpose(in0_t, in1_i, in2_i);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       p_node->Output(0, reg) = at::native::flatten(in0_t, in1_i, in2_i);
@@ -386,19 +386,19 @@ getNativeOperation(Node* n) {
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::permute")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_iv = p_node->Input(1, reg).toIntVector();
       p_node->Output(0, reg) = at::native::permute(in0_t, in1_iv);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::reshape")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_iv = p_node->Input(1, reg).toIntVector();
       p_node->Output(0, reg) = at::native::reshape(in0_t, in1_iv);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::slice")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       auto in3_i = p_node->Input(3, reg).toInt();
@@ -408,13 +408,13 @@ getNativeOperation(Node* n) {
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::narrow")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto self = p_node->Input(0, reg).toTensor(); // self
+      auto& self = p_node->Input(0, reg).toTensor(); // self
       auto dim = p_node->Input(1, reg).toInt(); // dim
       int64_t start = 0;
       if (p_node->Input(2, reg).isScalar()) {
         start = p_node->Input(2, reg).toInt();
       } else {
-        auto t = p_node->Input(2, reg).toTensor();
+        auto& t = p_node->Input(2, reg).toTensor();
         start = t.item<int64_t>();
       }
       auto length = p_node->Input(3, reg).toInt(); // length
@@ -440,7 +440,7 @@ getNativeOperation(Node* n) {
   } else if (n->kind() == c10::Symbol::fromQualString("aten::to")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
       DCHECK(p_node->input_regs().size() == 5);
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toScalarType();
       auto in2_i = p_node->Input(2, reg).toBool();
       auto in3_i = p_node->Input(3, reg).toBool();
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 6e5c3b927c38..811569485888 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -354,7 +354,7 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) {
   //
   // The format here is the same one used by `torch.save()`. The code for the
   // format can be found in `torch/serialization.py`.
-  auto tensor = ivalue.toTensor();
+  auto& tensor = ivalue.toTensor();
   bool quantized = tensor.is_quantized();
   // The arguments to this function are:
   //    storage, storage_offset, size, stride, requires_grad, backward_hooks
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index c86cbc460c9c..18d656c98f32 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -309,12 +309,12 @@ struct PythonPrintImpl {
     // because it doesn't hash any information about the tensors.
     // We will probably need to optimize this at some point using hashing.
     if (val.isTensor()) {
-      auto t = val.toTensor();
+      auto& t = val.toTensor();
       for (size_t i = 0; i < constant_table_.size(); ++i) {
         if (!constant_table_[i].isTensor()) {
           continue;
         }
-        auto t2 = constant_table_[i].toTensor();
+        auto& t2 = constant_table_[i].toTensor();
         if (t.options().type_equal(t2.options()) && t.equal(t2)) {
           return i;
         }
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 3ff5da29fe1f..841e87592be9 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -632,7 +632,7 @@ void Unpickler::rebuildTensor(bool quantized) {
     auto tup = pop(stack_).toTuple();
     const auto& elements = tup->elements();
     size_t idx = 0;
-    auto storage_tensor = elements.at(idx++).toTensor();
+    auto& storage_tensor = elements.at(idx++).toTensor();
     int64_t storage_offset = elements.at(idx++).toInt();
     std::vector<int64_t> size = tupleToIntList(elements.at(idx++));
     std::vector<int64_t> stride = tupleToIntList(elements.at(idx++));

From 68a6e4637903dba279c60daae5cff24e191ff9b4 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 6 Jan 2021 08:39:11 -0800
Subject: [PATCH 13/44] Push anonymous namespace into codegen, not template
 (#49498)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49498

In the near future, I want to code generate some functions that are
visible externally to this compilation unit.  I cannot easily do this
if all the codegen code is wrapped in a global anonymous namespace,
so push the namespace in.

Registration has to stay in an anonymous namespace to avoid name conflicts.
This could also have been solved by making the wrapper functions have
more unique names but I didn't do this in the end.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: albanD, smessmer

Differential Revision: D25616104

Pulled By: ezyang

fbshipit-source-id: 323c0dda05a081502aab702f359a08dfac8c41a4
---
 aten/src/ATen/templates/RegisterDispatchKey.cpp | 7 +++++--
 tools/codegen/gen.py                            | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index e923f6d73bd0..ed4359c6883e 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -37,10 +37,13 @@
 
 namespace at {
 
-namespace {
-
 ${dispatch_definitions}
 
+// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
+// ambiguity with conflicting identifiers that may have been defined in
+// at namespace already.
+namespace {
+
 TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
   ${dispatch_registrations}
 }
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 8f521e6651bc..4768670b6f26 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -435,6 +435,8 @@ def gen_one(f: NativeFunction) -> Optional[str]:
                 # For an overview of what this template code looks like, see
                 # https://github.com/pytorch/rfcs/pull/9
                 return f"""\
+namespace {{
+
 {self.gen_structured_class(
     f, k,
     class_name=class_name,
@@ -448,6 +450,8 @@ def gen_one(f: NativeFunction) -> Optional[str]:
     {impl_call}
     return {ret_expr};
 }}
+
+}} // anonymous namespace
 """
 
             elif self.target is Target.REGISTRATION:
@@ -540,9 +544,13 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
 """
 
             return f"""\
+namespace {{
+
 {returns_type} {name}({args_str}) {{
 {cuda_guard}{return_kw}{impl_name}({args_exprs_str});
 }}
+
+}} // anonymous namespace
 """
 
         elif self.target is Target.REGISTRATION:

From 74c055b24065d0202aecdf4bc837d3698d1639e1 Mon Sep 17 00:00:00 2001
From: Loi Ly <vinhloiit1327@gmail.com>
Date: Wed, 6 Jan 2021 09:45:15 -0800
Subject: [PATCH 14/44] Fix mypy type hint for AdaptiveAvgPool2,3d,
 AdaptiveMaxPool2,3d (#49963)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/49918

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49963

Reviewed By: mrshenli, heitorschueroff

Differential Revision: D25760110

Pulled By: ezyang

fbshipit-source-id: aeb655b784689544000ea3b948f7d6d025aee441
---
 test/type_hint_tests/opt_size.py |  6 ++++++
 torch/nn/common_types.py         |  7 ++++++-
 torch/nn/functional.pyi.in       | 10 +++++-----
 torch/nn/modules/pooling.py      | 15 ++++++++-------
 4 files changed, 25 insertions(+), 13 deletions(-)
 create mode 100644 test/type_hint_tests/opt_size.py

diff --git a/test/type_hint_tests/opt_size.py b/test/type_hint_tests/opt_size.py
new file mode 100644
index 000000000000..f24e57e6e56f
--- /dev/null
+++ b/test/type_hint_tests/opt_size.py
@@ -0,0 +1,6 @@
+import torch.nn as nn
+
+avg_pool1 = nn.AdaptiveAvgPool2d((1, None))
+avg_pool2 = nn.AdaptiveAvgPool2d((None, 1))
+max_pool1 = nn.AdaptiveMaxPool2d((1, None))
+max_pool2 = nn.AdaptiveMaxPool2d((None, 1))
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index fa9d5bb1eb00..884f739e2781 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,4 +1,4 @@
-from typing import TypeVar, Union, Tuple
+from typing import TypeVar, Union, Tuple, Optional
 from .. import Tensor
 
 # Create some useful type aliases
@@ -24,6 +24,11 @@
 _size_5_t = _scalar_or_tuple_5_t[int]
 _size_6_t = _scalar_or_tuple_6_t[int]
 
+# For arguments which represent optional size parameters (eg, adaptive pool parameters)
+_size_any_opt_t = _scalar_or_tuple_any_t[Optional[int]]
+_size_2_opt_t = _scalar_or_tuple_2_t[Optional[int]]
+_size_3_opt_t = _scalar_or_tuple_3_t[Optional[int]]
+
 # For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
 _ratio_2_t = _scalar_or_tuple_2_t[float]
 _ratio_3_t = _scalar_or_tuple_3_t[float]
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index 94071556e144..208dc7c2df40 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -1,7 +1,7 @@
 from torch import Tensor
 from torch.types import _size
 from typing import Any, Optional, Tuple, Dict, List, Callable, Sequence, Union
-from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t
+from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t, _size_2_opt_t, _size_3_opt_t
 
 # 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys.
 # It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature
@@ -75,21 +75,21 @@ def adaptive_max_pool1d_with_indices(input: Tensor, output_size: _size, return_i
     Tensor, Tensor]: ...
 
 
-def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[
+def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size_2_opt_t, return_indices: bool = ...) -> Tuple[
     Tensor, Tensor]: ...
 
 
-def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[
+def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size_3_opt_t, return_indices: bool = ...) -> Tuple[
     Tensor, Tensor]: ...
 
 
 def adaptive_avg_pool1d(input: Tensor, output_size: _size_1_t) -> Tensor: ...
 
 
-def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_t) -> Tensor: ...
+def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ...
 
 
-def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_t) -> Tensor: ...
+def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ...
 
 
 def dropout(input: Tensor, p: float = ..., training: bool = ..., inplace: bool = ...) -> Tensor: ...
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index e8f68307f230..78aae504083b 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -5,7 +5,8 @@
 from .utils import _single, _pair, _triple
 from .. import functional as F
 
-from ..common_types import _size_any_t, _size_1_t, _size_2_t, _size_3_t, _ratio_3_t, _ratio_2_t
+from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t,
+                            _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t)
 
 
 class _MaxPoolNd(Module):
@@ -953,7 +954,7 @@ class _AdaptiveMaxPoolNd(Module):
     __constants__ = ['output_size', 'return_indices']
     return_indices: bool
 
-    def __init__(self, output_size: _size_any_t, return_indices: bool = False) -> None:
+    def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None:
         super(_AdaptiveMaxPoolNd, self).__init__()
         self.output_size = output_size
         self.return_indices = return_indices
@@ -1020,7 +1021,7 @@ class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
 
     """
 
-    output_size: _size_2_t
+    output_size: _size_2_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
@@ -1057,7 +1058,7 @@ class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
 
     """
 
-    output_size: _size_3_t
+    output_size: _size_3_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
@@ -1066,7 +1067,7 @@ def forward(self, input: Tensor) -> Tensor:
 class _AdaptiveAvgPoolNd(Module):
     __constants__ = ['output_size']
 
-    def __init__(self, output_size: _size_any_t) -> None:
+    def __init__(self, output_size: _size_any_opt_t) -> None:
         super(_AdaptiveAvgPoolNd, self).__init__()
         self.output_size = output_size
 
@@ -1125,7 +1126,7 @@ class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
 
     """
 
-    output_size: _size_2_t
+    output_size: _size_2_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_avg_pool2d(input, self.output_size)
@@ -1159,7 +1160,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
 
     """
 
-    output_size: _size_3_t
+    output_size: _size_3_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_avg_pool3d(input, self.output_size)

From efe0533a24796c7402e1e8eba2317eb5424d90e3 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 6 Jan 2021 10:39:15 -0800
Subject: [PATCH 15/44] Clean up some type annotations in
 torch/testing/_internal (#50078)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50078

Upgrades type annotations from Python2 to Python3

Test Plan: Sandcastle tests

Reviewed By: pritamdamania87

Differential Revision: D25717560

fbshipit-source-id: cec631f3121ef9ab87ff8b3b00f1fae6df9a2155
---
 .../_internal/distributed/rpc/dist_autograd_test.py    |  7 +++----
 .../distributed/rpc/jit/dist_autograd_test.py          |  3 +--
 .../_internal/distributed/rpc/jit/rpc_test_faulty.py   | 10 ++++------
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index c7fdbe536061..15d5cfeca214 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -11,6 +11,7 @@
 import torch.testing._internal.dist_utils
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
+from torch.distributed.rpc import RRef
 from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.dist_utils import (
     dist_init,
@@ -70,8 +71,7 @@ def create_tensor():
 
 
 @torch.jit.script
-def create_torchscript_tensor():
-    # type: () -> Tensor
+def create_torchscript_tensor() -> torch.Tensor:
     return torch.ones((3, 3)).requires_grad_()
 
 
@@ -94,8 +94,7 @@ def my_script_add(t1, t2):
 
 
 @torch.jit.script
-def my_script_ref_add(ref_t1, t2):
-    # type: (RRef[Tensor], Tensor) -> Tensor
+def my_script_ref_add(ref_t1: RRef[torch.Tensor], t2: torch.Tensor) -> torch.Tensor:
     t1 = ref_t1.to_here()
     return torch.add(t1, t2)
 
diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
index ee3ebdb33eff..5ae40cdea065 100644
--- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
@@ -34,8 +34,7 @@ def test_get_gradients(self):
         dst_rank = self.rank
 
         @torch.jit.script
-        def dist_get_gradients(context_id):
-            # type: (int) -> (Dict[Tensor, Tensor])
+        def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]):
             return dist_autograd.get_gradients(context_id)
 
         FileCheck().check("get_gradients").run(str(dist_get_gradients.graph))
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
index 656f25322274..96ede7231a97 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
@@ -3,6 +3,7 @@
 import torch
 import torch.distributed.rpc as rpc
 from torch import Tensor
+from torch.distributed.rpc import RRef
 from torch.testing._internal.dist_utils import (
     dist_init,
     worker_name,
@@ -63,18 +64,15 @@ def rpc_async_call_future_ret(
     return fut
 
 @torch.jit.script
-def rref_to_here(rref_var):
-    # type: (RRef[Tensor]) -> Tensor
+def rref_to_here(rref_var: RRef[Tensor]) -> Tensor:
     return rref_var.to_here()
 
 @torch.jit.script
-def rref_to_here_with_timeout(rref_var, timeout):
-    # type: (RRef[Tensor], float) -> Tensor
+def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor:
     return rref_var.to_here(timeout)
 
 @torch.jit.script
-def rpc_async_with_rref_arg(dst_worker_name, args):
-    # type: (str, Tuple[RRef[Tensor]]) -> Tensor
+def rpc_async_with_rref_arg(dst_worker_name: str, args: Tuple[RRef[Tensor]]) -> Tensor:
     fut = rpc.rpc_async(dst_worker_name, rref_to_here, args)
     ret = fut.wait()
     return ret

From e606e603312cf874127b560bcbb8b78b9574ac84 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 6 Jan 2021 10:41:07 -0800
Subject: [PATCH 16/44] [Needs Review] Convert some files to Python3 (#49351)

Summary:
Uses the Python standard library 2to3 script to convert a number of Python 2 files to Python 3. This facilitates code maintenance such as dropping unused imports in D25500422.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49351

Test Plan: Standard sandcastle tests

Reviewed By: xush6528

Differential Revision: D25499576

fbshipit-source-id: 0c44718ac734771ce0758b1cb30676cc3d76ac10
---
 docs/caffe2/process.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/caffe2/process.py b/docs/caffe2/process.py
index 9fa37e5fbb5a..3b94b9d38502 100644
--- a/docs/caffe2/process.py
+++ b/docs/caffe2/process.py
@@ -1,20 +1,21 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 ## @package process
 # Module doxygen.process
 # Script to insert preamble for doxygen and regen API docs
 
-import glob, os, shutil
+import os
+import shutil
 
 # Module caffe2...caffe2.python.control_test
-def insert(originalfile,first_line,description):
-    with open(originalfile,'r') as f:
+def insert(originalfile, first_line, description):
+    with open(originalfile, 'r') as f:
         f1 = f.readline()
-        if(f1.find(first_line)<0):
+        if(f1.find(first_line) < 0):
             docs = first_line + description + f1
-            with open('newfile.txt','w') as f2:
+            with open('newfile.txt', 'w') as f2:
                 f2.write(docs)
                 f2.write(f.read())
-            os.rename('newfile.txt',originalfile)
+            os.rename('newfile.txt', originalfile)
         else:
             print('already inserted')
 
@@ -29,15 +30,15 @@ def insert(originalfile,first_line,description):
     for file in files:
         if (file.endswith(".py") and not file.endswith("_test.py") and not file.endswith("__.py")):
             filepath = os.path.join(root, file)
-            print("filepath: " + filepath)
+            print(("filepath: " + filepath))
             directory = os.path.dirname(filepath)[2:]
-            directory = directory.replace("/",".")
-            print "directory: " + directory
+            directory = directory.replace("/", ".")
+            print("directory: " + directory)
             name = os.path.splitext(file)[0]
             first_line = "## @package " + name
             description = "\n# Module " + directory + "." + name + "\n"
-            print first_line,description
-            insert(filepath,first_line,description)
+            print(first_line, description)
+            insert(filepath, first_line, description)
 
 if os.path.exists("doxygen/doxygen-python"):
     print("Looks like you ran this before, so we need to cleanup those old files...")

From 7d9eb6c6802e85193cdd6139833ff66dc0be228f Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Wed, 6 Jan 2021 10:49:00 -0800
Subject: [PATCH 17/44] Implementation of torch::cuda::synchronize (#50072)

Summary:
Adding `torch::cuda::synchronize()` to libtorch. Note that the implementation here adds a new method to the `CUDAHooksInterface`. An alternative that was suggested to me is to add a method to the `DeviceGuard` interface.

Fixes https://github.com/pytorch/pytorch/issues/47722

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50072

Reviewed By: H-Huang

Differential Revision: D25804342

Pulled By: jbschlosser

fbshipit-source-id: 45aa61d7c6fbfd3178caf2eb5ec053d6c01b5a43
---
 aten/src/ATen/cuda/detail/CUDAHooks.cpp   | 5 +++++
 aten/src/ATen/cuda/detail/CUDAHooks.h     | 1 +
 aten/src/ATen/detail/CUDAHooksInterface.h | 4 ++++
 torch/csrc/api/include/torch/cuda.h       | 3 +++
 torch/csrc/api/src/cuda.cpp               | 9 +++++++++
 5 files changed, 22 insertions(+)

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index f38860e8ef13..b75ef8219b1c 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -369,6 +369,11 @@ int CUDAHooks::getNumGPUs() const {
   return at::cuda::device_count();
 }
 
+void CUDAHooks::deviceSynchronize(int64_t device_index) const {
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  c10::cuda::device_synchronize();
+}
+
 // Sigh, the registry doesn't support namespaces :(
 using at::CUDAHooksRegistry;
 using at::RegistererCUDAHooksRegistry;
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index dff8913b153f..abef2e7ff835 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -38,6 +38,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override;
   void cuFFTClearPlanCache(int64_t device_index) const override;
   int getNumGPUs() const override;
+  void deviceSynchronize(int64_t device_index) const override;
 };
 
 }}} // at::cuda::detail
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index af4eb6fd0739..afe88761d88f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -181,6 +181,10 @@ struct TORCH_API CUDAHooksInterface {
   virtual int getNumGPUs() const {
     return 0;
   }
+
+  virtual void deviceSynchronize(int64_t device_index) const {
+    TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
+  }
 };
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h
index 5f6f2a9eb8a9..a7e063b90af9 100644
--- a/torch/csrc/api/include/torch/cuda.h
+++ b/torch/csrc/api/include/torch/cuda.h
@@ -23,5 +23,8 @@ void TORCH_API manual_seed(uint64_t seed);
 /// Sets the seed for all available GPUs.
 void TORCH_API manual_seed_all(uint64_t seed);
 
+/// Waits for all kernels in all streams on a CUDA device to complete.
+void TORCH_API synchronize(int64_t device_index = -1);
+
 } // namespace cuda
 } // namespace torch
diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp
index d40cd8611c42..b8f3ffa0ee0a 100644
--- a/torch/csrc/api/src/cuda.cpp
+++ b/torch/csrc/api/src/cuda.cpp
@@ -1,6 +1,7 @@
 #include <torch/cuda.h>
 
 #include <ATen/Context.h>
+#include <c10/core/DeviceGuard.h>
 
 #include <cstddef>
 
@@ -49,5 +50,13 @@ void manual_seed_all(uint64_t seed) {
   }
 }
 
+void synchronize(int64_t device_index) {
+  TORCH_CHECK(is_available(), "No CUDA GPUs are available");
+  int64_t num_gpus = cuda::device_count();
+  TORCH_CHECK(device_index == -1 || device_index < num_gpus,
+    "Device index out of range: ", device_index);
+  at::detail::getCUDAHooks().deviceSynchronize(device_index);
+}
+
 } // namespace cuda
 } // namespace torch

From 638086950d3f339de49c6b5393733aea2fee6a55 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 6 Jan 2021 11:01:27 -0800
Subject: [PATCH 18/44] Clean up type annotations in torch/nn/quantized/modules
 (#49941)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49941

Test Plan: Sandcastle

Reviewed By: jerryzh168

Differential Revision: D25718715

fbshipit-source-id: bbe450d937cf7ef634e003c09146e308180d1d58
---
 torch/nn/quantized/modules/conv.py            | 21 +++-----
 torch/nn/quantized/modules/embedding_ops.py   |  6 +--
 .../quantized/modules/functional_modules.py   | 54 +++++++------------
 3 files changed, 27 insertions(+), 54 deletions(-)

diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index a9ba3293630d..00ceba7ab367 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -240,8 +240,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv1d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv1d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -327,8 +326,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv2d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv2d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -412,8 +410,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv3d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv3d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -466,8 +463,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
             padding, dilation, transposed, output_padding,
             groups, bias, padding_mode)
 
-    def _input_padding(self, kernel_size, dilation, padding):
-        # type: (List[int], List[int], List[int]) -> List[int]
+    def _input_padding(self, kernel_size: List[int], dilation: List[int], padding: List[int]) -> List[int]:
         res = torch.jit.annotate(List[int], [])
         for kdx in range(len(kernel_size)):
             pad = (dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx])
@@ -561,8 +557,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose1d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
@@ -645,8 +640,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose2d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
@@ -730,8 +724,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose3d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py
index e41d55347741..523994b364c8 100644
--- a/torch/nn/quantized/modules/embedding_ops.py
+++ b/torch/nn/quantized/modules/embedding_ops.py
@@ -22,8 +22,7 @@ def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
             raise NotImplementedError('Unsupported dtype on quantized embedding! Supports quint8 and quint4x2.')
 
     @torch.jit.export
-    def set_weight(self, weight):
-        # type: (torch.Tensor) -> None
+    def set_weight(self, weight: torch.Tensor) -> None:
         if self.dtype in [torch.quint8, torch.quint4x2]:
             self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight)
         else:
@@ -125,8 +124,7 @@ def extra_repr(self):
 
         return extra_repr_str
 
-    def set_weight(self, w):
-        # type: (torch.Tensor) -> None
+    def set_weight(self, w: torch.Tensor) -> None:
         self._packed_params.set_weight(w)
 
     def weight(self):
diff --git a/torch/nn/quantized/modules/functional_modules.py b/torch/nn/quantized/modules/functional_modules.py
index b9fab962d563..08b5447bb925 100644
--- a/torch/nn/quantized/modules/functional_modules.py
+++ b/torch/nn/quantized/modules/functional_modules.py
@@ -40,45 +40,39 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.add(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.mul(x, y)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.mul(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = torch.cat(x, dim=dim)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``relu(torch.add(x,y))``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = torch.nn.functional.relu(r)
         r = self.activation_post_process(r)
@@ -101,38 +95,32 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         return r
 
     r"""Operation equivalent to ``torch.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.add(x, y)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.mul(x, y)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.mul(x, y)
         return r
 
     r"""Operation equivalent to ``torch.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = torch.cat(x, dim=dim)
         return r
 
     r"""Operation equivalent to ``relu(torch.add(x,y))``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = torch.nn.functional.relu(r)
         return r
@@ -195,45 +183,39 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.ops.quantized.add``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = ops.quantized.add_scalar(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = ops.quantized.mul_scalar(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.add_relu``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r

From 3ce539881a12df901538d6cd93f752469583f65b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 6 Jan 2021 11:26:03 -0800
Subject: [PATCH 19/44] Back out "Revert D25757721: [pytorch][PR] Run mypy on
 more test files" (#50142)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50142

Original commit changeset: 58437d719285

Test Plan: OSS CI

Reviewed By: walterddr, ngimel

Differential Revision: D25803866

fbshipit-source-id: d6b83a5211e430c0451994391876103f1ad96315
---
 mypy.ini                              | 11 +++++++++++
 test/test_bundled_inputs.py           |  4 +++-
 test/test_expecttest.py               |  3 ++-
 test/test_numpy_interop.py            | 18 +++++++++---------
 torch/testing/_internal/expecttest.py |  4 +++-
 torch/utils/bundled_inputs.py         |  4 ++--
 6 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 7d6161bddd17..bab4ce5dfd42 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,8 +17,13 @@ check_untyped_defs = True
 files =
     torch,
     caffe2,
+    test/test_bundled_images.py,
+    test/test_bundled_inputs.py,
     test/test_complex.py,
+    test/test_dataset.py,
+    test/test_expecttest.py,
     test/test_futures.py,
+    test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
     test/test_type_info.py
@@ -119,6 +124,12 @@ ignore_errors = True
 [mypy-torch.overrides]
 ignore_errors = True
 
+#
+# Adding type annotations to caffe2 is probably not worth the effort
+# only work on this if you have a specific reason for it, otherwise
+# leave these ignores as they are.
+#
+
 [mypy-caffe2.python.*]
 ignore_errors = True
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index f57407c9b1d1..e12339f3acea 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 import io
+from typing import List
+
 import torch
 import torch.utils.bundled_inputs
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -27,7 +29,7 @@ def forward(self, arg):
 
         sm = torch.jit.script(SingleTensorModel())
         original_size = model_size(sm)
-        get_expr = []
+        get_expr : List[str] = []
         samples = [
             # Tensor with small numel and small storage.
             (torch.tensor([1]),),
diff --git a/test/test_expecttest.py b/test/test_expecttest.py
index 652a33c41869..5e2461797705 100644
--- a/test/test_expecttest.py
+++ b/test/test_expecttest.py
@@ -4,6 +4,7 @@
 import string
 import textwrap
 import doctest
+from typing import Dict, Any
 
 import hypothesis
 from hypothesis.strategies import text, integers, composite, sampled_from, booleans
@@ -38,7 +39,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote):
         r3 = {r}{quote}placeholder3{quote}
         """.format(r='r' if raw else '', quote=quote * 3)
         new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0]
-        ns = {}
+        ns : Dict[str, Any] = {}
         exec(new_prog, ns)
         msg = "program was:\n{}".format(new_prog)
         self.assertEqual(ns['r'], 'placeholder', msg=msg)  # noqa: F821
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 35ac4eb94889..81c385ae90a2 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -47,10 +47,8 @@ def get_castable_tensor(shape, dtype):
             else:
                 # can't directly use min and max, because for int64_t, max - min
                 # is greater than int64_t range and triggers UB.
-                dtype_info = torch.iinfo(dtype)
-                low = max(dtype_info.min, int(-1e10))
-                high = min(dtype_info.max, int(1e10))
-                dtype_info = torch.iinfo(dtype)
+                low = max(torch.iinfo(dtype).min, int(-1e10))
+                high = min(torch.iinfo(dtype).max, int(1e10))
                 t = torch.empty(shape, dtype=torch.int64).random_(low, high)
             return t.to(dtype)
 
@@ -272,10 +270,12 @@ def test_numpy_array_interface(self, device):
         ]
         for tp, dtype in zip(types, dtypes):
             if np.dtype(dtype).kind == 'u':
-                x = torch.Tensor([1, 2, 3, 4]).type(tp)
+                # .type expects a XxxTensor, which have no type hints on
+                # purpose, so ignore during mypy type checking
+                x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
                 array = np.array([1, 2, 3, 4], dtype=dtype)
             else:
-                x = torch.Tensor([1, -2, 3, -4]).type(tp)
+                x = torch.Tensor([1, -2, 3, -4]).type(tp)  # type: ignore
                 array = np.array([1, -2, 3, -4], dtype=dtype)
 
             # Test __array__ w/o dtype argument
@@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device):
         float_types = [torch.DoubleTensor, torch.FloatTensor]
         float_dtypes = [np.float64, np.float32]
         for tp, dtype in zip(float_types, float_dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             for func in ['sin', 'sqrt', 'ceil']:
                 ufunc = getattr(np, func)
@@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device):
 
         # Test functions with boolean return value
         for tp, dtype in zip(types, dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             geq2_x = np.greater_equal(x, 2)
             geq2_array = np.greater_equal(array, 2).astype('uint8')
@@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device):
             self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val))
 
             # numpy integral type parses like a python int in custom python bindings:
-            self.assertEqual(torch.Storage(np_val).size(), scalar)
+            self.assertEqual(torch.Storage(np_val).size(), scalar)  # type: ignore
 
             tensor = torch.tensor([2], dtype=torch.int)
             tensor[0] = np_val
diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py
index 9e46a9a84a37..4dae7ebf03dc 100644
--- a/torch/testing/_internal/expecttest.py
+++ b/torch/testing/_internal/expecttest.py
@@ -3,6 +3,7 @@
 import traceback
 import os
 import string
+from typing import Tuple
 
 
 # This file implements expect tests (also known as "golden" tests).
@@ -139,7 +140,8 @@ def ok_for_raw_triple_quoted_string(s, quote):
                        r"(?P<raw>r?)", re.DOTALL)
 
 
-def replace_string_literal(src, lineno, new_string):
+def replace_string_literal(src : str, lineno : int,
+                           new_string : str) -> Tuple[str, int]:
     r"""
     Replace a triple quoted string literal with new contents.
     Only handles printable ASCII correctly at the moment.  This
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index c5d603885e4a..741c0841778a 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union
+from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence
 import textwrap
 import torch
 from torch._C import TupleType, OptionalType, ListType
@@ -17,7 +17,7 @@ class InflatableArg(NamedTuple):
 
 def augment_model_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Optional[List[Tuple[Any, ...]]] = None,
+        inputs: Optional[Sequence[Tuple[Any, ...]]] = None,
         _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
 ) -> None:
     """Add bundled sample inputs to a model.

From 6eee2a0a9f3545eb3d923408eedf2a2136bf4d14 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Wed, 6 Jan 2021 11:34:50 -0800
Subject: [PATCH 20/44] [JIT] disable masked fill (#50147)

Summary:
There is an internal user who is experiencing a bug with masked_fill. While I am almost certain this corresponds to an old pytorch version with the bug, the model that is breaking is important and time-sensitive and we are covering all bases to try to get it to work again.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50147

Reviewed By: nhsoukai

Differential Revision: D25806541

Pulled By: eellison

fbshipit-source-id: 131bd71b5db9717a8a9cb97973d0b4f0e96455d6
---
 test/test_jit_fuser_te.py                  | 1 +
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 2143b4e19020..4886abc58758 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1281,6 +1281,7 @@ def forward(self, x):
             self.assertEqual(ref, mod.forward(x))
             self.assertLastGraphAllFused()
 
+    @unittest.skip("Temporarily disabled")
     def test_masked_fill(self):
         dtypes = [
             torch.int8,
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index e8091957ba65..166238cebe17 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -127,7 +127,7 @@ bool isSupported(Node* node) {
       "aten::round(Tensor self) -> Tensor",
       "aten::trunc(Tensor self) -> Tensor",
       "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
-      "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor",
+      // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor",
       // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor
       "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor",
       "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor",

From ba691e1a428eb133e9c7b8e2f73ae723b8182f65 Mon Sep 17 00:00:00 2001
From: Sam Estep <sam@samestep.com>
Date: Wed, 6 Jan 2021 11:47:35 -0800
Subject: [PATCH 21/44] Remove incorrect links to zdevito/ATen (#50065)

Summary:
Similar to https://github.com/pytorch/pytorch/issues/49028, this PR removes a few more references to https://github.com/zdevito/ATen.

- The links for Functions.h, Tensor.h, and Type.h are simply broken, probably because they refer to `master` rather than a specific commit (cf. https://github.com/pytorch/pytorch/issues/47066)
- I'm unsure about the change to the `about` section of `aten/conda/meta.yaml`; can someone comment on whether I am understanding that field correctly?
- The reference to https://github.com/zdevito/ATen/issues/163 remains [in `tools/autograd/derivatives.yaml`](https://github.com/pytorch/pytorch/blob/cd608fe59b70fa7cafb07110096b2e023a8b6e9c/tools/autograd/derivatives.yaml#L91), because the contents of that issue discussion don't seem to be mirrored anywhere else.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50065

Reviewed By: ezyang, walterddr

Differential Revision: D25767353

Pulled By: samestep

fbshipit-source-id: 265f46f058bc54ef6d1a77f112cdfa1f115b3247
---
 aten/conda/meta.yaml                          |  2 +-
 caffe2/contrib/aten/README.md                 |  6 +++---
 caffe2/contrib/aten/docs/pytorch_to_caffe2.md | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml
index d8096fc73a0f..a502690a5447 100644
--- a/aten/conda/meta.yaml
+++ b/aten/conda/meta.yaml
@@ -24,7 +24,7 @@ requirements:
     - mkl # [not osx]
 
 about:
-  home: https://github.com/zdevito/ATen
+  home: https://github.com/pytorch/pytorch
   license: BSD
   summary: A TENsor library for C++14
 
diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md
index 377a1f780271..593079ef1393 100644
--- a/caffe2/contrib/aten/README.md
+++ b/caffe2/contrib/aten/README.md
@@ -1,6 +1,6 @@
 # An ATen operator for Caffe2
 
-[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
+ATen is a simple tensor library thats exposes the Tensor operations in Torch
 and PyTorch directly in C++14. This library provides a generated wrapper around the ATen API
 that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
 ToffeeIR.
@@ -8,8 +8,8 @@ ToffeeIR.
 
 ### Example Usage in Caffe2
 
-First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
-[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+First identify a function in ATen you want to call in Functions.h,
+Tensor.h, or Type.h.
 
 We will call the `pow` operator:
 
diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
index 85c275bb5178..c3f615ee37b9 100644
--- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
+++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
@@ -6,7 +6,7 @@ operators that haven't been standardized yet, or custom `torch.autograd.Function
 are specific to a network.
 
 To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
-[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
+[ATen](https://github.com/pytorch/pytorch/tree/master/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/master/caffe2/contrib/aten)
 that can run these tensor functions in a Caffe2 network after importing them through ONNX.
 
 This guide explains how to configure Caffe2 and modify your PyTorch program to use
@@ -61,8 +61,8 @@ We can add a `symbolic` method to it like so:
 
 The function `graph.at` adds a new ATen op the computation graph.
 You can call any ATen function using this facility. To do so,
-first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
-[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+first identify a function in ATen you want to call in Functions.h,
+Tensor.h, or Type.h.
 
 As an example, we might want to call the `pow` operator:
 
@@ -86,9 +86,9 @@ To call methods of ATen's `Type` objects, you provide an additional string attri
 that determines the type. For instance, `ones` creates a new constant tensor of all ones:
 ```
 class Type {
-	...
-	virtual Tensor ones(IntArrayRef size) const;
-	...
+  ...
+  virtual Tensor ones(IntArrayRef size) const;
+  ...
 };
 ```
 

From 9b519b4a3f101fc799f1a9fcec79b21a31e3af2e Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Wed, 6 Jan 2021 12:15:11 -0800
Subject: [PATCH 22/44] [PyTorch Mobile] Generate Kernel dtype selection code
 in selected_mobile_ops.h during the build (#49279)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49279

Now that the YAML files for tracing based selective build optionally have the information regarding the selected kernel function dtypes, we can start generating constexpr selection code in the include file (`selected_mobile_ops.h`) to make the inclusion of code for specific dtypes selective based on compile time decisions.

The way this is done is that if we detect that the code for a specific dtype should not be in the binary, we add an exception (throw) statement just before the method is called (see the first diff in this dtack) and allow the compiler to optimize away the rest of the function's body. This has the advantage of allowing the compiler to know the lambda's return type (since it's inferred from the `return` statements in the body of the method, and if we compile out all the cases, then the compiler won't know the return type and it will result in a compilation error).

The generated `<ATen/selected_mobile_ops.h>` is being used (included) in `Dispatch.h`. In case `XPLAT_MOBILE_BUILD` is not defined, then we should include code for all kernel dtypes (non-selective build).

When merging, we need to handle the case of both older and newer (tracing based) operator lists. If we detect any operator that includes all overloads, it indicates that an old style operator list is part of the build, and we need to `include_all_kernel_dtypes` for this build.
ghstack-source-id: 119439497

Test Plan:
For Segmentation v220, here is one of the intermediate generated YAML files (selected_operators.yaml): {P154480509}
and here is the generated `selected_mobile_ops.h` file: {P159808798}

Here is the `selected_mobile_ops.h` file for lite_predictor (which includes all ops and all dtypes): {P159806443}

Continuous build for ~8 checked-in models validates that the selection code works as expected when we build based on dtype selection.

Reviewed By: iseeyuan

Differential Revision: D25388949

fbshipit-source-id: 1c182a4831a7f94f7b152f02dbd3bc01c0d22443
---
 aten/src/ATen/Dispatch.h                  |  4 ++
 tools/codegen/selective_build/selector.py | 71 ++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 2e663b4f48dd..341e20cab1f3 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -10,6 +10,9 @@
 #include <c10/util/complex.h>
 #include <c10/util/string_view.h>
 
+#ifdef XPLAT_MOBILE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#else
 namespace at {
 /**
  * The method should_include_kernel_dtype() returns true/false
@@ -25,6 +28,7 @@ inline constexpr bool should_include_kernel_dtype(
   return true;
 }
 }
+#endif
 
 /**
  * In the Facebook internal build (using BUCK), this macro is enabled by
diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py
index 3e80e168d31c..eeb15049075e 100644
--- a/tools/codegen/selective_build/selector.py
+++ b/tools/codegen/selective_build/selector.py
@@ -1,4 +1,4 @@
-from typing import Dict, Set, Optional, Tuple
+from typing import Dict, Set, Optional, Tuple, List
 import yaml
 
 from dataclasses import dataclass
@@ -26,6 +26,20 @@ class SelectiveBuilder:
     # A dictionary of operator -> operator metadata.
     operators: Dict[str, SelectiveBuildOperator]
 
+    # A dictionary of selected kernel tags and dtypes. Typically a
+    # PyTorch Operator Kernel (function) may have many code paths
+    # that are specialized for many many Tensor dtypes, so it's not
+    # one per kernel function, but there could be many per kernel
+    # function. The tag isn't a kernel function name, but some fragment
+    # of the kernel function implementation itself.
+    kernel_metadata: Dict[str, List[str]]
+
+    # If true, then fragments for all dtypes for all kernel functions
+    # are included. This is typically set when any one of the
+    # operator lists is generated from a mechanism other than
+    # tracing based selective build.
+    include_all_kernel_dtypes: bool
+
     @staticmethod
     def get_nop_selector() -> 'SelectiveBuilder':
         return SelectiveBuilder.from_yaml_dict({'include_all_operators': True})
@@ -33,9 +47,11 @@ def get_nop_selector() -> 'SelectiveBuilder':
     @staticmethod
     def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder':
         valid_top_level_keys = {
+            'include_all_kernel_dtypes',
             'include_all_operators',
             'debug_info',
             'operators',
+            'kernel_metadata',
         }
         top_level_keys = set(data.keys())
         if len(top_level_keys - valid_top_level_keys) > 0:
@@ -58,7 +74,24 @@ def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder':
 
         for (k, v) in operators_dict.items():
             operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v)
-        return SelectiveBuilder(include_all_operators, debug_info, operators)
+
+        kernel_metadata = {}
+        kernel_metadata_dict = data.get('kernel_metadata', {})
+        assert isinstance(kernel_metadata_dict, dict)
+
+        for (k, v) in kernel_metadata_dict.items():
+            kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v))
+
+        include_all_kernel_dtypes = data.get('include_all_kernel_dtypes', False)
+        assert isinstance(include_all_kernel_dtypes, bool)
+
+        return SelectiveBuilder(
+            include_all_operators,
+            debug_info,
+            operators,
+            kernel_metadata,
+            include_all_kernel_dtypes,
+        )
 
     @staticmethod
     def from_yaml_str(config_contents: str) -> 'SelectiveBuilder':
@@ -86,6 +119,7 @@ def from_legacy_op_registration_allow_list(
             }
         return SelectiveBuilder.from_yaml_dict({
             'operators': operators,
+            'include_all_kernel_dtypes': True,
         })
 
     def is_operator_selected(self, name: str) -> bool:
@@ -147,8 +181,15 @@ def is_root_operator(self, name: str) -> bool:
         base_op: SelectiveBuildOperator = self.operators[name]
         return base_op.include_all_overloads and base_op.is_root_operator
 
+    def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool:
+        if self.include_all_operators or self.include_all_kernel_dtypes:
+            return True
+
+        return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag]
+
     def to_dict(self) -> Dict[str, object]:
         ret: Dict[str, object] = {
+            'include_all_kernel_dtypes': self.include_all_kernel_dtypes,
             'include_all_operators': self.include_all_operators,
         }
         operators = {}
@@ -159,14 +200,38 @@ def to_dict(self) -> Dict[str, object]:
         if self._debug_info is not None:
             ret['debug_info'] = self._debug_info
 
+        ret['kernel_metadata'] = {k: list(v) for (k, v) in self.kernel_metadata.items()}
+
         return ret
 
 
+def merge_kernel_metadata(
+        lhs: Dict[str, List[str]],
+        rhs: Dict[str, List[str]],
+) -> Dict[str, List[str]]:
+    kernel_metadata: Dict[str, List[str]] = {}
+    for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()):
+        dtypes_copy = set(dtypes)
+        if tag_name in kernel_metadata:
+            dtypes_copy |= set(kernel_metadata[tag_name])
+
+        kernel_metadata[tag_name] = list(dtypes_copy)
+
+    return kernel_metadata
+
 def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder:
     include_all_operators = lhs.include_all_operators or rhs.include_all_operators
     debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
     operators = merge_operator_dicts(lhs.operators, rhs.operators)
-    return SelectiveBuilder(include_all_operators, debug_info, operators)
+    kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata)
+    include_all_kernel_dtypes = lhs.include_all_kernel_dtypes or rhs.include_all_kernel_dtypes
+    return SelectiveBuilder(
+        include_all_operators,
+        debug_info,
+        operators,
+        kernel_metadata,
+        include_all_kernel_dtypes,
+    )
 
 
 def op_name_from_native_function(f: NativeFunction) -> str:

From 09eb468398763fe2914fa4e28eb04dcbf1b3e615 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@fb.com>
Date: Wed, 6 Jan 2021 12:35:09 -0800
Subject: [PATCH 23/44] [vulkan] 2D prepacking for conv2d (#48816)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48816

Test Plan: Imported from OSS

Reviewed By: AshkanAliabadi

Differential Revision: D25786280

Pulled By: SS-JIA

fbshipit-source-id: b41bf55dcff8f3dfbbf1994171e2ef62f16ff29a
---
 aten/src/ATen/native/vulkan/glsl/conv2d.glsl  |  24 +-
 .../ATen/native/vulkan/glsl/conv2d_dw.glsl    |   5 +-
 .../ATen/native/vulkan/glsl/conv2d_pw.glsl    |  13 +-
 aten/src/ATen/native/vulkan/ops/Common.h      |   2 +-
 .../ATen/native/vulkan/ops/Convolution.cpp    | 355 ++++++++++--------
 5 files changed, 221 insertions(+), 178 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index bb2508aefe65..547eec7fafef 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -17,7 +17,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 padding;
   ivec2 dilate;
   vec2 clamp;
-  int stacks_per_tower;
+  ivec3 src_kernel;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -28,9 +28,6 @@ void main() {
   /* Dynamically Uniform */
   const ivec3 size = imageSize(uOutput);
   const ivec3 isize = textureSize(uInput, 0);
-  const int tower = pos.z/(uBlock.stacks_per_tower);
-  const int tower_offset = pos.z % uBlock.stacks_per_tower;
-  const ivec4 block = tower_offset * uBlock.kernel.z + ivec4(0, 1, 2, 3);
 
   if (all(lessThan(pos, size))) {
     const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
@@ -42,16 +39,15 @@ void main() {
     vec4 sum = uBias.data[pos.z];
 
     for (int z = 0; z < uBlock.kernel.z; z+=4) {
-      const ivec4 kz = block + z;
-
-      for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) {
-        for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) {
-          const vec4 In = texelFetch(uInput, ivec3(x, y, z/4), 0);
-
-          sum = fma(In.xxxx, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.x), 0), sum);
-          sum = fma(In.yyyy, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.y), 0), sum);
-          sum = fma(In.zzzz, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.z), 0), sum);
-          sum = fma(In.wwww, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.w), 0), sum);
+      const int z4 = z/4;
+      for (int y = start.y, ky = kstart.y + pos.z * uBlock.src_kernel.y; y < end.y; y += uBlock.dilate.y, ++ky) {
+        for (int x = start.x, kx = 4*kstart.x + z4*uBlock.src_kernel.z; x < end.x; x += uBlock.dilate.x, kx+=4) {
+          const vec4 In = texelFetch(uInput, ivec3(x, y, z4), 0);
+
+          sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0 + kx, ky, 0), 0), sum);
+          sum = fma(In.yyyy, texelFetch(uKernel, ivec3(1 + kx, ky, 0), 0), sum);
+          sum = fma(In.zzzz, texelFetch(uKernel, ivec3(2 + kx, ky, 0), 0), sum);
+          sum = fma(In.wwww, texelFetch(uKernel, ivec3(3 + kx, ky, 0), 0), sum);
         }
       }
     }
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index 0f49515718b2..f8f929461ce7 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -17,6 +17,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 padding;
   ivec2 dilate;
   vec2 clamp;
+  ivec2 src_kernel;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -38,10 +39,10 @@ void main() {
     vec4 sum = uBias.data[pos.z];
 
     for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) {
-      for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) {
+      for (int x = start.x, kx = kstart.x + ky*uBlock.src_kernel.x; x < end.x; x += uBlock.dilate.x, ++kx) {
         sum = fma(
             texelFetch(uInput, ivec3(x, y, pos.z), 0),
-            texelFetch(uKernel, ivec3(kx, ky, pos.z), 0),
+            texelFetch(uKernel, ivec3(kx, pos.z, 0), 0),
             sum);
       }
     }
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index 1355b2c09b05..b28f0550132f 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -16,7 +16,6 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 stride;
   ivec2 padding;
   vec2 clamp;
-  int stacks_per_tower;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -27,9 +26,6 @@ void main() {
   /* Dynamically Uniform */
   const ivec3 size = imageSize(uOutput);
   const ivec3 isize = textureSize(uInput, 0);
-  const int tower = pos.z/(uBlock.stacks_per_tower);
-  const int tower_offset = pos.z % uBlock.stacks_per_tower;
-  const ivec4 block = tower_offset * uBlock.kernel.x + ivec4(0, 1, 2, 3);
 
   if (all(lessThan(pos, size))) {
     const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
@@ -38,12 +34,11 @@ void main() {
 
     for (int z = 0; z < uBlock.kernel.x; z+=4) {
       const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z/4), 0);
-      const ivec4 kz = block + z;
 
-      sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, tower, kz.x), 0), sum);
-      sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, tower, kz.y), 0), sum);
-      sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, tower, kz.z), 0), sum);
-      sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, tower, kz.w), 0), sum);
+      sum = fma(In.xxxx, texelFetch(uKernel, ivec3(z+0, pos.z, 0), 0), sum);
+      sum = fma(In.yyyy, texelFetch(uKernel, ivec3(z+1, pos.z, 0), 0), sum);
+      sum = fma(In.zzzz, texelFetch(uKernel, ivec3(z+2, pos.z, 0), 0), sum);
+      sum = fma(In.wwww, texelFetch(uKernel, ivec3(z+3, pos.z, 0), 0), sum);
     }
 
     imageStore(
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 6f7080f71a80..b0bbeeaf34f1 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -36,7 +36,7 @@ struct Layout final {
 };
 
 struct Experimentation {
-  static constexpr bool kUseConv2dOldApi = true;
+  static constexpr bool kUseConv2dOldApi = false;
 };
 
 struct ConvPrepackLimits final {
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 5af2c14b80cb..c757f6cdac7a 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -25,7 +25,7 @@ inline bool is_pointwise(const IntArrayRef filter) {
          (1 == filter[Layout::Filter::width]);
 }
 
-vTensor pack_weights(
+vTensor pack_weights_dw(
     api::Resource::Pool& pool,
     const Tensor& weight_arg,
     const int64_t groups) {
@@ -39,161 +39,201 @@ vTensor pack_weights(
   const IntArrayRef src_filter = weight.sizes();
   const float* const src_weight_ptr = weight.data_ptr<float>();
 
-  //
-  // Depthwise
-  //
+  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
+  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
+  const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
+  vTensor v_weight{
+      api::context(),
+      &pool,
+      {
+          4,
+          num_stacks,
+          src_kw_sz * src_kh_sz,
+      },
+      weight.options(),
+  };
 
-  if (is_depthwise(src_filter, groups)) {
-    vTensor v_weight{
-        api::context(),
-        &pool,
-        src_filter,
-        weight.options(),
-    };
+  using Future = vTensor::Future<float, vTensor::Access::Write>;
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future::Payload v_weight_payload = v_weight_future.wait();
 
-    using Future = vTensor::Future<void, vTensor::Access::Write>;
-    Future v_weight_future = v_weight.host<void, vTensor::Access::Write>();
-    Future::Payload v_weight_payload = v_weight_future.wait();
+  /* Source */
+  const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
+  const int64_t src_block_sz =
+      src_kernel_sz * src_filter[Layout::Filter::input];
 
-    memcpy(
-        v_weight_payload.get(),
-        src_weight_ptr,
-        std::min(weight.nbytes(), v_weight.nbytes()));
+  /* Destination */
+  const int64_t dst_kw_sz = src_kw_sz * src_kh_sz;
+  const int64_t dst_kh_sz = num_stacks;
+  const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
 
-    return v_weight;
-  }
+  float* const dst_weight_ptr = v_weight_payload.get();
+  memset(dst_weight_ptr, 0, v_weight.nbytes());
 
-  //
-  // General
-  //
+  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+    /* Source */
+    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
-  if (Experimentation::kUseConv2dOldApi) {
-    const uint32_t OC = src_filter[Layout::Filter::output];
-    const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u);
-    const uint32_t C = src_filter[Layout::Filter::input];
-    const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u);
-    const uint32_t KH = src_filter[Layout::Filter::height];
-    const uint32_t KW = src_filter[Layout::Filter::width];
-
-    vTensor v_weight{
-      api::context(),
-      &pool,
-      {
-        1,
-        4 * KH * KW,
-        OC_4,
-        4 * C_4
-      },
-      weight.options(),
-    };
+    /* Destination */
+    const int64_t dst_oh = src_oc / 4;
+    const int64_t dst_c = src_oc % 4;
 
-    using Future = vTensor::Future<float, vTensor::Access::Write>;
-    Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
-    Future::Payload v_weight_payload = v_weight_future.wait();
+    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
-    float* const dst_weight_ptr = v_weight_payload.get();
-    memset(dst_weight_ptr, 0, v_weight.nbytes());
+    for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
+      memcpy(
+          dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz,
+          src_weight_oc_ptr + src_ih * src_kw_sz,
+          sizeof(float) * src_kw_sz);
+    }
+  }
 
-    const float* src = src_weight_ptr;
-    float* const dst = dst_weight_ptr;
+  return v_weight;
+}
 
+vTensor pack_weights_old(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (weight_arg.is_vulkan()) {
+    return convert(weight_arg);
+  }
+
+  const Tensor weight = weight_arg.contiguous();
+  const IntArrayRef src_filter = weight.sizes();
+  const float* const src_weight_ptr = weight.data_ptr<float>();
+
+  const uint32_t OC = src_filter[Layout::Filter::output];
+  const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u);
+  const uint32_t C = src_filter[Layout::Filter::input];
+  const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u);
+  const uint32_t KH = src_filter[Layout::Filter::height];
+  const uint32_t KW = src_filter[Layout::Filter::width];
+
+  vTensor v_weight{
+    api::context(),
+    &pool,
     {
-      uint32_t ridx = 0;
-      const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16;
-      for (uint32_t oc = 0; oc < OC; ++oc) {
-        int oc_4 = oc / 4;
-        int oc_4_i = oc % 4;
-        float* dst_oc = dst + oc_4 * oc_4SizeNumel;
-        for (uint32_t ic = 0; ic < C; ++ic) {
-          int ic_4 = ic / 4;
-          int ic_4_i = ic % 4;
-          float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
-          for (uint32_t ky = 0; ky < KH; ++ky) {
-            float* dst_ky = dst_ic + ky * KW * 16;
-            for (uint32_t kx = 0; kx < KW; ++kx) {
-              float* dst_kx = dst_ky + kx * 16;
-              dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
-            }
+      1,
+      4 * KH * KW,
+      OC_4,
+      4 * C_4
+    },
+    weight.options(),
+  };
+
+  using Future = vTensor::Future<float, vTensor::Access::Write>;
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future::Payload v_weight_payload = v_weight_future.wait();
+
+  float* const dst_weight_ptr = v_weight_payload.get();
+  memset(dst_weight_ptr, 0, v_weight.nbytes());
+
+  const float* src = src_weight_ptr;
+  float* const dst = dst_weight_ptr;
+
+  {
+    uint32_t ridx = 0;
+    const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16;
+    for (uint32_t oc = 0; oc < OC; ++oc) {
+      int oc_4 = oc / 4;
+      int oc_4_i = oc % 4;
+      float* dst_oc = dst + oc_4 * oc_4SizeNumel;
+      for (uint32_t ic = 0; ic < C; ++ic) {
+        int ic_4 = ic / 4;
+        int ic_4_i = ic % 4;
+        float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
+        for (uint32_t ky = 0; ky < KH; ++ky) {
+          float* dst_ky = dst_ic + ky * KW * 16;
+          for (uint32_t kx = 0; kx < KW; ++kx) {
+            float* dst_kx = dst_ky + kx * 16;
+            dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
           }
         }
       }
+    }
 
-      // shader KO4C4HW_to_image
-      struct Image3D {
-        float* data_;
-        uint32_t dim0_, dim1_, dim2_;
-
-        Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) {
-          dim0_ = dim0;
-          dim1_ = dim1;
-          dim2_ = dim2;
-          data_ = new float[dim0 * dim1 * dim2 * 4];
-          memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
-        }
+    // shader KO4C4HW_to_image
+    struct Image3D {
+      float* data_;
+      uint32_t dim0_, dim1_, dim2_;
+
+      Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) {
+        dim0_ = dim0;
+        dim1_ = dim1;
+        dim2_ = dim2;
+        data_ = new float[dim0 * dim1 * dim2 * 4];
+        memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
+      }
 
-        inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
-          return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_;
-        }
+      inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_;
+      }
 
-        void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) {
-          data_[idx(i0, i1, i2, i3)] = value;
-        }
+      void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) {
+        data_[idx(i0, i1, i2, i3)] = value;
+      }
 
-        float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
-          return data_[idx(i0, i1, i2, i3)];
-        }
-      } image{4 * C_4, OC_4, KH * KW};
-
-      for (uint32_t sx = 0; sx < C_4; ++sx) {
-        for (uint32_t sy = 0; sy < OC_4; ++sy) {
-          for (uint32_t sz = 0; sz < (KH * KW); ++sz) {
-            for (uint32_t vi = 0; vi < 4; ++vi) {
-              int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz;
-              image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]);
-              image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]);
-              image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]);
-              image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]);
-            }
+      float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        return data_[idx(i0, i1, i2, i3)];
+      }
+    } image{4 * C_4, OC_4, KH * KW};
+
+    for (uint32_t sx = 0; sx < C_4; ++sx) {
+      for (uint32_t sy = 0; sy < OC_4; ++sy) {
+        for (uint32_t sz = 0; sz < (KH * KW); ++sz) {
+          for (uint32_t vi = 0; vi < 4; ++vi) {
+            int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz;
+            image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]);
+            image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]);
+            image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]);
+            image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]);
           }
         }
       }
+    }
 
-      // inverse function of nchw_to_image
-      const uint32_t W = 4 * C_4;
-      const uint32_t H = OC_4;
-      const uint32_t D = KH * KW;
-      for (uint32_t sx = 0; sx < W; ++sx) {
-        for (uint32_t sy = 0; sy < H; ++sy) {
-          for (uint32_t sz = 0; sz < D; ++sz) {
-            for (uint32_t szvi = 0; szvi < 4; ++szvi) {
-              dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi);
-            }
+    // inverse function of nchw_to_image
+    const uint32_t W = 4 * C_4;
+    const uint32_t H = OC_4;
+    const uint32_t D = KH * KW;
+    for (uint32_t sx = 0; sx < W; ++sx) {
+      for (uint32_t sy = 0; sy < H; ++sy) {
+        for (uint32_t sz = 0; sz < D; ++sz) {
+          for (uint32_t szvi = 0; szvi < 4; ++szvi) {
+            dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi);
           }
         }
       }
     }
+  }
 
-    return v_weight;
+  return v_weight;
+}
+
+vTensor pack_weights_2d(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (weight_arg.is_vulkan()) {
+    return convert(weight_arg);
   }
 
+  const Tensor weight = weight_arg.contiguous();
+  const IntArrayRef src_filter = weight.sizes();
+  const float* const src_weight_ptr = weight.data_ptr<float>();
+
+  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
+  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
   const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
-  const int64_t stack_depth =
-      4 * api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
-  const int64_t max_stacks_per_tower =
-      ConvPrepackLimits::maxStackDepth / stack_depth;
-  const int64_t num_towers = div_up(num_stacks, max_stacks_per_tower);
-  int64_t stacks_per_tower = num_stacks;
-  if (num_towers > 1) {
-    stacks_per_tower = div_up(num_stacks, num_towers);
-  }
+  const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
   vTensor v_weight{
       api::context(),
       &pool,
       {
-          stacks_per_tower,
-          stack_depth,
-          src_filter[Layout::Filter::height] * num_towers,
-          src_filter[Layout::Filter::width],
+          4,
+          src_kh_sz * num_stacks,
+          src_kw_sz * stack_depth,
       },
       weight.options(),
   };
@@ -203,53 +243,59 @@ vTensor pack_weights(
   Future::Payload v_weight_payload = v_weight_future.wait();
 
   /* Source */
-  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
-  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
   const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
   const int64_t src_block_sz =
       src_kernel_sz * src_filter[Layout::Filter::input];
 
   /* Destination */
-  const IntArrayRef dst_filter = v_weight.sizes();
-  const int64_t dst_kw_sz = src_filter[Layout::Filter::width];
-  const int64_t dst_kh_sz = src_filter[Layout::Filter::height] * num_towers;
+  const int64_t dst_kw_sz = src_kw_sz * stack_depth;
+  const int64_t dst_kh_sz = src_kh_sz * num_stacks;
   const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
-  const int64_t dst_block_sz =
-      dst_kernel_sz * dst_filter[Layout::Filter::input];
-
-  TORCH_INTERNAL_ASSERT(src_kernel_sz*num_towers == dst_kernel_sz, "Internal error!");
 
   float* const dst_weight_ptr = v_weight_payload.get();
   memset(dst_weight_ptr, 0, v_weight.nbytes());
 
   for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
-    const int64_t i_tower = src_oc / (stacks_per_tower * 4);
     /* Source */
-    const float* const src_weight_oc_ptr =
-        src_weight_ptr + src_oc * src_block_sz;
+    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
     /* Destination */
-    const int64_t local_oc = src_oc % (stacks_per_tower * 4);
-    const int64_t dst_oc = local_oc / 4;
-    const int64_t dst_oc_offset = local_oc % 4;
+    const int64_t dst_oh = src_oc / 4;
+    const int64_t dst_c = src_oc % 4;
 
-    float* const dst_weight_oc_ptr = dst_weight_ptr + dst_oc * dst_block_sz +
-        dst_oc_offset * dst_kernel_sz;
+    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
     for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
-      const int64_t dst_ic = 4 * src_ic;
-
-      memcpy(
-          dst_weight_oc_ptr + dst_ic * dst_kernel_sz +
-              (i_tower * src_kernel_sz),
-          src_weight_oc_ptr + src_ic * src_kernel_sz,
-          sizeof(float) * src_kernel_sz);
+      const int64_t dst_ic4 = src_ic/4;
+      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
+        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
+          memcpy(
+              dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
+                dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
+              src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
+              sizeof(float));
+        }
+      }
     }
   }
 
   return v_weight;
 }
 
+vTensor pack_weights(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (is_depthwise(weight_arg.sizes(), groups)) {
+    return pack_weights_dw(pool, weight_arg, groups);
+  }
+
+  if (Experimentation::kUseConv2dOldApi) {
+    return pack_weights_old(pool, weight_arg, groups);
+  }
+  return pack_weights_2d(pool, weight_arg, groups);
+}
+
 vTensor pack_biases(
     api::Resource::Pool& pool,
     const c10::optional<Tensor>& bias,
@@ -394,6 +440,7 @@ void conv2d_depthwise(
     const vTensor& v_weight,
     const vTensor& v_bias,
     const IntArrayRef filter,
+    const IntArrayRef src_filter,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -406,6 +453,7 @@ void conv2d_depthwise(
       int32_t padding_x, padding_y;
       int32_t dilate_x, dilate_y;
       float clamp_x, clamp_y;
+      int32_t src_filter_w, src_filter_h;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::width]),
       safe_downcast<int32_t>(filter[Layout::Filter::height]),
@@ -417,6 +465,8 @@ void conv2d_depthwise(
       safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
       output_min,
       output_max,
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
     };
 
     context->dispatch(
@@ -473,14 +523,12 @@ void conv2d_pointwise(
     const float output_min,
     const float output_max) {
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const int64_t stacks_per_tower = v_weight.sizes()[0];
 
     const struct {
       int32_t kernel_ic, kernel_oc;
       int32_t stride_x, stride_y;
       int32_t padding_x, padding_y;
       float clamp_x, clamp_y;
-      int32_t stacks_per_tower;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::input]),
       safe_downcast<int32_t>(filter[Layout::Filter::output]),
@@ -490,7 +538,6 @@ void conv2d_pointwise(
       safe_downcast<int32_t>(padding[Layout::Parameter::height]),
       output_min,
       output_max,
-      safe_downcast<int32_t>(stacks_per_tower),
     };
 
     context->dispatch(
@@ -542,20 +589,20 @@ void conv2d(
     const vTensor& v_weight,
     const vTensor& v_bias,
     const IntArrayRef filter,
+    const IntArrayRef src_filter,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
     const float output_min,
     const float output_max) {
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const int64_t stacks_per_tower = v_weight.sizes()[0];
     const struct {
       int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
       int32_t stride_x, stride_y;
       int32_t padding_x, padding_y;
       int32_t dilate_x, dilate_y;
       float clamp_x, clamp_y;
-      int32_t stacks_per_tower;
+      int32_t src_filter_w, src_filter_h, src_filter_w4;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::width]),
       safe_downcast<int32_t>(filter[Layout::Filter::height]),
@@ -569,7 +616,9 @@ void conv2d(
       safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
       output_min,
       output_max,
-      safe_downcast<int32_t>(stacks_per_tower),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]*4),
     };
 
     context->dispatch(
@@ -859,6 +908,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
           packed_.v_weight,
           packed_.v_bias,
           packed_.filter,
+          unpacked_.filter,
           packed_.stride,
           packed_.padding,
           packed_.dilation,
@@ -904,6 +954,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
               packed_.v_weight,
               packed_.v_bias,
               packed_.filter,
+              unpacked_.filter,
               packed_.stride,
               packed_.padding,
               packed_.dilation,

From 473e78c0faac1d14f6a02799dfc4940bcfe6e07d Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Wed, 6 Jan 2021 12:36:12 -0800
Subject: [PATCH 24/44] Remove redundant code for unsupported Python versions
 (#49486)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49486

Remove code for Python 3.5 and lower.

There's more that can be removed/modernised, but sticking mainly to redundant version checks here, to keep the diff/PR smaller.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/46579

Reviewed By: zou3519

Differential Revision: D24453571

Pulled By: ezyang

fbshipit-source-id: c2cfcf05d6c5f65df64d89c331692c9aec09248e
---
 .jenkins/pytorch/macos-test.sh                |  5 --
 .../win-test-helpers/setup_pytorch_env.bat    |  2 -
 caffe2/python/compatibility.py                |  8 ---
 caffe2/python/onnx/backend.py                 |  3 +-
 caffe2/python/onnx/frontend.py                |  5 +-
 caffe2/python/pybind_state.h                  | 12 ----
 caffe2/python/utils.py                        |  4 +-
 .../caffe2/jenkins/common/install_python.sh   |  5 --
 test/test_dataloader.py                       | 67 +++++++------------
 test/test_jit_profiling.py                    |  7 +-
 test/test_jit_simple.py                       |  7 +-
 tools/shared/module_loader.py                 |  2 -
 torch/_six.py                                 |  1 -
 torch/csrc/utils/python_compat.h              | 15 -----
 torch/csrc/utils/six.h                        |  8 ---
 torch/cuda/__init__.py                        | 10 +--
 torch/multiprocessing/__init__.py             |  2 +-
 torch/multiprocessing/spawn.py                | 17 -----
 torch/serialization.py                        |  2 +-
 torch/testing/_internal/common_nn.py          |  2 +-
 torch/utils/data/dataloader.py                |  4 --
 21 files changed, 39 insertions(+), 149 deletions(-)
 delete mode 100644 caffe2/python/compatibility.py

diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 0c34ddcc6179..24ec02c76df5 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -9,11 +9,6 @@ pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil
 # TODO move this to docker
 pip install unittest-xml-reporting pytest
 
-# faulthandler become built-in since 3.3
-if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then
-  pip install -q faulthandler
-fi
-
 if [ -z "${IN_CI}" ]; then
   rm -rf ${WORKSPACE_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 fi
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index a052a1b67d59..ed6482890993 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -41,8 +41,6 @@ popd
 :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage
 if %errorlevel% neq 0 ( exit /b %errorlevel% )
-:: No need to install faulthandler since we only test Python >= 3.6 on Windows
-:: faulthandler is builtin since Python 3.3
 
 set DISTUTILS_USE_SDK=1
 
diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py
deleted file mode 100644
index 9d615a308333..000000000000
--- a/caffe2/python/compatibility.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from six import PY2, PY3
-
-if PY2:
-    import collections
-    container_abcs = collections
-elif PY3:
-    import collections.abc
-    container_abcs = collections.abc
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 2c80fadafaee..193a6f217f93 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -17,7 +17,6 @@
 # system protobuf.
 import onnx.backend
 from caffe2.python import core, workspace, rnn_cell, gru_cell
-from caffe2.python.compatibility import container_abcs
 from caffe2.python.model_helper import ModelHelper
 from caffe2.proto import caffe2_pb2
 import caffe2.python.utils
@@ -771,7 +770,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version
         ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
         if isinstance(ops, Caffe2Ops):
             return ops
-        if not isinstance(ops, container_abcs.Iterable):
+        if not isinstance(ops, collections.abc.Iterable):
             ops = [ops]
         return Caffe2Ops(ops, [], [])
 
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index bb2778d1a991..b5121602aff5 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -10,13 +10,12 @@
 
 
 
-
+import collections
 import itertools
 import logging
 import re
 
 from caffe2.python import core as caffe2_core
-from caffe2.python.compatibility import container_abcs
 from onnx import (checker, helper, numpy_helper, mapping,
                   GraphProto, NodeProto, TensorProto, OperatorSetIdProto)
 from onnx.helper import make_tensor_value_info, make_model
@@ -153,7 +152,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes):
         const_tensors = []
         if isinstance(nodes, tuple):
             nodes, const_tensors = nodes
-        if not isinstance(nodes, container_abcs.Iterable):
+        if not isinstance(nodes, collections.abc.Iterable):
             nodes = [nodes]
         return nodes, const_tensors
 
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index b3926e941194..6513f216a9be 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -232,7 +232,6 @@ class TensorFeeder : public BlobFeederBase {
         for (int i = 0; i < tensor.numel(); ++i) {
           char* str;
           Py_ssize_t strSize;
-#if PY_MAJOR_VERSION > 2
           if (PyBytes_Check(input[i])) {
             CAFFE_ENFORCE(
                 PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
@@ -246,11 +245,6 @@ class TensorFeeder : public BlobFeederBase {
           } else {
             CAFFE_THROW("Unsupported python object type passed into ndarray.");
           }
-#else
-          CAFFE_ENFORCE(
-              PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
-              "Unsupported python object type passed into ndarray.");
-#endif // PY_MAJOR_VERSION > 2
           outPtr[i] = std::string(str, strSize);
         }
         break;
@@ -342,18 +336,12 @@ class PythonOpBase : public Operator<Context> {
         try {
           builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
         } catch (const py::error_already_set& e) {
-#if PY_MAJOR_VERSION >= 3
           LOG(INFO) << "Cannot unpickle python operator: " << e.what();
           LOG(INFO) << "Try latin1 encoding for python3 run";
           // to use the `_a` literal for arguments
           using namespace pybind11::literals;
           builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
                              .template cast<py::tuple>();
-#else
-          // for py2, simply re-throw the exception, as there is no encoding
-          // argument for pickle.loads
-          throw;
-#endif
         }
         CAFFE_ENFORCE(builder_call);
         CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 947dd9bf296d..289d107303fa 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -6,12 +6,12 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python.compatibility import container_abcs
 from future.utils import viewitems
 from google.protobuf.message import DecodeError, Message
 from google.protobuf import text_format
 
 import sys
+import collections
 import copy
 import functools
 import numpy as np
@@ -126,7 +126,7 @@ def MakeArgument(key, value):
     """Makes an argument based on the value type."""
     argument = caffe2_pb2.Argument()
     argument.name = key
-    iterable = isinstance(value, container_abcs.Iterable)
+    iterable = isinstance(value, collections.abc.Iterable)
 
     # Fast tracking common use case where a float32 array of tensor parameters
     # needs to be serialized.  The entire array is guaranteed to have the same
diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh
index 48a47b271107..19633d451ab3 100755
--- a/docker/caffe2/jenkins/common/install_python.sh
+++ b/docker/caffe2/jenkins/common/install_python.sh
@@ -135,11 +135,6 @@ if [ -z "${INSTALL_SETUPTOOLS}" ]; then
   pip install -U pip setuptools!=38.5.2
 fi
 
-# tornado 5.0 requires Python 2.7.9+ or 3.4+
-if [[ $($PYTHON -c 'import sys; print(int(sys.version_info <= (2, 7, 9) or sys.version_info <= (3, 4)))' == 1) ]]; then
-    pip install 'tornado<5'
-fi
-
 # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
 # defaults installs the most recent networkx version, so we install this lower
 # version explicitly before scikit-image pulls it in as a dependency
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 047297c438b7..c257dd8a2fd7 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3,6 +3,7 @@
 import errno
 import os
 import ctypes
+import faulthandler
 import torch
 import gc
 import time
@@ -34,18 +35,6 @@
     else:
         warnings.warn(err_msg)
 
-try:
-    import faulthandler
-    HAS_FAULTHANDLER = True
-except ImportError:
-    HAS_FAULTHANDLER = False
-    err_msg = ("faulthandler not found. Some data loader tests use it for error "
-               "reporting (e.g., TestDataLoader.test_proper_exit).")
-    if IS_PYTORCH_CI:
-        raise ImportError(err_msg) from None
-    else:
-        warnings.warn(err_msg)
-
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -86,9 +75,7 @@
 JOIN_TIMEOUT = 60.0  # seconds
 
 
-supported_multiprocessing_contexts = [None]
-if torch.multiprocessing._supports_context:
-    supported_multiprocessing_contexts += list(torch.multiprocessing.get_all_start_methods())
+supported_multiprocessing_contexts = [None] + list(torch.multiprocessing.get_all_start_methods())
 
 
 @unittest.skipIf(
@@ -312,29 +299,25 @@ def test_iterable_dataset_err(self):
 
 # takes in dummy var so this can also be used as a `worker_init_fn`
 def set_faulthander_if_available(_=None):
-    if HAS_FAULTHANDLER:
-        faulthandler.enable(sys.__stderr__)
-        if not IS_WINDOWS:
-            # windows does not have faulthandler.register
-            # chain=False prevents the default behavior of killing the process
-            faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False)
+    faulthandler.enable(sys.__stderr__)
+    if not IS_WINDOWS:
+        # windows does not have faulthandler.register
+        # chain=False prevents the default behavior of killing the process
+        faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False)
 
 
 set_faulthander_if_available()
 
 # Process `pid` must have called `set_faulthander_if_available`
 def print_traces_of_all_threads(pid):
-    if HAS_FAULTHANDLER:
-        if not IS_WINDOWS:
-            # use the custom signal if available
-            os.kill(pid, signal.SIGUSR1)
-        else:
-            # otherwise we can still use the handler given by faulthandler.enable()
-            # at the cost of killing the process.
-            os.kill(pid, signal.SIGSEGV)
+    if not IS_WINDOWS:
+        # use the custom signal if available
+        os.kill(pid, signal.SIGUSR1)
     else:
-        # if there is no faulthandler, use SIGINT otherwise and hope for the best
-        os.kill(pid, signal.SIGINT)
+        # otherwise we can still use the handler given by faulthandler.enable()
+        # at the cost of killing the process.
+        os.kill(pid, signal.SIGSEGV)
+
     # wait in parent process to give subprocess some time to print
     time.sleep(5)
 
@@ -1037,17 +1020,13 @@ def test_invalid_ctor_args_combinations(self):
                                     "batch_size=None option disables auto-batching and is mutually exclusive"):
             self._get_data_loader(self.dataset, batch_size=None, drop_last=True)
 
-        if torch.multiprocessing._supports_context:
-            valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1]
-            with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"):
-                self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx)
-            with self.assertRaisesRegex(ValueError, "should specify a valid start method in"):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad')
-            with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object())
-        else:
-            with self.assertRaisesRegex(ValueError, "multiprocessing_context relies on Python >= 3.4"):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='fork')
+        valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1]
+        with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"):
+            self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx)
+        with self.assertRaisesRegex(ValueError, "should specify a valid start method in"):
+            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad')
+        with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "):
+            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object())
 
         # map-style
         sampler = torch.utils.data.SequentialSampler(self.dataset)
@@ -1504,7 +1483,7 @@ def _test_sampler(self, **kwargs):
     def test_sampler(self):
         self._test_sampler()
         self._test_sampler(num_workers=4)
-        if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context:
+        if not NO_MULTIPROCESSING_SPAWN:
             self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
 
     def _test_batch_sampler(self, **kwargs):
@@ -1529,7 +1508,7 @@ def _test_batch_sampler(self, **kwargs):
     def test_batch_sampler(self):
         self._test_batch_sampler()
         self._test_batch_sampler(num_workers=4)
-        if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context:
+        if not NO_MULTIPROCESSING_SPAWN:
             self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index dc6bb2fbf878..1cf67f87ded9 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -4,7 +4,6 @@
 
 if __name__ == '__main__':
     run_tests()
-    if not PY2:
-        import test_jit_py3
-        suite = unittest.findTestCases(test_jit_py3)
-        unittest.TextTestRunner().run(suite)
+    import test_jit_py3
+    suite = unittest.findTestCases(test_jit_py3)
+    unittest.TextTestRunner().run(suite)
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 23da6602c572..23c7f3b4b6f6 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -4,7 +4,6 @@
 
 if __name__ == '__main__':
     run_tests()
-    if not PY2:
-        import test_jit_py3
-        suite = unittest.findTestCases(test_jit_py3)
-        unittest.TextTestRunner().run(suite)
+    import test_jit_py3
+    suite = unittest.findTestCases(test_jit_py3)
+    unittest.TextTestRunner().run(suite)
diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py
index c24a19678c39..51c57aa161c9 100644
--- a/tools/shared/module_loader.py
+++ b/tools/shared/module_loader.py
@@ -1,5 +1,3 @@
-
-
 def import_module(name, path):
     import importlib.util
     spec = importlib.util.spec_from_file_location(name, path)
diff --git a/torch/_six.py b/torch/_six.py
index c53feed94cce..00f9fa6b7f95 100644
--- a/torch/_six.py
+++ b/torch/_six.py
@@ -33,7 +33,6 @@
 FileNotFoundError = builtins.FileNotFoundError
 StringIO = io.StringIO
 container_abcs = collections.abc
-PY3 = sys.version_info[0] == 3
 PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7
 
 def with_metaclass(meta: type, *bases) -> type:
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
index 28d990c64c42..7e1cb0c4f92d 100644
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@@ -63,20 +63,5 @@ __PySlice_Unpack(PyObject *_r,
   (PySlice_Unpack(SLICE, START, STOP, STEP) == 0)
 #endif
 
-// https://bugsfiles.kde.org/attachment.cgi?id=61186
-#if PY_VERSION_HEX >= 0x03020000
 #define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
   (PySlice_GetIndicesEx(SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
-#else
-#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
-  (PySlice_GetIndicesEx((PySliceObject*)SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
-#endif
-
-// This function was introduced in Python 3.4
-#if PY_VERSION_HEX < 0x03040000
-inline int
-PyGILState_Check() {
-  PyThreadState * tstate = _PyThreadState_Current;
-  return tstate && (tstate == PyGILState_GetThisThreadState());
-}
-#endif
diff --git a/torch/csrc/utils/six.h b/torch/csrc/utils/six.h
index 932f0bf61a29..b83e60c77cf3 100644
--- a/torch/csrc/utils/six.h
+++ b/torch/csrc/utils/six.h
@@ -23,11 +23,7 @@ inline bool isTuple(pybind11::handle input) {
   if (PyTuple_Check(input.ptr())) {
     return true;
   }
-#if PY_MAJOR_VERSION == 2
-  return isStructSeq(input);
-#else
   return false;
-#endif
 }
 
 inline bool isTuple(PyObject* obj) {
@@ -40,12 +36,8 @@ inline bool isTuple(PyObject* obj) {
 // But on Python 2, structseq is not a subtype of tuple, so we need to manually create a
 // new tuple object from structseq.
 inline THPObjectPtr maybeAsTuple(PyStructSequence *obj) {
-#if PY_MAJOR_VERSION == 2
-  return THPObjectPtr(torch::utils::structseq_slice(obj, 0, Py_SIZE(obj)));
-#else
   Py_INCREF(obj);
   return THPObjectPtr((PyObject *)obj);
-#endif
 }
 
 inline THPObjectPtr maybeAsTuple(PyObject *obj) {
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 5535cef78395..e59c798a59be 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -153,15 +153,9 @@ def _lazy_init():
         # immediately, while we are still guaranteed to have the GIL, because some
         # of the C calls we make below will release the GIL
         if _is_in_bad_fork():
-            from sys import version_info
-            if version_info < (3, 4):
-                msg = ("To use CUDA with multiprocessing, you must use Python "
-                       "3.4+ and the 'spawn' start method")
-            else:
-                msg = ("To use CUDA with multiprocessing, you must use the "
-                       "'spawn' start method")
             raise RuntimeError(
-                "Cannot re-initialize CUDA in forked subprocess. " + msg)
+                "Cannot re-initialize CUDA in forked subprocess. To use CUDA with "
+                "multiprocessing, you must use the 'spawn' start method")
         if not hasattr(torch._C, '_cuda_getDeviceCount'):
             raise AssertionError("Torch not compiled with CUDA enabled")
         if _cudart is None:
diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py
index 561eddfb02a2..039ddf2a1b09 100644
--- a/torch/multiprocessing/__init__.py
+++ b/torch/multiprocessing/__init__.py
@@ -35,7 +35,7 @@
 
 """Add helper function to spawn N processes and wait for completion of any of
 them. This depends `mp.get_context` which was added in Python 3.4."""
-from .spawn import spawn, SpawnContext, _supports_context, start_processes, ProcessContext, \
+from .spawn import spawn, SpawnContext, start_processes, ProcessContext, \
     ProcessRaisedException, ProcessExitedException
 
 
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index b2008912dbb5..9ad17c94ccf8 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -66,24 +66,8 @@ def _wrap(fn, i, args, error_queue):
         sys.exit(1)
 
 
-# Multiprocessing contexts are introduced at Python 3.4
-_supports_context = sys.version_info >= (3, 4)
-
-
-def _python_version_check():
-    if not _supports_context:
-        raise RuntimeError("Requires python 3.4 or higher to use "
-                           "torch.multiprocessing.spawn and "
-                           "torch.multiprocessing.ProcessContext helper "
-                           "to launch multiple processes. If you are using "
-                           "this for distributed training and have a lower "
-                           "version of python, please use "
-                           "torch.distributed.launch instead.")
-
-
 class ProcessContext:
     def __init__(self, processes, error_queues):
-        _python_version_check()
         self.error_queues = error_queues
         self.processes = processes
         self.sentinels = {
@@ -182,7 +166,6 @@ def __init__(self, processes, error_queues):
 # Currently we only add this API first, we can consider adding it to documentation as
 # needed in the future.
 def start_processes(fn, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'):
-    _python_version_check()
     mp = multiprocessing.get_context(start_method)
     error_queues = []
     processes = []
diff --git a/torch/serialization.py b/torch/serialization.py
index ebc5d0a08541..3b6f5828d858 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -192,7 +192,7 @@ def storage_to_tensor_type(storage):
 
 def _is_path(name_or_buffer):
     return isinstance(name_or_buffer, str) or \
-        (sys.version_info[0] == 3 and isinstance(name_or_buffer, pathlib.Path))
+        isinstance(name_or_buffer, pathlib.Path)
 
 
 class _opener(object):
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 714361497d94..022255a5298b 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -4866,7 +4866,7 @@ def __call__(self, test_case):
 
         if self.should_test_pickle:
             # TODO: do this with in-memory files as soon as torch.save will support it
-            with TemporaryFile() as f:
+            with tempfile.TemporaryFile() as f:
                 test_case._forward(module, input)
                 torch.save(module, f)
                 f.seek(0)
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index f75e4cca195e..d4ef1a99a2df 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -308,10 +308,6 @@ def multiprocessing_context(self):
     def multiprocessing_context(self, multiprocessing_context):
         if multiprocessing_context is not None:
             if self.num_workers > 0:
-                if not multiprocessing._supports_context:
-                    raise ValueError('multiprocessing_context relies on Python >= 3.4, with '
-                                     'support for different start methods')
-
                 if isinstance(multiprocessing_context, string_classes):
                     valid_start_methods = multiprocessing.get_all_start_methods()
                     if multiprocessing_context not in valid_start_methods:

From fcb69d2ebaede960e7708706436d372b68807921 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Wed, 6 Jan 2021 12:56:59 -0800
Subject: [PATCH 25/44] Add android.permission.INTERNET permission to Android
 test_app. (#49996)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49996

According to section 5.2.1 of Snapdragon Profiler User Guide
(https://developer.qualcomm.com/qfile/30580/snapdragon_profiler_user_guide_reva.pdf)
OpenGL ES, Vulkan, and OpenCL apps must include
android.permission.INTERNET in the app's AndroidManifest.xml to enable
API tracing and GPU metrics.

Test Plan: Imported from OSS

Reviewed By: SS-JIA

Differential Revision: D25809555

Pulled By: AshkanAliabadi

fbshipit-source-id: c4d88a7ea98d9166efbc4157df7d822d99ba0df9
---
 android/test_app/app/src/main/AndroidManifest.xml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/android/test_app/app/src/main/AndroidManifest.xml b/android/test_app/app/src/main/AndroidManifest.xml
index a83bf223bdaf..abdd9a8d986a 100644
--- a/android/test_app/app/src/main/AndroidManifest.xml
+++ b/android/test_app/app/src/main/AndroidManifest.xml
@@ -18,4 +18,10 @@
     </application>
 
     <uses-permission android:name="android.permission.CAMERA" />
+
+    <!--
+     Permissions required by the Snapdragon Profiler to collect GPU metrics.
+    -->
+    <uses-permission android:name="android.permission.INTERNET" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
 </manifest>

From e4c41b6936ed433aff8e60735eba938ba66334e8 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 6 Jan 2021 14:14:24 -0800
Subject: [PATCH 26/44] Remove codegen logic to support non-c10-full ops
 (#49164)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49164

This PR removes the logic paths in codegen that were responsible for handling non-c10-full ops.
This only goes through our basic codegen. It does not simplify C++ code yet and it does not remove the codegen for generated unboxing wrappers yet.
ghstack-source-id: 119450487

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25462977

fbshipit-source-id: 7e70d14bea96948f5056d98125f3e6ba6bd78285
---
 tools/autograd/gen_trace_type.py    | 48 ++++++-----------------
 tools/autograd/gen_variable_type.py | 61 ++++++++---------------------
 tools/codegen/api/cpp.py            | 11 +-----
 tools/codegen/api/dispatcher.py     | 39 ++++++------------
 tools/codegen/api/native.py         |  3 +-
 tools/codegen/api/python.py         | 13 ++----
 tools/codegen/gen.py                | 33 ++++------------
 tools/codegen/model.py              | 14 ++-----
 tools/jit/gen_unboxing_wrappers.py  | 38 ------------------
 9 files changed, 56 insertions(+), 204 deletions(-)

diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index d8d42762e4fb..d8e68606e6ba 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -117,13 +117,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
             else:
                 return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
 
-    args: List[Union[Argument, TensorOptionsArguments]] = []
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        args = list(f.func.schema_order_arguments())
-    else:
-        sig_group = CppSignatureGroup.from_native_function(f, method=False)
-        args = [cpp_args.argument for cpp_args in sig_group.signature.arguments()
-                if not isinstance(cpp_args.argument, SelfArgument)]
+    args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments())
 
     if f.func.is_out_fn():
         # *_out functions take the result as a separate argument, but we don't want to
@@ -131,12 +125,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
         # So first, we need to remove the out argument from the list of arguments to trace.
         # TODO: byte-for-byte compatible with old codegen behavior - it's incorrect to assume
         # there is only one output argument.
-        if f.use_c10_dispatcher.dispatcher_uses_new_style():
-            # for c10-full ops, the out argument is in the end
-            args = args[:-1]
-        else:
-            # for legacy ops, the out argument is in the beginning.
-            args = args[1:]
+        args = args[:-1]
 
     trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args)
 
@@ -374,14 +363,10 @@ def method_definition(f: NativeFunction) -> Optional[str]:
     if cpp.name(f.func) in MANUAL_TRACER:
         return None
 
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        formals = ', '.join(
-            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
-            for a in f.func.schema_order_arguments()
-        )
-    else:
-        sig_group = CppSignatureGroup.from_native_function(f, method=False)
-        formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments())
+    formals = ', '.join(
+        f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+        for a in f.func.schema_order_arguments()
+    )
 
     return METHOD_DEFINITION.substitute(
         return_type=cpp.returns_type(f.func.returns),
@@ -396,27 +381,16 @@ def method_definition(f: NativeFunction) -> Optional[str]:
 );
 """)
 
-UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\
-m.impl_UNBOXED("${name}", &${class_type}::${type_wrapper_name});
-""")
-
 @with_native_function
 def method_registration(f: NativeFunction) -> Optional[str]:
     if cpp.name(f.func) in MANUAL_TRACER:
         return None
 
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        return WRAPPER_REGISTRATION.substitute(
-            name=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='TraceType',
-        )
-    else:
-        return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
-            name=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='TraceType',
-        )
+    return WRAPPER_REGISTRATION.substitute(
+        name=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type='TraceType',
+    )
 
 def gen_trace_type_shard(
     fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index f49f5e15845b..c78e1e5f66cc 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -191,19 +191,6 @@
 }
 """)
 
-# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once
-# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because
-# ops that are `use_c10_dispatcher: full` need different c++ code than ops
-# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants
-# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates
-# can be deleted once all ops are `use_c10_dispatcher: full`.
-# If you update one of the templates, you likely also have to update the other.
-
-# See NOTE[UnboxedOnly]
-UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\
-m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name});
-""")
-
 WRAPPER_REGISTRATION = CodeTemplate("""\
 m.impl("${unqual_operator_name_with_overload}",
        TORCH_FN(${class_type}::${type_wrapper_name})
@@ -349,30 +336,18 @@ def gen_variable_type(
 
 @with_native_function
 def gen_formals(f: NativeFunction) -> str:
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        formals = ', '.join(
-            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
-            for a in f.func.schema_order_arguments()
-        )
-    else:
-        sig_group = CppSignatureGroup.from_native_function(f, method=False)
-        formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments())
-    return formals
+    return ', '.join(
+        f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+        for a in f.func.schema_order_arguments()
+    )
 
 @with_native_function
 def gen_wrapper_registration(f: NativeFunction) -> str:
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        return WRAPPER_REGISTRATION.substitute(
-            unqual_operator_name_with_overload=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='VariableType',
-        )
-    else:
-        return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
-            unqual_operator_name_with_overload=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='VariableType',
-        )
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type='VariableType',
+    )
 
 def gen_variable_type_shard(
     fm: FileManager,
@@ -669,7 +644,7 @@ def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequen
             call = CALL_DISPATCH_VIA_NAMESPACE.substitute(
                 api_name=cpp.name(
                     f.func,
-                    faithful_name_for_out_overloads=f.use_c10_dispatcher.dispatcher_uses_new_style(),
+                    faithful_name_for_out_overloads=True,
                 ),
                 unpacked_args=unpacked_args)
         else:
@@ -887,16 +862,12 @@ def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
     body: List[str] = []
     unpacked_bindings: List[Binding] = []
 
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        bindings = [r for a in f.func.schema_order_arguments()
-                    for r in cpp.argument(a,
-                                          method=False,
-                                          cpp_no_default_args=set(),
-                                          faithful=False,
-                                          has_tensor_options=False)]
-    else:
-        sig_group = CppSignatureGroup.from_native_function(f, method=False)
-        bindings = list(sig_group.signature.arguments())
+    bindings = [r for a in f.func.schema_order_arguments()
+                for r in cpp.argument(a,
+                                      method=False,
+                                      cpp_no_default_args=set(),
+                                      faithful=False,
+                                      has_tensor_options=False)]
 
     for i, binding in enumerate(bindings):
         assert not isinstance(binding.argument, SelfArgument)
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index 8a1d2a5272f5..0debd52ca896 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,6 +1,5 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
-import tools.codegen.local as local
 from typing import Optional, Sequence, Union, List, Set
 
 # This file describes the translation of JIT schema to the public C++
@@ -88,10 +87,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
             if mutable:
                 return MutRefCType(BaseCType('Tensor', binds))  # TODO: fix this discrepancy
             else:
-                if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                    return ConstRefCType(OptionalCType(BaseCType('Tensor', binds)))
-                else:
-                    return ConstRefCType(BaseCType('Tensor', binds))
+                return ConstRefCType(OptionalCType(BaseCType('Tensor', binds)))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         return OptionalCType(elem)
     elif isinstance(t, ListType):
@@ -105,10 +101,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
         elif str(t.elem) == 'Dimname':
             return BaseCType("DimnameList", binds)
         elif str(t.elem) == 'Tensor?':
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return ConstRefCType(BaseCType("c10::List<c10::optional<Tensor>>", binds))
-            else:
-                return BaseCType("TensorList", binds)
+            return ConstRefCType(BaseCType("c10::List<c10::optional<Tensor>>", binds))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         # TODO: explicitly qualify namespace here
         return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds)
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 3adc2465b607..bb65bc386e64 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -2,8 +2,6 @@
 
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.api.native as native
-import tools.codegen.local as local
 
 import itertools
 from typing import Sequence, List, Union
@@ -31,17 +29,11 @@ def name(func: FunctionSchema) -> str:
     return cpp.name(func)
 
 def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
-    if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        # This is a faux amis.  If it makes sense in the future to add
-        # more special cases here, or invert things so cpp.argument_type
-        # calls this, or just completely inline the function, please do
-        # it.
-        return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
-    else:
-        # This is real sharing.  If you're modifying this path, ask
-        # yourself why you are changing the native functions protocol
-        # here and not in native.
-        return native.argumenttype_type(t, mutable=mutable, binds=binds)
+    # This is a faux amis.  If it makes sense in the future to add
+    # more special cases here, or invert things so cpp.argument_type
+    # calls this, or just completely inline the function, please do
+    # it.
+    return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
 
 def argument_type(a: Argument, *, binds: ArgName) -> CType:
     return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
@@ -53,10 +45,6 @@ def returns_type(rs: Sequence[Return]) -> str:
 def argument(
     a: Union[Argument, TensorOptionsArguments, SelfArgument]
 ) -> List[Binding]:
-    # We could forward to native.argument but it is a bit suspect because
-    # the grouping may not be set correctly
-    assert local.use_c10_dispatcher().dispatcher_uses_new_style()
-
     if isinstance(a, Argument):
         return [Binding(
             ctype=argument_type(a, binds=a.name),
@@ -71,13 +59,10 @@ def argument(
         assert_never(a)
 
 def arguments(func: FunctionSchema) -> List[Binding]:
-    if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        return [
-            r for a in itertools.chain(
-                func.arguments.positional,
-                func.arguments.kwarg_only,
-                func.arguments.out
-            ) for r in argument(a)
-        ]
-    else:
-        return native.arguments(func)
+    return [
+        r for a in itertools.chain(
+            func.arguments.positional,
+            func.arguments.kwarg_only,
+            func.arguments.out
+        ) for r in argument(a)
+    ]
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index 936500b560db..af82210b20f4 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -64,8 +64,7 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out
         # Erase SelfArgument from the distinction
         return argument(a.argument, is_out=is_out)
     elif isinstance(a, TensorOptionsArguments):
-        if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures,
-                                          UseC10Dispatcher.with_codegenerated_unboxing_wrapper]:
+        if local.use_c10_dispatcher() == UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
             # TODO: expunge this logic entirely
             default = None
             if should_default:
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index bc5cbb440b98..749513cb5c0d 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -3,7 +3,6 @@
 
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.local as local
 from tools.codegen.gen import pythonify_default
 from tools.codegen.model import *
 
@@ -599,11 +598,8 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
 
     elif isinstance(t, OptionalType):
         if str(t.elem) == 'Tensor':
-            if not simple_type or local.use_c10_dispatcher().dispatcher_uses_new_style():
-                # Is it desired to keep '?' for simple_type with new style dispatcher?
-                return 'Tensor?'
-            else:
-                return 'Tensor'
+            # Is it desired to keep '?' for simple_type with new style dispatcher?
+            return 'Tensor?'
         elem = argument_type_str(t.elem, simple_type=simple_type)
         if elem == 'Layout':
             # TODO: fix this special case in PythonArgParser?
@@ -1022,10 +1018,7 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
 
     elif isinstance(t, OptionalType):
         if str(t.elem) == 'Tensor':
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return 'optionalTensor'
-            else:
-                return 'tensor'
+            return 'optionalTensor'
 
         elif isinstance(t.elem, BaseType):
             if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar,
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 4768670b6f26..08e9572131e3 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -471,12 +471,6 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
         # for mypy type refinement; would be fixed by TODO on target
         assert self.target is not Target.DECLARATION
 
-        if f.func.is_out_fn():
-            assert local.use_c10_dispatcher().dispatcher_uses_new_style(), \
-                ("{} takes out arguments and has to be written in the new style. " +
-                 "Please add `use_c10_dispatcher: full` to your operator in native_functions.yaml " +
-                 "and write the C++ implementation to take out arguments in the end.").format(f.func.name)
-
         if self.dispatch_key not in f.dispatch:
             return None
         if f.manual_kernel_registration:
@@ -519,8 +513,7 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
     const DeviceGuard device_guard(device_or_default(device));
 """
                 else:
-                    assert local.use_c10_dispatcher() in [UseC10Dispatcher.with_codegenerated_unboxing_wrapper,
-                                                          UseC10Dispatcher.hacky_wrapper_for_legacy_signatures]
+                    assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
                     cuda_guard_from_tensor_options = """\
     const DeviceGuard device_guard(options.device());
 """
@@ -562,16 +555,14 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
                 # Figure out which signature the function is
                 if local.use_c10_dispatcher() is UseC10Dispatcher.full:
                     payload = f"TORCH_FN({name})"
-                elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
+                else:
+                    assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
                     payload = f"""
 c10::impl::hacky_wrapper_for_legacy_signatures<
     {dispatcher_sig.type()},
     {len(f.func.arguments.out)}
 >(TORCH_FN({name}))
 """
-                else:
-                    assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                    payload = f"torch::CppFunction::makeUnboxedOnly(&{name})"
 
                 return f'm.impl("{f.func.name}",\n{payload});\n'
         else:
@@ -790,14 +781,9 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
         dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
         sig: Union[NativeSignature, DispatcherSignature]
-        if local.use_c10_dispatcher().dispatcher_uses_new_style():
-            sig = dispatcher_sig
-            dispatcher_exprs = dispatcher_sig.exprs()
-            dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
-        else:
-            sig = native_sig
-            dispatcher_exprs = native_sig.dispatcher_exprs()
-            dispatch_key = "options.computeDispatchKey()"
+        sig = dispatcher_sig
+        dispatcher_exprs = dispatcher_sig.exprs()
+        dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
 
         if self.target is Target.DEFINITION:
             # I don't think there's actually a good reason to generate
@@ -823,11 +809,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 }}
 """
         elif self.target is Target.REGISTRATION:
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
-            else:
-                assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});"""
+            return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
         elif self.target is Target.DECLARATION:
             raise AssertionError()
         else:
@@ -1052,7 +1034,6 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('name', cpp.name(f.func)),
         ('operator_name', str(f.func.name.name)),
         ('overload_name', str(f.func.name.overload_name)),
-        ('use_c10_dispatcher', f.use_c10_dispatcher.name),
         ('manual_kernel_registration', f.manual_kernel_registration),
         ('category_override', f.category_override if f.category_override is not None else ''),
         ('matches_jit_signature', True),
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 9c8a0d73e815..1128878fe45c 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -49,12 +49,8 @@ def __str__(self) -> str:
 
 class UseC10Dispatcher(Enum):
     full = 0
-    with_codegenerated_unboxing_wrapper = 1
     hacky_wrapper_for_legacy_signatures = 2
 
-    def dispatcher_uses_new_style(self) -> bool:
-        return self in [UseC10Dispatcher.full, UseC10Dispatcher.hacky_wrapper_for_legacy_signatures]
-
 # The basic input to the code generation is native_functions.yaml.
 # The name "native", BTW, comes from the distinction between native
 # functions and legacy TH functions.  The legacy TH functions are gone,
@@ -77,7 +73,7 @@ class NativeFunction:
     func: 'FunctionSchema'
 
     # Corresponds to the 'use_c10_dispatcher' field.  The default
-    # is 'with_codegenerated_unboxing_wrapper'
+    # is 'full'
     use_c10_dispatcher: UseC10Dispatcher
 
     # Whether or not to omit automatic generation of a DeviceGuard
@@ -177,16 +173,14 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
         assert isinstance(cpp_no_default_args_list, list)
         cpp_no_default_args = set(cpp_no_default_args_list)
 
-        use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None)
-        if use_c10_dispatcher_s is None:
-            use_c10_dispatcher = UseC10Dispatcher.full
-        elif use_c10_dispatcher_s == 'full':
+        use_c10_dispatcher_s = e.pop('use_c10_dispatcher', 'full')
+        if use_c10_dispatcher_s == 'full':
             use_c10_dispatcher = UseC10Dispatcher.full
         elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures':
             use_c10_dispatcher = UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
         else:
             raise AssertionError(
-                f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}')
+                f'use_c10_dispatcher must be full or hacky_wrapper_for_legacy_signatures, got {use_c10_dispatcher}')
 
         variants_s = e.pop('variants', 'function')
         assert isinstance(variants_s, str)
diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py
index a52c109c603f..19e459e3f7ac 100644
--- a/tools/jit/gen_unboxing_wrappers.py
+++ b/tools/jit/gen_unboxing_wrappers.py
@@ -377,29 +377,7 @@ def pack_arguments(args):
                     device=device, pin_memory=pin_memory,
                     args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]),
                     first=args_with_tensor_options[0], num_inputs=num_inputs)
-        elif decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-            if len(decl['returns']) == 0:
-                return_type = "void"
-            elif len(decl['returns']) == 1:
-                return_type = decl['returns'][0]['type']
-            else:
-                return_type = "std::tuple<{}>".format(", ".join([r['type'] for r in decl['returns']]))
-            for a in decl['arguments']:
-                if 'type' not in a:
-                    raise Exception(decl)
-            argument_types_with_leading_comma = ", ".join([a['type'] for a in decl['arguments']])
-            if argument_types_with_leading_comma != "":
-                argument_types_with_leading_comma = ", " + argument_types_with_leading_comma
-            args_with_leading_comma = pack_arguments(args)
-            if args_with_leading_comma != "":
-                args_with_leading_comma = ", " + args_with_leading_comma
-            return CALL_UNBOXED_KERNEL.substitute(name=decl['name'],
-                                                  args_with_leading_comma=args_with_leading_comma,
-                                                  num_inputs=num_inputs,
-                                                  return_type=return_type,
-                                                  formals_types_with_leading_comma=argument_types_with_leading_comma)
         else:
-            assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
             if is_namespace_function:
                 return CALL_NAMESPACE.substitute(name=decl['name'],
                                                  args=pack_arguments(args),
@@ -438,16 +416,6 @@ def emit_decl_variant(decl):
 
         returns = decl['returns']
 
-        if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-            constructor = CONSTRUCTOR.substitute(name=decl['name'],
-                                                 call=call,
-                                                 kw_assignments=kw_assignments,
-                                                 num_inputs=num_inputs,
-                                                 op_capture=op_capture,
-                                                 lvalues=lvalues)
-        else:
-            assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
-
         return constructor
 
     def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration):
@@ -549,12 +517,6 @@ def expand_options(decl, i, arg):
     # ops are assigned arbitrarily but stably to a file based on hash
     for group in jit_decl_groups:
         x = sum(ord(c) for c in group[0]['name']) % num_shards
-        for decl in group:
-            if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-                shards[x].append(OPERATOR.substitute(signature=decl['schema_string'],
-                                                     op=emit_decl_variant(decl)))
-            else:
-                assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
 
     for i, shard in enumerate(shards):
         env = {

From 4a14020c0d56c733da381561c32009546ada28dd Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 6 Jan 2021 14:14:24 -0800
Subject: [PATCH 27/44] Remove .impl_UNBOXED() and functionalities associated
 with it (#49220)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49220

Since all ops are c10-full, we can remove .impl_UNBOXED now.
This also removes the ability of KernelFunction or CppFunction to store unboxedOnly kernels.
ghstack-source-id: 119450489

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25490225

fbshipit-source-id: 32de9d591e6a842fe18abc82541580647e9cfdad
---
 aten/src/ATen/BatchingRegistrations.cpp       |  2 +-
 aten/src/ATen/autocast_mode.cpp               | 30 +++++------
 aten/src/ATen/core/boxing/KernelFunction.cpp  | 21 --------
 aten/src/ATen/core/boxing/KernelFunction.h    | 42 ---------------
 .../ATen/core/boxing/KernelFunction_impl.h    | 54 ++-----------------
 .../ATen/core/boxing/KernelFunction_test.cpp  | 40 --------------
 .../op_registration/op_registration_test.cpp  |  2 +-
 aten/src/ATen/native/vulkan/VulkanAten.cpp    |  6 +--
 aten/src/ATen/native/vulkan/ops/Clamp.cpp     |  8 +--
 .../ATen/native/vulkan/ops/Convolution.cpp    |  2 +-
 aten/src/ATen/native/vulkan/ops/Factory.cpp   |  2 +-
 test/cpp_extensions/msnpu_extension.cpp       |  8 +--
 test/cpp_extensions/rng_extension.cpp         |  6 +--
 test/mobile/op_deps/simple_ops.cpp            |  6 +--
 tools/code_analyzer/run_analyzer.sh           |  2 +-
 torch/library.h                               | 51 +++++++++---------
 16 files changed, 64 insertions(+), 218 deletions(-)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 9bdec2dce77e..2cd7cac4e71b 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -1015,7 +1015,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("_add_batch_dim", native::_add_batch_dim);
   m.impl("_remove_batch_dim", native::_remove_batch_dim);
 
-  m.impl_UNBOXED("sum.dim_IntList", sum_batching_rule);
+  m.impl("sum.dim_IntList", sum_batching_rule);
   m.impl("is_complex", native::is_complex);
   m.impl("conj", native::conj);
 
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index dfb8e3ac0f32..9a2f34257c57 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -239,13 +239,9 @@ Therefore, for the moment, this is all copy pasted in from VariableTypeEverythin
   m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
     &WrapFunction<CastPolicy::POLICY, SIGNATURE, SIGNATURE, &FUNC>::type::call);
 
-#define KERNEL_UNBOXED_ONLY(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \
-  m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
-    &WrapFunction<CastPolicy::POLICY, SIGNATURE, SIGNATURE, &FUNC>::type::call);
-
 // Less-common but still useful case: redispatching to a function with a new signature (e.g. appending a dtype)
-#define KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \
-  m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \
+  m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
     &WrapFunction<CastPolicy::POLICY, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, &REDISPATCH_FUNC>::type::call);
 
 /*****************************************
@@ -367,20 +363,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, int64_t), fp32)
   KERNEL(ADD_NS(dist), "dist", Tensor (const Tensor &, const Tensor &, Scalar), fp32)
   KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
+  KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
   KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, Scalar, int64_t, Scalar), fp32)
   // fp32_set_opt_dtype
   KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(softmax), "softmax.int", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(log_softmax), "log_softmax.int", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(cumprod), "cumprod", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(cumsum), "cumsum", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
   // when autocasting.
   // KERNEL(ADD_NS(norm), "norm.ScalarOpt_dtype", Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_set_opt_dtype)
@@ -388,20 +384,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   // KERNEL(ADD_NS(norm), "norm.names_ScalarOpt_dim_dtype", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_set_opt_dtype)
   KERNEL(ADD_NS(sum), "sum", Tensor (const Tensor &, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(sum), "sum.dim_IntList", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
   // fp32_append_dtype
   // The fp32_append_dtype wrapper overrides implicit promotion behavior.
   // norm does not implicitly promote, but be aware when adding new ops to this policy.
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_append_dtype)
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool), Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool, ScalarType), fp32_append_dtype)
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool), Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool), Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool), Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_append_dtype)
   // promote
   KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
   KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
   KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote)
   KERNEL(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const c10::optional<Tensor>&), promote)
   KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
+  KERNEL(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
   KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote)
   KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
   KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index f84352ebee1f..58c35557018c 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -57,25 +57,4 @@ bool KernelFunction::_equalsBoxedAndUnboxed(const KernelFunction& other) const {
          unboxed_kernel_func_ == other.unboxed_kernel_func_;
 }
 
-void KernelFunction::checkBoxedKernel(const OperatorHandle& opHandle) const {
-  if (C10_UNLIKELY(boxed_kernel_func_ == nullptr)) {
-    if (unboxed_kernel_func_ == nullptr) {
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction.",
-          " opname: ",
-          opHandle.operator_name(),
-          " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`.");
-    } else {
-      // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this case should be impossible.
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call().",
-          " opname: ",
-          opHandle.operator_name(),
-          " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`.");
-    }
-  }
-}
-
 } // namespace c10
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 6817907b12b1..bf847681aac8 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -123,26 +123,6 @@ class TORCH_API KernelFunction final {
   template<bool AllowLegacyTypes = false, class KernelFunctor>
   static KernelFunction makeFromUnboxedFunctor(std::unique_ptr<OperatorKernel> kernelFunctor);
 
-  /**
-   * Create a KernelFunction from an unboxed functor and prevent creation of an
-   * unboxing-wrapper. This means that you cannot call this KernelFunction
-   * using KernelFunction::callBoxed()
-   *
-   * This is necessary because our unboxing wrappers don't work for all types
-   * yet, so if you want to use one of these types as function arguments,
-   * you need to use makeFromUnboxedOnlyFunctor.
-   *
-   * Example:
-   *
-   * > class MyFunctor final {
-   * >   public:
-   * >     Tensor operator()(Tensor a, Tensor b) {...}
-   * > };
-   * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::make_unique<MyFunctor>());
-   */
-  template<class KernelFunctor>
-  static KernelFunction makeFromUnboxedOnlyFunctor(std::unique_ptr<OperatorKernel> kernelFunctor);
-
   /**
    * Create a KernelFunction from an unboxed function.
    * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction
@@ -158,23 +138,6 @@ class TORCH_API KernelFunction final {
   template<class FuncPtr, bool AllowLegacyTypes = false>
   static KernelFunction makeFromUnboxedFunction(FuncPtr);
 
-  /**
-   * Create a KernelFunction from an unboxed function and prevent creation of an
-   * unboxing-wrapper. This means that you cannot call this KernelFunction
-   * using KernelFunction::callBoxed()
-   *
-   * This is necessary because our unboxing wrappers don't work for all types
-   * yet, so if you want to use one of these types as function arguments,
-   * you need to use makeFromUnboxedOnlyFunctor.
-   *
-   * Example:
-   *
-   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
-   * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction<decltype(unboxed_func), &unboxed_func>();
-   */
-  template<class FuncPtr>
-  static KernelFunction makeFromUnboxedOnlyFunction(FuncPtr);
-
   /**
    * Create a KernelFunction from an unboxed function.
    * KernelFunction::makeFromUnboxedFunction is usually a better choice than
@@ -189,9 +152,6 @@ class TORCH_API KernelFunction final {
   template<bool AllowLegacyTypes = false, class FuncType>
   static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func);
 
-  template<class FuncType>
-  static KernelFunction makeFromUnboxedOnlyRuntimeFunction(FuncType* func);
-
   static KernelFunction makeFallthrough();
   static KernelFunction makeAmbiguousAutogradOther();
   static KernelFunction makeNamedNotSupported();
@@ -226,8 +186,6 @@ class TORCH_API KernelFunction final {
   template<BoxedKernelFunction* func>
   static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, Stack* stack);
 
-  void checkBoxedKernel(const OperatorHandle& opHandle) const;
-
   OperatorKernel* getFunctor_() const;
 
   std::shared_ptr<OperatorKernel> functor_;
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 82a65fa27ffb..f45d8b28105e 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -23,8 +23,7 @@ inline void KernelFunction::make_boxed_function(OperatorKernel*, const OperatorH
 }
 
 inline bool KernelFunction::isValid() const {
-    // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this should only check boxed_kernel_func_.
-    return boxed_kernel_func_ != nullptr || unboxed_kernel_func_ != nullptr;
+    return boxed_kernel_func_ != nullptr;
 }
 
 inline bool KernelFunction::isFallthrough() const {
@@ -32,7 +31,10 @@ inline bool KernelFunction::isFallthrough() const {
 }
 
 inline void KernelFunction::callBoxed(const OperatorHandle& opHandle, Stack* stack) const {
-    checkBoxedKernel(opHandle);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        boxed_kernel_func_ != nullptr,
+        "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction."
+    );
     (*boxed_kernel_func_)(functor_.get(), opHandle, stack);
 }
 
@@ -111,21 +113,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr<Ope
     );
 }
 
-template<class KernelFunctor>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr<OperatorKernel> kernelFunctor) {
-    // TODO We want to get rid of kernels that have only an unboxed function pointer.
-    //      All kernels should have a boxed pointer.
-
-    static_assert(guts::is_functor<KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
-    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
-
-    return KernelFunction(
-        std::move(kernelFunctor),
-        nullptr, // Don't create a boxed kernel for this
-        reinterpret_cast<void*>(&impl::wrap_kernel_functor_unboxed<KernelFunctor>::call)
-    );
-}
-
 template<class FuncPtr, bool AllowLegacyTypes>
 inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) {
     static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
@@ -144,26 +131,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr)
 #endif
 }
 
-template<class FuncPtr>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunction(FuncPtr func_ptr) {
-    // TODO We want to get rid of kernels that have only an unboxed function pointer.
-    //      All kernels should have a boxed pointer.
-    static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
-    static_assert(!std::is_same<typename FuncPtr::FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
-    static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
-
-#if !defined(C10_MOBILE)
-    return makeFromUnboxedOnlyFunctor<typename impl::WrapFunctionIntoFunctor<FuncPtr>::type> (
-        guts::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
-    );
-#else
-    // On mobile, we rather want to optimize for binary size than for performance,
-    // so let's not inline the kernel into the wrapper but use makeFromUnboxedOnlyRuntimeFunction
-    // instead.
-    return makeFromUnboxedOnlyRuntimeFunction(func_ptr.func_ptr());
-#endif
-}
-
 template<bool AllowLegacyTypes, class FuncType>
 inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) {
     static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
@@ -175,17 +142,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* f
     );
 }
 
-template<class FuncType>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyRuntimeFunction(FuncType* func) {
-    static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
-    static_assert(!std::is_same<FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
-    TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
-
-    return makeFromUnboxedOnlyFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
-        guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func)
-    );
-}
-
 template<bool AllowLegacyTypes, class Lambda>
 inline std::enable_if_t<guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
     static_assert(guts::is_functor<std::decay_t<Lambda>>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
index 8ba50db14a2b..e17efab10ba5 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
@@ -544,26 +544,6 @@ TEST(KernelFunctionTest, givenUnboxedFunctor_withoutReturn_whenCallingUnboxed_th
   kernels::expectUnboxedCallingWithoutReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_with_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_with_return>()));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_without_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_without_return>()));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_with_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_with_return>()));
-  kernels::expectUnboxedCallingWithReturnWorks(func);
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_without_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_without_return>()));
-  kernels::expectUnboxedCallingWithoutReturnWorks(func);
-}
-
 TEST(KernelFunctionTest, givenUnboxedFunction_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernels::unboxed_function_with_return));
   kernels::expectBoxedCallingWithReturnWorks(func);
@@ -584,26 +564,6 @@ TEST(KernelFunctionTest, givenUnboxedFunction_withoutReturn_whenCallingUnboxed_t
   kernels::expectUnboxedCallingWithoutReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return));
-  kernels::expectUnboxedCallingWithReturnWorks(func);
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return));
-  kernels::expectUnboxedCallingWithoutReturnWorks(func);
-}
-
 TEST(KernelFunctionTest, givenUnboxedRuntimeFunction_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromUnboxedRuntimeFunction(&kernels::unboxed_function_with_return);
   kernels::expectBoxedCallingWithReturnWorks(func);
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 6259578fdac8..56afe8ca7fb5 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -1909,7 +1909,7 @@ TEST(NewOperatorRegistrationTest, CppFunction) {
   m.def("fn3", [](const Tensor& x) { return x; });
   // These require explicit schema
   m.def("fn4(Tensor x) -> Tensor", CppFunction::makeFallthrough());
-  m.def("fn5(Tensor x) -> Tensor", CppFunction::makeUnboxedOnly(dummy_fn));
+  m.def("fn5(Tensor x) -> Tensor", CppFunction::makeFromUnboxedFunction(dummy_fn));
   m.def("fn6(Tensor x) -> Tensor", CppFunction::makeFromBoxedFunction<&backend_fallback_kernel>());
 }
 
diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp
index 4dba9de7d5b0..88c519c09ea3 100644
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp
@@ -548,7 +548,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape));
   m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select));
   m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose));
-  m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_);
+  m.impl("transpose_", at::native::vulkan::aten::transpose_);
   m.impl("view", TORCH_FN(at::native::vulkan::aten::view));
   m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze));
   m.impl("empty.memory_format", at::native::vulkan::aten::empty);
@@ -569,11 +569,11 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat));
   m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar));
   m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar));
-  m.impl_UNBOXED(
+  m.impl(
       "convolution_overrideable", at::native::vulkan::aten::convolution);
   m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_);
   m.impl("relu_", at::native::vulkan::aten::relu_);
-  m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_);
+  m.impl("add_.Tensor", at::native::vulkan::aten::add_);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 369a47fee93a..9f25d89bca9b 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -167,10 +167,10 @@ Tensor& relu_(Tensor& self) {
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("clamp", TORCH_FN(clamp));
   m.impl("clamp_", TORCH_FN(clamp_));
-  m.impl_UNBOXED("hardtanh", hardtanh);
-  m.impl_UNBOXED("hardtanh_", hardtanh_);
-  m.impl_UNBOXED("relu", relu);
-  m.impl_UNBOXED("relu_", relu_);
+  m.impl("hardtanh", hardtanh);
+  m.impl("hardtanh_", hardtanh_);
+  m.impl("relu", relu);
+  m.impl("relu_", relu_);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index c757f6cdac7a..d88545e3a25a 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -688,7 +688,7 @@ Tensor convolution(
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
-  m.impl_UNBOXED("convolution_overrideable", convolution);
+  m.impl("convolution_overrideable", convolution);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
index 6e48ba120c31..14deb30b9888 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -45,7 +45,7 @@ Tensor empty_strided(
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
-  m.impl_UNBOXED("empty.memory_format", at::native::vulkan::ops::empty_memory_format);
+  m.impl("empty.memory_format", at::native::vulkan::ops::empty_memory_format);
   m.impl("empty_strided", TORCH_FN(at::native::vulkan::ops::empty_strided));
 }
 
diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp
index 88c1d509b34c..ea67910f96da 100644
--- a/test/cpp_extensions/msnpu_extension.cpp
+++ b/test/cpp_extensions/msnpu_extension.cpp
@@ -53,10 +53,10 @@ std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
 }
 
 TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
-  m.impl_UNBOXED("empty.memory_format",                empty_override);
-  m.impl_UNBOXED("add.Tensor",                         add_override);
-  m.impl_UNBOXED("convolution_overrideable",           fake_convolution);
-  m.impl_UNBOXED("convolution_backward_overrideable",  fake_convolution_backward);
+  m.impl("empty.memory_format",                empty_override);
+  m.impl("add.Tensor",                         add_override);
+  m.impl("convolution_overrideable",           fake_convolution);
+  m.impl("convolution_backward_overrideable",  fake_convolution_backward);
 }
 
 // TODO: Extend this to exercise multi-device setting.  In that case,
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
index bf16a840dfc9..4a71a526617f 100644
--- a/test/cpp_extensions/rng_extension.cpp
+++ b/test/cpp_extensions/rng_extension.cpp
@@ -54,9 +54,9 @@ size_t getInstanceCount() {
 }
 
 TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
-  m.impl_UNBOXED("aten::random_.from",                 random_from_to);
-  m.impl_UNBOXED("aten::random_.to",                   random_to);
-  m.impl_UNBOXED("aten::random_",                      random_);
+  m.impl("aten::random_.from",                 random_from_to);
+  m.impl("aten::random_.to",                   random_to);
+  m.impl("aten::random_",                      random_);
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
diff --git a/test/mobile/op_deps/simple_ops.cpp b/test/mobile/op_deps/simple_ops.cpp
index 3651d1b05353..a76c58838a72 100644
--- a/test/mobile/op_deps/simple_ops.cpp
+++ b/test/mobile/op_deps/simple_ops.cpp
@@ -80,7 +80,7 @@ namespace {
 // cares about the name
 TORCH_LIBRARY(_test, m) {
   m.def("AA(Tensor self) -> Tensor");
-  m.impl("AA", torch::CppFunction::makeUnboxedOnly(AA_op));
+  m.impl("AA", torch::CppFunction::makeFromUnboxedFunction(AA_op));
 
   m.def("BB(Tensor self) -> Tensor");
   m.impl("BB", TORCH_FN(BB_op));
@@ -97,10 +97,10 @@ TORCH_LIBRARY_FRAGMENT(_test, m) {
 }
 
 TORCH_LIBRARY_IMPL(_test, CPU, m) {
-  m.impl_UNBOXED("EE", EE_op);
+  m.impl("EE", EE_op);
   m.impl("FF",
          torch::dispatch(DispatchKey::CPU,
-                         torch::CppFunction::makeUnboxedOnly(FF_op))
+                         torch::CppFunction::makeFromUnboxedFunction(FF_op))
   );
   m.impl("GG",
          torch::dispatch(DispatchKey::CPU,
diff --git a/tools/code_analyzer/run_analyzer.sh b/tools/code_analyzer/run_analyzer.sh
index 79b366fb1a0d..dc8705cc39f7 100755
--- a/tools/code_analyzer/run_analyzer.sh
+++ b/tools/code_analyzer/run_analyzer.sh
@@ -15,7 +15,7 @@ echo "Analyze: ${INPUT}"
 # to operate, so for safety we match a more expansive set.
 "${ANALYZER_BIN}" \
   -op_schema_pattern="^(_aten|_prim|aten|quantized|_quantized|prepacked|profiler|_test)::[a-zA-Z0-9_.]+(\(.*)?$" \
-  -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl|impl_UNBOXED)|torch::Library::(_?def|_?impl|_?impl_UNBOXED)" \
+  -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl)|torch::Library::(_?def|_?impl)" \
   -op_invoke_pattern="c10::Dispatcher::findSchema" \
   -root_symbol_pattern="torch::jit::[^(]" \
   -torch_library_init_pattern="^.*TORCH_LIBRARY_init_([^(]+)(\(.*)?$" \
diff --git a/torch/library.h b/torch/library.h
index d86c1afbd50e..fee98abb2b81 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -116,19 +116,6 @@ class TORCH_API CppFunction final {
     , debug_()
     {}
 
-  /// This static factory lets you create CppFunctions that (1) don't have boxing
-  /// wrappers (because we don't support it yet) and (2) don't have schema
-  /// inference (because some ops don't support it).
-  template <typename Func>
-  static CppFunction makeUnboxedOnly(Func* f) {
-    // TODO: Eliminate the necessity for this function entirely.
-    return CppFunction(
-      c10::KernelFunction::makeFromUnboxedOnlyRuntimeFunction(f),
-      /* cpp_signature */ c10::impl::CppSignature::make<Func>(),
-      /* schema */ nullptr
-    );
-  }
-
   /// This creates a fallthrough function.  Fallthrough functions
   /// immediately redispatch to the next available dispatch key,
   /// but are implemented more efficiently than a hand written
@@ -170,6 +157,22 @@ class TORCH_API CppFunction final {
     );
   }
 
+  /// Create a function from an unboxed kernel function.
+  /// This is typically used to register common operators.
+  template<typename FuncPtr, std::enable_if_t<c10::guts::is_function_type<FuncPtr>::value, std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr* f) {
+    return CppFunction(f);
+  }
+
+  /// Create a function from a compile time unboxed kernel function pointer.
+  /// This is typically used to register common operators.
+  /// Compile time function pointers can be used to allow the compiler
+  /// to optimize (e.g. inline) calls to it.
+  template<typename FuncPtr, std::enable_if_t<c10::is_compile_time_function_pointer<FuncPtr>::value, std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr f) {
+    return CppFunction(f);
+  }
+
   CppFunction&& debug(std::string d) && {
     debug_ = std::move(d);
     return std::move(*this);
@@ -496,20 +499,10 @@ class TORCH_API Library final {
     return impl(name, dispatch(std::forward<Dispatch>(key), std::forward<Func>(raw_f)));
   }
 
-  /// \private
-  ///
-  /// Convenience overload for unboxed only kernels; kernels whose type
-  /// signatures are not supported by our template based metaprogramming
-  /// system.  These are currently quite common but will be eventually
-  /// eliminated.
-  ///
-  /// This is equivalent to calling CppFunction::makeUnboxedOnly() on
-  /// the function, but this name for the function makes it easy to grep for.
   template <typename Name, typename Func>
   Library& impl_UNBOXED(Name name, Func* raw_f) & {
-    // TODO: Remove this overload once the makeUnboxedOnly incidence rate
-    // goes way down
-    return impl(name, CppFunction::makeUnboxedOnly(raw_f));
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
   }
 
   // These overloads cover cases when a SelectiveStr (see Note [Selective build])
@@ -531,7 +524,10 @@ class TORCH_API Library final {
   template <typename Dispatch, typename Func>
   Library& impl(detail::SelectiveStr<false>, Dispatch&& key, Func&& raw_f) & { return *this; }
   template <typename Func>
-  Library& impl_UNBOXED(detail::SelectiveStr<false> name, Func* raw_f) & { return *this; }
+  Library& impl_UNBOXED(detail::SelectiveStr<false> name, Func* raw_f) & {
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
 
   template <typename Func>
   Library& impl(detail::SelectiveStr<true> name, Func&& raw_f) & {
@@ -543,7 +539,8 @@ class TORCH_API Library final {
   }
   template <typename Func>
   Library& impl_UNBOXED(detail::SelectiveStr<true> name, Func* raw_f) & {
-    return impl(name.operator const char*(), CppFunction::makeUnboxedOnly(raw_f));
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
   }
 
   /// Register a fallback implementation for all operators which will be used

From 249261ada7a137db6cf9d2114d42d0ca2e0d396b Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 6 Jan 2021 14:14:24 -0800
Subject: [PATCH 28/44] Remove generated_unboxing_wrappers and
 setManuallyBoxedKernel (#49251)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49251

Since all ops are c10-full and use templated unboxing now, we don't need to codegenerate any unboxing logic anymore.
Since this codegen was the only code using setManuallyBoxedKernel, we can also remove that functionality from KernelFunction, OperatorEntry and Dispatcher.
ghstack-source-id: 119450486

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25502865

fbshipit-source-id: 49d009df159fda4be41bd02457d4427e6e638c10
---
 .jenkins/pytorch/codegen-test.sh              |   7 -
 BUILD.bazel                                   |   3 -
 aten/src/ATen/core/boxing/KernelFunction.h    |   6 -
 .../ATen/core/boxing/KernelFunction_impl.h    |  10 -
 aten/src/ATen/core/dispatch/Dispatcher.cpp    |   6 -
 aten/src/ATen/core/dispatch/Dispatcher.h      |   6 -
 aten/src/ATen/core/dispatch/OperatorEntry.cpp |  18 -
 aten/src/ATen/core/dispatch/OperatorEntry.h   |  12 -
 caffe2/CMakeLists.txt                         |   5 -
 caffe2/contrib/aten/gen_op.py                 |   3 +-
 tools/build_variables.bzl                     |   6 -
 tools/jit/gen_unboxing_wrappers.py            | 568 ------------------
 .../templates/generated_unboxing_wrappers.cpp | 132 ----
 tools/setup_helpers/generate_code.py          |   8 -
 14 files changed, 1 insertion(+), 789 deletions(-)
 delete mode 100644 tools/jit/gen_unboxing_wrappers.py
 delete mode 100644 tools/jit/templates/generated_unboxing_wrappers.cpp

diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
index 17e7e9fa3445..47d13f2908d0 100755
--- a/.jenkins/pytorch/codegen-test.sh
+++ b/.jenkins/pytorch/codegen-test.sh
@@ -48,13 +48,6 @@ python -m tools.autograd.gen_autograd \
   "$OUT"/autograd \
   tools/autograd
 
-# unboxing_wrappers codegen (called by torch codegen but can run independently)
-mkdir -p "$OUT"/unboxing_wrappers
-python -m tools.jit.gen_unboxing_wrappers \
-  "$OUT"/torch/share/ATen/Declarations.yaml \
-  "$OUT"/unboxing_wrappers \
-  tools/jit/templates
-
 # annotated_fn_args codegen (called by torch codegen but can run independently)
 mkdir -p "$OUT"/annotated_fn_args
 python -m tools.autograd.gen_annotated_fn_args \
diff --git a/BUILD.bazel b/BUILD.bazel
index b3faea487965..2b4636d850c9 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -193,9 +193,6 @@ libtorch_cpp_generated_sources = [
         "torch/csrc/autograd/generated/Functions.h",
         "torch/csrc/autograd/generated/Functions.cpp",
         "torch/csrc/autograd/generated/variable_factories.h",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_0.cpp",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_1.cpp",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_2.cpp",
 ]
 
 libtorch_python_generated_sources = [
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index bf847681aac8..ddbbd912777a 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -173,12 +173,6 @@ class TORCH_API KernelFunction final {
   // For testing internal invariants only
   bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic. This can be done once https://github.com/pytorch/pytorch/issues/32366 is fixed.
-  void setManuallyBoxedKernel_(InternalBoxedKernelFunction* func);
-
 private:
 
   explicit KernelFunction(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func);
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index f45d8b28105e..b248e54a6f94 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -168,14 +168,4 @@ inline std::enable_if_t<!guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
     );
 }
 
-inline void KernelFunction::setManuallyBoxedKernel_(InternalBoxedKernelFunction* func) {
-    if (boxed_kernel_func_ == &fallthrough_kernel) {
-      // special case no-op
-      return;
-    }
-    TORCH_INTERNAL_ASSERT(boxed_kernel_func_ == nullptr, "Tried to set a manually boxed kernel for a kernel that already has a boxed kernel set.");
-    TORCH_INTERNAL_ASSERT(unboxed_kernel_func_ != nullptr, "Tried to set a manually boxed kernel for an invalid KernelFunction.");
-    boxed_kernel_func_ = func;
-}
-
 }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 5e3e91afbb45..270cffaf6d1f 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -295,12 +295,6 @@ void Dispatcher::checkInvariants() const {
   }
 }
 
-void Dispatcher::setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  op.operatorIterator_->op.setManuallyBoxedKernel_(*this, func);
-  // NB: Do not need to set manually boxed kernel for backend fallbacks
-}
-
 std::vector<OperatorHandle> Dispatcher::findDanglingImpls() const {
   return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::vector<OperatorHandle> {
     std::vector<OperatorHandle> opsWithDanglingImpls;
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 60f9f9bd0579..9641dfbea0cd 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -182,12 +182,6 @@ class TORCH_API Dispatcher final {
    */
   RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setBoxedKernelFor_ once all operators work with the templated boxing logic
-  void setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func);
-
   // ------------------------------------------------------------------------
   //
   // Listeners on registrations
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index f0d7bc6968ed..7c3698beeb06 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -21,7 +21,6 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 , schema_()
 , dispatchTable_()
 , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
-, manuallyBoxedKernel_()
 , kernels_()
 , cpp_signature_()
 , is_observed_(ObservedOperators::isObserved(name_))
@@ -122,10 +121,6 @@ std::list<AnnotatedKernel>::iterator OperatorEntry::registerKernel(
     );
   }
 
-  if (manuallyBoxedKernel_.has_value()) {
-    kernel.setManuallyBoxedKernel_(*manuallyBoxedKernel_);
-  }
-
   k.emplace_front(std::move(kernel), std::move(inferred_function_schema), std::move(debug));
   std::list<AnnotatedKernel>::iterator inserted = k.begin();
   // update the dispatch table, i.e. re-establish the invariant
@@ -331,19 +326,6 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
   }
 }
 
-void OperatorEntry::setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func) {
-  TORCH_INTERNAL_ASSERT(!manuallyBoxedKernel_);
-  manuallyBoxedKernel_ = func;
-
-  for (auto& kv : kernels_) {
-    for (auto& k : kv.second) {
-      k.kernel.setManuallyBoxedKernel_(func);
-    }
-  }
-  // Refresh entries in dispatchTable_
-  updateDispatchTableFull_(dispatcher);
-}
-
 void OperatorEntry::checkInvariants() const {
   if (schema_) {
     TORCH_INTERNAL_ASSERT(schema_->schema.operator_name() == name_, dumpState());
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 5098fd0d8c28..44b8fac5661e 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -148,12 +148,6 @@ class TORCH_API OperatorEntry final {
 
   const DispatchKeyExtractor& dispatchKeyExtractor() const { return dispatchKeyExtractor_; }
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic
-  void setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func);
-
   // Asserts that the given FuncType is correct for calling this operator in an unboxed way.
   template<class FuncType>
   void assertSignatureIsCorrect() {
@@ -189,12 +183,6 @@ class TORCH_API OperatorEntry final {
   std::array<KernelFunction, static_cast<uint8_t>(DispatchKey::NumDispatchKeys)> dispatchTable_;
   DispatchKeyExtractor dispatchKeyExtractor_;
 
-  // This manuallyBoxedKernel_ member is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete manuallyBoxedKernel_ once all operators work with the templated boxing logic
-  c10::optional<KernelFunction::InternalBoxedKernelFunction*> manuallyBoxedKernel_;
-
   // kernels_ stores all registered kernels for the corresponding dispatch key
   // and catchAllKernels_ stores the catch-all kernels.
   // If an operator library gets loaded that overwrites an already existing kernel,
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 191a7ca26835..9b934e4831e8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -340,9 +340,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
 
   set(GENERATED_CXX_TORCH
     "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_0.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_1.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_2.cpp"
     )
 
   if(NOT INTERN_DISABLE_AUTOGRAD)
@@ -434,8 +431,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     "${TOOLS_PATH}/autograd/load_derivatives.py"
     "${TOOLS_PATH}/autograd/nested_dict.py"
     "${TOOLS_PATH}/autograd/utils.py"
-    "${TOOLS_PATH}/jit/gen_unboxing_wrappers.py"
-    "${TOOLS_PATH}/jit/templates/generated_unboxing_wrappers.cpp"
     WORKING_DIRECTORY "${TORCH_ROOT}")
 
 
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 769f9d59c856..64d3de547bb7 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -285,8 +285,7 @@ def emit_assignments(o, env):
         real_inputs = 0
         for i, arg in enumerate(o['arguments']):
             env['arguments'].append(arg['name'])
-            # Emulate logic in gen_unboxing_wrappers.py. Pretend the flat argument
-            # list is a stack where the end is the top.
+            # Pretend the flat argument list is a stack where the end is the top.
             view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
             if arg['type'] == 'TensorList':
                 # NOTE: do not advance real_inputs here. After this we will
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index dc05ace7c542..5ed0b1340811 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -7,9 +7,6 @@ GENERATED_CPP = [
     "autograd/generated/VariableType_2.cpp",
     "autograd/generated/VariableType_3.cpp",
     "autograd/generated/VariableType_4.cpp",
-    "jit/generated/generated_unboxing_wrappers_0.cpp",
-    "jit/generated/generated_unboxing_wrappers_1.cpp",
-    "jit/generated/generated_unboxing_wrappers_2.cpp",
     "autograd/generated/TraceType_0.cpp",
     "autograd/generated/TraceType_1.cpp",
     "autograd/generated/TraceType_2.cpp",
@@ -39,9 +36,6 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name[36:-3]) for name in lib
 def libtorch_generated_sources(gencode_pattern):
     return [gencode_pattern.format(name) for name in [
         "autograd/generated/Functions.cpp",
-        "jit/generated/generated_unboxing_wrappers_0.cpp",
-        "jit/generated/generated_unboxing_wrappers_1.cpp",
-        "jit/generated/generated_unboxing_wrappers_2.cpp",
         "autograd/generated/VariableType_0.cpp",
         "autograd/generated/VariableType_1.cpp",
         "autograd/generated/VariableType_2.cpp",
diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py
deleted file mode 100644
index 19e459e3f7ac..000000000000
--- a/tools/jit/gen_unboxing_wrappers.py
+++ /dev/null
@@ -1,568 +0,0 @@
-"""
-To run this file by hand from the root of the PyTorch
-repository, run:
-
-python -m tools.jit.gen_unboxing_wrappers \
-       build/aten/src/ATen/Declarations.yaml \
-       $OUTPUT_DIR \
-       tools/jit/templates
-
-Where $OUTPUT_DIR is where you would like the files to be
-generated.  In the full build system, OUTPUT_DIR is
-torch/csrc/jit/generated/
-"""
-
-# This file generates generated_unboxing_wrappers, which contains
-# manual unboxing wrappers for ops that aren't use_c10_dispatcher: full
-# because the templated unboxing logic in c10 doesn't support them yet.
-# The ultimate goal is to make all ops use the templated unboxing and
-# delete this codegen file.
-
-import argparse
-import re
-from itertools import groupby
-from functools import reduce
-import yaml
-
-from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT
-from ..autograd.utils import CodeTemplate, YamlLoader, write, is_out_variant, op_name_with_overload
-from tools.codegen.selective_build.selector import SelectiveBuilder
-
-# JIT has a type system of
-# Scalar = int | float | bool # int is the largest int (int64_t),
-# float is the largest float (double) we don't have the others because they are never held in tensors
-# Type = Scalar # primitive numbers
-#      | Tensor # any tensor, as defined by at::Tensor
-#      | Type[] # a dynamically sized list[ of a type
-#      | Scalar[N] # a homogenous fixed size scalar list, single scalars can expand to this list
-#      | (Type1, Type2, ...) # a heterogeneous tuple
-#      | Layout | ScalarType | Device | Generator # special singleton types for built-in concepts in tensor lib
-
-# clean up the variety of C++ types in the ATen declarations
-# to be in the restricted set of types that the IR represents
-# note: no default values for this map, to make it clear what types
-# can be passedthrough
-
-TYPE_MAP = {
-    'std::array<bool,2>': 'bool[2]',
-    'std::array<bool,3>': 'bool[3]',
-    'std::array<bool,4>': 'bool[4]',
-    'std::string': 'str',
-    'std::string?': 'str?',
-    'Scalar': 'Scalar',
-    'ScalarList': 'Scalar[]',
-    'MemoryFormat': 'MemoryFormat',
-    'MemoryFormat?': 'MemoryFormat?',
-    'QScheme': 'QScheme',
-    'Scalar?': 'Scalar?',
-    'Tensor': 'Tensor',
-    'Tensor?': 'Tensor?',
-    'TensorList': 'Tensor[]',
-    # this appears in return values instead of TensorList
-    # since TensorList is a ArrayRef in arguments but a vector
-    # in returns
-    'std::vector<Tensor>': 'Tensor[]',
-    'IntArrayRef': 'int[]',
-    'IntArrayRef?': 'int[]?',
-    'ArrayRef<double>?': 'float[]?',
-    'Layout': 'Layout',
-    'Layout?': 'Layout?',
-    'Device': 'Device',
-    'Device?': 'Device?',
-    'ScalarType': 'ScalarType',
-    'ScalarType?': 'ScalarType?',
-    'int64_t': 'int',
-    'int64_t?': 'int?',
-    'double': 'float',
-    'double?': 'float?',
-    'bool': 'bool',
-    'bool?': 'bool?',
-    'Generator': 'Generator?',
-    'Generator?': 'Generator?',
-}
-
-
-def optional_type_of(arg, typ):
-    # optional type special handling for Tensor?[] and Tensor
-    # types that is missing a optional annotation
-    if arg.get('is_nullable') and '?' not in typ:
-        if typ == 'TensorList' or typ == 'Tensor[]':
-            typ = 'Tensor?[]'
-        else:
-            typ = '{}?'.format(typ)
-    return typ
-
-
-def annotated_type_of(arg, typ):
-    anno = arg.get('annotation')
-    if anno:
-        typ = '{}({})'.format(typ, anno)
-    return typ
-
-
-def jit_type_of(arg):
-    jit_type = arg.get('jit_type')
-    if not jit_type:
-        jit_type = TYPE_MAP[arg['simple_type']]
-        if is_sized_intlist_arg(arg):
-            jit_type = 'int[{}]'.format(arg['size'])
-        jit_type = optional_type_of(arg, jit_type)
-        jit_type = annotated_type_of(arg, jit_type)
-        arg['jit_type'] = jit_type
-    return jit_type
-
-
-# map from aten 'simple_type' to the function that will turn a tensor into
-# that type
-FROM_IVALUE = {
-    'Device': '{}.toDevice()',
-    'Device?': '{}.toOptional<c10::Device>()',
-    'IntArrayRef': '{}.toIntVector()',
-    'IntArrayRef?': '{}.toOptionalIntArray()',
-    'ArrayRef<double>?': '{}.toOptionalDoubleArray()',
-    'Layout': '{}.toLayout()',
-    'Layout?': '{}.toOptional<c10::Layout>()',
-    'MemoryFormat': '{}.toMemoryFormat()',
-    'MemoryFormat?': '{}.toOptional<c10::MemoryFormat>()',
-    'QScheme': '{}.toQScheme()',
-    'Scalar': '{}.toScalar()',
-    'Scalar?': '{}.toOptional<Scalar>()',
-    'ScalarType': '{}.toScalarType()',
-    'ScalarType?': '{}.toOptional<ScalarType>()',
-    'Tensor': '{}.toTensor()',
-    'Tensor?': 'toOptionalTensor({})',
-    'Tensor?[]': 'toListOfOptionalTensor({})',
-    'TensorList': '{}.toTensorVector()',
-    'ScalarList': '{}.toScalarVector()',
-    'bool': '{}.toBool()',
-    'bool?': '{}.toOptional<bool>()',
-    'double': '{}.toDouble()',
-    'double?': '{}.toOptional<double>()',
-    'int64_t': '{}.toInt()',
-    'int64_t?': '{}.toOptional<int64_t>()',
-    'std::string': '{}.toStringRef()',
-    'std::string?': '{}.toOptional<std::string>()',
-    'Generator?': '{}.toOptional<at::Generator>()',
-    'std::array<bool,2>': 'as_bool_array<2>({}.toBoolList())',
-    'std::array<bool,3>': 'as_bool_array<3>({}.toBoolList())',
-    'std::array<bool,4>': 'as_bool_array<4>({}.toBoolList())',
-}
-
-
-def from_ivalue(arg, value):
-    typ = optional_type_of(arg, arg['simple_type'])
-    return FROM_IVALUE[typ].format(value)
-
-
-CALL_UNBOXED_KERNEL = CodeTemplate("""\
-auto result_ = callUnboxedKernel<${return_type}${formals_types_with_leading_comma}>(unboxedKernel${args_with_leading_comma});
-""")
-CALL_NAMESPACE = CodeTemplate("""\
-auto result_ = at::${name}(
-    ${args}
-);
-""")
-CALL_METHOD = CodeTemplate("""\
-auto result_ = (${first}).${name}(
-    ${args}
-);
-""")
-CALL_NAMESPACE_WITH_TENSOR_OPTIONS = CodeTemplate("""\
-const auto options = TensorOptions()
-        .dtype(${dtype})
-        .layout(${layout})
-        .device(${device})
-        .pinned_memory(${pin_memory});
-    auto result_ = torch::${name}(${args_with_tensor_options});
-""")
-CALL_METHOD_WITH_TENSOR_OPTIONS = CodeTemplate("""\
-const auto options = TensorOptions()
-        .dtype(${dtype})
-        .layout(${layout})
-        .device(${device})
-        .pinned_memory(${pin_memory});
-auto result_ = (${first}).${name}(${args_with_tensor_options});
-""")
-
-CONSTRUCTOR = CodeTemplate("""\
-[](OperatorKernel* unboxedKernel, const OperatorHandle&, Stack* stack) {
-    using namespace at;
-    ${lvalues}
-    ${call}
-    drop(*stack, ${num_inputs});
-    pack(*stack, std::move(result_));
-}
-""")
-
-OPERATOR = CodeTemplate("""\
-  .op("${signature}",
-    ${op})
-""")
-
-
-disallowed_types = {
-    'Storage',
-    'DimnameList?',
-    'ConstQuantizerPtr',
-    'Dimname',
-    'DimnameList',
-}
-
-default_only_types = {'Generator'}
-
-
-def is_jit_arg(i, arg):
-    simple_type = arg['simple_type']
-    if simple_type in disallowed_types:
-        return False
-    if simple_type in default_only_types and 'default' not in arg:
-        return False
-    if simple_type == 'Type':
-        return False
-    return True
-
-
-def is_jit_op(decl):
-    # We currently don't support functions that return nothing
-    assert all(r['type'] != 'void' for r in decl['returns'])
-    if len(decl['returns']) == 0:
-        return False
-
-    arguments = decl['arguments']
-
-    # there must be a single out variant
-    if is_out_variant(decl) and sum([not not arg.get('output') for arg in arguments]) > 1:
-        return False
-
-    return (('namespace' in decl['method_of'] or 'Tensor' in decl['method_of']) and
-            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['arguments'])) and
-            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['returns'])))
-
-
-def is_tensor_arg(arg):
-    return arg['simple_type'] in {'Tensor', 'TensorList'}
-
-
-def is_sized_intlist_arg(arg):
-    """Returns True for arguments declared as IntArrayRef[k], but False for IntArrayRef."""
-    return (arg['simple_type'] == 'IntArrayRef') and ('size' in arg)
-
-
-def base_name(decl):
-    name = decl['name']
-    return name[:-1] if decl.get('inplace', False) else name[:-4] if name.endswith('_out') else name
-
-
-def is_view(decl):
-    return base_name(decl) in RETURNS_VIEWS_OF_INPUT
-
-
-# Copied from ..autograd.gen_python_functions.SKIP_PYTHON_BINDINGS
-BACKWARD_OP_PATTERNS = [
-    '.*_backward',
-    '.*_backward_(out|input|weight|bias)',
-]
-
-def is_backward_op(decl):
-    for pattern in BACKWARD_OP_PATTERNS:
-        if re.match('^' + pattern + '$', decl['name']):
-            return True
-    return False
-
-
-# for each argument in decl, the location it should appear in the
-# jit schema declaration. e.g.
-# arguments = [x, y, z] # the order in aten
-# jit_argument_order = [2, 0, 1]
-# aten::my_arg(Tensor y, Tensor z, Tensor x) # the order in schema
-# used to move 'out' arguments to the end of the list
-def argument_order(decl):
-    return decl.get('jit_argument_order') or list(range(len(decl['arguments'])))
-
-
-def format_return_type(returns):
-    if len(returns) == 0:
-        return 'void'
-    elif len(returns) == 1:
-        return returns[0]['type']
-    else:
-        return_types = [r['type'] for r in returns]
-        return 'std::tuple<{}>'.format(','.join(return_types))
-
-
-def get_simple_type(arg):
-    simple_type = arg['type']
-    simple_type = simple_type.replace(' &', '').replace('const ', '')
-    simple_type = simple_type.replace('Generator *', 'Generator')
-
-    opt_match = re.match(r'c10::optional<(.+)>', simple_type)
-    if opt_match:
-        simple_type = '{}?'.format(opt_match.group(1))
-    return simple_type
-
-
-def load_aten_declarations(path):
-    with open(path, 'r') as f:
-        declarations = yaml.load(f, Loader=YamlLoader)
-
-    # enrich declarations with additional information
-    selected_declarations = []
-    for declaration in declarations:
-        if declaration.get('deprecated'):
-            continue
-
-        for arg in declaration['arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for arg in declaration['schema_order_arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for ret in declaration['returns']:
-            ret['simple_type'] = get_simple_type(ret)
-
-        declaration['formals'] = [arg['type'] + ' ' + arg['name']
-                                  for arg in declaration['arguments']]
-        declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name']
-                                               for arg in declaration['schema_order_arguments']]
-        declaration['args'] = [arg['name'] for arg in declaration['arguments']]
-        declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
-        declaration['api_name'] = declaration['name']
-        if declaration.get('overload_name'):
-            declaration['type_wrapper_name'] = "{}_{}".format(
-                declaration['name'], declaration['overload_name'])
-        else:
-            declaration['type_wrapper_name'] = declaration['name']
-        declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0]
-        declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1]
-        declaration['return_type'] = format_return_type(declaration['returns'])
-
-        declaration['base_name'] = declaration['name']
-        selected_declarations.append(declaration)
-
-    return selected_declarations
-
-
-def gen_unboxing_wrappers(
-    declarations,
-    out,
-    template_path,
-    operator_selector: SelectiveBuilder,
-    disable_autograd=False,
-    force_schema_registration=False,
-):
-    GENERATED_UNBOXING_WRAPPERS_CPP = CodeTemplate.from_file(template_path + '/generated_unboxing_wrappers.cpp')
-
-    ops = []
-
-    def get_invocation(decl, args, num_inputs):
-
-        # because the arg list can get lengthy we put them on a separate line
-        def pack_arguments(args):
-            return ',\n'.join(args)
-        is_namespace_function = 'namespace' in decl['method_of']
-        tensor_options_arg_index = decl.get('tensor_options_arg_index', None)
-        if tensor_options_arg_index is not None:
-            dtype = args[tensor_options_arg_index]
-            layout = args[tensor_options_arg_index + 1]
-            device = args[tensor_options_arg_index + 2]
-            pin_memory = args[tensor_options_arg_index + 3]
-            args_with_tensor_options = args[:tensor_options_arg_index] + \
-                ['options'] + args[(tensor_options_arg_index + 4):]
-            if is_namespace_function:
-                return CALL_NAMESPACE_WITH_TENSOR_OPTIONS.substitute(
-                    name=decl['name'], dtype=dtype, layout=layout,
-                    device=device, pin_memory=pin_memory,
-                    args_with_tensor_options=pack_arguments(args_with_tensor_options))
-            else:
-                return CALL_METHOD_WITH_TENSOR_OPTIONS.substitute(
-                    name=decl['name'], dtype=dtype, layout=layout,
-                    device=device, pin_memory=pin_memory,
-                    args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]),
-                    first=args_with_tensor_options[0], num_inputs=num_inputs)
-        else:
-            if is_namespace_function:
-                return CALL_NAMESPACE.substitute(name=decl['name'],
-                                                 args=pack_arguments(args),
-                                                 num_inputs=num_inputs)
-            else:
-                return CALL_METHOD.substitute(
-                    name=decl['name'], first=args[0],
-                    args=pack_arguments(args[1:]), num_inputs=num_inputs)
-
-    def requires_lvalue(arg):
-        jit_type = jit_type_of(arg)
-        return jit_type.startswith('Tensor') and '!' in jit_type
-
-    def emit_decl_variant(decl):
-        if ('emit_dummy_placeholder' in decl):
-            return "DUMMY_OPERATION"
-        kw_assignments = []
-
-        # mutable arguments in aten are passed as non const references
-        # these must be lvalues, so we have to put them in variables
-        # before calling the function
-        lvalues = []
-
-        arguments = []
-        num_inputs = len(decl['arguments'])
-        op_capture = ''
-        order = argument_order(decl)
-        for i, arg in enumerate(decl['arguments']):
-            value = from_ivalue(arg, '(std::move(peek(*stack, {}, {})))'.format(order[i], num_inputs))
-            if requires_lvalue(arg):
-                lvalues.append('auto {} = {};\n'.format(arg['name'], value))
-                value = arg['name']
-            arguments.append(value)
-
-        call = get_invocation(decl, arguments, num_inputs)
-
-        returns = decl['returns']
-
-        return constructor
-
-    def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration):
-        result = []
-        for decl in jit_decls:
-            if disable_autograd and is_backward_op(decl):
-                continue
-            op_name = op_name_with_overload(decl)
-            if operator_selector.is_root_operator(op_name):
-                result.append(decl)
-            else:
-                if force_schema_registration:
-                    decl['emit_dummy_placeholder'] = True
-                    result.append(decl)
-
-        return result
-
-    # This function declares an order on declarations. This is necessary because
-    # there is some ambiguity in the choice of overload: if an argument is overloaded
-    # to accept both Scalar and Tensor, the schema with the Tensor should come first
-    # TODO: this can (probably) be removed when we remove the implicit conversion
-    # from Tensor -> Number.
-    def sort_decls(jit_decls):
-        def declkey(decl):
-            # key = sum_{i < len(args)} {1 if arg is tensor else 2} * (3 ** i)
-            # This is a ternary encoding where
-            # 0: No argument at this position
-            # 1: Tensor argument at this position
-            # 2: Some other argument at this position.
-            args = decl['arguments']
-            result = 0
-            for i in range(len(args)):
-                result += (3 ** i) * (1 if args[i]['simple_type'] == 'Tensor' else 2)
-            return result
-
-        # NB: itertools.groupby requires the list be sorted.
-        sorted_decls = sorted(jit_decls, key=lambda decl: decl['name'])
-        grouped_decls = [list(g) for _, g in
-                         groupby(sorted_decls, key=lambda decl: decl['name'])]
-        return [sorted(g, key=declkey) for g in grouped_decls]
-
-    aten_decls = load_aten_declarations(declarations)
-    jit_decls = [d for d in aten_decls if is_jit_op(d)]
-
-    # add arguments dtype and device for functions like zeros
-    def expand_options(decl, i, arg):
-        if arg['simple_type'] != 'TensorOptions':
-            return [arg]
-        assert decl.get('tensor_options_arg_index') != i
-        decl['tensor_options_arg_index'] = i
-        tensor_options_expansion = [
-            # XXX - until we actually have first-class interpreter types for these
-            # concepts, the default values to be encoded in Tensors
-            # If you change this, you also need to update [TensorOptions in script]
-            # in the tracer code.
-            # dtype is specified as an int64_t of at::ScalarType
-            {'name': 'dtype', 'simple_type': 'ScalarType'},
-            # layout is specified as an int64_t of at::Layout
-            {'name': 'layout', 'simple_type': 'Layout'},
-            # device is specified as an IntArrayRef of { at::Device::Type, device_id }
-            {'name': 'device', 'simple_type': 'Device'},
-            # pin_memory is specified as a boolean
-            {'name': 'pin_memory', 'simple_type': 'bool', 'default': False},
-        ]
-        # TODO: Don't repack this into TensorOptions. Needs various changes in downstream code.
-        if 'default' in arg:
-            for el in tensor_options_expansion:
-                el['simple_type'] += '?'
-                el['default'] = 'None'
-        if 'default' in arg and arg['default'] == 'at::kLong':
-            tensor_options_expansion[0]['default'] = 'long'
-        if 'kwarg_only' in arg and arg['kwarg_only']:
-            for el in tensor_options_expansion:
-                el['kwarg_only'] = True
-        return tensor_options_expansion
-
-    additional_jit_decls = []
-
-    for decl in jit_decls:
-        decl['arguments'] = [a for i, arg in enumerate(decl['arguments']) for a in expand_options(decl, i, arg)]
-        if is_out_variant(decl):
-            reorder_out_args(decl)
-
-    jit_decls.extend(additional_jit_decls)
-    jit_decls = filter_decls(jit_decls, disable_autograd, operator_selector, force_schema_registration)
-
-    # generation is deterministic
-    jit_decl_groups = sort_decls(jit_decls)
-
-    # NOTE: see Note [Sharded File] at the top of the generated_unboxing_wrappers.cpp
-    # template regarding sharding of the generated files.
-    #
-    # If you edit the number of shards here, you will also have to
-    # modify generate_code.py, torch/CMakeLists.txt, and the TARGETS
-    # files.
-    num_shards = 3
-    shards = [[] for _ in range(num_shards)]
-
-    # ops are assigned arbitrarily but stably to a file based on hash
-    for group in jit_decl_groups:
-        x = sum(ord(c) for c in group[0]['name']) % num_shards
-
-    for i, shard in enumerate(shards):
-        env = {
-            'constructors': shard,
-        }
-        write(out, 'generated_unboxing_wrappers_%d.cpp' % i, GENERATED_UNBOXING_WRAPPERS_CPP, env)
-
-    all_shards = reduce(
-        lambda lhs, rhs: lhs + rhs,
-        shards,
-    )
-    env = {
-        'constructors': all_shards,
-    }
-    write(out, 'generated_unboxing_wrappers_everything.cpp', GENERATED_UNBOXING_WRAPPERS_CPP, env)
-
-
-default_map = {'{}': 'None', 'nullptr': 'None', 'c10::nullopt': 'None'}
-
-
-def reorder_out_args(decl):
-    first_arg = decl['arguments'][0]
-    assert(first_arg['output'])
-    # the output variant must go at the end
-    # note: this is an annoying side effect of using a single '*'
-    # to denote kwarg_only
-    nargs = len(decl['arguments'])
-    decl['jit_argument_order'] = [nargs - 1] + list(range(nargs - 1))
-
-
-def is_kwarg_only(a):
-    return a.get('kwarg_only') or a.get('output')
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate JIT op dispatch')
-    parser.add_argument('declarations', metavar='DECL',
-                        help='path to Declarations.yaml')
-    parser.add_argument('out', metavar='OUT',
-                        help='path to output directory')
-    parser.add_argument('template_path', metavar='TEMPLATE_PATH',
-                        help='path to templates directory')
-    args = parser.parse_args()
-    gen_unboxing_wrappers(args.declarations, args.out, args.template_path,
-                          SelectiveBuilder.get_nop_selector())
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/jit/templates/generated_unboxing_wrappers.cpp b/tools/jit/templates/generated_unboxing_wrappers.cpp
deleted file mode 100644
index cd8d12f6b15e..000000000000
--- a/tools/jit/templates/generated_unboxing_wrappers.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-#include "torch/csrc/jit/runtime/operator.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/jit/frontend/function_schema_parser.h"
-
-#include "torch/csrc/autograd/profiler.h"
-#include "torch/csrc/autograd/generated/variable_factories.h"
-
-#include <ATen/ATen.h>
-#include <ATen/core/functional.h>
-#include <ATen/core/interned_strings.h>
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <cstring>
-#include <sstream>
-#include <stdexcept>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-// ${generated_comment}
-
-// This file contains manual unboxing wrappers for ops that aren't
-// use_c10_dispatcher: full because the templated unboxing logic in c10 doesn't
-// support them yet. The ultimate goal is to make all ops use the templated
-// unboxing and delete this codegen file.
-
-// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
-// incremental rebuilds. See the comment at the top of
-// templates/VariableType.cpp for an analogous, in-depth discussion.
-
-namespace torch { namespace jit {
-
-using autograd::Variable;
-using autograd::variable_list;
-using at::Scalar;
-using at::ScalarType;
-using at::Tensor;
-using at::TensorOptions;
-using at::DeviceGuard;
-using at::MemoryFormat;
-
-using ::c10::fmap;
-using ::c10::filter;
-using c10::OperatorKernel;
-using c10::OperatorHandle;
-using c10::KernelFunction;
-using c10::RegistrationHandleRAII;
-using c10::Stack;
-
-namespace {
-
-template<class Return, class... Args>
-Return callUnboxedKernel(OperatorKernel* unboxedKernel, Args... args) {
-  using FuncType = Return (Args...);
-  auto* typedUnboxedKernel = static_cast<c10::impl::WrapFunctionIntoRuntimeFunctor<FuncType*>*>(unboxedKernel);
-  return (*typedUnboxedKernel)(std::forward<Args>(args)...);
-}
-
-// TODO: remove the toOptionalTensor and toListOfOptionalTensor
-// when we remove the undefined tensor semantic from TH
-
-// XXX: This function is to specialize IValue for tensor type in
-// interpreter, it should only be used in this file
-at::Tensor toOptionalTensor(const IValue& v) {
-  if (v.isNone()) {
-    return at::Tensor();
-  }
-  return v.toTensor();
-}
-
-// XXX: This function is to specialize IValue for list of optional
-// tensor type in interpreter, it should only be used in this file
-std::vector<Tensor> toListOfOptionalTensor(const IValue& v) {
-  // v is a list of optional tensor, loop over as generic list
-  auto vlist = v.toListRef();
-  std::vector<Tensor> res;
-
-  for (const IValue &v: vlist) {
-    res.emplace_back(toOptionalTensor(v));
-  }
-  return res;
-}
-
-template<size_t N>
-std::array<bool, N> as_bool_array(const c10::List<bool>& list) {
-  std::array<bool, N> res;
-  AT_ASSERT(list.size() == N);
-  std::copy(list.begin(), list.end(), res.begin());
-  return res;
-}
-
-KernelFunction::InternalBoxedKernelFunction *DUMMY_OPERATION =
-  [](c10::OperatorKernel *, const c10::OperatorHandle &, std::vector<c10::IValue> *) -> void {
-    TORCH_CHECK(false, "Operator has been stripped in the custom build.")
-  };
-
-class Registerer final {
-public:
-  Registerer&& op(const std::string& schemaStr, KernelFunction::InternalBoxedKernelFunction* boxed_kernel_wrapper) && {
-    static auto& dispatcher = c10::Dispatcher::singleton();
-    auto schema = parseSchema(schemaStr);
-    schema.setAliasAnalysis(AliasAnalysisKind::FROM_SCHEMA);
-    c10::OperatorName name = schema.operator_name();
-    RegistrationHandleRAII registration = dispatcher.registerName(name);
-    auto op = dispatcher.findOp(name).value();
-    registrationHandles_.push_back(std::move(registration));
-    dispatcher.setManuallyBoxedKernelFor_(op, boxed_kernel_wrapper);
-    return std::move(*this);
-  }
-
-  Registerer() = default;
-  Registerer(const Registerer&) = delete;
-  Registerer& operator=(const Registerer&) = delete;
-  Registerer(Registerer&&) noexcept = default;
-  Registerer& operator=(Registerer&&) noexcept = default;
-private:
-  std::vector<RegistrationHandleRAII> registrationHandles_;
-};
-
-static auto registry = Registerer()
-  // Generated operators
-  ${constructors}
-  ;
-
-} // anon namespace
-
-
-}} // namespace torch::jit
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 9ca843abc69f..10bbc33c352f 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -30,7 +30,6 @@ def generate_code(ninja_global=None,
                   operator_selector=None):
     from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python
     from tools.autograd.gen_annotated_fn_args import gen_annotated
-    from tools.jit.gen_unboxing_wrappers import gen_unboxing_wrappers
     from tools.codegen.selective_build.selector import SelectiveBuilder
 
 
@@ -70,13 +69,6 @@ def generate_code(ninja_global=None,
             disable_autograd=disable_autograd,
             operator_selector=operator_selector,
         )
-        gen_unboxing_wrappers(
-            declarations_path or DECLARATIONS_PATH,
-            jit_gen_dir,
-            tools_jit_templates,
-            disable_autograd=disable_autograd,
-            operator_selector=operator_selector,
-            force_schema_registration=force_schema_registration)
 
     if subset == "python" or not subset:
         gen_annotated(

From 6643e9fbb3c6770fa128be93635e87725de3c839 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 6 Jan 2021 14:14:24 -0800
Subject: [PATCH 29/44] Remove `use_c10_dispatcher: full` lines (#49259)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49259

Since `use_c10_dispatcher: full` is now the default, we can remove all those pesky lines mentioning it. Only the `use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` lines are left.
ghstack-source-id: 119450485

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25506526

fbshipit-source-id: 8053618120c0b52ff7c73cacb34bec7eb38f8fe0
---
 aten/src/ATen/native/native_functions.yaml | 1244 +-------------------
 1 file changed, 4 insertions(+), 1240 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 215ca70bfbae..b474d435398c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7,42 +7,34 @@
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # Computes the gradient of current tensor w.r.t. graph leaves.
@@ -59,18 +51,15 @@
 # where Variables *are* Tensors (as opposed to them containing tensors, which
 # is what the previous interpretation was.)
 - func: set_data(Tensor(a!) self, Tensor new_data) -> ()
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: data(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 # True if this `Variable` is a leaf and thus does not have a `grad_fn`.
 - func: is_leaf(Tensor self) -> bool
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -85,23 +74,19 @@
 #   assert y2.output_nr == 2
 #
 - func: output_nr(Tensor self) -> int
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: _version(Tensor self) -> int
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 # Enables .grad attribute for non-leaf Tensors.
 - func: retain_grad(Tensor(a!) self) -> ()
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -120,47 +105,36 @@
   variants: function
 
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_as(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _use_cudnn_ctc_loss
 
 - func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_ctc_loss
 
 - func: _use_cudnn_rnn_flatten_weight() -> bool
-  use_c10_dispatcher: full
 
 - func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_rnn_flatten_weight
 
@@ -180,71 +154,52 @@
     CUDA: _cudnn_init_dropout_state
 
 - func: _debug_has_internal_overlap(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _shape_as_tensor(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: feature_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: abs(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: abs
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: abs_
@@ -281,18 +236,15 @@
 
 # Absolute, alias for abs
 - func: absolute(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: absolute_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: angle(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: angle
@@ -303,19 +255,16 @@
     CPU, CUDA: angle_out
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: view_as_real
 
 - func: view_as_complex(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: view_as_complex
 
 - func: sgn(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sgn
@@ -332,15 +281,12 @@
     CPU, CUDA: sgn_out
 
 - func: real(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
 
 - func: imag(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
 
 - func: conj(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -349,19 +295,16 @@
     CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: _conj
 
 - func: acos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acos
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acos_
@@ -373,28 +316,22 @@
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
-  use_c10_dispatcher: full
 
 - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
-  use_c10_dispatcher: full
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   structured_delegate: add.out
   variants: function, method
   dispatch:
@@ -403,7 +340,6 @@
     MkldnnCPU: mkldnn_add
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   structured_delegate: add.out
   dispatch:
@@ -422,13 +358,11 @@
     MkldnnCPU: mkldnn_add_out
 
 - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: add_relu
 
 - func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: add_relu_
@@ -441,25 +375,21 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: add
 
 - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: add_
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addmv_
@@ -470,20 +400,17 @@
     CPU, CUDA: addmv_out
 
 - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: addmv_impl_cpu
     CUDA: addmv_impl_cuda
 
 - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addr
     Math: math_addr
 
 - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addr_
@@ -495,17 +422,14 @@
     Math: math_addr_out
 
 - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: affine_grid_generator
 
 - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: all
@@ -516,18 +440,15 @@
     CPU, CUDA: all_out
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: any
@@ -538,7 +459,6 @@
     CPU, CUDA: any_out
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -568,10 +488,8 @@
 # preserve tracing.  Get rid of this when arange can directly take tensors for bounds
 # (so that it can be traced directly).
 - func: _dim_arange(Tensor like, int dim) -> Tensor
-  use_c10_dispatcher: full
 
 - func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: argmax
@@ -582,7 +500,6 @@
     CPU, CUDA: argmax_out
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: argmin
@@ -593,13 +510,11 @@
     CPU, CUDA: argmin_out
 
 - func: acosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acosh
 
 - func: acosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acosh_
@@ -611,24 +526,20 @@
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: asinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: asinh
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: asinh_
@@ -640,24 +551,20 @@
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atanh
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atanh_
@@ -669,18 +576,15 @@
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: as_strided_tensorimpl
@@ -695,14 +599,12 @@
     DefaultBackend: as_strided_
 
 - func: asin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: asin
     SparseCPU, SparseCUDA: asin_sparse
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: asin_
@@ -716,24 +618,20 @@
 
 # arcsin, alias of asin
 - func: arcsin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atan
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atan_
@@ -745,55 +643,44 @@
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atleast_1d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: atleast_2d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_3d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: baddbmm_cpu
     CUDA: baddbmm_cuda
 
 - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: baddbmm__cpu
     CUDA: baddbmm__cuda
 
 - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
 
 - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -825,7 +712,6 @@
 
 # Sample bernoulli with values in `self` as probability.
 - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: bernoulli
@@ -837,13 +723,11 @@
     CPU, CUDA: bernoulli_out
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
@@ -852,7 +736,6 @@
 # There is no default valid on `p` here because it would introduce ambiguity
 # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
 - func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
@@ -908,11 +791,9 @@
     CUDA: _bincount_cuda
 
 - func: bitwise_not(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -921,13 +802,11 @@
     CPU, CUDA: bitwise_not_out
 
 - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: copysign
 
 - func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: copysign_
@@ -938,23 +817,19 @@
     CPU, CUDA: copysign_out
 
 - func: copysign.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: copysign
 
 - func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: copysign_
 
 - func: logical_not(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -963,11 +838,9 @@
     CPU, CUDA: logical_not_out
 
 - func: logical_xor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -976,11 +849,9 @@
     CPU, CUDA: logical_xor_out
 
 - func: logical_and(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -989,11 +860,9 @@
     CPU, CUDA: logical_and_out
 
 - func: logical_or(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1008,7 +877,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: bmm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: bmm_cpu
@@ -1017,7 +885,6 @@
     SparseCUDA: bmm_sparse_cuda
 
 - func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     SparseCUDA: _bmm_sparse_cuda
@@ -1038,7 +905,6 @@
     SparseCUDA: _bmm_out_sparse_cuda
 
 - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
 
 - func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
@@ -1048,7 +914,6 @@
     Math: broadcast_to
 
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: cat
 
@@ -1058,23 +923,19 @@
     DefaultBackend: cat_out
 
 - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
-  use_c10_dispatcher: full
 
 - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: block_diag(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: ceil(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ceil
 
 - func: ceil_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ceil_
@@ -1085,25 +946,20 @@
     CPU, CUDA: ceil_out
 
 - func: chain_matmul(Tensor[] matrices) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: tensor_split.sections(Tensor(a) self, int sections, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: tensor_split.indices(Tensor(a) self, int[] indices, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: tensor_split.tensor_indices_or_sections(Tensor(a) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
@@ -1111,14 +967,12 @@
   variants: function, method
 
 - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: clamp
     QuantizedCPU: clamp_quantized_cpu
 
 - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_
@@ -1129,13 +983,11 @@
     CPU, CUDA: clamp_out
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_max
 
 - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_max_
@@ -1146,13 +998,11 @@
     CPU, CUDA: clamp_max_out
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_min
 
 - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_min_
@@ -1164,7 +1014,6 @@
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
@@ -1175,11 +1024,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: cudnn_is_acceptable(Tensor self) -> bool
-  use_c10_dispatcher: full
   device_guard: False
 
 - func: complex(Tensor real, Tensor imag) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: complex
@@ -1190,7 +1037,6 @@
     CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: polar
@@ -1201,13 +1047,11 @@
     CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: constant_pad_nd
 
 - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   manual_cpp_binding: True
 
@@ -1220,7 +1064,6 @@
     DefaultBackend: convolution_overrideable
 
 - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: convolution_backward_overrideable
 
@@ -1246,12 +1089,10 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: conv_tbc
 
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
 - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
@@ -1264,24 +1105,20 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: copy_
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch: {}
 
 - func: cos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cos
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cos_
@@ -1292,13 +1129,11 @@
     CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cosh
 
 - func: cosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cosh_
@@ -1309,28 +1144,23 @@
     CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: count_nonzero
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: count_nonzero
 
 - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_affine_grid_generator_forward
 
 # TODO: Why do I have to call this grad?!
 - func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_affine_grid_generator_backward
 
@@ -1351,27 +1181,22 @@
     CUDA: cudnn_convolution_deprecated
 
 - func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_deprecated2
 
 - func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution
 
 - func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_input
 
 - func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward
 
 - func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_weight
 
@@ -1381,45 +1206,37 @@
     CUDA: cudnn_convolution_transpose_deprecated
 
 - func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_deprecated2
 
 - func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose
 
 # NB: output_padding not strictly needed here, but it's helpful for the float
 # backwards
 - func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward
 
 - func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_input
 
 - func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_weight
 
 # NB: input is special cased in a way I don't quite understand
 - func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_grid_sampler_forward
 
 - func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_grid_sampler_backward
 
 - func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cummax
@@ -1430,7 +1247,6 @@
     DefaultBackend: cummax_out
 
 - func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -1444,7 +1260,6 @@
     CUDA: cummax_helper_cuda
 
 - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cummin
@@ -1455,7 +1270,6 @@
     DefaultBackend: cummin_out
 
 - func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -1469,18 +1283,15 @@
     CUDA: cummin_helper_cuda
 
 - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cumprod
 
 - func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: cumprod_
@@ -1491,29 +1302,24 @@
     DefaultBackend: cumprod_out
 
 - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cumsum
 
 - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: cumsum_
@@ -1524,137 +1330,111 @@
     DefaultBackend: cumsum_out
 
 - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
 
 # convenience function that converts to intlists for you
 - func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: ctc_loss_backward_cpu
     CUDA: ctc_loss_backward_gpu
 
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagflat(Tensor self, int offset=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: diagonal
 
 - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: div.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: div
     SparseCPU, SparseCUDA: div_sparse
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: div_
     SparseCPU, SparseCUDA: div_sparse_
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: div_out
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: div.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: div
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: div_
 
 # divide, alias for div
 - func: divide.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
   # true_divide, an alias for div
 - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: dot(Tensor self, Tensor tensor) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: dot
@@ -1666,7 +1446,6 @@
     DefaultBackend: dot_out
 
 - func: vdot(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: vdot
@@ -1678,30 +1457,24 @@
     DefaultBackend: vdot_out
 
 - func: einsum(str equation, Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: embedding
 
 - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
-  use_c10_dispatcher: full
 
 - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
 
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: embedding_renorm_cpu_
     CUDA: embedding_renorm_cuda_
 
 - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  use_c10_dispatcher: full
 
 # NOTE [ embedding_bag Native Functions ]
 # The `_embedding_bag.*` variants assume that input tensors except for `weight`,
@@ -1720,11 +1493,9 @@
     CUDA: _embedding_bag_forward_only_cuda
 
 - func: rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # row_stack is the alias of vstack
 - func: row_stack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: row_stack
 
@@ -1755,20 +1526,17 @@
     CUDA: _embedding_bag_dense_backward_cuda
 
 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _embedding_bag_per_sample_weights_backward_cpu
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
 
 - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
@@ -1776,7 +1544,6 @@
     SparseCPU, SparseCUDA: empty_sparse
 
 - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: new_empty_strided(Tensor self, int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -1808,7 +1575,6 @@
     QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
 
 - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -1818,7 +1584,6 @@
     Meta: resize_meta_
 
 - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     QuantizedCPU, QuantizedCUDA: empty_quantized
@@ -1832,19 +1597,16 @@
   device_guard: False
 
 - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
 
 - func: erf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erf
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erf_
@@ -1855,13 +1617,11 @@
     CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erfc
 
 - func: erfc_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erfc_
@@ -1872,13 +1632,11 @@
     CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp_
@@ -1889,13 +1647,11 @@
     CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp2
 
 - func: exp2_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp2_
@@ -1906,13 +1662,11 @@
     CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: expm1
 
 - func: expm1_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: expm1_
@@ -1923,14 +1677,12 @@
     CPU, CUDA: expm1_out
 
 - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
   dispatch:
     DefaultBackend: expand
 
 - func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
 
@@ -1953,49 +1705,39 @@
     CUDA: eye_out_cuda
 
 - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: fill_
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: fill_
 
 - func: floor(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: floor
 
 - func: floor_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: floor_
@@ -2006,14 +1748,12 @@
     CPU, CUDA: floor_out
 
 - func: floor_divide(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: floor_divide
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: floor_divide_
@@ -2026,21 +1766,17 @@
     SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: frac(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: frac
 
 - func: frac_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: frac_
@@ -2074,11 +1810,9 @@
     CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2087,11 +1821,9 @@
     CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 # NOTE [ grid_sampler Native Functions ]
@@ -2110,37 +1842,30 @@
 # Nor does it take in `align_corners` because it only supports the mode
 # `align_corners = True`.
 - func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
 
 - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
 
 - func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_2d_backward_cpu
     CUDA: grid_sampler_2d_backward_cuda
 
 # See NOTE [ grid_sample CPU fallback ]
 - func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _grid_sampler_2d_cpu_fallback
 
 - func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
 
 - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_3d_backward_cpu
     CUDA: grid_sampler_3d_backward_cuda
@@ -2173,7 +1898,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -2191,7 +1915,6 @@
 
 # Real to complex forward FFT
 - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_r2c_mkl
@@ -2206,7 +1929,6 @@
 
 # Complex to real inverse FFT
 - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_c2r_mkl
@@ -2221,7 +1943,6 @@
 
 # Standard complex to complex FFT (forward or backward)
 - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_c2c_mkl
@@ -2235,19 +1956,14 @@
     CUDA: _fft_c2c_cufft_out
 
 - func: _cufft_get_plan_cache_size(int device_index) -> int
-  use_c10_dispatcher: full
 
 - func: _cufft_get_plan_cache_max_size(int device_index) -> int
-  use_c10_dispatcher: full
 
 - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
-  use_c10_dispatcher: full
 
 - func: _cufft_clear_plan_cache(int device_index) -> ()
-  use_c10_dispatcher: full
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: index
@@ -2258,25 +1974,20 @@
   # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
 
 - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: index_copy_
 
 - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: index_put_
@@ -2287,11 +1998,9 @@
   # - Tensor & Tensor::index_put_(std::initializer_list<TensorIndex> indices, Scalar v)
 
 - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _index_put_impl_
@@ -2301,7 +2010,6 @@
   variants: function
 
 - func: inverse(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: inverse
@@ -2312,18 +2020,15 @@
     DefaultBackend: inverse_out
 
 - func: _inverse_helper(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _inverse_helper_cpu
     CUDA: _inverse_helper_cuda
 
 - func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isnan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
@@ -2331,52 +2036,42 @@
     SparseCPU, SparseCUDA: isnan_sparse
 
 - func: is_distributed(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_floating_point(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_complex(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: isreal(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: is_nonzero(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_same_size(Tensor self, Tensor other) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_signed(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: kl_div
 
 - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: kl_div_backward_cpu
     CUDA: kl_div_backward_cuda
 
 - func: kron(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: kron
@@ -2387,7 +2082,6 @@
     Math: kron_out
 
 - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: kthvalue
@@ -2399,7 +2093,6 @@
     CUDA: kthvalue_out_cuda
 
 - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -2422,13 +2115,11 @@
     CUDA: layer_norm_backward_cuda
 
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nan_to_num
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nan_to_num_
@@ -2449,35 +2140,25 @@
     MkldnnCPU: mkldnn_linear
 
 - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
-  use_c10_dispatcher: full
 
 - func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2493,13 +2174,11 @@
     CUDA: linspace_cuda_out
 
 - func: log(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log_
@@ -2510,13 +2189,11 @@
     CPU, CUDA: log_out
 
 - func: log10(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log10
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log10_
@@ -2527,14 +2204,12 @@
     CPU, CUDA: log10_out
 
 - func: log1p(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: log1p
     SparseCPU, SparseCUDA: log1p_sparse
 
 - func: log1p_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: log1p_
@@ -2547,13 +2222,11 @@
     SparseCPU, SparseCUDA: log1p_out_sparse
 
 - func: log2(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log2
 
 - func: log2_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log2_
@@ -2569,7 +2242,6 @@
     CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: logaddexp
@@ -2580,7 +2252,6 @@
     CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: logaddexp2
@@ -2636,7 +2307,6 @@
     CPU, CUDA: xlogy_out
 
 - func: logdet(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logdet
@@ -2652,27 +2322,22 @@
 
 # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: log_softmax_cpu
     CUDA: log_softmax_cuda
 
 - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: log_softmax_backward_cpu
     CUDA: log_softmax_backward_cuda
 
 - func: _logcumsumexp(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _logcumsumexp_cpu
     CUDA: _logcumsumexp_cuda
@@ -2684,7 +2349,6 @@
     CUDA: _logcumsumexp_out_cuda
 
 - func: logcumsumexp(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logcumsumexp
@@ -2695,14 +2359,12 @@
     DefaultBackend: logcumsumexp_out
 
 - func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logsumexp
@@ -2713,55 +2375,44 @@
     DefaultBackend: logsumexp_out
 
 - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matmul(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matrix_rank(Tensor self, bool symmetric=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matrix_power(Tensor self, int n) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: matrix_exp(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: matrix_exp
 
 - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _aminmax(Tensor self) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _aminmax_all
 
 - func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _aminmax
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _compute_linear_combination
 
@@ -2771,7 +2422,6 @@
     CPU, CUDA: _compute_linear_combination_out
 
 - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: max
@@ -2782,19 +2432,16 @@
     CPU, CUDA: max_out
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: amax
@@ -2806,48 +2453,38 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_max_pool2d
 
 - func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_max_pool3d
 
 - func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     QuantizedCPU: quantized_max_pool1d
 
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     QuantizedCPU: quantized_max_pool2d
 
 - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 # The CPU and GPU dispatch variants are named weirdly here because otherwise there
 # are namespacing issues in C++
 - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
     QuantizedCPU: mean_quantized_cpu
 
 - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
@@ -2860,21 +2497,18 @@
     QuantizedCPU: mean_out_quantized_cpu
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: median(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: median_cpu
     CUDA: median_cuda
 
 - func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: median
@@ -2886,21 +2520,18 @@
     CUDA: median_out_cuda
 
 - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanmedian(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: nanmedian_cpu
     CUDA: nanmedian_cuda
 
 - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nanmedian
@@ -2912,14 +2543,12 @@
     CUDA: nanmedian_out_cuda
 
 - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: min
@@ -2930,14 +2559,12 @@
     CPU, CUDA: min_out
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: amin
@@ -2953,13 +2580,10 @@
     DefaultBackend: mkldnn_convolution
 
 - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor
-  use_c10_dispatcher: full
 
 - func: mkldnn_convolution_backward_weights(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: mkldnn_convolution_backward
 
@@ -2979,22 +2603,18 @@
     CUDA: miopen_convolution
 
 - func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_input
 
 - func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward
 
 - func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_bias
 
 - func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_weight
 
@@ -3006,17 +2626,14 @@
 # NB: output_padding not strictly needed here, but it's helpful for the float
 # backwards
 - func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward
 
 - func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward_input
 
 - func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward_weight
 
@@ -3026,17 +2643,14 @@
     CUDA: miopen_depthwise_convolution
 
 - func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward_input
 
 - func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward
 
 - func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward_weight
 
@@ -3051,7 +2665,6 @@
     CUDA: miopen_rnn_backward
 
 - func: mm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: mm_cpu
@@ -3066,7 +2679,6 @@
     SparseCPU, SparseCUDA: _sparse_mm_out
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -3080,7 +2692,6 @@
     SparseCUDA: sparse_matrix_mask_helper_cuda
 
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mode
@@ -3091,14 +2702,12 @@
     DefaultBackend: mode_out
 
 - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: mul.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mul
@@ -3106,7 +2715,6 @@
     MkldnnCPU: mkldnn_mul
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: mul_
@@ -3123,39 +2731,32 @@
 
   # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: mul.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: mul
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: mul_
 
 # multiply, alias for mul
 - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: multiply.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: mv(Tensor self, Tensor vec) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mv
@@ -3167,31 +2768,26 @@
     DefaultBackend: mv_out
 
 - func: mvlgamma(Tensor self, int p) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: mvlgamma
 
 - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: mvlgamma_
 
 - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: narrow_copy_dense
     SparseCPU, SparseCUDA: narrow_copy_sparse
 
 - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
@@ -3208,7 +2804,6 @@
     CUDA: batch_norm_cuda_out
 
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_stats_cuda
 
@@ -3256,10 +2851,8 @@
     CUDA: batch_norm_update_stats_cuda
 
 - func: is_vulkan_available() -> bool
-  use_c10_dispatcher: full
 
 - func: _nnpack_available() -> bool
-  use_c10_dispatcher: full
 
 - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -3268,15 +2861,12 @@
     DefaultBackend: _nnpack_spatial_convolution
 
 - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _nnpack_spatial_convolution_backward_input(Tensor input, Tensor grad_output, Tensor weight, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _nnpack_spatial_convolution_backward_weight(Tensor input, int[] weightsize, Tensor grad_output, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3293,64 +2883,50 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _euclidean_dist
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _cdist_forward
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _cdist_backward
 
 - func: pdist(Tensor self, float p=2) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _pdist_forward
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _pdist_backward
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   dispatch:
     DefaultBackend: permute
 
 - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 # moveaxis, alias for movedim
 - func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 # Only exposed from C++ -- in Python,
@@ -3361,45 +2937,36 @@
 # behavior on Windows, for reasons I don't understand
 # (maybe related to capital letter collation somehow...)
 - func: numpy_T(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
-  use_c10_dispatcher: full
 
 - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
   use_c10_dispatcher: full
 
 - func: channel_shuffle(Tensor self, int groups) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: channel_shuffle
     QuantizedCPU: channel_shuffle_quantized_cpu
 
 - func: is_pinned(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: method
 
 - func: pin_memory(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: rad2deg(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rad2deg
 
 - func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rad2deg_
@@ -3410,13 +2977,11 @@
     DefaultBackend: rad2deg_out
 
 - func: deg2rad(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: deg2rad
 
 - func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: deg2rad_
@@ -3533,17 +3098,14 @@
     CUDA: range_cuda_out
 
 - func: ravel(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: reciprocal(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: reciprocal
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: reciprocal_
@@ -3554,13 +3116,11 @@
     CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: neg
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: neg_
@@ -3574,61 +3134,50 @@
 
 # Alias for neg
 - func: negative(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: negative_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: repeat(Tensor self, int[] repeats) -> Tensor
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   dispatch:
     DefaultBackend: repeat
 
 - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: repeat_interleave_cpu
     CUDA: repeat_interleave_cuda
 
 - func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: reshape(Tensor(a) self, int[] shape) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_reshape
 
 - func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: round(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: round
 
 - func: round_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: round_
@@ -3640,13 +3189,10 @@
     CUDA: round_out
 
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: relu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: relu
@@ -3654,7 +3200,6 @@
     QuantizedCPU: relu_quantized_cpu
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: relu_
@@ -3662,59 +3207,50 @@
     QuantizedCPU: relu_quantized_cpu_
 
 - func: prelu(Tensor self, Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: prelu_cpu
     CUDA: prelu_cuda
 
 - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: prelu_backward_cpu
     CUDA: prelu_backward_cuda
 
 - func: gelu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: gelu_cpu
     CUDA: gelu_cuda
 
 - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: gelu_backward_cpu
     CUDA: gelu_backward_cuda
 
 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   device_guard: False
 
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: hardshrink
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: hardshrink_backward
 
 - func: rsqrt(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rsqrt
 
 - func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rsqrt_
@@ -3725,46 +3261,37 @@
     CPU, CUDA: rsqrt_out
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: select
 
 - func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: selu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: selu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: celu
 
 - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: celu_
 
 - func: silu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: silu
 
 - func: silu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: silu_
@@ -3776,14 +3303,12 @@
     CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: silu_backward
     Math: math_silu_backward
 
 - func: sigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sigmoid
@@ -3791,7 +3316,6 @@
     MkldnnCPU: mkldnn_sigmoid
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sigmoid_
@@ -3803,13 +3327,11 @@
     CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: logit
 
 - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: logit_
@@ -3820,13 +3342,11 @@
     CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sin
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sin_
@@ -3854,13 +3374,11 @@
     CPU, CUDA: sinc_out
 
 - func: sinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sinh
 
 - func: sinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sinh_
@@ -3882,7 +3400,6 @@
 # changing metadata of the detached tensor and expecting the original tensor to also
 # be updated.
 - func: detach(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: detach
@@ -3891,134 +3408,112 @@
 # only be called on non-view `Variable`s. You can use `is_view()` to check
 # this. If this `Variable` is a view, throws an `std::runtime_error()`.
 - func: detach_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: detach_
 
 - func: size.int(Tensor self, int dim) -> int
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
   manual_cpp_binding: True
 
 - func: size.Dimname(Tensor self, Dimname dim) -> int
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: slice.Tensor(Tensor(a) self, int dim=0, int start=0, int end=9223372036854775807, int step=1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: slice
 
 - func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: slogdet
 
 - func: smm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: softmax_cpu
     CUDA: softmax_cuda
     MkldnnCPU: mkldnn_softmax
 
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: softmax_backward_cpu
     CUDA: softmax_backward_cuda
 
 - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsafe_split
 
 - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: split
 
 - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsafe_split_with_sizes
 
 - func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: split_with_sizes
 
 - func: squeeze(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze
 
 - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze
 
 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze_
 
 - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze_
 
 - func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -4030,7 +3525,6 @@
     SparseCUDA: _sspaddmm_out_cuda
 
 - func: stack(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: stack
 
@@ -4040,19 +3534,16 @@
     DefaultBackend: stack_out
 
 - func: hstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: vstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: dstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -4070,30 +3561,25 @@
   variants: function, method
 
 - func: stride.int(Tensor self, int dim) -> int
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
   manual_cpp_binding: True
 
 - func: stride.Dimname(Tensor self, Dimname dim) -> int
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sum
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sum
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -4105,13 +3591,11 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
 
 - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
@@ -4122,18 +3606,15 @@
     CPU, CUDA: nansum_out
 
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: sqrt(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sqrt
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sqrt_
@@ -4144,39 +3625,32 @@
     CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: square_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: std(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: std
 
 - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: std
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: std_mean
 
 - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: std_mean
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4185,20 +3659,17 @@
     CPU, CUDA: std_out
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: prod
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: prod
@@ -4209,34 +3680,29 @@
     CPU, CUDA: prod_out
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: t(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   device_guard: False
   variants: function, method
   dispatch:
     DefaultBackend: t
 
 - func: t_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   device_guard: False
   variants: method
   dispatch:
     DefaultBackend: t_
 
 - func: tan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tan
 
 - func: tan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tan_
@@ -4247,14 +3713,12 @@
     CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: tanh
     QuantizedCPU: tanh_quantized_cpu
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tanh_
@@ -4265,7 +3729,6 @@
     CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
@@ -4276,7 +3739,6 @@
 
 # TODO: namespace threshold in 'nn'
 - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold
@@ -4284,7 +3746,6 @@
     QuantizedCPU: threshold_quantized_cpu
 
 - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold_
@@ -4297,69 +3758,57 @@
     CUDA: threshold_out_cuda
 
 - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold_backward
     CUDA: threshold_backward_cuda
 
 - func: tile(Tensor self, int[] dims) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: transpose
 
 - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_transpose
 
 - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: transpose_
 
 - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_transpose_
 
 - func: one_hot(Tensor self, int num_classes=-1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   variants: function
 
 - func: flip(Tensor self, int[] dims) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, QuantizedCPU: flip_cpu
     CUDA: flip_cuda
 
 - func: fliplr(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flipud(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: roll_cpu
@@ -4368,33 +3817,26 @@
 # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
 
 - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rot90
 
 - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
 
 - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _trilinear
 
 - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: trunc(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: trunc
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: trunc_
@@ -4406,47 +3848,39 @@
 
 # Alias for trunc
 - func: fix(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: fix_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: type_as(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
-  use_c10_dispatcher: full
   variants: function
 
 - func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _unique_cpu
     CUDA: _unique_cuda
 
 - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_dim_cpu
     CUDA: unique_dim_cuda
 
 - func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_consecutive_cpu
     CUDA: unique_consecutive_cuda
 
 - func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_dim_consecutive_cpu
@@ -4457,42 +3891,35 @@
 # Please don't rely on these two operators, they will be removed soon
 
 - func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _unique2_cpu
     CUDA: _unique2_cuda
 
 - func: _unsafe_view(Tensor self, int[] size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _unsafe_view
 
 - func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsqueeze
 
 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: unsqueeze_
 
 - func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: var(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: var
 
 - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: var
@@ -4503,30 +3930,25 @@
     CPU, CUDA: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: var_mean
 
 - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: var_mean
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -4534,55 +3956,44 @@
 # this allows us to implicitly calculate the broadcast derivative, while only dealing with the
 # _s_where derivative.
 - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where(Tensor condition) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _s_where
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # VariableType::_weight_norm does not want to be given a gap in the autograd graph,
 # so we don't define "dispatch" variants for it.
 - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: weight_norm_cuda
 
 - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: weight_norm_cuda_backward
 
 - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -4599,40 +4010,34 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _standard_gamma_grad_cpu
     CUDA: _standard_gamma_grad_cuda
 
 - func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _s_gamma_cpu
     CUDA: _s_gamma_cuda
 
 - func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _dirichlet_grad_cpu
     CUDA: _dirichlet_grad_cuda
 
 - func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _s_dirichlet_cpu
     CUDA: _s_dirichlet_cuda
 
 - func: poisson(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _s_poisson_cpu
     CUDA: _s_poisson_cuda
 
 - func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _s_binomial_cpu
     CUDA: _s_binomial_cuda
@@ -4641,96 +4046,77 @@
 # complicated
 
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: norm_sparse
 
 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: norm_sparse
 
 # TODO: reduce signatures down to one when optional args is available
 - func: _sparse_sum(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _sparse_sum
 
 - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: _sparse_sum_backward_cpu
     SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
     SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
     SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
     SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
     SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
@@ -4746,11 +4132,9 @@
     CPU, CUDA: norm_out
 
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
@@ -4760,11 +4144,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: frobenius_norm(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4772,7 +4154,6 @@
   variants: function
 
 - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4780,7 +4161,6 @@
   variants: function
 
 - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4788,7 +4168,6 @@
   variants: function
 
 - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: clone
@@ -4797,13 +4176,11 @@
     QuantizedCPU, QuantizedCUDA: quantized_clone
 
 - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: resize_as_
 
 - func: zero_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: zero_
@@ -4817,14 +4194,12 @@
     SparseCPU, SparseCUDA: sub_out_sparse
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sub
     SparseCPU, SparseCUDA: sub_sparse
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: sub_
@@ -4832,13 +4207,11 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sub
 
 - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: sub_
@@ -4848,24 +4221,19 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: rsub
@@ -4876,7 +4244,6 @@
     CPU, CUDA: heaviside_out
 
 - func: heaviside(Tensor self, Tensor values) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
@@ -4885,7 +4252,6 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: rsub
@@ -4893,7 +4259,6 @@
 # Functionally the same as addmm, but we give it a different derivative formula
 # that doesn't propagate gradients to non-present entries on sparse.
 - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _sparse_addmm
 
@@ -4906,7 +4271,6 @@
     SparseCUDA: addmm_out_sparse_dense_cuda
 
 - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: addmm_cpu
@@ -4915,7 +4279,6 @@
     SparseCUDA: addmm_sparse_dense_cuda
 
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: addmm_cpu_
@@ -5049,49 +4412,40 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
-  use_c10_dispatcher: full
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: new_with_dims_sparse
 
 - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse
 
 - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_resize_
 
 - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_resize_and_clear_
 
 - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU: sparse_mask_cpu
     SparseCUDA: sparse_mask_cuda
 
 - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_to_dense
     MkldnnCPU: mkldnn_to_dense
 
 - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: sparse_dim(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_dim_sparse
@@ -5099,14 +4453,12 @@
 
 # legacy method
 - func: _dimI(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_dim_sparse
   device_guard: False
 
 - func: dense_dim(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: dense_dim_sparse
@@ -5114,42 +4466,36 @@
 
 # legacy method
 - func: _dimV(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: dense_dim_sparse
   device_guard: False
 
 - func: _nnz(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _nnz_sparse
   device_guard: False
 
 - func: coalesce(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU: coalesce_sparse_cpu
     SparseCUDA: coalesce_sparse_cuda
 
 - func: is_coalesced(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: is_coalesced_sparse
   device_guard: False
 
 - func: _indices(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _indices_sparse
   device_guard: False
 
 - func: _values(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _values_sparse
@@ -5159,21 +4505,18 @@
 # a bit unsafe. Similar to _indices and _values, this is useful for implementing
 # custom sparse operations in Python/C++ extension.
 - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _coalesced_sparse_
   device_guard: False
 
 - func: indices(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: indices_sparse
   device_guard: False
 
 - func: values(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: values_sparse
@@ -5186,196 +4529,161 @@
     SparseCUDA: hspmm_out_sparse_cuda
 
 - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
 - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     SparseCPU, SparseCUDA: copy_sparse_
 
 - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: unbind
 
 - func: unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
 
 - func: to_sparse(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
 
 - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: dense_to_mkldnn
 
 - func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv2d_weight
 
 - func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv3d_weight
 
 - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: quantize_per_tensor
 
 - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_tensor_list_cpu
 
 - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_channel_cpu
 
 - func: dequantize.self(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: dequantize_quant
 
 - func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     QuantizedCPU: dequantize_tensors_quantized_cpu
 
 - func: q_scale(Tensor self) -> float
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_scale_quant
 
 - func: q_zero_point(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_zero_point_quant
 
 - func: q_per_channel_scales(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_scales
 
 - func: q_per_channel_zero_points(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points
 
 - func: q_per_channel_axis(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_axis
 
 - func: int_repr(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU: int_repr_quantized_cpu
     QuantizedCUDA: int_repr_quantized_cuda
 
 - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: make_per_tensor_quantized_tensor_cpu
     CUDA: make_per_tensor_quantized_tensor_cuda
 
 - func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: make_per_channel_quantized_tensor_cpu
 
 - func: qscheme(Tensor self) -> QScheme
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qscheme_quant
 
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
 
 - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: fake_quantize_per_channel_affine
 
 - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _fake_quantize_learnable_per_channel_affine
 
 - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 # to(Device) must not exist because all constructors of Device also works for
@@ -5387,61 +4695,47 @@
   device_guard: False
 
 - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: meshgrid(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: cartesian_prod(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: item(Tensor self) -> Scalar
-  use_c10_dispatcher: full
   variants: method
 
 - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
-  use_c10_dispatcher: full
 
 - func: can_cast(ScalarType from, ScalarType to) -> bool
-  use_c10_dispatcher: full
   variants: function
 
 - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 # NB: Does NOT check precondition that numel == 1
 - func: _local_scalar_dense(Tensor self) -> Scalar
-  use_c10_dispatcher: full
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
@@ -5467,7 +4761,6 @@
     CUDA: _thnn_fused_gru_cell_cuda
 
 - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _thnn_fused_gru_cell_backward_cuda
 
@@ -5476,28 +4769,20 @@
 
 # RNN cells and layers
 - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -5515,55 +4800,46 @@
 
 # Quantized RNN layers
 # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
-#  use_c10_dispatcher: full
+
 
 # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
-#  use_c10_dispatcher: full
+
 
 # Quantized GRU layers
 
 # - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-#   use_c10_dispatcher: full
+#
 
 # - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-#   use_c10_dispatcher: full
+#
 
 # Quantized RNN cells
 - func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 # PackedSequence utilities
 - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _pack_padded_sequence
 
 - func: _pack_padded_sequence_backward(Tensor grad, int[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # wrappers for legacy TH methods
 
 - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: set_
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -5572,61 +4848,51 @@
     QuantizedCPU, QuantizedCUDA: set_storage_quantized_
 
 - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: set_tensor_
 
 - func: set_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: set_cpu_
     CUDA: set_cuda_
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: is_set_to
 
 - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
 
 - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
 
 - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_scatter__cpu
     CUDA: masked_scatter__cuda
 
 - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: view(Tensor(a) self, int[] size) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -5634,126 +4900,101 @@
     MkldnnCPU: mkldnn_view
 
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_put_
     CUDA: legacy::cuda::_th_put_
 
 - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: index_add_cpu_
     CUDA: index_add_cuda_
 
 - func: index_add(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_index_fill_
     CUDA: legacy::cuda::_th_index_fill_
 
 - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: index_fill_
 
 - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_
 
 - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_fill_
 
 - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_reduce_
 
 - func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_scalar_reduce_
 
 - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_add_
 
 - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: eq_
 
 - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: eq_
@@ -5771,35 +5012,27 @@
     CPU, CUDA: bitwise_and_out
 
 - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __and__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -5815,35 +5048,27 @@
     CPU, CUDA: bitwise_or_out
 
 - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __or__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -5859,181 +5084,149 @@
     CPU, CUDA: bitwise_xor_out
 
 - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
 
 - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
 
 - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __ilshift__
 
 - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __ilshift__
 
 - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
 
 - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
 
 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __irshift__
 
 - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __irshift__
 
 - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: _lgamma__cpu
     CUDA: _lgamma__cuda
 
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: tril_cpu_
     CUDA: tril_cuda_
 
 - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: triu_cpu_
     CUDA: triu_cuda_
 
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_renorm_
     CUDA: legacy::cuda::_th_renorm_
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: pow_
 
 - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: pow_
 
 - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_scalar_
     CUDA: lerp_cuda_scalar_
 
 - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_tensor_
     CUDA: lerp_cuda_tensor_
 
 - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: fmod_
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: fmod_
 
 - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: remainder_
 
 - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: remainder_
 
 - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: addbmm_
@@ -6044,61 +5237,51 @@
     CPU, CUDA: addbmm_out
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: addbmm
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addcdiv_
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: uniform_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: geometric_
@@ -6112,13 +5295,11 @@
     CUDA: diag_cuda_out
 
 - func: diag(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: diag
 
 - func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6128,7 +5309,6 @@
     CPU, CUDA: cross_out
 
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: cross
@@ -6140,7 +5320,6 @@
     CUDA: triu_cuda_out
 
 - func: triu(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: triu
@@ -6152,32 +5331,27 @@
     CUDA: tril_cuda_out
 
 - func: tril(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: tril
 
 - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: tril_indices_cpu
     CUDA: tril_indices_cuda
 
 - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: triu_indices_cpu
     CUDA: triu_indices_cuda
 
 - func: trace(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: trace_cpu
     CUDA: trace_cuda
 
 - func: trace_backward(Tensor grad, int[] sizes) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6188,7 +5362,6 @@
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ne
@@ -6201,20 +5374,17 @@
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ne
     QuantizedCPU: ne_quantized_cpu
 
 - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ne_
 
 - func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ne_
@@ -6224,22 +5394,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6249,7 +5415,6 @@
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: eq
@@ -6262,7 +5427,6 @@
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: eq
@@ -6275,7 +5439,6 @@
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ge
@@ -6288,20 +5451,17 @@
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ge
     QuantizedCPU: ge_quantized_cpu
 
 - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ge_
 
 - func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ge_
@@ -6311,22 +5471,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6336,7 +5492,6 @@
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: le
@@ -6349,20 +5504,17 @@
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: le
     QuantizedCPU: le_quantized_cpu
 
 - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: le_
 
 - func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: le_
@@ -6372,22 +5524,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6397,7 +5545,6 @@
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gt
@@ -6410,20 +5557,17 @@
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gt
     QuantizedCPU: gt_quantized_cpu
 
 - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: gt_
 
 - func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: gt_
@@ -6433,22 +5577,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6458,7 +5598,6 @@
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lt
@@ -6471,20 +5610,17 @@
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lt
     QuantizedCPU: lt_quantized_cpu
 
 - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: lt_
 
 - func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: lt_
@@ -6494,22 +5630,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
@@ -6519,14 +5651,12 @@
     CUDA: take_out_cuda
 
 - func: take(Tensor self, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: take_cpu
     CUDA: take_cuda
 
 - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6537,7 +5667,6 @@
     CUDA: index_select_out_cuda
 
 - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: index_select_cpu_
@@ -6549,11 +5678,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6564,14 +5691,12 @@
     CUDA: masked_select_out_cuda
 
 - func: masked_select(Tensor self, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: masked_select_cpu
     CUDA: masked_select_cuda
 
 - func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6582,14 +5707,12 @@
     CUDA: nonzero_out_cuda
 
 - func: nonzero(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_nonzero
     CUDA: nonzero_cuda
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
@@ -6599,13 +5722,11 @@
     CUDA: gather_out_cpu_cuda
 
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gather
 
 - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6613,11 +5734,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
-  use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -6625,13 +5744,11 @@
     CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: addcmul
 
 - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addcmul_
@@ -6642,7 +5759,6 @@
     CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: addcdiv
@@ -6654,7 +5770,6 @@
     CUDA: legacy::cuda::_th_gels_out
 
 - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_gels
@@ -6666,13 +5781,11 @@
     DefaultBackend: triangular_solve_out
 
 - func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: triangular_solve
 
 - func: _triangular_solve_helper(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _triangular_solve_helper_cpu
@@ -6684,13 +5797,11 @@
     DefaultBackend: symeig_out
 
 - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: symeig
 
 - func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _symeig_helper_cpu
@@ -6702,7 +5813,6 @@
     DefaultBackend: eig_out
 
 - func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: eig
@@ -6713,13 +5823,11 @@
     DefaultBackend: svd_out
 
 - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: svd
 
 - func: _svd_helper(Tensor self, bool some, bool compute_uv) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _svd_helper_cpu
@@ -6727,23 +5835,19 @@
 
 # swapaxes, alias for transpose
 - func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 # swapdims, alias for transpose
 - func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -6753,13 +5857,11 @@
     DefaultBackend: cholesky_out
 
 - func: cholesky(Tensor self, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: cholesky
 
 - func: _cholesky_helper(Tensor self, bool upper) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _cholesky_helper_cpu
@@ -6771,20 +5873,17 @@
     DefaultBackend: cholesky_solve_out
 
 - func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: cholesky_solve
 
 - func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _cholesky_solve_helper_cpu
     CUDA: _cholesky_solve_helper_cuda
 
 - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: solve
@@ -6795,7 +5894,6 @@
     DefaultBackend: solve_out
 
 - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _solve_helper_cpu
@@ -6808,7 +5906,6 @@
     CUDA: legacy::cuda::_th_potri_out
 
 - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_potri
@@ -6820,7 +5917,6 @@
     Math: qr_out
 
 - func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     Math: qr
@@ -6832,7 +5928,6 @@
     CUDA: legacy::cuda::_th_geqrf_out
 
 - func: geqrf(Tensor self) -> (Tensor a, Tensor tau)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_geqrf
@@ -6844,7 +5939,6 @@
     CPU: legacy::cpu::_th_orgqr_out
 
 - func: orgqr(Tensor self, Tensor input2) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_orgqr
@@ -6855,13 +5949,11 @@
     CPU: legacy::cpu::_th_ormqr_out
 
 - func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_ormqr
 
 - func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _lu_with_info_cpu
@@ -6873,13 +5965,11 @@
     DefaultBackend: lu_solve_out
 
 - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: lu_solve
 
 - func: _lu_solve_helper(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _lu_solve_helper_cpu
@@ -6892,20 +5982,17 @@
     CPU, CUDA: multinomial_out
 
 - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: multinomial
 
 - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: legacy::cpu::_th_multinomial_alias_setup
     CUDA: legacy::cuda::_th_multinomial_alias_setup
 
 - func: _multinomial_alias_draw(Tensor J, Tensor q, int num_samples, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: legacy::cpu::_th_multinomial_alias_draw
@@ -6918,7 +6005,6 @@
     CUDA: _lgamma_out_cuda
 
 - func: lgamma(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lgamma
@@ -6929,7 +6015,6 @@
     CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: digamma
@@ -6940,19 +6025,16 @@
     CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: polygamma
 
 - func: erfinv(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: erfinv
 
 - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: erfinv_
@@ -6963,13 +6045,11 @@
     CPU, CUDA: erfinv_out
 
 - func: i0(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: i0
 
 - func: i0_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: i0_
@@ -6980,13 +6060,11 @@
     CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sign
 
 - func: sign_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: sign_
@@ -6997,7 +6075,6 @@
     CPU, CUDA: sign_out
 
 - func: signbit(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -7007,7 +6084,6 @@
     CUDA: signbit_out
 
 - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: dist
@@ -7018,7 +6094,6 @@
     CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: atan2
@@ -7036,14 +6111,12 @@
     CUDA: lerp_cuda_tensor_out
 
 - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: lerp_cpu_scalar
     CUDA: lerp_cuda_scalar
 
 - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: lerp_cpu_tensor
@@ -7056,7 +6129,6 @@
     CUDA: _histc_out_cuda
 
 - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_histc
@@ -7068,7 +6140,6 @@
     CPU, CUDA: fmod_out
 
 - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: fmod
@@ -7079,7 +6150,6 @@
     CPU, CUDA: fmod_out
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: fmod
@@ -7090,7 +6160,6 @@
     CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: hypot
@@ -7107,7 +6176,6 @@
     CPU, CUDA: igamma_out
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: igamma
@@ -7124,13 +6192,11 @@
     CPU, CUDA: igammac_out
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: igammac
 
 - func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: igammac_
@@ -7141,7 +6207,6 @@
     CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: nextafter
@@ -7158,7 +6223,6 @@
     CPU, CUDA: remainder_out
 
 - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: remainder
@@ -7169,27 +6233,23 @@
     CPU, CUDA: remainder_out
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: remainder
 
 - func: min(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: min
     QuantizedCPU: min_quantized_cpu
 
 - func: max(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: max
     QuantizedCPU: max_quantized_cpu
 
 - func: maximum(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: maximum
@@ -7202,14 +6262,12 @@
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
 - func: max.other(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: minimum(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: minimum
@@ -7225,35 +6283,30 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: min.other(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7263,7 +6316,6 @@
     CUDA: legacy::cuda::_th_sort_out
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: sort_cpu
@@ -7274,7 +6326,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -7283,17 +6334,14 @@
     Math: msort_out
 
 - func: msort(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     Math: msort
 
 - func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7303,20 +6351,17 @@
     CUDA: legacy::cuda::_th_topk_out
 
 - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: topk
     QuantizedCPU: topk_quantized_cpu
 
 - func: all(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: all
 
 - func: any(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: any
@@ -7329,14 +6374,12 @@
     CUDA: legacy::cuda::_th_renorm_out
 
 - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_renorm
     CUDA: legacy::cuda::_th_renorm
 
 - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -7344,13 +6387,11 @@
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: unfold_backward
 
 - func: equal(Tensor self, Tensor other) -> bool
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: cpu_equal
@@ -7363,7 +6404,6 @@
     CPU, CUDA: pow_out
 
 - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: pow
@@ -7374,7 +6414,6 @@
     CPU, CUDA: pow_out
 
 - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: pow
 
@@ -7385,7 +6424,6 @@
     SparseCPU, SparseCUDA: pow_out_sparse_scalar
 
 - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: pow
@@ -7397,7 +6435,6 @@
     Math: float_power_out
 
 - func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: float_power
@@ -7408,7 +6445,6 @@
     Math: float_power_out
 
 - func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: float_power
 
@@ -7418,25 +6454,21 @@
     Math: float_power_out
 
 - func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: float_power
 
 - func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     Math: float_power_
 
 - func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     Math: float_power_
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: normal_
@@ -7447,7 +6479,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7457,7 +6488,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7467,7 +6497,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7478,19 +6507,16 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: alias(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: alias
 
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_index_copy_
     CUDA: legacy::cuda::_th_index_copy_
 
 - func: _cumsum(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cumsum_cpu
     CUDA: _cumsum_cuda
@@ -7502,7 +6528,6 @@
     CUDA: _cumsum_out_cuda
 
 - func: _cumprod(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cumprod_cpu
     CUDA: _cumprod_cuda
@@ -7514,29 +6539,24 @@
     CUDA: _cumprod_out_cuda
 
 - func: _var(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_var
 
 - func: _std(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_std
 
 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
 
 - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: _amp_update_scale_cuda
 
 - func: _cat(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cat_cpu
     CUDA: cat_cuda
@@ -7550,644 +6570,552 @@
     QuantizedCPU: cat_out_quantized_cpu
 
 - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow_
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow_
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow_
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
 - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
 - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
 - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
 - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow_
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
 - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
 - func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda
 
 - func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow_
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
 
 - func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
 
 - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow_
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
 
 - func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda
 
 - func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow_
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
 
 - func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
 
 - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow_
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
 
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_zero_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_zero_slow_
     CUDA: foreach_tensor_zero_cuda_
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow_
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_abs(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_abs_slow
     CUDA: foreach_tensor_abs_cuda
 
 - func: _foreach_abs_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_abs_slow_
     CUDA: foreach_tensor_abs_cuda_
 
 - func: _foreach_acos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_acos_slow
     CUDA: foreach_tensor_acos_cuda
 
 - func: _foreach_acos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_acos_slow_
     CUDA: foreach_tensor_acos_cuda_
 
 - func: _foreach_asin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_asin_slow
     CUDA: foreach_tensor_asin_cuda
 
 - func: _foreach_asin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_asin_slow_
     CUDA: foreach_tensor_asin_cuda_
 
 - func: _foreach_atan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_atan_slow
     CUDA: foreach_tensor_atan_cuda
 
 - func: _foreach_atan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_atan_slow_
     CUDA: foreach_tensor_atan_cuda_
 
 - func: _foreach_ceil(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_ceil_slow
     CUDA: foreach_tensor_ceil_cuda
 
 - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_ceil_slow_
     CUDA: foreach_tensor_ceil_cuda_
 
 - func: _foreach_cos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cos_slow
     CUDA: foreach_tensor_cos_cuda
 
 - func: _foreach_cos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cos_slow_
     CUDA: foreach_tensor_cos_cuda_
 
 - func: _foreach_cosh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cosh_slow
     CUDA: foreach_tensor_cosh_cuda
 
 - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cosh_slow_
     CUDA: foreach_tensor_cosh_cuda_
 
 - func: _foreach_erf(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erf_slow
     CUDA: foreach_tensor_erf_cuda
 
 - func: _foreach_erf_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erf_slow_
     CUDA: foreach_tensor_erf_cuda_
 
 - func: _foreach_erfc(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erfc_slow
     CUDA: foreach_tensor_erfc_cuda
 
 - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erfc_slow_
     CUDA: foreach_tensor_erfc_cuda_
 
 - func: _foreach_expm1(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_expm1_slow
     CUDA: foreach_tensor_expm1_cuda
 
 - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_expm1_slow_
     CUDA: foreach_tensor_expm1_cuda_
 
 - func: _foreach_floor(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_floor_slow
     CUDA: foreach_tensor_floor_cuda
 
 - func: _foreach_floor_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_floor_slow_
     CUDA: foreach_tensor_floor_cuda_
 
 - func: _foreach_log(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log_slow
     CUDA: foreach_tensor_log_cuda
 
 - func: _foreach_log_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log_slow_
     CUDA: foreach_tensor_log_cuda_
 
 - func: _foreach_log10(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log10_slow
     CUDA: foreach_tensor_log10_cuda
 
 - func: _foreach_log10_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log10_slow_
     CUDA: foreach_tensor_log10_cuda_
 
 - func: _foreach_log1p(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log1p_slow
     CUDA: foreach_tensor_log1p_cuda
 
 - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log1p_slow_
     CUDA: foreach_tensor_log1p_cuda_
 
 - func: _foreach_log2(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log2_slow
     CUDA: foreach_tensor_log2_cuda
 
 - func: _foreach_log2_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log2_slow_
     CUDA: foreach_tensor_log2_cuda_
 
 - func: _foreach_neg(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_neg_slow
     CUDA: foreach_tensor_neg_cuda
 
 - func: _foreach_neg_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_neg_slow_
     CUDA: foreach_tensor_neg_cuda_
 
 - func: _foreach_tan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tan_slow
     CUDA: foreach_tensor_tan_cuda
 
 - func: _foreach_tan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tan_slow_
     CUDA: foreach_tensor_tan_cuda_
 
 - func: _foreach_tanh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tanh_slow
     CUDA: foreach_tensor_tanh_cuda
 
 - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tanh_slow_
     CUDA: foreach_tensor_tanh_cuda_
 
 - func: _foreach_sin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sin_slow
     CUDA: foreach_tensor_sin_cuda
 
 - func: _foreach_sin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sin_slow_
     CUDA: foreach_tensor_sin_cuda_
 
 - func: _foreach_sinh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sinh_slow
     CUDA: foreach_tensor_sinh_cuda
 
 - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sinh_slow_
     CUDA: foreach_tensor_sinh_cuda_
 
 - func: _foreach_round(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_round_slow
     CUDA: foreach_tensor_round_cuda
 
 - func: _foreach_round_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_round_slow_
     CUDA: foreach_tensor_round_cuda_
 
 - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_lgamma_slow
     CUDA: foreach_tensor_lgamma_cuda
 
 - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_lgamma_slow_
     CUDA: foreach_tensor_lgamma_cuda_
 
 - func: _foreach_frac(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_frac_slow
     CUDA: foreach_tensor_frac_cuda
 
 - func: _foreach_frac_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_frac_slow_
     CUDA: foreach_tensor_frac_cuda_
 
 - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_reciprocal_slow
     CUDA: foreach_tensor_reciprocal_cuda
 
 - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_reciprocal_slow_
     CUDA: foreach_tensor_reciprocal_cuda_
 
 - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sigmoid_slow
     CUDA: foreach_tensor_sigmoid_cuda
 
 - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sigmoid_slow_
     CUDA: foreach_tensor_sigmoid_cuda_
 
 - func: _foreach_trunc(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_trunc_slow
     CUDA: foreach_tensor_trunc_cuda
 
 - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_trunc_slow_
     CUDA: foreach_tensor_trunc_cuda_
 
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow_
     CUDA: foreach_tensor_addcdiv_scalar_cuda_
 
 - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow_
     CUDA: foreach_tensor_addcmul_scalar_cuda_
 
 - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow_
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
 
 - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow_
     CUDA: foreach_tensor_addcmul_scalarlist_cuda_
 
 - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow
     CUDA: foreach_tensor_addcdiv_scalar_cuda
 
 - func: _foreach_addcmul.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow
     CUDA: foreach_tensor_addcmul_scalar_cuda
 
 - func: _foreach_addcdiv.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda
 
 - func: _foreach_addcmul.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow
     CUDA: foreach_tensor_addcmul_scalarlist_cuda
 
 - func: _foreach_maximum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_maximum_slow
     CUDA: foreach_tensor_maximum_cuda
 
 - func: _foreach_minimum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_minimum_slow
     CUDA: foreach_tensor_minimum_cuda
 
 - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_mode
     CUDA: legacy::cuda::_th_mode
@@ -8199,7 +7127,6 @@
     CUDA: legacy::cuda::_th_mode_out
 
 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
@@ -8211,13 +7138,11 @@
     CUDA: bucketize_out_cuda
 
 - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
 
 - func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
@@ -8229,7 +7154,6 @@
     CUDA: searchsorted_out_cuda
 
 - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
@@ -8243,7 +7167,6 @@
     CPU, CUDA: mse_loss_out
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss
@@ -8255,7 +7178,6 @@
     CPU, CUDA: mse_loss_backward_out
 
 - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss_backward
@@ -8267,7 +7189,6 @@
     DefaultBackend: l1_loss_out
 
 - func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: l1_loss
@@ -8279,7 +7200,6 @@
     CPU, CUDA: l1_loss_backward_out
 
 - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: l1_loss_backward
@@ -8317,7 +7237,6 @@
   python_module: nn
 
 - func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
@@ -8328,7 +7247,6 @@
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out
 
 - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_forward_cpu
@@ -8342,7 +7260,6 @@
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out
 
 - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu
@@ -8428,7 +7345,6 @@
     CUDA: smooth_l1_loss_out
 
 - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: smooth_l1_loss
@@ -8441,7 +7357,6 @@
     CUDA: smooth_l1_loss_backward_out
 
 - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: smooth_l1_loss_backward
@@ -8453,7 +7368,6 @@
     DefaultBackend: soft_margin_loss_out
 
 - func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: soft_margin_loss
@@ -8465,7 +7379,6 @@
     DefaultBackend: soft_margin_loss_backward_out
 
 - func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: soft_margin_loss_backward
@@ -8477,7 +7390,6 @@
     CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: elu
@@ -8489,13 +7401,11 @@
     CPU, CUDA: elu_backward_out
 
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: elu_
@@ -8508,7 +7418,6 @@
     CUDA: legacy::cuda::_thnn_glu_forward_out
 
 - func: glu(Tensor self, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: glu
@@ -8522,7 +7431,6 @@
     CUDA: legacy::cuda::_thnn_glu_backward_out
 
 - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: glu_backward
@@ -8535,20 +7443,17 @@
     CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid
     QuantizedCPU: hardsigmoid_quantized_cpu
 
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_backward
@@ -8561,7 +7466,6 @@
     QuantizedCPU: hardtanh_out_quantized_cpu
 
 - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh
@@ -8574,13 +7478,11 @@
     CPU, CUDA: hardtanh_backward_out
 
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_
@@ -8593,19 +7495,16 @@
     CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_backward
@@ -8618,20 +7517,17 @@
     QuantizedCPU: leaky_relu_out_quantized_cpu
 
 - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu
     QuantizedCPU: leaky_relu_quantized_cpu
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_
@@ -8642,7 +7538,6 @@
   python_module: nn
 
 - func: log_sigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
@@ -8653,7 +7548,6 @@
     CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out
 
 - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: log_sigmoid_forward_cpu
@@ -8667,7 +7561,6 @@
     CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out
 
 - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: log_sigmoid_backward_cpu
@@ -8681,20 +7574,17 @@
     CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out
 
 - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: rrelu_with_noise_cpu
     CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward
 
 - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: rrelu_with_noise_backward
 
 - func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: rrelu_with_noise_cpu_
@@ -8707,7 +7597,6 @@
     CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softplus
@@ -8719,7 +7608,6 @@
     CPU, CUDA: softplus_backward_out
 
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softplus_backward
@@ -8731,7 +7619,6 @@
     CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softshrink
@@ -8743,7 +7630,6 @@
     CPU, CUDA: softshrink_backward_out
 
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softshrink_backward
@@ -8756,23 +7642,19 @@
     MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
 
 - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_adaptive_avg_pool2d
 
 - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: adaptive_avg_pool2d_cpu
     CUDA: adaptive_avg_pool2d_cuda
     QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
 
 - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool2d_backward_cpu
@@ -8787,7 +7669,6 @@
     QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
 
 - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_cpu
@@ -8802,7 +7683,6 @@
     CUDA: adaptive_avg_pool3d_backward_out_cuda
 
 - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_backward_cpu
@@ -8818,7 +7698,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool2d_cpu
@@ -8832,7 +7711,6 @@
     CUDA: adaptive_max_pool2d_backward_out_cuda
 
 - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool2d_backward_cpu
@@ -8848,7 +7726,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool3d_cpu
@@ -8862,7 +7739,6 @@
     CUDA: adaptive_max_pool3d_backward_out_cuda
 
 - func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool3d_backward_cpu
@@ -8877,7 +7753,6 @@
     MkldnnCPU: mkldnn_avg_pool2d_out
 
 - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool2d_cpu
@@ -8893,7 +7768,6 @@
     CUDA: avg_pool2d_backward_out_cuda
 
 - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool2d_backward_cpu
@@ -8908,7 +7782,6 @@
     MkldnnCPU: mkldnn_avg_pool3d_out
 
 - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool3d_cpu
@@ -8924,7 +7797,6 @@
     CUDA: avg_pool3d_backward_out_cuda
 
 - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool3d_backward_cpu
@@ -8940,7 +7812,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool2d_cpu
@@ -8954,7 +7825,6 @@
     CUDA: fractional_max_pool2d_backward_out_cuda
 
 - func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool2d_backward_cpu
@@ -8970,7 +7840,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool3d_cpu
@@ -8984,7 +7853,6 @@
     CUDA: fractional_max_pool3d_backward_out_cuda
 
 - func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool3d_backward_cpu
@@ -9000,7 +7868,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool2d_with_indices_cpu
@@ -9014,7 +7881,6 @@
     CUDA: max_pool2d_with_indices_backward_out_cuda
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool2d_with_indices_backward_cpu
@@ -9030,7 +7896,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_cpu
@@ -9044,7 +7909,6 @@
     CUDA: max_pool3d_with_indices_backward_out_cuda
 
 - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_backward_cpu
@@ -9058,7 +7922,6 @@
     CUDA: max_unpooling2d_forward_out_cuda
 
 - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_forward_cpu
@@ -9072,7 +7935,6 @@
     CUDA: max_unpooling2d_backward_out_cuda
 
 - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_backward_cpu
@@ -9086,7 +7948,6 @@
     CUDA: max_unpooling3d_forward_out_cuda
 
 - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_cpu
@@ -9100,7 +7961,6 @@
     CUDA: max_unpooling3d_backward_out_cuda
 
 - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_backward_cpu
@@ -9114,7 +7974,6 @@
     CUDA: reflection_pad1d_out_cuda
 
 - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, QuantizedCPU: reflection_pad1d_cpu
@@ -9128,7 +7987,6 @@
     CUDA: reflection_pad1d_backward_out_cuda
 
 - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: reflection_pad1d_backward_cpu
@@ -9142,7 +8000,6 @@
     CUDA: reflection_pad2d_out_cuda
 
 - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, QuantizedCPU: reflection_pad2d_cpu
@@ -9156,7 +8013,6 @@
     CUDA: reflection_pad2d_backward_out_cuda
 
 - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_backward_cpu
@@ -9170,7 +8026,6 @@
     CUDA: replication_pad1d_out_cuda
 
 - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad1d_cpu
@@ -9184,7 +8039,6 @@
     CUDA: replication_pad1d_backward_out_cuda
 
 - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad1d_backward_cpu
@@ -9198,7 +8052,6 @@
     CUDA: replication_pad2d_out_cuda
 
 - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad2d_cpu
@@ -9212,7 +8065,6 @@
     CUDA: replication_pad2d_backward_out_cuda
 
 - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad2d_backward_cpu
@@ -9226,7 +8078,6 @@
     CUDA: replication_pad3d_out_cuda
 
 - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad3d_cpu
@@ -9240,28 +8091,24 @@
     CUDA: replication_pad3d_backward_out_cuda
 
 - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad3d_backward_cpu
     CUDA: replication_pad3d_backward_cuda
 
 - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_cpu
     CUDA: upsample_linear1d_cuda
 
 - func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_backward_cpu
     CUDA: upsample_linear1d_backward_cuda
 
 - func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_cpu
@@ -9269,54 +8116,46 @@
     QuantizedCPU: upsample_bilinear2d_quantized_cpu
 
 - func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_backward_cpu
     CUDA: upsample_bilinear2d_backward_cuda
 
 - func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_cpu
     CUDA: upsample_trilinear3d_cuda
 
 - func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_cpu
     CUDA: upsample_trilinear3d_backward_cuda
 
 - func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_cpu
     CUDA: upsample_bicubic2d_cuda
 
 - func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_backward_cpu
     CUDA: upsample_bicubic2d_backward_cuda
 
 - func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: upsample_nearest1d
 
 - func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: upsample_nearest1d_backward
 
 - func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_cpu
@@ -9324,14 +8163,12 @@
     QuantizedCPU: upsample_nearest2d_quantized_cpu
 
 - func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_backward_cpu
     CUDA: upsample_nearest2d_backward_cuda
 
 - func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_cpu
@@ -9339,7 +8176,6 @@
     QuantizedCPU: upsample_nearest3d_quantized_cpu
 
 - func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_cpu
@@ -9354,7 +8190,6 @@
     CUDA: upsample_linear1d_out_cuda
 
 - func: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_cpu
@@ -9368,7 +8203,6 @@
     CUDA: upsample_linear1d_backward_out_cuda
 
 - func: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_backward_cpu
@@ -9382,7 +8216,6 @@
     CUDA: upsample_bilinear2d_out_cuda
 
 - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_cpu
@@ -9397,7 +8230,6 @@
     CUDA: upsample_bilinear2d_backward_out_cuda
 
 - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_backward_cpu
@@ -9411,7 +8243,6 @@
     CUDA: upsample_bicubic2d_out_cuda
 
 - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_cpu
@@ -9425,7 +8256,6 @@
     CUDA: upsample_bicubic2d_backward_out_cuda
 
 - func: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_backward_cpu
@@ -9439,7 +8269,6 @@
     CUDA: upsample_trilinear3d_out_cuda
 
 - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_cpu
@@ -9453,7 +8282,6 @@
     CUDA: upsample_trilinear3d_backward_out_cuda
 
 - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_cpu
@@ -9468,7 +8296,6 @@
     CUDA: upsample_nearest1d_out_cuda
 
 - func: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   structured_delegate: upsample_nearest1d.out
 
@@ -9481,7 +8308,6 @@
     CUDA: upsample_nearest1d_backward_out_cuda
 
 - func: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   structured_delegate: upsample_nearest1d_backward.grad_input
 
@@ -9493,7 +8319,6 @@
     CUDA: upsample_nearest2d_out_cuda
 
 - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_cpu
@@ -9508,7 +8333,6 @@
     CUDA: upsample_nearest2d_backward_out_cuda
 
 - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_backward_cpu
@@ -9522,7 +8346,6 @@
     CUDA: upsample_nearest3d_out_cuda
 
 - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_cpu
@@ -9537,7 +8360,6 @@
     CUDA: upsample_nearest3d_backward_out_cuda
 
 - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_cpu
@@ -9550,7 +8372,6 @@
     CPU, CUDA: sigmoid_backward_out
 
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: sigmoid_backward
@@ -9562,7 +8383,6 @@
     CPU, CUDA: logit_backward_out
 
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: logit_backward
@@ -9574,7 +8394,6 @@
     CPU, CUDA: tanh_backward_out
 
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: tanh_backward
@@ -9619,7 +8438,6 @@
     CUDA: slow_conv_transpose2d_backward_out_cuda
 
 - func: slow_conv_transpose2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose2d_backward_cpu
@@ -9647,7 +8465,6 @@
     CUDA: slow_conv_transpose3d_backward_out_cuda
 
 - func: slow_conv_transpose3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_backward_cpu
@@ -9683,7 +8500,6 @@
     CUDA: slow_conv2d_backward_out_cuda
 
 - func: thnn_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_cpu
@@ -9716,7 +8532,6 @@
     CUDA: thnn_conv_depthwise2d_backward_out
 
 - func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CUDA: thnn_conv_depthwise2d_backward
@@ -9748,7 +8563,6 @@
     CPU: slow_conv3d_backward_out_cpu
 
 - func: slow_conv3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv3d_backward_cpu
@@ -9761,7 +8575,6 @@
     CUDA: slow_conv_dilated2d_cuda
 
 - func: slow_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_backward_cpu
@@ -9775,7 +8588,6 @@
     CUDA: slow_conv_dilated3d_cuda
 
 - func: slow_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_backward_cpu
@@ -9789,7 +8601,6 @@
     CUDA: col2im_out_cuda
 
 - func: col2im(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: col2im_cpu
@@ -9803,14 +8614,12 @@
     CUDA: col2im_backward_out_cuda
 
 - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: col2im_backward_cpu
     CUDA: col2im_backward_cuda
 
 - func: column_stack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: column_stack
 
@@ -9827,7 +8636,6 @@
     CUDA: im2col_out_cuda
 
 - func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: im2col_cpu
@@ -9841,30 +8649,25 @@
     CUDA: im2col_backward_out_cuda
 
 - func: im2col_backward(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: im2col_backward_cpu
     CUDA: im2col_backward_cuda
 
 - func: isfinite(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: isinf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: record_stream(Tensor(a!) self, Stream s) -> ()
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CUDA: record_stream_cuda
 
 - func: isposinf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -9873,7 +8676,6 @@
     CPU, CUDA: isposinf_out
 
 - func: isneginf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -9886,12 +8688,10 @@
 # of the vmap frontend API (see torch/_vmap_internals.py). They are not
 # user-facing, hence the leading underscore. Please don't use them them anywhere else.
 - func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # See NOTE [_add_batch_dim and _remove_batch_dim]
 - func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 ## Functions related to the fast Fourier transform and the torch.fft namespace
@@ -9906,7 +8706,6 @@
 # torch.fft.fft
 # NOTE: NOT an alias for torch.fft, which has different semantics
 - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9916,7 +8715,6 @@
   variants: function
 
 - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9926,7 +8724,6 @@
   variants: function
 
 - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9936,7 +8733,6 @@
   variants: function
 
 - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9946,7 +8742,6 @@
   variants: function
 
 - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9956,7 +8751,6 @@
   variants: function
 
 - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9966,7 +8760,6 @@
   variants: function
 
 - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9976,7 +8769,6 @@
   variants: function
 
 - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9986,7 +8778,6 @@
   variants: function
 
 - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9996,7 +8787,6 @@
   variants: function
 
 - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10006,7 +8796,6 @@
   variants: function
 
 - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10016,7 +8805,6 @@
   variants: function
 
 - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10026,7 +8814,6 @@
   variants: function
 
 - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10036,7 +8823,6 @@
   variants: function
 
 - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10066,12 +8852,10 @@
   variants: function
 
 - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
 - func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -10085,7 +8869,6 @@
 # See linalg_det as an example.
 
 - func: linalg_cholesky(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10100,25 +8883,21 @@
 
 # torch.linalg.det, alias for torch.det
 - func: linalg_det(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
 - func: det(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: det
 
 - func: _syevd_helper(Tensor self, bool compute_eigenvectors, str uplo) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _syevd_helper_cpu
     CUDA: _syevd_helper_cuda
 
 - func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10131,7 +8910,6 @@
     DefaultBackend: linalg_eigh_out
 
 - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10144,7 +8922,6 @@
     DefaultBackend: linalg_eigvalsh_out
 
 - func: inner(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -10152,14 +8929,12 @@
 
 # torch.outer, alias for torch.ger
 - func: outer(Tensor self, Tensor vec2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: ger(Tensor self, Tensor vec2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ger
@@ -10170,12 +8945,10 @@
     DefaultBackend: ger_out
 
 - func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
 - func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
@@ -10190,7 +8963,6 @@
   variants: function
 
 - func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10204,7 +8976,6 @@
     Math: linalg_cond_out
 
 - func: linalg_cond.p_str(Tensor self, str p) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10252,7 +9023,6 @@
     Math: linalg_tensorinv_out
 
 - func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10287,7 +9057,6 @@
     CUDA: _linalg_qr_helper_cuda
 
 - func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10303,32 +9072,27 @@
 ## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
 
 # Note: this function is only for testing.
 - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_intlist
 
 # Note: this function is only for testing.
 - func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_intlist
 
 # Note: this function is only for testing.
 - func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_floatlist
 
 # Note: this function is only for testing.
 - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 # Note: this function is only for testing.

From eef5eb05bf0468ed5f840d2bf3e09c135b8760df Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Wed, 6 Jan 2021 14:14:24 -0800
Subject: [PATCH 30/44] Remove backward and requires_grad from Autograd backend
 key (#49613)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49613

Just following a TODO in the code base...
ghstack-source-id: 119450484

Test Plan: waitforsandcastle

Reviewed By: ezyang

Differential Revision: D25644597

fbshipit-source-id: 26f5fa6af480929d0468b0de3ab103813e40d78b
---
 torch/csrc/autograd/VariableTypeManual.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index d1f15fff3669..f6c3f23cd0f7 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -387,14 +387,6 @@ TORCH_LIBRARY_IMPL(aten, Autograd, m) {
   m.impl("detach", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach)));
   m.impl("detach_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach_)));
   m.impl("copy_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::copy_)));
-  // For backward() and requires_grad_(), we need the DefaultBackend kernel, but we also need the Autograd backend
-  // kernel, because when called with a VariableTensorId tensor, it goes through the variable fallback kernel,
-  // which calls callBoxed(), which doesn't support optional tensor arguments yet and backward() has an optional
-  // tensor argument.
-  // TODO Once callBoxed() supports optional tensor arguments, we can enable `use_c10_dispatcher: full` for backward()
-  //      and requires_grad_(), then remove the backend Autograd kernel here, only leaving the Math kernel.
-  m.impl("_backward", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_backward)));
-  m.impl("requires_grad_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::requires_grad_)));
   m.impl("_fw_primal", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_fw_primal)));
 }
 

From dde5b6e177ec24d34651ffd8df04b4ebdf264e6e Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 14:39:42 -0800
Subject: [PATCH 31/44] [PyTorch] Reapply D25547962: Make
 tls_local_dispatch_key_set inlineable (reapply) (#49763)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49763

This was reverted because it landed in a stack together with
D25542799 (https://github.com/pytorch/pytorch/commit/9ce1df079f6ea90dd4b7f9aa12a1a78d51a8b204), which really was broken.
ghstack-source-id: 119063016

Test Plan: CI

Reviewed By: ezyang

Differential Revision: D25685959

fbshipit-source-id: 514d8076eac67c760f119cfebc2ae3d0ddcd4e04
---
 c10/core/impl/LocalDispatchKeySet.cpp | 20 +++-----------------
 c10/core/impl/LocalDispatchKeySet.h   | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp
index 358e6ef7e1f7..ff3e454eda8a 100644
--- a/c10/core/impl/LocalDispatchKeySet.cpp
+++ b/c10/core/impl/LocalDispatchKeySet.cpp
@@ -5,10 +5,6 @@
 namespace c10 {
 namespace impl {
 
-C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process.");
-
-namespace {
-
 /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
 /// thread_local is not supported.
 #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
@@ -18,25 +14,15 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
 
 #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
 
-static PODLocalDispatchKeySet raw_local_dispatch_key_set;
+PODLocalDispatchKeySet raw_local_dispatch_key_set;
 
 #endif
 
-} // anonymous namespace
-
+#ifdef _MSC_VER
 LocalDispatchKeySet tls_local_dispatch_key_set() {
-  // Hack until variable performance is fixed
-  //
-  // ezyang: I'm pretty unhappy about this implementation, it looks wrong
-  // to me, as it seems to be performing a mutation on
-  // raw_local_dispatch_key_set.  I can't conveniently test the correct
-  // version though...
-  if (FLAGS_disable_variable_dispatch) {
-    raw_local_dispatch_key_set.set_excluded(
-      raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset);
-  }
   return raw_local_dispatch_key_set;
 }
+#endif // _MSC_VER
 
 void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) {
   raw_local_dispatch_key_set = PODLocalDispatchKeySet {
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index 5262b1d4d6c0..313dc5ca3508 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -23,8 +23,6 @@
 namespace c10 {
 namespace impl {
 
-C10_DECLARE_bool(disable_variable_dispatch);
-
 // POD version of LocalDispatchKeySet.  Declared here just so that
 // we can put it in the guards.
 struct C10_API PODLocalDispatchKeySet {
@@ -54,7 +52,24 @@ struct C10_API LocalDispatchKeySet {
   DispatchKeySet excluded_;
 };
 
+// thread_local variables cannot be C10_API on Windows.
+#ifdef _MSC_VER
 C10_API LocalDispatchKeySet tls_local_dispatch_key_set();
+#else // _MSC_VER
+/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
+/// thread_local is not supported.
+#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
+  extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
+#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+  extern C10_API PODLocalDispatchKeySet raw_local_dispatch_key_set;
+#endif
+
+inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() {
+  // Don't let people fiddle with the thread_local directly just
+  // because they include this header.
+  return raw_local_dispatch_key_set;
+}
+#endif // _MSC_VER
 
 // Internal, use ThreadLocalStateGuard
 C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);

From 3270e661c3e885171ffd3a7fa94cb085b267be3a Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Wed, 6 Jan 2021 14:53:55 -0800
Subject: [PATCH 32/44] [PyTorch Mobile] Skip signature check when converting
 to typed operator handle (#49469)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49469

In Functions.cpp, there is a call to `typed<...>` that converts to a `TypedOperatorHandle`. This isn't needed at runtime since it's already been exercised during development, and for mobile, there is no possibility of operators or kernels being registered by users (from Python code the way it is possible on server side).
ghstack-source-id: 118714246

Test Plan:
Sandcastle

### App testing results:

FBiOS fails with an error similar to this one: https://fb.workplace.com/groups/2102613013103952/permalink/3815085708523332/

Tested 2 AR effects (gren screen and colors shift) on IGiOS.

### BSB results:

D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912)

**fbios: Succeeded**
Change in Download Size for arm64 + 3x assets variation: -7.2 KiB
Change in Uncompressed Size for arm64 + 3x assets variation: -27.1 KiB

Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:135971531636706@base/bsb:135971531636706@diff/

D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912)

**fbios-pika: Succeeded**
Change in Download Size for arm64 + 3x assets variation: -11.0 KiB
Change in Uncompressed Size for arm64 + 3x assets variation: -7.4 KiB

Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:430379774665351@base/bsb:430379774665351@diff/
3:02 AM

D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912)

**igios: Succeeded**
Change in Download Size for arm64 + 3x assets variation: -5.3 KiB
Change in Uncompressed Size for arm64 + 3x assets variation: -17.3 KiB

Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:685843828784135@base/bsb:685843828784135@diff/

Reviewed By: iseeyuan

Differential Revision: D25581159

fbshipit-source-id: 4a62982829ec42c2d3f58f47f876f2543bc0099b
---
 aten/src/ATen/core/dispatch/Dispatcher.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 9641dfbea0cd..d83653f75363 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -304,7 +304,9 @@ class TORCH_API OperatorHandle {
     // smuggle in a kernel that is typed incorrectly).  For everything
     // in core library this won't happen, because all the static registrations
     // will be done by the time a typed() handle is acquired.
+#if !defined C10_MOBILE
     operatorIterator_->op.assertSignatureIsCorrect<FuncType>();
+#endif
     return TypedOperatorHandle<FuncType>(operatorIterator_);
   }
 

From dc41d1765517316073b5faa3be6a46e50005a0fe Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Wed, 6 Jan 2021 15:40:35 -0800
Subject: [PATCH 33/44] .circleci: Add option to not run build workflow
 (#50162)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50162

Adds an option to not run the build workflow when the `run_build`
parameter is set to false

Should reduce the amount of double workflows that are run by
pytorch-probot

Uses functionality introduced in https://github.com/pytorch/pytorch-probot/pull/18

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: yns88

Differential Revision: D25812971

Pulled By: seemethere

fbshipit-source-id: 4832170f6abcabe3f385f47a663d148b0cfe2a28
---
 .circleci/config.yml                          | 4 ++++
 .circleci/generate_config_yml.py              | 5 ++++-
 .circleci/verbatim-sources/header-section.yml | 3 +++
 .github/pytorch-circleci-labels.yml           | 2 ++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0716e516518b..d19c08b2b0b6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,6 +11,9 @@ parameters:
   run_binary_tests:
     type: boolean
     default: false
+  run_build:
+    type: boolean
+    default: true
 
 docker_config_defaults: &docker_config_defaults
   user: jenkins
@@ -9762,6 +9765,7 @@ workflows:
               only:
                 - postnightly
           executor: windows-with-nvidia-gpu
+    when: << pipeline.parameters.run_build >>
   ecr_gc:
     triggers:
       - schedule:
diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py
index f1af924bd3e2..a836d2e510a6 100755
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@@ -112,7 +112,10 @@ def gen_build_workflows_tree():
                 "when": r"<< pipeline.parameters.run_binary_tests >>",
                 "jobs": [f() for f in binary_build_functions],
             },
-            "build": {"jobs": [f() for f in build_workflows_functions]},
+            "build": {
+                "when": r"<< pipeline.parameters.run_build >>",
+                "jobs": [f() for f in build_workflows_functions]
+            },
         }
     }
 
diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml
index 26205a0cccba..43d4c94ee5ed 100644
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@@ -11,6 +11,9 @@ parameters:
   run_binary_tests:
     type: boolean
     default: false
+  run_build:
+    type: boolean
+    default: true
 
 docker_config_defaults: &docker_config_defaults
   user: jenkins
diff --git a/.github/pytorch-circleci-labels.yml b/.github/pytorch-circleci-labels.yml
index ccdf2e876af1..3a9eeca0abcc 100644
--- a/.github/pytorch-circleci-labels.yml
+++ b/.github/pytorch-circleci-labels.yml
@@ -9,3 +9,5 @@ labels_to_circle_params:
         - release/.*
       tags:
         - v[0-9]+(\.[0-9]+)*-rc[0-9]+
+    set_to_false:
+      - run_build

From eb8003d8e9639369c25b16f2cec338590beb0cb8 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 6 Jan 2021 15:43:37 -0800
Subject: [PATCH 34/44] [FX] Remove extraneous newlines at end of code (#50117)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50117

Test Plan: Imported from OSS

Reviewed By: ansley

Differential Revision: D25791847

Pulled By: jamesr66a

fbshipit-source-id: 9c0b296e117e6bcf69ed9624ad0b243fa3db0f76
---
 test/test_fx.py   |  5 +++++
 torch/fx/graph.py | 11 ++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 65d5aa3f0101..2511adc52c62 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -861,6 +861,11 @@ def forward(self, x, w):
         x, w = torch.rand(3, 4), torch.rand(4, 4)
         self.assertTrue(any(n.target == torch.relu for n in traced.graph.nodes))
 
+    def test_empty_graph_codegen(self):
+        graph = torch.fx.Graph()
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+        self.assertEqual(gm(), None)
+
     def test_sequential(self):
         m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1))
         gm = torch.fx.symbolic_trace(m)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index fd0087dca398..6e493676f8c2 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -693,13 +693,18 @@ def emit_node(node : Node):
         import_strs = [f'import {name}' for name in sorted(modules_used)]
         import_block = '\n'.join(import_strs)
 
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+
         code = ''.join(body)
-        code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
+        code = '\n'.join('    ' + line for line in code.split('\n'))
         fn_code = f"""\
 {import_block}
 def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}:
-{code}
-"""
+{code}"""
 
         return fn_code
 

From e49372d460c80102bd6bae3dd2f8c1e2b61ebc1b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 6 Jan 2021 16:12:47 -0800
Subject: [PATCH 35/44] Bugfix nightly checkout tool to work on Windows
 (#49274)

Summary:
I am submitting this PR on behalf of Janne Hellsten(nurpax) from NVIDIA, for the convenience of CLA. Thanks Janne a lot for the contribution!

This fixes the bug when running `
./tools/nightly.py checkout -b my-nightly-branch` on windows. Before this fix, this command gets the following error on Windows.

```
ERROR:root:Fatal exception
Traceback (most recent call last):
  File "./tools/nightly.py", line 166, in logging_manager
    yield root_logger
  File "./tools/nightly.py", line 644, in main
    install(
  File "./tools/nightly.py", line 552, in install
    spdir = _site_packages(pytdir.name, platform)
  File "./tools/nightly.py", line 325, in _site_packages
    os.path.join(pytdir.name, "Lib", "site-packages")
NameError: name 'pytdir' is not defined
log file: d:\pytorch\nightly\log\2020-12-11_16h10m14s_6867a21e-3c0e-11eb-878e-04ed3363a33e\nightly.log
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49274

Reviewed By: H-Huang

Differential Revision: D25808156

Pulled By: malfet

fbshipit-source-id: 00778016366ab771fc3fb152710c7849210640fb
---
 tools/nightly.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/nightly.py b/tools/nightly.py
index 1fecc67e72f3..55a90e3fd9fb 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -322,10 +322,10 @@ def pytorch_install(url):
 
 def _site_packages(dirname, platform):
     if platform.startswith("win"):
-        os.path.join(pytdir.name, "Lib", "site-packages")
+        template = os.path.join(dirname, "Lib", "site-packages")
     else:
         template = os.path.join(dirname, "lib", "python*.*", "site-packages")
-        spdir = glob.glob(template)[0]
+    spdir = glob.glob(template)[0]
     return spdir
 
 

From 6838ecefb6c9d138dfde7f1eaeccff6c8fc72fff Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 6 Jan 2021 16:38:05 -0800
Subject: [PATCH 36/44] Clean up some type annotations in torch/jit (#49939)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49939

Upgrades type annotations from Python2 to Python3

Test Plan: Sandcastle tests

Reviewed By: xush6528

Differential Revision: D25717573

fbshipit-source-id: 7d5c98fafaa224e0504b73dc69b1e4a6410c0494
---
 torch/jit/quantized.py | 60 +++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index 615741f38da7..d853a55b3933 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -130,8 +130,7 @@ def check_forward_input(self, input):
                     input.size(1), self.input_size))
 
     @torch.jit.script_method
-    def check_forward_hidden(self, input, hx, hidden_label=''):
-        # type: (Tensor, Tensor, str) -> None
+    def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = '') -> None:
         if input.size(0) != hx.size(0):
             raise RuntimeError(
                 "Input batch size {} doesn't match hidden{} batch size {}".format(
@@ -169,8 +168,7 @@ def __init__(self, other):
         self.nonlinearity = other.nonlinearity
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -201,8 +199,7 @@ def __init__(self, other):
         super(QuantizedLSTMCell, self).__init__(other)
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
         self.check_forward_input(input)
         if hx is None:
             zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -222,8 +219,7 @@ def __init__(self, other):
         super(QuantizedGRUCell, self).__init__(other)
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -236,8 +232,7 @@ def forward(self, input, hx=None):
         )
 
 
-def apply_permutation(tensor, permutation, dim=1):
-    # type: (Tensor, Tensor, int) -> Tensor
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
     return tensor.index_select(dim, permutation)
 
 
@@ -303,8 +298,7 @@ def get_weight_bias(ihhh):
                 self.all_weights.append(cell_params)
 
     @torch.jit.script_method
-    def check_input(self, input, batch_sizes):
-        # type: (Tensor, Optional[Tensor]) -> None
+    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
         expected_input_dim = 2 if batch_sizes is not None else 3
         if input.dim() != expected_input_dim:
             raise RuntimeError(
@@ -316,8 +310,7 @@ def check_input(self, input, batch_sizes):
                     self.input_size, input.size(-1)))
 
     @torch.jit.script_method
-    def get_expected_hidden_size(self, input, batch_sizes):
-        # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int]
+    def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
         if batch_sizes is not None:
             mini_batch = int(batch_sizes[0])
         else:
@@ -328,21 +321,19 @@ def get_expected_hidden_size(self, input, batch_sizes):
         return expected_hidden_size
 
     @torch.jit.script_method
-    def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
-        # type: (Tensor, Tuple[int, int, int], str) -> None
+    def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int],
+                          msg: str = 'Expected hidden size {}, got {}') -> None:
         if hx.size() != expected_hidden_size:
             raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
 
     @torch.jit.script_method
-    def check_forward_args(self, input, hidden, batch_sizes):
-        # type: (Tensor, Tensor, Optional[Tensor]) -> None
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None:
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
         self.check_hidden_size(hidden, expected_hidden_size, msg='Expected hidden size {}, got {}')
 
     @torch.jit.script_method
-    def permute_hidden(self, hx, permutation):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
         if permutation is None:
             return hx
         return apply_permutation(hx, permutation)
@@ -355,8 +346,9 @@ def __init__(self, other, dtype):
         super(QuantizedLSTM, self).__init__(other, dtype)
 
     @torch.jit.script_method
-    def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]  # noqa
+    def forward_impl(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]], batch_sizes: Optional[Tensor],
+                     max_batch_size: int, sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        # noqa
         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             zeros = torch.zeros(self.num_layers * num_directions,
@@ -379,8 +371,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
         return output, hidden
 
     @torch.jit.script_method
-    def forward_tensor(self, input, hx=None):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
+    def forward_tensor(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
         batch_sizes = None
         max_batch_size = input.size(0) if self.batch_first else input.size(1)
         sorted_indices = None
@@ -391,8 +382,8 @@ def forward_tensor(self, input, hx=None):
         return output, self.permute_hidden(hidden, unsorted_indices)
 
     @torch.jit.script_method
-    def forward_packed(self, input, hx=None):
-        # type: (PackedSequence, Optional[Tuple[Tensor, Tensor]]) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]  # noqa
+    def forward_packed(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
+                       ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
         input, batch_sizes, sorted_indices, unsorted_indices = input
         max_batch_size = batch_sizes[0]
         max_batch_size = int(max_batch_size)
@@ -404,15 +395,13 @@ def forward_packed(self, input, hx=None):
 
 
     @torch.jit.script_method
-    def permute_hidden(self, hx, permutation):
-        # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor]
+    def permute_hidden(self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]) -> Tuple[Tensor, Tensor]:
         if permutation is None:
             return hx
         return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
 
     @torch.jit.script_method
-    def check_forward_args(self, input, hidden, batch_sizes):
-        # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None
+    def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]) -> None:
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
 
@@ -432,8 +421,9 @@ class QuantizedGRU(QuantizedRNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
     @torch.jit.script_method
-    def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
-        # type: (Tensor, Optional[Tensor], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tensor]  # noqa
+    def forward_impl(self, input: Tensor, hx: Optional[Tensor], batch_sizes: Optional[Tensor], max_batch_size: int,
+                     sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tensor]:
+        # noqa
         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             hx = torch.zeros(self.num_layers * num_directions,
@@ -459,8 +449,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
         return output, hidden
 
     @torch.jit.script_method
-    def forward_tensor(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor]
+    def forward_tensor(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
         batch_sizes = None
         max_batch_size = input.size(0) if self.batch_first else input.size(1)
         sorted_indices = None
@@ -470,8 +459,7 @@ def forward_tensor(self, input, hx=None):
         return output, self.permute_hidden(hidden, unsorted_indices)
 
     @torch.jit.script_method
-    def forward_packed(self, input, hx=None):
-        # type: (PackedSequence, Optional[Tensor]) -> Tuple[PackedSequence, Tensor]
+    def forward_packed(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]:
         input, batch_sizes, sorted_indices, unsorted_indices = input
         max_batch_size = batch_sizes[0]
         max_batch_size = int(max_batch_size)

From fa160d18e701f673c0af99d2e92f485b181bb789 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 18:06:31 -0800
Subject: [PATCH 37/44] [PyTorch][jit] Add Type::{castRaw,expectRef} (#50061)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50061

These are more efficient than creating an extra `shared_ptr`
when you just want to access the casted value.
ghstack-source-id: 119325644

Test Plan: CI

Reviewed By: ezyang

Differential Revision: D25766630

fbshipit-source-id: 46f11f70333b44714cab708a4850922ab7486793
---
 aten/src/ATen/core/jit_type_base.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index 37da9ad7ef8d..e5a6d48340cf 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -152,6 +152,20 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     return nullptr;
   }
   template <typename T>
+  T* castRaw() {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  const T* castRaw() const {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
   std::shared_ptr<T> expect() {
     auto r = cast<T>();
     AT_ASSERT(r);
@@ -163,6 +177,18 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     AT_ASSERT(r);
     return r;
   }
+  template <typename T>
+  T& expectRef() {
+    auto* r = castRaw<T>();
+    AT_ASSERT(r);
+    return *r;
+  }
+  template <typename T>
+  const T& expectRef() const {
+    auto* r = castRaw<const T>();
+    AT_ASSERT(r);
+    return *r;
+  }
   virtual ~Type() = default;
   virtual bool hasFreeVariables() const {
     return false;

From ef1fa547ba015b620b7bf53aa2908ab68ea5d5d2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 6 Jan 2021 18:06:31 -0800
Subject: [PATCH 38/44] [PyTorch] Use expectRef() when calling listConstruct
 (#50062)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50062

Avoids creating an extra shared_ptr.
ghstack-source-id: 119325645

Test Plan: CI

Reviewed By: ezyang

Differential Revision: D25766631

fbshipit-source-id: f2ab8349dfea325054820fa2c1055180c740574e
---
 torch/csrc/jit/mobile/interpreter.cpp          | 2 +-
 torch/csrc/jit/passes/constant_propagation.cpp | 4 +++-
 torch/csrc/jit/runtime/interpreter.cpp         | 3 ++-
 torch/csrc/jit/runtime/static/ops.cpp          | 2 +-
 torch/csrc/jit/runtime/vararg_functions.cpp    | 9 +++------
 torch/csrc/jit/runtime/vararg_functions.h      | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 031c21474618..681eddfaa832 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -148,7 +148,7 @@ bool InterpreterState::run(Stack& stack) {
       case RET:
         return false;
       case LIST_CONSTRUCT: {
-        auto type = code_->types_[inst.X]->expect<at::ListType>();
+        const auto& type = code_->types_[inst.X]->expectRef<at::ListType>();
         listConstruct(stack, type, inst.N);
         ++pc;
       } break;
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index da9d551a6c88..75be7e86acab 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -45,7 +45,9 @@ c10::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
     } break;
     case prim::ListConstruct: {
       listConstruct(
-          stack, n->output()->type()->expect<ListType>(), n->inputs().size());
+          stack,
+          n->output()->type()->expectRef<ListType>(),
+          n->inputs().size());
     } break;
     case prim::DictConstruct: {
       dictConstruct(
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index ce4718becaf7..7d588b6d96e7 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -1495,7 +1495,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             ++frame.pc;
           } break;
           case LIST_CONSTRUCT: {
-            auto type = frame.function->type_table_[inst.X]->expect<ListType>();
+            const auto& type =
+                frame.function->type_table_[inst.X]->expectRef<ListType>();
             listConstruct(stack, type, inst.N);
             ++frame.pc;
           } break;
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 89519d3765b5..4d66c6382c2d 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -361,7 +361,7 @@ getNativeOperation(Node* n) {
       // run op
       listConstruct(
           stack,
-          p_node->get_node()->output()->type()->expect<ListType>(),
+          p_node->get_node()->output()->type()->expectRef<ListType>(),
           p_node->input_regs().size());
       // put output back
       p_node->Output(0, reg) = std::move(stack[0]);
diff --git a/torch/csrc/jit/runtime/vararg_functions.cpp b/torch/csrc/jit/runtime/vararg_functions.cpp
index 44bc56206eaf..220a5e67f723 100644
--- a/torch/csrc/jit/runtime/vararg_functions.cpp
+++ b/torch/csrc/jit/runtime/vararg_functions.cpp
@@ -204,16 +204,13 @@ void namedTupleConstruct(
       c10::ivalue::Tuple::createNamed(std::move(elems), std::move(type)));
 }
 
-void listConstruct(
-    Stack& stack,
-    const at::ListTypePtr& type,
-    size_t num_inputs) {
+void listConstruct(Stack& stack, const at::ListType& type, size_t num_inputs) {
   // Structuring the implementation this way allows NRVO to avoid
   // move-constructing vals on its way onto the stack. Moving a List
   // isn't free.
   auto makeList =
-      [](Stack& stack, const at::ListTypePtr& type, size_t num_inputs) {
-        c10::List<IValue> vals(type->getElementType());
+      [](Stack& stack, const at::ListType& type, size_t num_inputs) {
+        c10::List<IValue> vals(type.getElementType());
         vals.reserve(num_inputs);
         for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) {
           vals.emplace_back(std::move(stack[i]));
diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h
index d6eba7f5d191..e9580411212a 100644
--- a/torch/csrc/jit/runtime/vararg_functions.h
+++ b/torch/csrc/jit/runtime/vararg_functions.h
@@ -25,7 +25,7 @@ void namedTupleConstruct(
 
 void listConstruct(
     Stack& stack,
-    const at::ListTypePtr& list_type,
+    const at::ListType& list_type,
     size_t num_inputs);
 
 void dictConstruct(

From b6b76a105511b4bf896447321caf091cdb00a507 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 6 Jan 2021 18:09:27 -0800
Subject: [PATCH 39/44] Mod lists to neutral+descriptive terms in
 caffe2/caffe2/opt (#49801)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49801

Per "https://fb.workplace.com/groups/e/permalink/3320810064641820/" we can no longer use the terms "whitelist" and "blacklist", and editing any file containing them results in a critical error signal. Let's embrace the change.
This diff changes "blacklist" to "blocklist" in a number of non-interface contexts (interfaces would require more extensive testing and might interfere with reading stored data, so those are deferred until later).

Test Plan: Sandcastle

Reviewed By: xush6528

Differential Revision: D25686949

fbshipit-source-id: e07de4d228674ae61559719cbe4717f8044778d2
---
 caffe2/opt/fakefp16_transform.cc |  6 +++---
 caffe2/opt/glow_net_transform.cc | 22 +++++++++++-----------
 caffe2/opt/glow_net_transform.h  |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/caffe2/opt/fakefp16_transform.cc b/caffe2/opt/fakefp16_transform.cc
index 424056bd2c80..cbd3132dfc08 100644
--- a/caffe2/opt/fakefp16_transform.cc
+++ b/caffe2/opt/fakefp16_transform.cc
@@ -299,8 +299,8 @@ void fakeFp16Transform(NetDef* net) {
           FLAGS_fake_fp16_conversion_use_fp16_acc,
           FLAGS_fake_fp16_conversion_use_nnpi);
 
-  auto blacklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist);
-  auto blacklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
+  auto blocklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist);
+  auto blocklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
 
   // A hack to only do fakefp16 transformation for operators which will be
   // lowered to ONNXIFI.
@@ -320,7 +320,7 @@ void fakeFp16Transform(NetDef* net) {
     auto* op = net->mutable_op(i);
     auto net_pos =
         ArgumentHelper::GetSingleArgument<OperatorDef, int>(*op, "net_pos", -1);
-    if (blacklist_pos.count(net_pos) || blacklist_type.count(op->type())) {
+    if (blocklist_pos.count(net_pos) || blocklist_type.count(op->type())) {
       continue;
     }
     auto it = kFakeFp16OpConversionMap.find(op->type());
diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
index ee3ce1b27e2c..45ce9a487fbb 100644
--- a/caffe2/opt/glow_net_transform.cc
+++ b/caffe2/opt/glow_net_transform.cc
@@ -107,7 +107,7 @@ void onnxifi(
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
-    const std::unordered_set<int>& blacklist,
+    const std::unordered_set<int>& blocklist,
     const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size,
@@ -154,19 +154,19 @@ void onnxifi(
   // Before applying backlist, make sure the ops in the net all have an net_pos;
   caffe2::BackendTransformerBase::annotateOpIndex(net);
 
-  // Parse the blacklist
-  auto more_blacklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
-  for (const auto& b : blacklist) {
-    more_blacklist.emplace(b);
+  // Parse the blocklist
+  auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
+  for (const auto& b : blocklist) {
+    more_blocklist.emplace(b);
   }
 
   // ONNX mode will change the op order so it doesn't apply here
   if (!opts.use_onnx) {
-    auto blacklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
+    auto blocklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
     for (const auto& op : net->op()) {
-      if (blacklisted_ops.count(op.type())) {
+      if (blocklisted_ops.count(op.type())) {
         ArgumentHelper helper(op);
-        more_blacklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
+        more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
       }
     }
   }
@@ -179,7 +179,7 @@ void onnxifi(
   // 1. for specified op, we find its input and outputs.
   // 2. for each input and output, we create a new copy op and attach it as an
   // input to the copy.
-  // 3. we blacklist these new copy operators from onnxification. This forces
+  // 3. we blocklist these new copy operators from onnxification. This forces
   // these intermediate tensors to also become outputs of the onnxifi op.
   // 4. we put the right arguments on the copy ops so TensorObserver can print
   // out the values.
@@ -213,11 +213,11 @@ void onnxifi(
     AddArgument(kNetPos, pos, &copy_op);
     AddArgument("observe_input_tensors", 1, &copy_op);
     net->add_op()->CopyFrom(copy_op);
-    more_blacklist.emplace(pos);
+    more_blocklist.emplace(pos);
   }
 
   OnnxifiTransformer ts(opts);
-  ts.transform(ws, net, weight_names, more_shape_hints, more_blacklist);
+  ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist);
 
   // Cleanup the input from the workspace
   for (const auto& i : input_names) {
diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h
index e8d1c9b9054f..f6cd975a6e91 100644
--- a/caffe2/opt/glow_net_transform.h
+++ b/caffe2/opt/glow_net_transform.h
@@ -16,7 +16,7 @@ namespace caffe2 {
 namespace glow {
 /// Onnxifi transformation on the net and workspace.  We also
 /// needed the input data/shape to populate the shape. In addition, we take a \p
-/// blacklist to control and mask what ops we want to consider in onnxifi
+/// blocklist to control and mask what ops we want to consider in onnxifi
 /// process. We can also set whether to use ONNX proto or C2 proto through
 /// ONNXIFI interface.
 void onnxifi(
@@ -25,7 +25,7 @@ void onnxifi(
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
-    const std::unordered_set<int>& blacklist,
+    const std::unordered_set<int>& blocklist,
     const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size = 0,

From 4e2ab2cd734b5142622c7406b4bddc662caf7501 Mon Sep 17 00:00:00 2001
From: Qifan Lu <lqf.1996121@gmail.com>
Date: Wed, 6 Jan 2021 18:25:02 -0800
Subject: [PATCH 40/44] Move generator state APIs to ATen (#49589)

Summary:
## Rationale

While most of the `torch.Generator` properties and methods are implemented as a thin wrapper of the corresponding `at::Generator` methods, `torch.Generator.get_state()` and `torch.Generator.set_state()` are implemented in legacy Torch code and are not dispatched through the `c10::GeneratorImpl` interface. This is not structured well and makes implementing generators for new backends (e.g. `XLAGeneratorImpl` for the XLA backend) inconvenient. As such, this pull request seeks to move these generator state APIs to c10 and ATen.

## What is being refactored?
* Interfaces
  - Added `c10::GeneratorImpl::set_state` and `c10::GeneratorImpl::state` for getting and setting the internal state of a random number generator.
  - `at::Generator::set_state` and `at::Generator::state` wraps the above-mentioned APIs, as it's basically a PIMPL.
  - Added helper function `at::detail::check_rng_state` for checking the validity of new RNG state tensor.
* CPU Generator
  - Renamed and moved `THTensor_(setRNGState)` and `THTensor_(getRNGState)` to `CPUGeneratorImpl::set_state` and `CPUGenerator::state`.
  - Renamed and moved `THGeneratorState` and `THGeneratorStateNew` to `CPUGeneratorStateLegacy` and `CPUGeneratorState`.
* CUDA Generator
  - Renamed and moved `THCRandom_setRNGState` and `THCRandom_getRNGState` to `CUDAGeneratorImpl::set_state` and `CUDAGeneratorImpl::state`.
* PyTorch Bindings
  - `THPGenerator_setState` and `THPGenerator_getState` now simply forward to `at::Generator::set_state` and `at::Generator::state`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49589

Reviewed By: H-Huang

Differential Revision: D25785774

Pulled By: pbelevich

fbshipit-source-id: 8ed79209c4ffb1a0ae8b19952ac8871ac9e0255f
---
 aten/src/ATen/CPUGeneratorImpl.cpp       | 160 +++++++++++++++++++++++
 aten/src/ATen/CPUGeneratorImpl.h         |   2 +
 aten/src/ATen/CUDAGeneratorImpl.h        |   4 +-
 aten/src/ATen/core/Generator.cpp         |  16 +++
 aten/src/ATen/core/Generator.h           |  28 ++++
 aten/src/ATen/cuda/CUDAGeneratorImpl.cpp |  63 ++++++++-
 aten/src/ATen/test/cpu_rng_test.cpp      |   2 +
 aten/src/TH/CMakeLists.txt               |   1 -
 aten/src/TH/THGenerator.hpp              |  39 ------
 aten/src/TH/generic/THTensorRandom.cpp   | 116 ----------------
 aten/src/TH/generic/THTensorRandom.h     |   5 -
 aten/src/THC/THCTensorRandom.cu          |  54 --------
 aten/src/THC/THCTensorRandom.h           |   5 -
 c10/core/GeneratorImpl.h                 |   3 +
 test/cpp_extensions/rng_extension.cpp    |   2 +
 torch/csrc/Generator.cpp                 |  46 +++----
 16 files changed, 294 insertions(+), 252 deletions(-)
 create mode 100644 aten/src/ATen/core/Generator.cpp
 delete mode 100644 aten/src/TH/THGenerator.hpp

diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index bfa4a2a8f72f..ff4a2f1c61e2 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -1,4 +1,6 @@
 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/Utils.h>
+#include <ATen/core/MT19937RNGEngine.h>
 #include <c10/util/C++17.h>
 #include <algorithm>
 
@@ -6,6 +8,42 @@ namespace at {
 
 namespace detail {
 
+/**
+ * CPUGeneratorImplStateLegacy is a POD class needed for memcpys
+ * in torch.get_rng_state() and torch.set_rng_state().
+ * It is a legacy class and even though it is replaced with
+ * at::CPUGeneratorImpl, we need this class and some of its fields
+ * to support backward compatibility on loading checkpoints.
+ */
+struct CPUGeneratorImplStateLegacy {
+  /* The initial seed. */
+  uint64_t the_initial_seed;
+  int left;  /* = 1; */
+  int seeded; /* = 0; */
+  uint64_t next;
+  uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector  */
+
+  /********************************/
+
+  /* For normal distribution */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid; /* = 0; */
+};
+
+/**
+ * CPUGeneratorImplState is a POD class containing
+ * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
+ * as a helper for torch.get_rng_state() and torch.set_rng_state()
+ * functions.
+ */ 
+struct CPUGeneratorImplState {
+  CPUGeneratorImplStateLegacy legacy_pod;
+  float next_float_normal_sample;
+  bool is_next_float_normal_sample_valid;
+};
+
 /**
  * PyTorch maintains a collection of default generators that get
  * initialized once. The purpose of these default generators is to
@@ -75,6 +113,128 @@ uint64_t CPUGeneratorImpl::seed() {
   return random;
 }
 
+/**
+ * Sets the internal state of CPUGeneratorImpl. The new internal state
+ * must be a strided CPU byte tensor and of the same size as either
+ * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
+ * CPUGeneratorImplState (for new state).
+ * 
+ * FIXME: Remove support of the legacy state in the future?
+ */
+void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  using detail::CPUGeneratorImplState;
+  using detail::CPUGeneratorImplStateLegacy;
+
+  static_assert(std::is_pod<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
+  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+
+  static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
+  static const size_t size_current = sizeof(CPUGeneratorImplState);
+  static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
+
+  detail::check_rng_state(new_state);
+
+  at::mt19937 engine;
+  auto float_normal_sample = c10::optional<float>();
+  auto double_normal_sample = c10::optional<double>();
+
+  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
+  CPUGeneratorImplStateLegacy* legacy_pod;
+  auto new_state_size = new_state.numel();
+  if (new_state_size == size_legacy) {
+    legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
+    // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
+    // of normal sample and hence we leave the c10::optional<float> as is
+
+    // Update next_double_normal_sample.
+    // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
+    // and a rho value (normal_rho). These three values were redundant and in the new
+    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
+    // intermediate values.
+    if (legacy_pod->normal_is_valid) {
+      auto r = legacy_pod->normal_rho;
+      auto theta = 2.0 * M_PI * legacy_pod->normal_x;
+      // we return the sin version of the normal sample when in caching mode
+      double_normal_sample = c10::optional<double>(r * ::sin(theta));
+    }
+  } else if (new_state_size == size_current) {
+    auto rng_state = (CPUGeneratorImplState*)new_state.data();
+    legacy_pod = &rng_state->legacy_pod;
+    // update next_float_normal_sample
+    if (rng_state->is_next_float_normal_sample_valid) {
+      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
+    }
+
+    // Update next_double_normal_sample.
+    // Note that in getRNGState, we now return the actual normal sample in normal_y
+    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
+    // are squashed to 0.0.
+    if (legacy_pod->normal_is_valid) {
+      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
+    }
+  } else {
+    AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
+             " or a CPUGeneratorImplState of size ", size_current,
+             " but found the input RNG state size to be ", new_state_size);
+  }
+
+  // construct engine_
+  // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
+  // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
+  // doing a std::copy.
+  at::mt19937_data_pod rng_data;
+  std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin());
+  rng_data.seed_ = legacy_pod->the_initial_seed;
+  rng_data.left_ = legacy_pod->left;
+  rng_data.seeded_ = legacy_pod->seeded;
+  rng_data.next_ = static_cast<uint32_t>(legacy_pod->next);
+  engine.set_data(rng_data);
+  TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
+  this->engine_ = engine;
+  this->next_float_normal_sample_ = float_normal_sample;
+  this->next_double_normal_sample_ = double_normal_sample;
+}
+
+/**
+ * Gets the current internal state of CPUGeneratorImpl. The internal
+ * state is returned as a CPU byte tensor.
+ */
+c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
+  using detail::CPUGeneratorImplState;
+
+  static const size_t size = sizeof(CPUGeneratorImplState);
+  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr();
+
+  // accumulate generator data to be copied into byte tensor
+  auto accum_state = std::make_unique<CPUGeneratorImplState>();
+  auto rng_data = this->engine_.data();
+  accum_state->legacy_pod.the_initial_seed = rng_data.seed_;
+  accum_state->legacy_pod.left = rng_data.left_;
+  accum_state->legacy_pod.seeded = rng_data.seeded_;
+  accum_state->legacy_pod.next = rng_data.next_;
+  std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state));
+  accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy
+  accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy
+  accum_state->legacy_pod.normal_is_valid = false;
+  accum_state->legacy_pod.normal_y = 0.0;
+  accum_state->next_float_normal_sample = 0.0f;
+  accum_state->is_next_float_normal_sample_valid = false;
+  if (this->next_double_normal_sample_) {
+    accum_state->legacy_pod.normal_is_valid = true;
+    accum_state->legacy_pod.normal_y = *(this->next_double_normal_sample_);
+  }
+  if (this->next_float_normal_sample_) {
+    accum_state->is_next_float_normal_sample_valid = true;
+    accum_state->next_float_normal_sample = *(this->next_float_normal_sample_);
+  }
+
+  memcpy(rng_state, accum_state.get(), size);
+  return state_tensor.getIntrusivePtr();
+}
+
 /**
  * Gets the DeviceType of CPUGeneratorImpl.
  * Used for type checking during run time.
diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h
index eceb338966fd..f8b43a04c73c 100644
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@@ -17,6 +17,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
   static DeviceType device_type();
   uint32_t random();
   uint64_t random64();
diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h
index 9a9febd01f8e..1179a049aa08 100644
--- a/aten/src/ATen/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/CUDAGeneratorImpl.h
@@ -129,8 +129,10 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
   void set_philox_offset_per_thread(uint64_t offset);
-  uint64_t philox_offset_per_thread();
+  uint64_t philox_offset_per_thread() const;
   void capture_prologue(int64_t* offset_extragraph);
   uint64_t capture_epilogue();
   PhiloxCudaState philox_cuda_state(uint64_t increment);
diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp
new file mode 100644
index 000000000000..800f8c7c88ec
--- /dev/null
+++ b/aten/src/ATen/core/Generator.cpp
@@ -0,0 +1,16 @@
+#include <ATen/core/Generator.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+
+void Generator::set_state(const at::Tensor& new_state) {
+  TORCH_CHECK(new_state.defined(), "Undefined tensor is not allowed");
+  this->impl_->set_state(*new_state.unsafeGetTensorImpl());
+}
+
+at::Tensor Generator::get_state() const {
+  return at::Tensor::wrap_tensor_impl(this->impl_->get_state());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index de3f6e46f8f2..b5bbb2fe3c74 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -56,6 +56,8 @@
 
 namespace at {
 
+class Tensor;
+
 struct TORCH_API Generator {
   Generator() {}
 
@@ -96,6 +98,12 @@ struct TORCH_API Generator {
 
   uint64_t seed() { return impl_->seed(); }
 
+  // Implementation not inlined to prevent cycle reference between
+  // `ATen/core/Generator.h` and `ATen/core/Tensor.h`
+  void set_state(const at::Tensor& new_state);
+
+  at::Tensor get_state() const;
+
   std::mutex& mutex() {
     return impl_->mutex_;
   }
@@ -130,4 +138,24 @@ Generator make_generator(Args&&... args) {
   return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
 }
 
+namespace detail {
+
+/**
+ * Helper function for checking the validity of new random generator
+ * state. Right now following conditions are checked:
+ * 
+ * - The new state tensor must be a torch.ByteTensor
+ * - Data of the new state tensor must be contiguous
+ */
+static inline void check_rng_state(const c10::TensorImpl& new_state) {
+  TORCH_CHECK_TYPE(
+    new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte,
+    "RNG state must be a torch.ByteTensor"
+  );
+
+  TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous");
+}
+
+} // namespace detail
+
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 8a5e4f48e0c0..f0572bb6d809 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -130,6 +130,67 @@ uint64_t CUDAGeneratorImpl::seed() {
   return random;
 }
 
+/**
+ * Gets the current internal state of CUDAGeneratorImpl. The internal
+ * state is returned as a CPU byte tensor.
+ */
+c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
+  // The RNG state comprises the seed, and an offset used for Philox.
+  // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120.
+  // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here
+  // because this is just host side code and we don't want to worry about linking with cuda
+  static const size_t states_size = 200 * sizeof(4120);
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = states_size + seed_size + offset_size;
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1
+  // gen_states in THCGenerator struct was an array of curandStateMtgp32s.
+  memset(rng_state, -1, states_size);
+  auto current_seed = this->current_seed();
+  auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
+  memcpy(rng_state + states_size, &current_seed, seed_size);
+  memcpy(rng_state + states_size + seed_size, &offset, offset_size);
+
+  return state_tensor.getIntrusivePtr();
+}
+
+/**
+ * Sets the internal state of CUDAGeneratorImpl. The new internal state
+ * must be a strided CPU byte tensor and have appropriate size. See
+ * comments of CUDAGeneratorImpl::state for information about the layout
+ * and size of the internal state.
+ */
+void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = states_size + seed_size + offset_size;
+
+  detail::check_rng_state(new_state);
+
+  bool no_philox_seed = false;
+  auto new_state_size = new_state.numel();
+  if (new_state_size == total_size - offset_size) {
+    no_philox_seed = true;
+  } else {
+    TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+  }
+  
+  uint64_t input_seed;
+  auto new_rng_state = new_state.data<uint8_t>();
+  memcpy(&input_seed, new_rng_state + states_size, seed_size);
+  this->set_current_seed(input_seed);
+  int64_t philox_offset = 0;
+  if (!no_philox_seed) {
+    memcpy(&philox_offset, new_rng_state + states_size + seed_size, offset_size);
+  }
+  this->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
+}
+
 /**
  * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
  *
@@ -143,7 +204,7 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
 /**
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
-uint64_t CUDAGeneratorImpl::philox_offset_per_thread() {
+uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread");
   return philox_offset_per_thread_;
 }
diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp
index 6d596095d7a0..805ed40557b6 100644
--- a/aten/src/ATen/test/cpu_rng_test.cpp
+++ b/aten/src/ATen/test/cpu_rng_test.cpp
@@ -28,6 +28,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
   uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
   uint64_t seed() override { throw std::runtime_error("not implemented"); }
+  void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); }
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override { throw std::runtime_error("not implemented"); }
   TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); }
 
   static DeviceType device_type() { return DeviceType::CPU; }
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index a3ed10126b93..5661a697da38 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -79,7 +79,6 @@ install(FILES
   THHalf.h
   THTensor.hpp
   THStorageFunctions.hpp
-  THGenerator.hpp
   DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 
 install(FILES
diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp
deleted file mode 100644
index 1a40611f8b5b..000000000000
--- a/aten/src/TH/THGenerator.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include <ATen/core/MT19937RNGEngine.h>
-
-/**
- * THGeneratorState is a POD class needed for memcpys
- * in torch.get_rng_state() and torch.set_rng_state().
- * It is a legacy class and even though it is replaced with
- * at::CPUGeneratorImpl, we need this class and some of its fields
- * to support backward compatibility on loading checkpoints.
- */
-struct THGeneratorState {
-  /* The initial seed. */
-  uint64_t the_initial_seed;
-  int left;  /* = 1; */
-  int seeded; /* = 0; */
-  uint64_t next;
-  uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector  */
-
-  /********************************/
-
-  /* For normal distribution */
-  double normal_x;
-  double normal_y;
-  double normal_rho;
-  int normal_is_valid; /* = 0; */
-};
-
-/**
- * THGeneratorStateNew is a POD class containing
- * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
- * as a helper for torch.get_rng_state() and torch.set_rng_state()
- * functions.
- */ 
-struct THGeneratorStateNew {
-  THGeneratorState legacy_pod;
-  float next_float_normal_sample;
-  bool is_next_float_normal_sample_valid;
-};
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index 399bcc38e1de..c37b0b9bb7f0 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -11,7 +11,6 @@
 #include <type_traits>
 #include <ATen/Utils.h>
 #include <ATen/core/DistributionsHelper.h>
-#include <TH/THGenerator.hpp>
 
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
@@ -149,119 +148,4 @@ void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTens
     }
 }
 #endif
-
-#if defined(TH_REAL_IS_BYTE)
-void THTensor_(getRNGState)(at::Generator _generator, THTensor *self)
-{
-  // See Note [Acquire lock when using random generators]
-  std::lock_guard<std::mutex> lock(_generator.mutex());
-  static const size_t size = sizeof(THGeneratorStateNew);
-  THTensor_(resize1d)(self, size);
-  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  static_assert(std::is_pod<THGeneratorStateNew>::value, "THGeneratorStateNew is not a PODType");
-
-  // cast byte tensor to POD type
-  THGeneratorStateNew* rng_state = (THGeneratorStateNew*)self->data<scalar_t>();
-
-  // accumulate generator data to be copied into byte tensor
-  auto accum_state = std::make_unique<THGeneratorStateNew>();
-  auto cast_generator = at::check_generator<at::CPUGeneratorImpl>(_generator);
-  auto rng_data = cast_generator->engine().data();
-  accum_state->legacy_pod.the_initial_seed = rng_data.seed_;
-  accum_state->legacy_pod.left = rng_data.left_;
-  accum_state->legacy_pod.seeded = rng_data.seeded_;
-  accum_state->legacy_pod.next = rng_data.next_;
-  std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state));
-  accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy
-  accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy
-  accum_state->legacy_pod.normal_is_valid = false;
-  accum_state->legacy_pod.normal_y = 0.0;
-  accum_state->next_float_normal_sample = 0.0f;
-  accum_state->is_next_float_normal_sample_valid = false;
-  if(cast_generator->next_double_normal_sample()) {
-    accum_state->legacy_pod.normal_is_valid = true;
-    accum_state->legacy_pod.normal_y = *(cast_generator->next_double_normal_sample());
-  }
-  if(cast_generator->next_float_normal_sample()) {
-    accum_state->is_next_float_normal_sample_valid = true;
-    accum_state->next_float_normal_sample = *(cast_generator->next_float_normal_sample());
-  }
-
-  memcpy(rng_state, accum_state.get(), size);
-}
-
-void THTensor_(setRNGState)(at::Generator _generator, THTensor *self)
-{
-  // See Note [Acquire lock when using random generators]
-  std::lock_guard<std::mutex> lock(_generator.mutex());
-  auto cast_generator = at::check_generator<at::CPUGeneratorImpl>(_generator);
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  static_assert(std::is_pod<THGeneratorState>::value, "THGeneratorState is not a PODType");
-  static_assert(std::is_pod<THGeneratorStateNew>::value, "THGeneratorStateNew is not a PODType");
-
-  static const size_t size_legacy = sizeof(THGeneratorState);
-  static const size_t size_current = sizeof(THGeneratorStateNew);
-  static_assert(size_legacy != size_current, "Legacy THGeneratorState and THGeneratorStateNew can't be of the same size");
-
-  at::mt19937 engine;
-  auto float_normal_sample = c10::optional<float>();
-  auto double_normal_sample = c10::optional<double>();
-
-  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
-  THGeneratorState* legacy_pod;
-  if (THTensor_(nElement)(self) == size_legacy) {
-    legacy_pod = (THGeneratorState*)self->data<scalar_t>();
-    // Note that in legacy THGeneratorState, we didn't have float version
-    // of normal sample and hence we leave the c10::optional<float> as is
-
-    // Update next_double_normal_sample.
-    // Note that legacy THGeneratorState stores two uniform values (normal_x, normal_y)
-    // and a rho value (normal_rho). These three values were redundant and in the new
-    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
-    // intermediate values.
-    if (legacy_pod->normal_is_valid) {
-      auto r = legacy_pod->normal_rho;
-      auto theta = 2.0 * M_PI * legacy_pod->normal_x;
-      // we return the sin version of the normal sample when in caching mode
-      double_normal_sample = c10::optional<double>(r * ::sin(theta));
-    }
-  } else if (THTensor_(nElement)(self) == size_current) {
-    auto rng_state = (THGeneratorStateNew*)self->data<scalar_t>();
-    legacy_pod = &rng_state->legacy_pod;
-    // update next_float_normal_sample
-    if (rng_state->is_next_float_normal_sample_valid) {
-      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
-    }
-
-    // Update next_double_normal_sample.
-    // Note that in getRNGState, we now return the actual normal sample in normal_y
-    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
-    // are squashed to 0.0.
-    if (legacy_pod->normal_is_valid) {
-      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
-    }
-  } else {
-    AT_ERROR("Expected either a THGeneratorState of size ", size_legacy,
-             " or a THGeneratorStateNew of size ", size_current,
-             " but found the input RNG state size to be ", THTensor_(nElement)(self));
-  }
-
-  // construct engine_
-  // Note that legacy THGeneratorState stored a state array of 64 bit uints, whereas in our
-  // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
-  // doing a std::copy.
-  at::mt19937_data_pod rng_data;
-  std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin());
-  rng_data.seed_ = legacy_pod->the_initial_seed;
-  rng_data.left_ = legacy_pod->left;
-  rng_data.seeded_ = legacy_pod->seeded;
-  rng_data.next_ = static_cast<uint32_t>(legacy_pod->next);
-  engine.set_data(rng_data);
-  THArgCheck(engine.is_valid(), 1, "Invalid mt19937 state");
-  cast_generator->set_engine(engine);
-  cast_generator->set_next_float_normal_sample(float_normal_sample);
-  cast_generator->set_next_double_normal_sample(double_normal_sample);
-}
-#endif
 #endif
diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h
index ffc52bc69390..ddeb905680cd 100644
--- a/aten/src/TH/generic/THTensorRandom.h
+++ b/aten/src/TH/generic/THTensorRandom.h
@@ -9,9 +9,4 @@ TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor *
 TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTensor *J, int n_sample, c10::optional<at::Generator> _generator);
 #endif
 
-#if defined(TH_REAL_IS_BYTE)
-TH_API void THTensor_(getRNGState)(at::Generator _generator, THTensor *self);
-TH_API void THTensor_(setRNGState)(at::Generator _generator, THTensor *self);
-#endif
-
 #endif
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index aefb427f4e67..8655ea2fb829 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -12,60 +12,6 @@
 #define MAX_NUM_BLOCKS 200
 #define BLOCK_SIZE 256
 
-// NB: ROCm compiler seems to have a bug where __host__ functions must be
-// explicitly specified extern "C" otherwise ROCm compiler doesn't respect it.
-// See https://github.com/RadeonOpenCompute/hcc/issues/839
-__host__ void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state)
-{
-  auto gen = at::check_generator<at::CUDAGeneratorImpl>(gen_);
-  std::lock_guard<std::mutex> lock(gen->mutex_);
-  // The RNG state comprises the seed, and an offset used for Philox.
-  // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120.
-  // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
-  // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here
-  // because this is just host side code and we don't want to worry about linking with cuda
-  static const size_t states_size = 200 * sizeof(4120);
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = states_size + seed_size + offset_size;
-  THByteTensor_resize1d(rng_state, total_size);
-  THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
-  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
-  // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1
-  // gen_states in THCGenerator struct was an array of curandStateMtgp32s.
-  memset(THByteTensor_data(rng_state), -1, states_size);
-  auto current_seed = gen->current_seed();
-  auto offset = static_cast<int64_t>(gen->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
-  memcpy(THByteTensor_data(rng_state) + states_size, &current_seed, seed_size);
-  memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &offset, offset_size);
-}
-
-__host__ void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state)
-{
-  auto gen = at::check_generator<at::CUDAGeneratorImpl>(gen_);
-  std::lock_guard<std::mutex> lock(gen->mutex_);
-  static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = states_size + seed_size + offset_size;
-  bool no_philox_seed = false;
-  if (THByteTensor_nElement(rng_state) == total_size - offset_size) {
-    no_philox_seed = true;
-  }
-  else {
-    THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
-  }
-  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
-  uint64_t input_seed;
-  memcpy(&input_seed, THByteTensor_data(rng_state) + states_size, seed_size);
-  gen->set_current_seed(input_seed);
-  int64_t philox_offset = 0;
-  if (!no_philox_seed) {
-    memcpy(&philox_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size);
-  }
-  gen->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
-}
-
 #include <THC/generic/THCTensorRandom.cu>
 #include <THC/THCGenerateAllTypes.h>
 
diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h
index b1d7f1ef1797..696e36f70bec 100644
--- a/aten/src/THC/THCTensorRandom.h
+++ b/aten/src/THC/THCTensorRandom.h
@@ -9,9 +9,4 @@
 #include <THC/generic/THCTensorRandom.h>
 #include <THC/THCGenerateBoolType.h>
 
-#include <ATen/CUDAGeneratorImpl.h>
-
-TORCH_CUDA_API void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state);
-TORCH_CUDA_API void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state);
-
 #endif
diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h
index 3af652a1a3b2..84e620e93a72 100644
--- a/c10/core/GeneratorImpl.h
+++ b/c10/core/GeneratorImpl.h
@@ -13,6 +13,7 @@
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/python_stub.h>
+#include <c10/core/TensorImpl.h>
 
 /**
  * Note [Generator]
@@ -71,6 +72,8 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
   virtual void set_current_seed(uint64_t seed) = 0;
   virtual uint64_t current_seed() const = 0;
   virtual uint64_t seed() = 0;
+  virtual void set_state(const c10::TensorImpl& new_state) = 0;
+  virtual c10::intrusive_ptr<c10::TensorImpl> get_state() const = 0;
   Device device() const;
 
   // See Note [Acquire lock when using random generators]
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
index 4a71a526617f..f3ab91fb3cab 100644
--- a/test/cpp_extensions/rng_extension.cpp
+++ b/test/cpp_extensions/rng_extension.cpp
@@ -22,6 +22,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
   uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
   uint64_t seed() override { throw std::runtime_error("not implemented"); }
+  void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); }
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override { throw std::runtime_error("not implemented"); }
   TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); }
 
   static DeviceType device_type() { return DeviceType::CPU; }
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 55e5abc29ef9..2bc478f36007 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -15,7 +15,6 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 
 #ifdef USE_CUDA
-#include <THC/THCTensorRandom.h>
 #include <ATen/CUDAGeneratorImpl.h>
 #endif
 
@@ -78,45 +77,32 @@ static PyObject * THPGenerator_getState(PyObject *_self, PyObject *noargs)
 {
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
-  Variable var = torch::empty({0}, at::device(at::kCPU).dtype(at::kByte));
-  if (self->cdata.device().type() == at::kCPU) {
-    THByteTensor_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl()));
-  } else {
-#ifdef USE_CUDA
-    TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA);
-    THCRandom_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl()));
-#else
-    TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA");
-#endif
-  }
-  return THPVariable_Wrap(std::move(var));
+  auto& gen = ((THPGenerator*)_self)->cdata;
+
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(gen.mutex());
+  auto state_tensor = gen.get_state();
+
+  return THPVariable_Wrap(std::move(state_tensor));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state)
 {
   using namespace torch::autograd;
-  auto self = (THPGenerator*)_self;
+  
   HANDLE_TH_ERRORS
   if (!THPVariable_Check(_new_state)) {
     throw torch::TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name);
   }
-  auto& tensor = ((THPVariable*)_new_state)->cdata;
-  if (tensor.layout() != kStrided || tensor.device().type() != kCPU || tensor.scalar_type() != kByte) {
-    auto type_name = torch::utils::options_to_string(tensor.options());
-    throw torch::TypeError("expected a torch.ByteTensor, but got %s", type_name.c_str());
-  }
-  if (self->cdata.device().type() == at::kCPU) {
-    THByteTensor_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl());
-  } else {
-#ifdef USE_CUDA
-    TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA);
-    THCRandom_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl());
-#else
-    TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA");
-#endif
-  }
+  auto self = (THPGenerator*)_self;
+  auto& gen = self->cdata;
+  auto& new_state_tensor = ((THPVariable*)_new_state)->cdata;
+  
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(gen.mutex());
+  gen.set_state(new_state_tensor);
+  
   Py_INCREF(self);
   return (PyObject*)self;
   END_HANDLE_TH_ERRORS

From 838e73de2042083503021ae8505e066b93d4c2d4 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Wed, 6 Jan 2021 18:35:09 -0800
Subject: [PATCH 41/44] enable alltoall_single torchscript support (#48345)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48345

Test Plan: wait for sandcastle

Differential Revision: D25074475

fbshipit-source-id: 04261f8453567154b0464f8348320e936ca06384
---
 .../check_backward_compatibility.py           |  2 +-
 test/distributed/test_jit_c10d.py             | 27 ++++++++++---------
 torch/csrc/distributed/c10d/init.cpp          | 27 ++++++++++++++++---
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 8527293189a9..2d5d50096c81 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -72,7 +72,7 @@ def allow_listed(schema, allow_list):
 dont_parse_list = [
     ("_TorchScriptTesting.*", datetime.date(2099, 9, 17)),
     ("test_backend", datetime.date(2099, 9, 17)),
-    ("c10d.frontend", datetime.date(2020, 12, 30)),
+    ("dist_c10d", datetime.date(2021, 1, 30)),
 ]
 
 
diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py
index 85788b914059..182a405d0e78 100644
--- a/test/distributed/test_jit_c10d.py
+++ b/test/distributed/test_jit_c10d.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed as c10d
 import time
+from datetime import timedelta
 from typing import List
 
 import torch.testing._internal.common_utils as common
@@ -31,6 +32,14 @@ def unique_process_group_name(prefix):
     now = int(time.time() * 1000)
     return "%s_%d" % (prefix, now)
 
+def _create_tcp_store():
+    addr = "localhost"
+    port = common.find_free_port()
+    timeout = timedelta(minutes=5)
+    timeout_millisecond = int(timeout / timedelta(milliseconds=1))
+    return torch.classes.dist_c10d.TCPStore(addr, port, 1, True, timeout_millisecond)
+
+
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "TSAN is not fork-safe since we're forking in a multi-threaded environment",
@@ -48,19 +57,15 @@ def setUp(self):
             raise unittest.SkipTest("NCCL test requires 2+ GPUs")
 
     def _create_nccl_pg(self, name_prefix):
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
         opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True)
 
         name = unique_process_group_name(name_prefix)
 
-        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)  
+        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)
 
     def _create_nccl_pg_as_base_process_group(self, name):
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
 
         return torch.classes.dist_c10d.frontend().new_process_group_helper(
             self.world_size, self.rank, [], "nccl", tcp_store, name, 0)
@@ -155,9 +160,7 @@ def test_frontend_singleton(self):
         frontend1 = torch.classes.dist_c10d.frontend()
         frontend2 = torch.classes.dist_c10d.frontend()
 
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
 
         pg_name = unique_process_group_name("singleton_test_process_group")
 
@@ -180,9 +183,7 @@ def test_process_group_as_module_member(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super(TestModule, self).__init__()
-                addr = "localhost"
-                port = common.find_free_port()
-                tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+                tcp_store = _create_tcp_store()
 
                 name = unique_process_group_name("module_member_process_group")
                 self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper(
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 76b466c91f10..0d4250eddd13 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1260,11 +1260,25 @@ static const auto TCPStoreTorchBind =
         .def(torch::init([](const std::string& host_name,
                             int64_t port,
                             int64_t world_size,
-                            bool is_master) {
+                            bool is_master,
+                            int64_t timeout) {
+          auto timeout_miliseconds = std::chrono::milliseconds(timeout);
           return c10::make_intrusive<::c10d::TCPStore>(
-              host_name, port, world_size, is_master);
+              host_name, port, world_size, is_master, timeout_miliseconds);
         }));
 
+// TODO: This should really take Store as constructor argument instead of
+// TCPStore, but the fact that TorchScript does not support polymorphism
+// forced us to cast in C++ instead of automatic casting
+static const auto PrefixStoreTorchBind =
+    torch::class_<::c10d::PrefixStore>("dist_c10d", "PrefixStore")
+        .def(torch::init([](const std::string& prefix,
+                            const c10::intrusive_ptr<::c10d::TCPStore>& store) {
+            return c10::make_intrusive<::c10d::PrefixStore>(
+                prefix, store);
+        }));
+
+
 // Torchbind the ProcessGroup to make it available in TorchScript
 static const auto ProcessGroupWorkTorchBind =
     torch::class_<::c10d::ProcessGroup::Work>("dist_c10d", "Work")
@@ -1624,7 +1638,14 @@ static const auto ProcessGroupNCCLTorchBind =
                   outputSplitSizes,
                   inputSplitSizes,
                   ::c10d::AllToAllOptions());
-            });
+
+            })
+        .def("size", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+            return (int64_t) self->getSize();
+        })
+        .def("rank", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+            return (int64_t) self->getRank();
+        });
 #endif
 
 static const auto DistributedC10dFrontendTorchBind =

From 11cdb910b4af2b4abb6a9c98a325f6f378347fba Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Wed, 6 Jan 2021 21:46:56 -0800
Subject: [PATCH 42/44] [fx] Add matrix multiplication fusion pass (#50151)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50151

**Summary**
This commit adds a graph transformation pass that merges several matrix
multiplications that use the same RHS operand into one large matrix
multiplication. The LHS operands from all of the smaller matrix multiplications
are concatenated together and used as an input in the large matrix multiply,
and the result is split in order to obtain the same products as the original
set of matrix multiplications.

**Test Plan**
This commit adds a simple unit test with two matrix multiplications that share
the same RHS operand.

`python test/test_fx_experimental.py -k merge_matmul -v`

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D25809409

Pulled By: SplitInfinity

fbshipit-source-id: fb55c044a54dea9f07b71aa60d44b7a8f3966ed0
---
 test/test_fx_experimental.py          | 123 ++++++++++++++
 torch/fx/experimental/merge_matmul.py | 220 ++++++++++++++++++++++++++
 2 files changed, 343 insertions(+)
 create mode 100644 torch/fx/experimental/merge_matmul.py

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 6e9c877b8de6..ac71d6037591 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -21,6 +21,7 @@
     PartitionMode
 )
 from torch.fx.experimental.fuser import fuse
+from torch.fx.experimental import merge_matmul
 
 try:
     from torchvision.models import resnet18
@@ -844,6 +845,128 @@ def forward(self, a):
                 for p_name in para_list:
                     assert p_name in node.attrs_for_lowering
 
+    def test_merge_matmuls(self):
+        """
+        A collection of test cases for torch.fx.experimental.merge_matmul,
+        a graph transformation that merges matrix multiplication operations.
+        """
+        # Utility function for counting matmuls for test assertions.
+        def _count_matmuls(mod):
+            gm = torch.fx.symbolic_trace(mod)
+
+            num_matmuls = 0
+            for node in gm.graph.nodes:
+                if node.target == torch.matmul:
+                    num_matmuls += 1
+
+            return num_matmuls
+
+        # Simple test case in which there are two matmuls of the same size to merge.
+        class SimpleMergeMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, x, y):
+                a = torch.matmul(x, self.rhs)
+                b = torch.matmul(y, self.rhs)
+                return a + b
+
+        # Initialize inputs.
+        a = torch.randn(3, 3)
+        b = torch.randn(3, 3)
+
+        # Initialize RHS for matmuls.
+        rhs = torch.randn(3, 4)
+
+        # Construct SimpleMergeMatmulModule and call merge_matmul on it.
+        module = SimpleMergeMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(a, b)
+        after = opt_module(a, b)
+        before.allclose(after)
+
+        # Basic graph structure check; original module should have 2 matmuls
+        # and optimized module should have 1.
+        self.assertEqual(_count_matmuls(module), 2)
+        self.assertEqual(_count_matmuls(opt_module), 1)
+
+        # Test case in which there are multiple matmuls of different sizes to merge.
+        class FiveMergeMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, a, b, c, d, e):
+                s = torch.Tensor((0))
+                matmuls = []
+
+                # For some reason using a list comprehension or for-loop for this
+                # doesn't work.
+                matmuls.append(torch.matmul(a, self.rhs))
+                matmuls.append(torch.matmul(b, self.rhs))
+                matmuls.append(torch.matmul(c, self.rhs))
+                matmuls.append(torch.matmul(d, self.rhs))
+                matmuls.append(torch.matmul(e, self.rhs))
+
+                for m in matmuls:
+                    s += torch.sum(m)
+
+                return s
+
+        # Initialize inputs.
+        inputs = [torch.randn(2 * i + 1, 5) for i in range(5)]
+
+        # Initialize RHS.
+        rhs = torch.randn(5, 4)
+
+        # Construct FiveMergeMatmulModule and call merge_matmul on it.
+        module = FiveMergeMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(*inputs)
+        after = opt_module(*inputs)
+        before.allclose(after)
+
+        # Basic graph structure check; original module should have len(inputs) matmuls
+        # and optimized module should have 1.
+        self.assertEqual(_count_matmuls(module), len(inputs))
+        self.assertEqual(_count_matmuls(opt_module), 1)
+
+        # Simple test case in which two matmuls cannot be merged due to a data dependency between
+        # the LHS operands.
+        class UnmergeableMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, x):
+                a = torch.matmul(x, self.rhs)
+                a_abs = torch.abs(a)
+                b = torch.matmul(a_abs.transpose(1, 0), self.rhs)
+                return b
+
+        # Initialize inputs.
+        a = torch.randn(3, 3)
+
+        # Initialize RHS for matmuls.
+        rhs = torch.randn(3, 4)
+
+        # Construct UnmergeableMatmulModule and call merge_matmul on it.
+        module = UnmergeableMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(a)
+        after = opt_module(a)
+        before.allclose(after)
+
+        # Basic graph structure check; the number of matrix multiplcations should not have changed.
+        self.assertEqual(_count_matmuls(module), 2)
+        self.assertEqual(_count_matmuls(opt_module), 2)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
new file mode 100644
index 000000000000..b72bbe633dd9
--- /dev/null
+++ b/torch/fx/experimental/merge_matmul.py
@@ -0,0 +1,220 @@
+import torch
+
+from torch.fx.graph import Graph
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.symbolic_trace import symbolic_trace
+
+import itertools
+import operator
+
+from typing import Dict, List
+
+
+def get_first_dim(t: torch.Tensor) -> int:
+    """
+    A free function primarily for use in the merge_matmul graph transformation below
+    that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape
+    is an attribute (and cannot be the target of a call_function node) and also helps save
+    a getitem op in the graph.
+
+    Arguments:
+        t: The tensor to get the first dimension of.
+
+    Returns:
+        The first dimension of t.
+    """
+    return t.shape[0]
+
+
+def legalize_graph(gm: GraphModule):
+    """
+    Replace the graph of the given GraphModule with one that contains the same nodes as the
+    original, but in topologically sorted order.
+
+    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
+    order of its input GraphModule, so that this order is restored before further transformation.
+
+    Arguments:
+        gm: The graph module to topologically sort. It is modified in-place.
+
+    """
+    # Build an adjacency list representation of node dependencies in the graph. This also
+    # serves as a list of nodes that still need to be inserted into the new, topologically
+    # sorted graph.
+    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
+
+    # Construct a new graph that will contain all nodes in topologically sorted order.
+    new_graph = Graph()
+    value_remap: Dict[Node, Node] = {}
+
+    # Copy over all nodes with no dependencies.
+    for node, deps in dependencies.items():
+        if not deps:
+            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+
+    # Remove the copied over nodes from the adjacency list.
+    for copied_node in value_remap.keys():
+        del dependencies[copied_node]
+
+    # While there are still nodes to insert into the new graph:
+    while dependencies:
+        copied_this_round = []
+
+        # Copy over all nodes whose dependencies already exist in the new graph.
+        for node, deps in dependencies.items():
+            all_deps_copied = True
+            for dep in deps:
+                if dep not in value_remap:
+                    all_deps_copied = False
+
+            if all_deps_copied:
+                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+                copied_this_round.append(node)
+
+        # Delete all nodes copied over in this iteration from dependencies.
+        for copied_node in copied_this_round:
+            del dependencies[copied_node]
+
+    # Replace the old graph with the new, topologically sorted one.
+    gm.graph = new_graph
+
+
+def may_depend_on(a: Node, b: Node, search_depth: int = 6):
+    """
+    Determine if one node depends on another in a torch.fx.Graph.
+
+    Arguments:
+        a: The node that may have a dependency on b.
+        b: The node that a may have a dependency on.
+        search_depth: In the case of an indirect dependency, this function
+                        searches upto this many nodes away in search of a
+                        data dependency. If none is found, the function
+                        makes the conservative assumption that there is a
+                        dependency.
+
+    Returns:
+        True if a may depend on b, False if it definitely does not.
+    """
+    # Equivalence is defined as dependence.
+    if a == b:
+        return True
+
+    # If a has no inputs, it cannot depend on b.
+    if len(a.all_input_nodes) == 0:
+        return False
+
+    # If the search depth has been exhausted and no conclusion has been
+    # reached, assume that there is a data dependency.
+    if search_depth == 0:
+        return True
+
+    # Recursively check all inputs of a.
+    for inp in a.all_input_nodes:
+        if may_depend_on(inp, b, search_depth - 1):
+            return True
+
+    return False
+
+
+def are_nodes_independent(nodes: List[Node]):
+    """
+    Check if all of the given nodes are pairwise-data independent.
+
+    Arguments:
+        nodes: The nodes to check for data dependencies.
+
+    Returns:
+        True if any pair in nodes has a data dependency.
+    """
+    # For each pair in nodes:
+    for i, j in itertools.combinations(nodes, 2):
+        if may_depend_on(i, j) or may_depend_on(j, i):
+            return False
+
+    return True
+
+
+def merge_matmul(in_mod: torch.nn.Module):
+    """
+    A graph transformation that merges matrix multiplication operations that share the same right-hand
+    side operand into one large matrix multiplication.
+               ____      _________        _________
+      ----    |    |    |         |     M|  A * C  |
+    M| A  |  T| B  | * K|    C    | =    |---------|
+      ---- ,  |    |    |         |     T|  B * C  |
+       K       ----      ---------        ---------
+                K            R                R
+    """
+    gm = symbolic_trace(in_mod)
+
+    rhs_users: Dict[Node, List[Node]] = {}
+    lhs_users: Dict[Node, List[Node]] = {}
+
+    # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to
+    # the matmul of which they are the LHS/RHS.
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target is not torch.matmul:
+            continue
+
+        lhs, rhs = node.args
+
+        # TODO: Properly handle aliasing caused by get_attr. For now,
+        # use the attribute name as the operand if the node is a
+        # get_attr.
+        lhs = lhs.target if lhs.op == "get_attr" else lhs
+        rhs = rhs.target if rhs.op == "get_attr" else rhs
+
+        lhs_users.setdefault(lhs, []).append(node)
+        rhs_users.setdefault(rhs, []).append(node)
+
+    for rhs, mms in rhs_users.items():
+        # There must be at least matmuls for a merge to make sense.
+        if len(mms) < 2:
+            continue
+
+        # All matmuls must not depend on each other directly or indirectly
+        # in order for the merge to be possible.
+        if not are_nodes_independent(mms):
+            continue
+
+        lhs_vals = [mm.args[0] for mm in mms]
+
+        # Merge the matmul.
+        # Collect a list of LHS operands and the single RHS operand.
+        lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals]
+        rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs
+
+        # Concatenate all the LHS operands.
+        merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {})
+
+        # Multiply the concatenated LHS operands with the one RHS. This will produce
+        # the same results as all the individual matmuls involving rhs in the original graph,
+        # but they will all be concatenated together.
+        merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {})
+
+        # Split the result of the merged matmul using the shapes of the LHS operands
+        # to ascertain how large each chunk should be.
+        merge_mm_sizes = [
+            gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs
+        ]
+        merge_mm_split = gm.graph.call_function(
+            torch.split, (merge_mm, merge_mm_sizes), {}
+        )
+        merge_mm_res = [
+            gm.graph.call_function(operator.getitem, (merge_mm_split, out), {})
+            for out in range(len(lhs))
+        ]
+
+        # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul.
+        for old, new in zip(mms, merge_mm_res):
+            old.replace_all_uses_with(new)
+            gm.graph.erase_node(old)
+
+        # All of the new nodes created above were inserted at the end, so we need to sort
+        # the nodes topologically to make sure all definitions precede uses.
+        legalize_graph(gm)
+
+    gm.recompile()
+    gm.graph.lint(in_mod)
+    return gm

From 968ad47b410b93d2600d163db50eb9fb45c24a2b Mon Sep 17 00:00:00 2001
From: UNO Leo <leouno12@gmail.com>
Date: Wed, 6 Jan 2021 22:19:37 -0800
Subject: [PATCH 43/44] Fix error messages thrown when the padding size is not
 valid (#50135)

Summary:
Hi, I changed error messages so that they correspond to the actual implementation.
Acording to the implementation, half of kernel size is valid as padding size.

This is minor but an example that the padding size is exactly equal to the half of kernel size,

Input: 5 x 5
Kernel: 4 x 4
Stride: 4
Padding: 2
==> Output: 2 x 2

You don't get the error in the above case, like following:
```python
import torch
import torch.nn as nn

# no error
input = torch.randn(1, 1, 5, 5)
pool = nn.MaxPool2d(4, 4, padding=2)
print(pool(input).shape)
# >>> torch.Size([1, 1, 2, 2])
```

You get the error when you set the padding size larger then half of kernel size like:
```python
# it raises error
input = torch.randn(1, 1, 5, 5)
pool = nn.MaxPool2d(4, 4, padding=3)
print(pool(input).shape)
```

The error message is:
```
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-43-2b09d1c5d79a> in <module>()
      1 input = torch.randn(1, 1, 5, 5)
      2 pool = nn.MaxPool2d(4, 4, padding=3)
----> 3 print(pool(input).shape)

3 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in _max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices)
    584         stride = torch.jit.annotate(List[int], [])
    585     return torch.max_pool2d(
--> 586         input, kernel_size, stride, padding, dilation, ceil_mode)
    587
    588 max_pool2d = boolean_dispatch(

RuntimeError: pad should be smaller than half of kernel size, but got padW = 3, padH = 3, kW = 4, kH = 4
```

Thanks in advance.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50135

Reviewed By: hl475

Differential Revision: D25815337

Pulled By: H-Huang

fbshipit-source-id: 98142296fa6e6849d2e1407d2c1d4e3c2f83076d
---
 aten/src/ATen/native/Pool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 071460b090cd..8b5d65a8a60f 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -72,7 +72,7 @@ pool2d_shape_check(
   TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
               "non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
   TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
-              "pad should be smaller than half of kernel size, but got ",
+              "pad should be smaller than or equal to half of kernel size, but got ",
               "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
 
   TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
@@ -172,7 +172,7 @@ pool3d_shape_check(
   }
 
   TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH,
-              "pad should be smaller than half of kernel size, but got "
+              "pad should be smaller than or equal to half of kernel size, but got "
               "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH);
 
   TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1,

From 321b98830e17e9e1a366ababeb5475f9f202c815 Mon Sep 17 00:00:00 2001
From: Chunli Fu <chunlifu@fb.com>
Date: Thu, 7 Jan 2021 02:01:25 -0800
Subject: [PATCH 44/44] [script] Validator for unsupported ops on accelerator

Summary:
ATT

Next step:
1. integrate with dper flow.
2. Support in bento after diff is pushed to prod.

Test Plan:
buck run mode/opt-clang sigrid/predictor/scripts:check_accelerator_unsupported_ops -- --model_entity_id=232891739

I0106 17:08:36.425796 1238141 pybind_state.cc:531] Unsupported ops: Fused8BitRowwiseQuantizedToFloat

Reviewed By: khabinov

Differential Revision: D25818253

fbshipit-source-id: 8d8556b0400c1747f154b0517352f1685f1aa8b1
---
 caffe2/opt/onnxifi_transformer.cc |  5 +++--
 caffe2/opt/onnxifi_transformer.h  | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 8089314c3100..2dd8c8d2d8b4 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -1195,11 +1195,11 @@ void OnnxifiTransformer::applyFilteringRules(
   blocklistCpuPartition(net, blocklisted_ops);
 }
 
-void OnnxifiTransformer::getBackendId() {
+std::vector<onnxBackendID> OnnxifiTransformer::getBackendId() {
   idx_ = 0;
 
   if (opts_.use_onnx) {
-    return;
+    return backend_ids_;
   }
   // Try to find a backend that support Caffe2 proto. Note that this is quite
   // opportunistic as we don't officially support Caffe2 proto.
@@ -1214,6 +1214,7 @@ void OnnxifiTransformer::getBackendId() {
       break;
     }
   }
+  return backend_ids_;
 }
 
 NetDef OnnxifiTransformer::TransformViaC2(
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index d88eb739750c..d1af1731013d 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -61,6 +61,17 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       const ShapeInfoMap& shape_hints,
       const std::unordered_set<int>& blocklisted_ops) override;
 
+  // Query whether an operator is supported by passing C2 protobuf
+  bool supportOpC2(
+      const caffe2::OperatorDef& op,
+      const ShapeInfoMap& shape_hints,
+      const std::unordered_set<std::string>& weights,
+      const std::unordered_set<int>& blocklisted_ops,
+      onnxBackendID backend_id) const;
+
+  // Determine backend id
+  std::vector<onnxBackendID> getBackendId();
+
  private:
   // Since we create new tensors during the conversion process, we actually need
   // into inject them into the original workspace
@@ -114,14 +125,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       ShapeInfoMap* shape_hints_max_bs,
       const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
-  // Query whether an operator is supported by passing C2 protobuf
-  bool supportOpC2(
-      const caffe2::OperatorDef& op,
-      const ShapeInfoMap& shape_hints,
-      const std::unordered_set<std::string>& weights,
-      const std::unordered_set<int>& blocklisted_ops,
-      onnxBackendID backend_id) const;
-
   // Query whether an operator is supported by passing ONNX protobuf
   bool supportOpOnnx(
       const caffe2::OperatorDef& op,
@@ -152,9 +155,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_set<std::string>& weights,
       std::unordered_set<int>* blocklisted_ops) const;
 
-  // Determine backend id
-  void getBackendId();
-
   // Extract partition info from the original net
   void extractPartitionInfo(const NetDef& net);