From f6f0fde8411882af712d53fc7f7c0bdffeb47683 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 5 Jan 2021 20:25:56 -0800 Subject: [PATCH 01/44] [reland][quant][graphmode][fx] Standalone module support {input/output}_quantized_idxs (#49754) (#50058) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50058 This PR adds the support for {input/output}_quantized_idxs for standalone module. if input_quantized_idxs = [] and output_quantized_idxs = [], the standalone module will be expecting float input and produce float output, and will quantize the input and dequantize output internally if input_quantized_idxs = [0] and otuput_qiuantized_idxs = [0], the standalone module will be expecting quantized input and produce quantized output, the input will be quantized in the parent module, and output will be dequantized in the parent module as well, this is similar to current quantized modules like nn.quantized.Conv2d For more details, please see the test case Test Plan: python test/test_quantization.py TestQuantizeFx.test_standalone_module Imported from OSS Imported from OSS Reviewed By: vkuzo Differential Revision: D25768910 fbshipit-source-id: 96c21a3456cf192c8f1400afa4e86273ee69197b --- test/quantization/test_quantize_fx.py | 126 ++++++++++++---- torch/quantization/fx/fuse.py | 11 +- torch/quantization/fx/fusion_patterns.py | 23 ++- torch/quantization/fx/observed_module.py | 10 +- .../quantization/fx/quantization_patterns.py | 4 +- torch/quantization/fx/quantize.py | 138 +++++++++++++----- torch/quantization/fx/utils.py | 6 +- torch/quantization/quantize_fx.py | 23 ++- 8 files changed, 253 insertions(+), 88 deletions(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index d014bd31f02e..7965b3cc88a4 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -573,7 +573,16 @@ def forward(self, x): m = convert_fx(m) m(tensor_input) - def test_standalone_module(self): + def _test_standalone_module( + self, + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check): + """ Test standalone module with different quantized input/quantized output + configurations + """ class StandaloneModule(torch.nn.Module): def __init__(self): super().__init__() @@ -613,45 +622,32 @@ def forward(self, x): original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach()) original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach()) - qconfig_dict = {"": default_qconfig} - config_name = {"standalone_module_name": [("standalone", None, None)]} - config_class = {"standalone_module_class": [(StandaloneModule, None, None)]} - for prepare_config in [config_name, config_class]: + for is_name in [True, False]: + if is_name: + prepare_config = { + "standalone_module_name": [("standalone", None, interface_config)] + } + else: + prepare_config = { + "standalone_module_class": [(StandaloneModule, None, interface_config)] + } + original_m_copy = copy.deepcopy(original_m) original_ref_m_copy = copy.deepcopy(original_ref_m) + + qconfig_dict = {"": default_qconfig} # check prepared model m = prepare_fx( original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config) # calibration m(data) - # input and output of first conv, observer for standalone module - # will be inserted in the standalone module itself - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 2 - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - # for input and output of conv in the standalone module - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 2 - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check) + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check) # check converted/quantized model m = convert_fx(m) - count_check = { - ns.call_function(torch.quantize_per_tensor) : 1, - ns.call_module(nnq.Conv2d) : 1, - ns.call_method('dequantize') : 1, - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - count_check = { - # standalone module will take float as input and output - # so we'll see quantize and dequantize in the modoule - ns.call_function(torch.quantize_per_tensor) : 1, - ns.call_module(nnq.Conv2d): 1, - ns.call_method('dequantize') : 1, - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check) + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check) res = m(data) # quantize the reference model @@ -661,6 +657,76 @@ def forward(self, x): ref_res = ref_m(data) self.assertEqual(res, ref_res) + def test_standalone_module_float_interface(self): + float_interface_config = { + "input_quantized_idxs": [], # float input + "output_quantized_idxs": [], # float output + } + interface_config = float_interface_config + # input and output of first conv, observer for standalone module + # will be inserted in the standalone module itself + prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + # for input and output of conv in the standalone module + standalone_prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + convert_count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method("dequantize") : 1, + } + standalone_convert_count_check = { + # standalone module will take float as input and output + # so we'll see quantize and dequantize in the modoule + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d): 1, + ns.call_method("dequantize") : 1, + } + self._test_standalone_module( + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check) + + def test_standalone_module_quantized_interface(self): + quantized_interface_config = { + "input_quantized_idxs": [0], # quantized input + "output_quantized_idxs": [0], # quantized output + } + interface_config = quantized_interface_config + # observer for input and output of first conv + prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + # for output of conv in the standalone module + standalone_prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 1 + } + convert_count_check = { + # quantizing input for conv + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + # dequantizing output of standalone module + ns.call_method("dequantize") : 1, + } + standalone_convert_count_check = { + # quantization of input happens in parent module + # quantization of output happens in the quantized conv module + ns.call_function(torch.quantize_per_tensor) : 0, + ns.call_module(nnq.Conv2d): 1, + # dequantization for output happens in parent module + ns.call_method("dequantize") : 0, + } + self._test_standalone_module( + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check) + @skipIfNoFBGEMM def test_qconfig_none(self): class M(torch.nn.Module): diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py index 5aabbd66c4b1..59e3851dcd57 100644 --- a/torch/quantization/fx/fuse.py +++ b/torch/quantization/fx/fuse.py @@ -21,7 +21,7 @@ from .quantization_types import Pattern -from typing import Callable, Tuple, Optional +from typing import Callable, Tuple class Fuser: @@ -59,11 +59,12 @@ def load_arg(a): model = GraphModule(input_root, self.fused_graph) return model - def _find_matches(self, root: GraphModule, graph: Graph, - patterns: Dict[Pattern, Callable] - ) -> Dict[str, Tuple[Node, Optional[Any]]]: + def _find_matches( + self, root: GraphModule, graph: Graph, + patterns: Dict[Pattern, Callable] + ) -> Dict[str, Tuple[Node, FuseHandler]]: modules = dict(root.named_modules()) - match_map = {} # node name -> (root_node, match_value?) + match_map : Dict[str, Tuple[Node, FuseHandler]] = {} # node name -> (root_node, match_value) def apply_match(pattern, node, match): if isinstance(pattern, tuple): diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py index b7af6008b3f3..1749484fccec 100644 --- a/torch/quantization/fx/fusion_patterns.py +++ b/torch/quantization/fx/fusion_patterns.py @@ -6,12 +6,25 @@ from .utils import _parent_name from .quantization_types import QuantizerCls from ..fuser_method_mappings import get_fuser_method +from abc import ABC, abstractmethod from typing import Any, Callable, Dict # --------------------- -# Fusion Patterns +# Fusion Pattern Registrations # --------------------- +# Base Pattern Handler +class FuseHandler(ABC): + """ Base handler class for the fusion patterns + """ + def __init__(self, quantizer: QuantizerCls, node: Node): + pass + + @abstractmethod + def fuse(self, quantizer: QuantizerCls, load_arg: Callable, + fuse_custom_config_dict: Dict[str, Any] = None) -> Node: + pass + @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d)) @@ -27,9 +40,9 @@ @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d))) @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d))) -class ConvBNReLUFusion(): +class ConvBNReLUFusion(FuseHandler): def __init__(self, quantizer: QuantizerCls, node: Node): - super().__init__() + super().__init__(quantizer, node) self.relu_node = None self.bn_node = None if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \ @@ -94,9 +107,9 @@ def fuse(self, quantizer: QuantizerCls, load_arg: Callable, @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d)) @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d)) -class ModuleReLUFusion(): +class ModuleReLUFusion(FuseHandler): def __init__(self, quantizer: QuantizerCls, node: Node): - super().__init__() + super().__init__(quantizer, node) self.relu_node = node assert isinstance(node.args[0], Node) node = node.args[0] diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py index a95bc184fa10..808a3b36fb4a 100644 --- a/torch/quantization/fx/observed_module.py +++ b/torch/quantization/fx/observed_module.py @@ -2,11 +2,11 @@ import copy from torch.fx import GraphModule # type: ignore from torch.fx.graph import Graph -from typing import Union, Dict, Any +from typing import Union, Dict, Any, List class ObservedGraphModule(GraphModule): - def get_preserved_attr_names(self): + def get_preserved_attr_names(self) -> List[str]: return ['_activation_post_process_map', '_patterns', '_qconfig_map', @@ -35,6 +35,12 @@ def is_observed_module(module: Any) -> bool: return isinstance(module, ObservedGraphModule) class ObservedStandaloneGraphModule(ObservedGraphModule): + def get_preserved_attr_names(self) -> List[str] : + return super().get_preserved_attr_names() + [ + "_standalone_module_input_quantized_idxs", + "_standalone_module_output_quantized_idxs" + ] + def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 46fbed74bdc8..fb5bef0bd0ad 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -755,10 +755,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, qconfig = quantizer.qconfig_map[node.name] convert = torch.quantization.quantize_fx._convert_standalone_module_fx # type: ignore observed_standalone_module = quantizer.modules[node.target] + input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist() quantized_standalone_module = convert(observed_standalone_module, debug=debug) parent_name, name = _parent_name(node.target) # update the modules dict setattr(quantizer.modules[parent_name], name, quantized_standalone_module) quantizer.modules[node.target] = quantized_standalone_module - # standalone module takes float input - return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False)) + return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs)) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index af9496a66a63..318295270b61 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -102,14 +102,15 @@ def insert_observer( 'call_module', observer_name, (load_arg(node),), {}) observed_node_names_set.add(node.name) -def insert_observer_for_special_module( +def maybe_insert_observer_for_special_module( quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module], - prepare_custom_config_dict: Any, qconfig: Any, node: Node): + prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]: """ Insert observer for custom module and standalone module Returns: standalone_module_input_idxs: the indexs for inputs that needs to be observed by parent module """ assert modules is not None + standalone_module_input_idxs = None if isinstance(quantize_handler, CustomModuleQuantizeHandler): custom_module = modules[node.target] # type: ignore custom_module_class_mapping = prepare_custom_config_dict.get( @@ -129,19 +130,22 @@ def insert_observer_for_special_module( class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs} name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs} config = class_config_map.get(type(standalone_module), (None, None)) - config = name_config_map.get(node.target, (None, None)) - standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0] - standalone_prepare_config_dict = {} if config[1] is None else config[1] + config = name_config_map.get(node.target, config) + sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0] + sm_prepare_config_dict = {} if config[1] is None else config[1] prepare = \ torch.quantization.quantize_fx._prepare_standalone_module_fx # type: ignore observed_standalone_module = \ - prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict) + prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict) + standalone_module_input_idxs = observed_standalone_module.\ + _standalone_module_input_quantized_idxs.int().tolist() observed_standalone_module = mark_observed_standalone_module( observed_standalone_module) parent_name, name = _parent_name(node.target) setattr(modules[parent_name], name, observed_standalone_module) modules[node.target] = observed_standalone_module # type: ignore + return standalone_module_input_idxs def insert_observer_for_output_of_the_node( node: Node, @@ -155,7 +159,8 @@ def insert_observer_for_output_of_the_node( observed_graph: Graph, load_arg: Callable, observed_node_names_set: Set[str], - matched_nodes: Optional[List[Node]]): + matched_nodes: Optional[List[Node]], + standalone_module_input_idxs: Optional[List[int]]): """ Insert observer/fake_quantize module for output of the observed module if needed """ @@ -215,8 +220,13 @@ def input_is_observed(arg): observed_node_names_set.add(node.name) elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler): - # output is observed in the standalone module - return + assert node.op == "call_module" + assert isinstance(node.target, str) + sm_out_qidxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist() # type: ignore + output_is_quantized = 0 in sm_out_qidxs + + if output_is_quantized: + observed_node_names_set.add(node.name) elif (quantize_handler.all_node_args and input_output_observed(quantize_handler)): # observer for outputs @@ -226,6 +236,16 @@ def input_is_observed(arg): activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set) + # insert observer for input of standalone module + if standalone_module_input_idxs is not None: + for idx in standalone_module_input_idxs: + if node.args[idx].name not in observed_node_names_set: # type: ignore + new_observer = qconfig.activation() + insert_observer( + node, new_observer, model, + activation_post_process_map, env, observed_graph, + load_arg, observed_node_names_set) + def insert_observer_for_input_arg_of_observed_node( node: Node, observed_node_names_set: Set[str], quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]], @@ -373,10 +393,19 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any, """ standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - When we are preparing a standalone module: - both input and output are observed in prepared standalone module + How the standalone module is observed is specified by `input_quantized_idxs` and + `output_quantized_idxs` in the prepare_custom_config for the standalone module Returns: model(GraphModule): prepared standalone module + attributes: + _standalone_module_input_quantized_idxs(List[Int]): a list of + indexes for the graph input that is expected to be quantized, + same as input_quantized_idxs configuration provided + for the standalone module + _standalone_module_output_quantized_idxs(List[Int]): a list of + indexs for the graph output that is quantized + same as input_quantized_idxs configuration provided + for the standalone module """ if prepare_custom_config_dict is None: prepare_custom_config_dict = {} @@ -430,8 +459,6 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any, def load_arg(a): return map_arg(a, lambda node: env[node.name]) - # indexes for the inputs that needs to be observed - standalone_module_observed_input_idxs: List[int] = [] graph_inputs = [] for node in model.graph.nodes: if node.op == 'placeholder': @@ -487,14 +514,15 @@ def load_arg(a): # parent if qconfig is not None: assert obj is not None - insert_observer_for_special_module( - obj, self.modules, prepare_custom_config_dict, qconfig, - node) + standalone_module_input_idxs = \ + maybe_insert_observer_for_special_module( + obj, self.modules, prepare_custom_config_dict, qconfig, + node) insert_observer_for_output_of_the_node( node, obj, qconfig, self.modules, model, pattern, self.activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set, - matched_nodes) + matched_nodes, standalone_module_input_idxs) else: env[node.name] = observed_graph.node_copy(node, load_arg) @@ -516,6 +544,21 @@ def load_arg(a): model = GraphModule(model, observed_graph) self.save_state(model) model = mark_observed_module(model) + if is_standalone_module: + assert result_node is not None + assert isinstance(result_node.args[0], Node), \ + "standalone module only supports returning simple value currently"\ + "(not tuple, dict etc.)" + # indicator for whether output is observed or not. + # This used for correctly quantize standalone modules + output_is_observed = \ + result_node.args[0].name in observed_node_names_set + # these inputs are observed in parent + # converting List[int] to Tensor since module attribute is + # Union[Tensor, Module] + model._standalone_module_input_quantized_idxs = \ + torch.Tensor(input_quantized_idxs) + model._standalone_module_output_quantized_idxs = torch.Tensor(output_quantized_idxs) return model def save_state(self, observed: GraphModule) -> None: @@ -569,8 +612,10 @@ def _convert(self, model: GraphModule, debug: bool = False, """ standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - Returns a quantized standalone module which accepts float input - and produces float output. + Returns a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config_dict, with + input_quantized_idxs, output_quantized_idxs, please + see docs for prepare_fx for details """ if convert_custom_config_dict is None: convert_custom_config_dict = {} @@ -627,36 +672,50 @@ def load_x(n: Node) -> Node: else: return env[n.name] - def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]] + def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] ) -> Callable[[Node], Argument]: """ Input: quantized, which can be None, list, boolean or tuple - - if quantized is a list or tuple, then arg should be a list and - the args with corresponding indexes will be quantized - - if quantized is a boolean, then all args will be - quantized/not quantized - if quantized is None, then we'll load the node as long as it exists + - if quantized is a boolean, then all args will be + quantized/not quantized + - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False) + - if quantized is a list or tuple, then arg should be a list and + the args with corresponding indexes will be quantized Output: fn which takes arg_or_args, and loads them from the corresponding environment depending on the value of quantized. """ assert quantized is None or \ isinstance(quantized, (tuple, list, bool)), type(quantized) + if isinstance(quantized, (tuple, list)) and len(quantized) == 0: + # empty tuple or list means nothing is quantized + quantized = False def load_arg_impl(arg_or_args): - if quantized is None: + # we'll update the format of `quantized` + # to better match arg_or_args + updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized + + if isinstance(quantized, (tuple, list)) and \ + len(quantized) == 1 and isinstance(arg_or_args, Node): + # when argument is one Node instead of tuple, we just need to check + # 0 is in the quantized list + updated_quantized = 0 in quantized + + if updated_quantized is None: return map_arg(arg_or_args, load_x) - if isinstance(quantized, bool): + if isinstance(updated_quantized, bool): return map_arg( arg_or_args, - load_quantized if quantized else load_non_quantized) - elif isinstance(quantized, (tuple, list)): + load_quantized if updated_quantized else load_non_quantized) + elif isinstance(updated_quantized, (tuple, list)): assert isinstance(arg_or_args, (tuple, list)), arg_or_args loaded_args = [] # for now, we only support quantizing positional arguments for i, a in enumerate(arg_or_args): - if i in quantized: + if i in updated_quantized: loaded_args.append(map_arg(a, load_quantized)) else: loaded_args.append(map_arg(a, load_non_quantized)) @@ -690,10 +749,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool: def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool: """ Check if output node is quantized or not """ assert self.modules is not None - # by default the output is expected to be quantized + # by default the output for a quantizable node is expected to be quantized quantized = True - # Need to get correct quantized/non-quantized state for the output + # Need to get correct quantized/non-quantized state forn the output # of CopyNode if type(obj) in [ CopyNode, @@ -750,7 +809,7 @@ def insert_quantize_node(node: Node) -> None: "output_quantized_idxs", []) for node in model.graph.nodes: - if node.op == 'output': + if node.op == "output": cur_output_node_idx = output_node_seen_cnt output_node_seen_cnt += 1 if cur_output_node_idx in output_quantized_idxs: @@ -775,12 +834,19 @@ def insert_quantize_node(node: Node) -> None: quantized = False else: assert obj is not None + # We will get whether the output is quantized or not before + # convert for standalone module and after convert + # for non-standalone module, since _standalone_module_output_quantized_idxs + # is only available in observed standalone module + if is_observed_standalone_module_node: + out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs.tolist() # type: ignore + assert len(out_quant_idxs) <= 1, "Currently standalone only support one output" + quantized = 0 in out_quant_idxs + result = obj.convert( self, node, load_arg, debug=debug, convert_custom_config_dict=convert_custom_config_dict) - if is_observed_standalone_module_node: - quantized = False - else: + if not is_observed_standalone_module_node: quantized = is_output_quantized(node, obj) if quantized: @@ -929,7 +995,7 @@ def _find_matches( standalone_module_names = [] match_map: Dict[str, MatchResult] = {} - all_matched = set() + all_matched : Set[str] = set() def record_match(pattern, node, matched): if isinstance(pattern, tuple): diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index c1f849803342..8285e204b1ed 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -9,7 +9,7 @@ Node, ) -from typing import Callable, Optional, List, Dict, Any +from typing import Callable, Optional, List, Dict, Any, Set # turn foo.bar -> ['foo', 'bar'] def _parent_name(target): @@ -140,7 +140,7 @@ def get_next_qparams_idx(module, qparams): inputs.append(graph.create_node('get_attr', qparam_full_path)) return graph.create_node('call_function', quantize_op, tuple(inputs), {}) -def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key): +def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key) -> List[Any]: r""" Get all the unique custom module keys in the custom config dict e.g. Input: @@ -163,7 +163,7 @@ def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key): [CustomModule1, CustomModule2, CustomModule3] """ # using set to dedup - float_custom_module_classes = set() + float_custom_module_classes : Set[Any] = set() custom_module_mapping = custom_config_dict.get(custom_config_dict_key, {}) for quant_mode in ["static", "dynamic", "weight_only"]: quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {}) diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py index cba104b8f783..89ba877ffe78 100644 --- a/torch/quantization/quantize_fx.py +++ b/torch/quantization/quantize_fx.py @@ -107,8 +107,20 @@ def _prepare_standalone_module_fx( standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - Both input and output of the module are observed in the - standalone module. + How the standalone module is observed is specified by `input_quantized_idxs` and + `output_quantized_idxs` in the prepare_custom_config for the standalone module + + Returns: + model(GraphModule): prepared standalone module + attributes: + _standalone_module_input_quantized_idxs(List[Int]): a list of + indexes for the graph input that is expected to be quantized, + same as input_quantized_idxs configuration provided + for the standalone module + _standalone_module_output_quantized_idxs(List[Int]): a list of + indexs for the graph output that is quantized + same as input_quantized_idxs configuration provided + for the standalone module """ return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True) @@ -378,8 +390,9 @@ def _convert_standalone_module_fx( r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx` and convert it to a quantized model - Return: - A quantized standalone module which accepts float input - and produces float output. + Returns a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config_dict, with + input_quantized_idxs, output_quantized_idxs, please + see docs for prepare_fx for details """ return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True) From 57d489e43a5b915cdb4bd8a16112ac68eb792581 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Tue, 5 Jan 2021 22:34:19 -0800 Subject: [PATCH 02/44] Fix for possible RNG offset calculation bug in cuda vectorized dropout with VEC=2 (#50110) Summary: The [offset calculation](https://github.com/pytorch/pytorch/blob/e3c56ddde67ca1a49159ffa886d889b6e65c7033/aten/src/ATen/native/cuda/Dropout.cu#L328) (which gives an estimated ceiling on the most 32-bit values in the philox sequence any thread in the launch will use) uses the hardcoded UNROLL value of 4, and assumes the hungriest threads can use every value (.x, .y, .z, and .w) their curand_uniform4 calls provide. However, the way fused_dropout_kernel_vec is currently written, that assumption isn't true in the VEC=2 case: Each iteration of the `grid x VEC` stride loop, each thread calls curand_uniform4 once, uses rand.x and rand.y, and discards rand.z and rand.w. This means (I _think_) curand_uniform4 may be called twice as many times per thread in the VEC=2 case as for the VEC=4 case or the fully unrolled code path, which means the offset calculation (which is a good estimate for the latter two cases) is probably wrong for the `fused_dropout_kernel_vec<..., /*VEC=*/2>` code path. The present PR inserts some value-reuse in fused_dropout_kernel_vec to align the number of times curand_uniform4 is called for launches with the same totalElements in the VEC=2 and VEC=4 cases. The diff should - make the offset calculation valid for all code paths - provide a very small perf boost by reducing the number of curand_uniform4 calls in the VEC=2 path - ~~make results bitwise accurate for all code paths~~ nvm, tensor elements are assigned to threads differently in the unrolled, VEC 2 and VEC 4 cases, so we're screwed here no matter what. ngimel what do you think? Pull Request resolved: https://github.com/pytorch/pytorch/pull/50110 Reviewed By: smessmer Differential Revision: D25790121 Pulled By: ngimel fbshipit-source-id: f8f533ad997268c6f323cf4d225de547144247a8 --- aten/src/ATen/native/cuda/Dropout.cu | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 67adbaabbb84..c3e456d97056 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -57,6 +57,12 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, accscalar_t pinv = accscalar_t(1)/p; + // Helps align the total number of times curand_uniform4 is called by each thread for the same totalElements + // in the vec=2 and vec=4 cases. + bool gridxvec_loop_state = 0; + + float4 rand; + // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time for (IndexType linearIndex = idx * VEC; linearIndex < totalElements; @@ -69,12 +75,21 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4) // sets of rand. - float4 rand = curand_uniform4(&state); + if ((VEC == 4) || (gridxvec_loop_state == 0)) { + rand = curand_uniform4(&state); + } else { + // sets up the last two values we generated last iteration to be used this iteration. + rand.x = rand.z; + rand.y = rand.w; + gridxvec_loop_state ^= 1; + } rand.x = rand.x < p; rand.y = rand.y < p; - rand.z = rand.z < p; - rand.w = rand.w < p; + if (VEC == 4) { + rand.z = rand.z < p; + rand.w = rand.w < p; + } // Note: We explicitly check for is_contiguous() before launching the vectorized kernel // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other) From 282552dde2415d3cb3e4b1f0b18356810cf1ecd4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 5 Jan 2021 22:57:12 -0800 Subject: [PATCH 03/44] [PyTorch] Reapply D25546409: Use .sizes() isntead of .size() in cat_serial_kernel_impl (#49762) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49762 This was reverted because it landed in a stack together with D25542799 (https://github.com/pytorch/pytorch/commit/9ce1df079f6ea90dd4b7f9aa12a1a78d51a8b204), which really was broken. ghstack-source-id: 119326870 Test Plan: CI Reviewed By: maratsubkhankulov Differential Revision: D25685905 fbshipit-source-id: f4ec9e114993f988d4af380677331c72dfe41c44 --- aten/src/ATen/native/cpu/CatKernel.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp index 299850407da3..f86adb8e6318 100644 --- a/aten/src/ATen/native/cpu/CatKernel.cpp +++ b/aten/src/ATen/native/cpu/CatKernel.cpp @@ -15,18 +15,20 @@ struct InputMeta { InputMeta(const Tensor& t, int64_t dim, int64_t inner) : data_ptr(t.data_ptr()) - , inner_size(t.size(dim) * inner) {} + , inner_size(t.sizes()[dim] * inner) {} }; template void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { - int64_t outer = result.numel() / (result.size(dim) * result.stride(dim)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl"); + int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]); scalar_t* result_data = result.data_ptr(); int64_t ninputs = tensors.size(); std::vector inputs; inputs.reserve(ninputs); for (auto const &tensor : tensors) { - inputs.emplace_back(tensor, dim, result.stride(dim)); + inputs.emplace_back(tensor, dim, result.strides()[dim]); } using Vec = vec256::Vec256; From ad7d208ba5f2c5614679a7999918b75ae74530e9 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Tue, 5 Jan 2021 23:20:42 -0800 Subject: [PATCH 04/44] Revert D25239967: [fx] Add matrix multiplication fusion pass Test Plan: revert-hammer Differential Revision: D25239967 (https://github.com/pytorch/pytorch/commit/9b7f3fa146d350628b295ab9b794d64173f17da1) Original commit changeset: fb99ad25b7d8 fbshipit-source-id: 370167b5ade8bf2b3a6cccdf4290ea07b8347c79 --- test/test_fx_experimental.py | 123 --------------- torch/fx/experimental/merge_matmul.py | 215 -------------------------- 2 files changed, 338 deletions(-) delete mode 100644 torch/fx/experimental/merge_matmul.py diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index ac71d6037591..6e9c877b8de6 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -21,7 +21,6 @@ PartitionMode ) from torch.fx.experimental.fuser import fuse -from torch.fx.experimental import merge_matmul try: from torchvision.models import resnet18 @@ -845,128 +844,6 @@ def forward(self, a): for p_name in para_list: assert p_name in node.attrs_for_lowering - def test_merge_matmuls(self): - """ - A collection of test cases for torch.fx.experimental.merge_matmul, - a graph transformation that merges matrix multiplication operations. - """ - # Utility function for counting matmuls for test assertions. - def _count_matmuls(mod): - gm = torch.fx.symbolic_trace(mod) - - num_matmuls = 0 - for node in gm.graph.nodes: - if node.target == torch.matmul: - num_matmuls += 1 - - return num_matmuls - - # Simple test case in which there are two matmuls of the same size to merge. - class SimpleMergeMatmulModule(torch.nn.Module): - def __init__(self, rhs): - super().__init__() - self.rhs = rhs - - def forward(self, x, y): - a = torch.matmul(x, self.rhs) - b = torch.matmul(y, self.rhs) - return a + b - - # Initialize inputs. - a = torch.randn(3, 3) - b = torch.randn(3, 3) - - # Initialize RHS for matmuls. - rhs = torch.randn(3, 4) - - # Construct SimpleMergeMatmulModule and call merge_matmul on it. - module = SimpleMergeMatmulModule(rhs) - opt_module = merge_matmul.merge_matmul(module) - - # Numerical correctness check. - before = module(a, b) - after = opt_module(a, b) - before.allclose(after) - - # Basic graph structure check; original module should have 2 matmuls - # and optimized module should have 1. - self.assertEqual(_count_matmuls(module), 2) - self.assertEqual(_count_matmuls(opt_module), 1) - - # Test case in which there are multiple matmuls of different sizes to merge. - class FiveMergeMatmulModule(torch.nn.Module): - def __init__(self, rhs): - super().__init__() - self.rhs = rhs - - def forward(self, a, b, c, d, e): - s = torch.Tensor((0)) - matmuls = [] - - # For some reason using a list comprehension or for-loop for this - # doesn't work. - matmuls.append(torch.matmul(a, self.rhs)) - matmuls.append(torch.matmul(b, self.rhs)) - matmuls.append(torch.matmul(c, self.rhs)) - matmuls.append(torch.matmul(d, self.rhs)) - matmuls.append(torch.matmul(e, self.rhs)) - - for m in matmuls: - s += torch.sum(m) - - return s - - # Initialize inputs. - inputs = [torch.randn(2 * i + 1, 5) for i in range(5)] - - # Initialize RHS. - rhs = torch.randn(5, 4) - - # Construct FiveMergeMatmulModule and call merge_matmul on it. - module = FiveMergeMatmulModule(rhs) - opt_module = merge_matmul.merge_matmul(module) - - # Numerical correctness check. - before = module(*inputs) - after = opt_module(*inputs) - before.allclose(after) - - # Basic graph structure check; original module should have len(inputs) matmuls - # and optimized module should have 1. - self.assertEqual(_count_matmuls(module), len(inputs)) - self.assertEqual(_count_matmuls(opt_module), 1) - - # Simple test case in which two matmuls cannot be merged due to a data dependency between - # the LHS operands. - class UnmergeableMatmulModule(torch.nn.Module): - def __init__(self, rhs): - super().__init__() - self.rhs = rhs - - def forward(self, x): - a = torch.matmul(x, self.rhs) - a_abs = torch.abs(a) - b = torch.matmul(a_abs.transpose(1, 0), self.rhs) - return b - - # Initialize inputs. - a = torch.randn(3, 3) - - # Initialize RHS for matmuls. - rhs = torch.randn(3, 4) - - # Construct UnmergeableMatmulModule and call merge_matmul on it. - module = UnmergeableMatmulModule(rhs) - opt_module = merge_matmul.merge_matmul(module) - - # Numerical correctness check. - before = module(a) - after = opt_module(a) - before.allclose(after) - - # Basic graph structure check; the number of matrix multiplcations should not have changed. - self.assertEqual(_count_matmuls(module), 2) - self.assertEqual(_count_matmuls(opt_module), 2) if __name__ == "__main__": run_tests() diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py deleted file mode 100644 index a5bd24c84c12..000000000000 --- a/torch/fx/experimental/merge_matmul.py +++ /dev/null @@ -1,215 +0,0 @@ -import torch - -import itertools -import operator - -from typing import List - - -def get_first_dim(t: torch.Tensor) -> int: - """ - A free function primarily for use in the merge_matmul graph transformation below - that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape - is an attribute (and cannot be the target of a call_function node) and also helps save - a getitem op in the graph. - - Arguments: - t: The tensor to get the first dimension of. - - Returns: - The first dimension of t. - """ - return t.shape[0] - - -def legalize_graph(gm: torch.fx.GraphModule): - """ - Replace the graph of the given GraphModule with one that contains the same nodes as the - original, but in topologically sorted order. - - This is used by the merge_matmul transformation below, which disturbs the topologically sorted - order of its input GraphModule, so that this order is restored before further transformation. - - Arguments: - gm: The graph module to topologically sort. It is modified in-place. - - """ - # Build an adjacency list representation of node dependencies in the graph. This also - # serves as a list of nodes that still need to be inserted into the new, topologically - # sorted graph. - dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes} - - # Construct a new graph that will contain all nodes in topologically sorted order. - new_graph = torch.fx.Graph() - value_remap = {} - - # Copy over all nodes with no dependencies. - for node, deps in dependencies.items(): - if not deps: - value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) - - # Remove the copied over nodes from the adjacency list. - for copied_node in value_remap.keys(): - del dependencies[copied_node] - - # While there are still nodes to insert into the new graph: - while dependencies: - copied_this_round = [] - - # Copy over all nodes whose dependencies already exist in the new graph. - for node, deps in dependencies.items(): - all_deps_copied = True - for dep in deps: - if dep not in value_remap: - all_deps_copied = False - - if all_deps_copied: - value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) - copied_this_round.append(node) - - # Delete all nodes copied over in this iteration from dependencies. - for copied_node in copied_this_round: - del dependencies[copied_node] - - # Replace the old graph with the new, topologically sorted one. - gm.graph = new_graph - - -def may_depend_on(a: torch.fx.Node, b: torch.fx.Node, search_depth: int = 6): - """ - Determine if one node depends on another in a torch.fx.Graph. - - Arguments: - a: The node that may have a dependency on b. - b: The node that a may have a dependency on. - search_depth: In the case of an indirect dependency, this function - searches upto this many nodes away in search of a - data dependency. If none is found, the function - makes the conservative assumption that there is a - dependency. - - Returns: - True if a may depend on b, False if it definitely does not. - """ - # Equivalence is defined as dependence. - if a == b: - return True - - # If a has no inputs, it cannot depend on b. - if len(a.all_input_nodes) == 0: - return False - - # If the search depth has been exhausted and no conclusion has been - # reached, assume that there is a data dependency. - if search_depth == 0: - return True - - # Recursively check all inputs of a. - for inp in a.all_input_nodes: - if may_depend_on(inp, b, search_depth - 1): - return True - - return False - - -def are_nodes_independent(nodes: List[torch.fx.Node]): - """ - Check if all of the given nodes are pairwise-data independent. - - Arguments: - nodes: The nodes to check for data dependencies. - - Returns: - True if any pair in nodes has a data dependency. - """ - # For each pair in nodes: - for i, j in itertools.combinations(nodes, 2): - if may_depend_on(i, j) or may_depend_on(j, i): - return False - - return True - - -def merge_matmul(in_mod: torch.nn.Module): - """ - A graph transformation that merges matrix multiplication operations that share the same right-hand - side operand into one large matrix multiplication. - ____ _________ _________ - ---- | | | | M| A * C | - M| A | T| B | * K| C | = |---------| - ---- , | | | | T| B * C | - K ---- --------- --------- - K R R - """ - gm = torch.fx.symbolic_trace(in_mod) - - rhs_users = {} - lhs_users = {} - - # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to - # the matmul of which they are the LHS/RHS. - for node in gm.graph.nodes: - if node.op != "call_function" or node.target is not torch.matmul: - continue - - lhs, rhs = node.args - - # TODO: Properly handle aliasing caused by get_attr. For now, - # use the attribute name as the operand if the node is a - # get_attr. - lhs = lhs.target if lhs.op == "get_attr" else lhs - rhs = rhs.target if rhs.op == "get_attr" else rhs - - lhs_users.setdefault(lhs, []).append(node) - rhs_users.setdefault(rhs, []).append(node) - - for rhs, mms in rhs_users.items(): - # There must be at least matmuls for a merge to make sense. - if len(mms) < 2: - continue - - # All matmuls must not depend on each other directly or indirectly - # in order for the merge to be possible. - if not are_nodes_independent(mms): - continue - - lhs_vals = [mm.args[0] for mm in mms] - - # Merge the matmul. - # Collect a list of LHS operands and the single RHS operand. - lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals] - rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs - - # Concatenate all the LHS operands. - merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {}) - - # Multiply the concatenated LHS operands with the one RHS. This will produce - # the same results as all the individual matmuls involving rhs in the original graph, - # but they will all be concatenated together. - merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {}) - - # Split the result of the merged matmul using the shapes of the LHS operands - # to ascertain how large each chunk should be. - merge_mm_sizes = [ - gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs - ] - merge_mm_split = gm.graph.call_function( - torch.split, (merge_mm, merge_mm_sizes), {} - ) - merge_mm_res = [ - gm.graph.call_function(operator.getitem, (merge_mm_split, out), {}) - for out in range(len(lhs)) - ] - - # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul. - for old, new in zip(mms, merge_mm_res): - old.replace_all_uses_with(new) - gm.graph.erase_node(old) - - # All of the new nodes created above were inserted at the end, so we need to sort - # the nodes topologically to make sure all definitions precede uses. - legalize_graph(gm) - - gm.recompile() - gm.graph.lint(in_mod) - return gm From 0ad6f066843537d6cf86e57910f4bbf8faa60f9e Mon Sep 17 00:00:00 2001 From: cyy Date: Wed, 6 Jan 2021 06:50:56 -0800 Subject: [PATCH 05/44] drop a unneeded comma from cmakelist.txt (#50091) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50091 Reviewed By: smessmer Differential Revision: D25782083 Pulled By: ezyang fbshipit-source-id: f90f57c6c9fc0c1e68ab30dd3b56dfe971798df2 --- aten/src/ATen/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index fd3c95f2573b..6fedef185b21 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -72,7 +72,7 @@ file(GLOB metal_h "metal/*.h") file(GLOB metal_cpp "metal/*.cpp") file(GLOB_RECURSE native_metal_h "native/metal/*.h") file(GLOB metal_test_srcs "native/metal/mpscnn/tests/*.mm") -file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm", "native/metal/*.cpp") +file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm" "native/metal/*.cpp") EXCLUDE(native_metal_srcs "${native_metal_srcs}" ${metal_test_srcs}) file(GLOB metal_prepack_h "native/metal/MetalPrepackOpContext.h") file(GLOB metal_prepack_cpp "native/metal/MetalPrepackOpRegister.cpp") From 45ec35827ed73c27c114ba0444517baa5b3cdbee Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 6 Jan 2021 06:55:10 -0800 Subject: [PATCH 06/44] Set USE_RCCL cmake option (dependent on USE_NCCL) [REDUX] (#34683) Summary: Refiled duplicate of https://github.com/pytorch/pytorch/issues/31341 which was reverted in commit 63964175b52197a75e03b73c59bd2573df66b398. This PR enables RCCL support when building Gloo as part of PyTorch for ROCm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/34683 Reviewed By: glaringlee Differential Revision: D25540578 Pulled By: ezyang fbshipit-source-id: fcb02e5745d62e1b7d2e02048160e9e7a4b4df2d --- CMakeLists.txt | 2 ++ tools/amd_build/build_amd.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e346087c0cdb..3df73f8a3041 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option( USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_RCCL "Use RCCL" ON + USE_NCCL OFF) cmake_dependent_option( USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 026293a9281a..9d4fa54c93b3 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -131,6 +131,20 @@ def is_hip_clang(): sources.write(line) print("%s updated" % gloo_cmake_file) +gloo_cmake_file = "third_party/gloo/cmake/Modules/Findrccl.cmake" +if os.path.exists(gloo_cmake_file): + do_write = False + with open(gloo_cmake_file, "r") as sources: + lines = sources.readlines() + newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines] + if lines == newlines: + print("%s skipped" % gloo_cmake_file) + else: + with open(gloo_cmake_file, "w") as sources: + for line in newlines: + sources.write(line) + print("%s updated" % gloo_cmake_file) + hipify_python.hipify( project_directory=proj_dir, output_directory=out_dir, From 2ac180a5dddf04178068dba7cbced33df250eb60 Mon Sep 17 00:00:00 2001 From: Chester Liu Date: Wed, 6 Jan 2021 07:08:16 -0800 Subject: [PATCH 07/44] Fix cl.exe detection in cpu/fused_kernel.cpp (#50085) Summary: The command used here is essentially `where cl.exe`. By using `system()` we will not be able to find cl.exe unless we are using VS Developer Prompt, which makes `activate()` meaningless. Change `system()` to `run()` fixes this. Found during https://github.com/pytorch/pytorch/issues/49781. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50085 Reviewed By: smessmer Differential Revision: D25782054 Pulled By: ezyang fbshipit-source-id: e8e3cac903a73f3bd78def667ebe0e93201814c8 --- torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp index 4e76dc23e55d..4f4aa0d1536b 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp @@ -45,11 +45,17 @@ constexpr int so_suffix_len = 3; constexpr int cpp_suffix_len = 4; #endif +intptr_t run(const std::string& cmd); + static bool programExists(const std::string& program) { TemplateEnv env; env.s("program", program); std::string cmd = format(check_exists_string, env); +#ifdef _MSC_VER + return (run(cmd.c_str()) == 0); +#else return (system(cmd.c_str()) == 0); +#endif } #ifdef _MSC_VER From c517e15d79b8ae672ee2a94581fc57fa62155adf Mon Sep 17 00:00:00 2001 From: Nathan Howell Date: Wed, 6 Jan 2021 07:36:12 -0800 Subject: [PATCH 08/44] Add support for converting sparse bool tensors to dense (#50019) Summary: Fixes https://github.com/pytorch/pytorch/issues/49977 Pull Request resolved: https://github.com/pytorch/pytorch/pull/50019 Reviewed By: smessmer Differential Revision: D25782045 Pulled By: ezyang fbshipit-source-id: a8389cbecb7e79099292a423a6fd8ac28631905b --- aten/src/ATen/native/sparse/SparseTensorMath.cpp | 2 +- aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu | 4 ++-- test/test_sparse.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 9bb679beb3d0..6c3298b72e75 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -650,7 +650,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen dstBuffer.add_(srcBuffer, value); } } else { - AT_DISPATCH_ALL_TYPES( + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, commonDtype, "add_dense_sparse", [&] { add_dense_sparse_worker_cpu(resultBuffer, value, sparse, indices, valuesBuffer); }); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index c8366f71618e..fce3446816e7 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -338,8 +338,8 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT if (sparse.dense_dim() == 0) { TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); - AT_DISPATCH_ALL_TYPES_AND2( - at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { apply::sparseElementwiseKernelScalar, uint64_t, scalar_t> <<>>( TensorCAddOp(value.to()), diff --git a/test/test_sparse.py b/test/test_sparse.py index 4e982b8333d9..228c66aa403e 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -356,6 +356,11 @@ def test_to_sparse(self): sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3]) self.assertRaises(RuntimeError, lambda: sp.to_sparse()) + def test_sparse_bool(self): + a = self.value_tensor([True, False]).to(torch.bool) + b = a.to_sparse().to_dense() + self.assertEqual(a, b) + def test_scalar(self): # tensor with value a = self.sparse_tensor(self.index_tensor([]).unsqueeze(1), 12.3, []) From 5f2ec6293d6a443b8acca1d3ff7d57f9121afcc7 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Wed, 6 Jan 2021 08:15:08 -0800 Subject: [PATCH 09/44] Unused variables in neural net classes and functions (#50100) Summary: These unused variables were identified by [pyflakes](https://pypi.org/project/pyflakes/). They can be safely removed to simplify the code and possibly improve performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50100 Reviewed By: ezyang Differential Revision: D25797764 Pulled By: smessmer fbshipit-source-id: ced341aee692f429d2dcc3a4ef5c46c8ee99cabb --- torch/nn/modules/module.py | 1 - torch/nn/parallel/replicate.py | 1 - torch/nn/quantized/dynamic/modules/rnn.py | 2 -- torch/nn/quantized/modules/embedding_ops.py | 1 - torch/nn/quantized/modules/normalization.py | 5 ----- torch/nn/utils/prune.py | 1 - 6 files changed, 11 deletions(-) diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 297a4edf15bf..f054590da66a 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -843,7 +843,6 @@ def _slow_forward(self, *input, **kwargs): if recording_scopes: name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None if name: - cur_scope_name = tracing_state.current_scope() tracing_state.push_scope(name) else: recording_scopes = False diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py index a069c6c6f939..8effeece5908 100644 --- a/torch/nn/parallel/replicate.py +++ b/torch/nn/parallel/replicate.py @@ -108,7 +108,6 @@ def replicate(network, devices, detach=False): modules = list(network.modules()) module_copies = [[] for device in devices] module_indices = {} - scriptmodule_skip_attr = {"_parameters", "_buffers", "_modules", "forward", "_c"} for i, module in enumerate(modules): module_indices[module] = i diff --git a/torch/nn/quantized/dynamic/modules/rnn.py b/torch/nn/quantized/dynamic/modules/rnn.py index df88169471ca..59c0195d7858 100644 --- a/torch/nn/quantized/dynamic/modules/rnn.py +++ b/torch/nn/quantized/dynamic/modules/rnn.py @@ -239,8 +239,6 @@ def from_float(cls, mod): _all_weight_values = [] for layer in range(qRNNBase.num_layers): for direction in range(num_directions): - layer_input_size = qRNNBase.input_size if layer == 0 else qRNNBase.hidden_size * num_directions - suffix = '_reverse' if direction == 1 else '' def retrieve_weight_bias(ihhh): diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py index d16748b3baf7..e41d55347741 100644 --- a/torch/nn/quantized/modules/embedding_ops.py +++ b/torch/nn/quantized/modules/embedding_ops.py @@ -52,7 +52,6 @@ def _save_to_state_dict(self, destination, prefix, keep_vars): def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - version = local_metadata.get('version', None) self.dtype = state_dict[prefix + 'dtype'] state_dict.pop(prefix + 'dtype') diff --git a/torch/nn/quantized/modules/normalization.py b/torch/nn/quantized/modules/normalization.py index 4664120ec8b5..c12f74374863 100644 --- a/torch/nn/quantized/modules/normalization.py +++ b/torch/nn/quantized/modules/normalization.py @@ -29,7 +29,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.normalized_shape, mod.weight, mod.bias, float(scale), @@ -63,7 +62,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_groups, mod.num_channels, mod.weight, mod.bias, float(scale), int(zero_point), @@ -98,7 +96,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), @@ -133,7 +130,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), @@ -168,7 +164,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py index 84fa30021ed1..851a551da0d8 100644 --- a/torch/nn/utils/prune.py +++ b/torch/nn/utils/prune.py @@ -587,7 +587,6 @@ def compute_mask(self, t, default_mask): # Compute number of units to prune: amount if int, # else amount * tensor_size nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) - nparams_tokeep = tensor_size - nparams_toprune # This should raise an error if the number of units to prune is larger # than the number of units in the tensor _validate_pruning_amount(nparams_toprune, tensor_size) From 688992c775e2eeef53f3184b2e3428ef2f3a2967 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 08:33:26 -0800 Subject: [PATCH 10/44] [PyTorch] Additional IValue tests (#49718) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49718 Improving test coverage in preparation for updating the implementation of IValue. ghstack-source-id: 119327373 Test Plan: ivalue_test Reviewed By: hlu1 Differential Revision: D25674605 fbshipit-source-id: 37a82bb135f75ec52d2d8bd929c4329e8dcc4d25 --- aten/src/ATen/test/ivalue_test.cpp | 217 +++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp index 14e75205aa66..a0e2648758ff 100644 --- a/aten/src/ATen/test/ivalue_test.cpp +++ b/aten/src/ATen/test/ivalue_test.cpp @@ -51,6 +51,91 @@ TEST(IValueTest, Basic) { ASSERT_EQ(tv.use_count(), 2); } +static std::array makeSampleIValues() { + return { at::rand({3, 4}), "hello", 42, true, 1.5 }; +} + +static std::array makeMoreSampleIValues() { + return { at::rand({3, 4}), "goodbye", 23, false, 0.5 }; +} + +// IValue::operator== doesn't seem to work on Tensors. +#define EXPECT_IVALUE_EQ(a, b) \ + EXPECT_EQ((a).isTensor(), (b).isTensor()); \ + if ((a).isTensor()) { \ + EXPECT_TRUE(a.toTensor().equal(b.toTensor())); \ + } else { \ + EXPECT_EQ(a, b); \ + } + +TEST(IValueTest, Swap) { + // swap() has the following 3 cases: tensor, intrusive_ptr, or + // neither. Exercise all pairs of the three. + + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + for (const auto& input: sampleInputs) { + for (const auto& target: sampleTargets) { + IValue a(input); + IValue b(target); + EXPECT_IVALUE_EQ(a, input); + EXPECT_IVALUE_EQ(b, target); + a.swap(b); + EXPECT_IVALUE_EQ(a, target); + EXPECT_IVALUE_EQ(b, input); + } + } +} + +TEST(IValueTest, CopyConstruct) { + auto sampleInputs = makeSampleIValues(); + for (const IValue& v: sampleInputs) { + IValue copy(v); + EXPECT_IVALUE_EQ(copy, v); + } +} + +TEST(IValueTest, MoveConstruct) { + auto sampleInputs = makeSampleIValues(); + for (const IValue& v: sampleInputs) { + IValue source(v); + IValue target(std::move(source)); + EXPECT_IVALUE_EQ(target, v); + EXPECT_TRUE(source.isNone()); + } +} + +TEST(IValueTest, CopyAssign) { + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + + for (const IValue& input: sampleInputs) { + for (const IValue& target: sampleTargets) { + IValue copyTo(target); + IValue copyFrom(input); + copyTo = copyFrom; + EXPECT_IVALUE_EQ(copyTo, input); + EXPECT_IVALUE_EQ(copyFrom, input); + EXPECT_IVALUE_EQ(copyTo, copyFrom); + } + } +} + +TEST(IValueTest, MoveAssign) { + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + + for (const IValue& input: sampleInputs) { + for (const IValue& target: sampleTargets) { + IValue moveTo(target); + IValue moveFrom(input); + moveTo = std::move(moveFrom); + EXPECT_IVALUE_EQ(moveTo, input); + EXPECT_TRUE(moveFrom.isNone()); + } + } +} + TEST(IValueTest, Tuple) { std::tuple t = std::make_tuple(123, at::randn({1})); auto iv = IValue(t); @@ -318,5 +403,137 @@ TEST(IValueTest, EnumEquality) { ); } +TEST(IValueTest, isPtrType) { + IValue tensor(at::rand({3, 4})); + IValue undefinedTensor((at::Tensor())); + IValue integer(42); + IValue str("hello"); + + EXPECT_TRUE(tensor.isPtrType()); + EXPECT_FALSE(undefinedTensor.isPtrType()); + EXPECT_FALSE(integer.isPtrType()); + EXPECT_TRUE(str.isPtrType()); +} + +TEST(IValueTest, isAliasOf) { + auto sampleIValues = makeSampleIValues(); + for (auto& iv: sampleIValues) { + for (auto& iv2: sampleIValues) { + if (&iv == &iv2 && iv.isPtrType()) { + EXPECT_TRUE(iv.isAliasOf(iv2)); + } else { + EXPECT_FALSE(iv.isAliasOf(iv2)); + } + } + } +} + +TEST(IValueTest, internalToPointer) { + IValue tensor(at::rand({3, 4})); + IValue str("hello"); + + EXPECT_EQ(tensor.internalToPointer(), tensor.unsafeToTensorImpl()); + EXPECT_NE(str.internalToPointer(), nullptr); + + IValue nullStr((c10::intrusive_ptr())); + ASSERT_TRUE(nullStr.isString()); + EXPECT_EQ(nullStr.internalToPointer(), nullptr); +} + +TEST(IValueTest, IdentityComparisonAndHashing) { + at::Tensor t1 = at::rand({3, 4}); + at::Tensor t2 = at::rand({3, 4}); + IValue tv1(t1), tv2(t2); + IValue tv1b(t1); + + EXPECT_EQ(tv1.hash(), tv1b.hash()); + EXPECT_NE(tv1.hash(), tv2.hash()); + + EXPECT_TRUE(tv1.is(tv1)); + EXPECT_TRUE(tv1.is(tv1b)); + EXPECT_TRUE(tv1b.is(tv1)); + EXPECT_TRUE(tv2.is(tv2)); + + EXPECT_FALSE(tv1.is(tv2)); + EXPECT_FALSE(tv2.is(tv1)); + + IValue none; + IValue undefinedTensor((at::Tensor())); + + EXPECT_TRUE(none.is(undefinedTensor)); + EXPECT_TRUE(undefinedTensor.is(none)); + + // Is this a bug? We should probably have a is b => a.hash() == b.hash() + EXPECT_NE(none.hash(), undefinedTensor.hash()); + + auto sampleIValues = makeSampleIValues(); + auto sampleIValues2 = makeSampleIValues(); + auto moreSampleIValues = makeMoreSampleIValues(); + + ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size()); + for (int ii = 0; ii < sampleIValues.size(); ++ii) { + // Constant strings will have the same pointer value. + if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) { + EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); + } else { + EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); + } + EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash()); + } +} + +TEST(IValueTest, getSubValues) { + // Scalars have no subvalues. + IValue integer(42), float_(1.5); + + IValue::HashAliasedIValues subvalues; + + integer.getSubValues(subvalues); + EXPECT_TRUE(subvalues.empty()); + + subvalues.clear(); + + float_.getSubValues(subvalues); + EXPECT_TRUE(subvalues.empty()); + + subvalues.clear(); + + at::Tensor t1(at::rand({3, 4})), t2(at::rand({3, 4})); + IValue tv1(t1), tv2(t2); + IValue list(std::vector{t1, t2}); + IValue tuple(ivalue::Tuple::create({tv1, tv2})); + + std::unordered_map m; + m[1] = t1; + m[2] = t2; + + IValue dict(std::move(m)); + + auto objType = ClassType::create(nullopt, {}); + objType->addAttribute("t1", tv1.type()); + objType->addAttribute("t2", tv2.type()); + + auto o = ivalue::Object::create(StrongTypePtr(nullptr, objType), 2); + o->setSlot(0, tv1); + o->setSlot(1, tv2); + + IValue object(o); + tv1.getSubValues(subvalues); + EXPECT_EQ(subvalues.size(), 1); + EXPECT_EQ(subvalues.count(tv1), 1); + + subvalues.clear(); + + for (auto& container: {list, tuple, dict, object}) { + container.getSubValues(subvalues); + EXPECT_EQ(subvalues.size(), 3); + EXPECT_EQ(subvalues.count(container), 1); + EXPECT_EQ(subvalues.count(tv1), 1); + EXPECT_EQ(subvalues.count(tv2), 1); + + subvalues.clear(); + } +} + // TODO(gmagogsfm): Add type conversion test? } // namespace c10 From 1b31e1353903eb52140aedef04c6edff5bb7b7e6 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 08:33:26 -0800 Subject: [PATCH 11/44] [PyTorch] Store Tensor explicitly in IValue (#48824) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48824 Enables following diff, which will make toTensor() return `const Tensor&` and allow callers to avoid refcounting overhead. ghstack-source-id: 119327370 Test Plan: ivalue_test Internal benchmark to ensure perf parity. Some interesting steps during the debugging process: - First version was about a 5% regression - Directly implementing move construction instead of using swap lowered the regression to 2-3% - Directly implementing move assign was maybe an 0.5% improvement - Adding C10_ALWAYS_INLINE on move assign got our regression to negligible - Fixing toTensor() to actually be correct regressed us again, but omitting the explicit dtor call as exhaustively spelled out in a comment fixed it. Reviewed By: bwasti Differential Revision: D25324617 fbshipit-source-id: 7518c1c67f6f2661f151b43310aaddf4fb6e511a --- aten/src/ATen/core/ivalue.cpp | 12 +- aten/src/ATen/core/ivalue.h | 279 +++++++++++++++++++++++--------- aten/src/ATen/core/ivalue_inl.h | 95 +++++++---- aten/src/ATen/core/jit_type.h | 8 +- c10/util/intrusive_ptr.h | 4 +- 5 files changed, 275 insertions(+), 123 deletions(-) diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 320fa6294638..1223577c59c6 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -265,7 +265,7 @@ bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) { TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr); TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr); return lhs.tag == rhs.tag && - lhs.payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } IValue IValue::equals(const IValue& rhs) const { @@ -325,17 +325,17 @@ size_t IValue::hash(const IValue& v) { case Tag::None: return 0; case Tag::Bool: - return c10::get_hash(v.payload.as_bool); + return c10::get_hash(v.payload.u.as_bool); case Tag::Double: - return c10::get_hash(v.payload.as_double); + return c10::get_hash(v.payload.u.as_double); case Tag::Tensor: // Tensor __hash__ is equivalent to `id()`, so take the pointer value of // the tensor to emulate it - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.as_tensor.unsafeGetTensorImpl()); case Tag::Storage: - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.u.as_int); case Tag::Int: - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.u.as_int); case Tag::String: return c10::get_hash(v.toStringRef()); case Tag::Tuple: diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 4a7e15c4008b..5370294b2f2c 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -131,10 +131,15 @@ struct Capsule { // they are marked `@private`, which hides them on the doxygen documentation for // this page. -/// IValue (Interpreter Value) is a tagged union over the types supported by the -/// TorchScript interpreter. IValues contain their values as an -/// `IValue::Payload`, which holds primitive types (`int64_t`, `bool`, `double`, -/// `Device`), as values and all other types as a `c10::intrusive_ptr`. +/// IValue (Interpreter Value) is a tagged union over the types +/// supported by the TorchScript interpreter. IValues contain their +/// values as an `IValue::Payload`, which holds primitive types +/// (`int64_t`, `bool`, `double`, `Device`) and `Tensor` as values, +/// and all other types as a `c10::intrusive_ptr`. In order to +/// optimize performance of the destructor and related operations by +/// making the `Tensor` and `c10::intrusive_ptr` paths generate the +/// same code, we represent a null `c10::intrusive_ptr` as +/// `UndefinedTensorImpl::singleton()`, *not* `nullptr`. /// /// IValues are used as inputs to and outputs from the TorchScript interpreter. /// To retrieve the value contained within an IValue, use the `.toX()` methods, @@ -160,27 +165,35 @@ struct Capsule { struct TORCH_API IValue final { IValue(const IValue& rhs) : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); + if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { + c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr); } } - IValue(IValue&& rhs) noexcept : IValue() { - swap(rhs); + + IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { + moveFrom(std::move(rhs)); } + /// @private [doxygen private] ~IValue() { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); - } + destroy(); } - IValue& operator=(IValue&& rhs) & noexcept { - IValue(std::move(rhs)).swap(*this); // this also sets rhs to None + + C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept { + if (&rhs == this) { + return *this; + } + + destroy(); + moveFrom(std::move(rhs)); return *this; } + IValue& operator=(IValue const& rhs) & { IValue(rhs).swap(*this); return *this; } + void dump() const; /** @@ -260,13 +273,6 @@ struct TORCH_API IValue final { return false; } - if (!this->is_intrusive_ptr) { - // Primitive types don't alias anything - return false; - } - - AT_ASSERT(rhs.is_intrusive_ptr); - // Tensors should be compared based on internal storage if (this->isTensor()) { const auto thisTensor = this->toTensor(); @@ -274,22 +280,56 @@ struct TORCH_API IValue final { return thisTensor.is_alias_of(rhsTensor); } + if (!this->is_intrusive_ptr) { + // Primitive types don't alias anything + return false; + } + + AT_ASSERT(rhs.is_intrusive_ptr); + // Other types can be compared by their ptr value - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } /// @private [doxygen private] size_t use_count() const noexcept { + if (isTensor()) { + return payload.as_tensor.use_count(); + } + if (!is_intrusive_ptr) { return 1; } - return c10::raw::intrusive_ptr::use_count(payload.as_intrusive_ptr); + if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) { + return 0; + } + return c10::raw::intrusive_ptr::use_count(payload.u.as_intrusive_ptr); } /// @private [doxygen private] void swap(IValue& rhs) noexcept { - std::swap(payload, rhs.payload); + if (isTensor() && rhs.isTensor()) { + std::swap(payload.as_tensor, rhs.payload.as_tensor); + } else if (isTensor()) { + at::Tensor t = std::move(payload.as_tensor); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // payload.as_tensor.~Tensor(); + payload.u = rhs.payload.u; + new (&rhs.payload.as_tensor) at::Tensor(std::move(t)); + } else if (rhs.isTensor()) { + rhs.swap(*this); + return; + } else { + std::swap(payload.u, rhs.payload.u); + } std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); std::swap(tag, rhs.tag); } @@ -298,13 +338,8 @@ struct TORCH_API IValue final { // While some of these accessors could be generated through templates, // we prefer to write them manually for clarity - IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { - // Note: the undefined tensor is not refcounted, so while it - // is tagged as a tensor, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined tensor. - payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); + IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) { + new (&payload.as_tensor) at::Tensor(std::move(t)); } bool isTensor() const { return Tag::Tensor == tag; @@ -312,7 +347,7 @@ struct TORCH_API IValue final { at::Tensor toTensor() &&; at::Tensor toTensor() const&; at::TensorImpl* unsafeToTensorImpl() const { - return static_cast(payload.as_intrusive_ptr); + return payload.as_tensor.unsafeGetTensorImpl(); } IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast(s)) { @@ -321,7 +356,7 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined tensor. - payload.as_intrusive_ptr = s.unsafeReleaseStorageImpl(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl()); } bool isStorage() const { return Tag::Storage == tag; @@ -341,7 +376,7 @@ struct TORCH_API IValue final { : tag(Tag::Blob), is_intrusive_ptr(true) { // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract // and store it as a Tensor instead. - payload.as_intrusive_ptr = blob.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); } /// @private [doxygen private] @@ -397,14 +432,14 @@ struct TORCH_API IValue final { // Double IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) { - payload.as_double = d; + payload.u.as_double = d; } bool isDouble() const { return Tag::Double == tag; } double toDouble() const { AT_ASSERT(isDouble()); - return payload.as_double; + return payload.u.as_double; } // Future @@ -433,7 +468,7 @@ struct TORCH_API IValue final { // Int IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = i; + payload.u.as_int = i; } // allow you to pass literals (3, 4) without ambiguity @@ -445,7 +480,7 @@ struct TORCH_API IValue final { int64_t toInt() const { AT_ASSERT(isInt()); - return payload.as_int; + return payload.u.as_int; } // Bool @@ -454,9 +489,9 @@ struct TORCH_API IValue final { // Initializing entire payload stops valgrind's from reporting // "jump or move depends on uninitialised value" in IValue copy constructor // See https://github.com/pytorch/pytorch/issues/37117 - payload.as_int = b; + payload.u.as_int = b; #else - payload.as_bool = b; + payload.u.as_bool = b; #endif } bool isBool() const { @@ -464,7 +499,7 @@ struct TORCH_API IValue final { } bool toBool() const { AT_ASSERT(isBool()); - return payload.as_bool; + return payload.u.as_bool; } // IntList @@ -580,7 +615,7 @@ struct TORCH_API IValue final { c10::intrusive_ptr toEnumHolder() const&; // None - IValue() : payload{0}, tag(Tag::None), is_intrusive_ptr(false) {} + IValue() : tag(Tag::None), is_intrusive_ptr(false) {} bool isNone() const { return Tag::None == tag; } @@ -616,21 +651,21 @@ struct TORCH_API IValue final { // Device IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) { - payload.as_device.type = d.type(); - payload.as_device.index = d.index(); + payload.u.as_device.type = d.type(); + payload.u.as_device.index = d.index(); } bool isDevice() const { return Tag::Device == tag; } c10::Device toDevice() const { AT_ASSERT(isDevice()); - return c10::Device(payload.as_device.type, payload.as_device.index); + return c10::Device(payload.u.as_device.type, payload.u.as_device.index); } //Stream IValue(c10::Stream stream) : tag(Tag::Stream), is_intrusive_ptr(false) { - payload.as_int = stream.pack(); + payload.u.as_int = stream.pack(); } c10::Stream toStream() &&; c10::Stream toStream() const &; @@ -659,7 +694,7 @@ struct TORCH_API IValue final { // QScheme IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = static_cast(qscheme); + payload.u.as_int = static_cast(qscheme); } at::QScheme toQScheme() const { @@ -680,7 +715,7 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined generator. - payload.as_intrusive_ptr = g.unsafeReleaseGeneratorImpl(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl()); } bool isGenerator() const { return Tag::Generator == tag; @@ -749,14 +784,19 @@ struct TORCH_API IValue final { const IValue& v); bool isPtrType() const { - return is_intrusive_ptr; + return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr; } /// @private [doxygen private] const void* internalToPointer() const { TORCH_INTERNAL_ASSERT( isPtrType(), "Can only call internalToPointer() for pointer types"); - return payload.as_intrusive_ptr; + if (isTensor()) { + return payload.as_tensor.unsafeGetTensorImpl(); + } else { + return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton() + ? payload.u.as_intrusive_ptr : nullptr; + } } TypePtr type() const; @@ -770,7 +810,7 @@ struct TORCH_API IValue final { } // If it is not a Tensor, then two mutable IValues alias each other only // if they are the same pointer. - return val.payload.as_int; + return val.payload.u.as_int; } }; @@ -800,6 +840,10 @@ struct TORCH_API IValue final { IValue deepcopy(HashAliasedIValueMap& memo) const; private: + static c10::intrusive_ptr_target* null_to_undefined_tensor(c10::intrusive_ptr_target* p) { + return p ? p : static_cast(c10::UndefinedTensorImpl::singleton()); + } + static bool ptrEqual(const IValue& lhs, const IValue& rhs); // NOTE: IValue tags are intentionally private. In the future we may encode // this value different (e.g. using NaN boxing), and this would make it more @@ -822,24 +866,77 @@ struct TORCH_API IValue final { class NullType = c10::detail::intrusive_target_default_null_type> c10::intrusive_ptr toIntrusivePtr() const; - void clearToNone() { - payload.as_int = 0; + void destroy() { + // We carefully construct this call to both 1) avoid UB by using + // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable + // the compiler to generate the same code for each case. It is + // surprisingly difficult to get this right. + if (isTensor() || is_intrusive_ptr) { + c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr; + c10::intrusive_ptr::reclaim(p); + // No need to make this destructor call! + // payload.as_tensor.~Tensor(); + } + } + + C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept { + if (rhs.isTensor()) { + new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor)); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // rhs.payload.as_tensor.~Tensor(); + } else { + payload.u = rhs.payload.u; + } + tag = rhs.tag; + is_intrusive_ptr = rhs.is_intrusive_ptr; + rhs.clearToNone(); + } + + void clearToNone() noexcept { + payload.u.as_int = 0; tag = Tag::None; is_intrusive_ptr = false; } union Payload { - int64_t as_int; - double as_double; - bool as_bool; - c10::intrusive_ptr_target* as_intrusive_ptr; - struct { - DeviceType type; - DeviceIndex index; - } as_device; + // We use a nested union here so that we can make the copy easy + // and efficient in the non-tensor (i.e., trivially copyable) + // case. Specifically, we do not have to do a switch-on-tag to + // figure out which union member to assign; we can just use + // TriviallyCopyablePayload::operator=. + union TriviallyCopyablePayload { + TriviallyCopyablePayload() : as_int(0) {} + int64_t as_int; + double as_double; + bool as_bool; + // Invariant: never nullptr; null state is represented as + // c10::UndefinedTensorImpl::singleton() for consistency of + // representation with Tensor. + c10::intrusive_ptr_target* as_intrusive_ptr; + struct { + DeviceType type; + DeviceIndex index; + } as_device; + } u; + at::Tensor as_tensor; + Payload() : u() {} + ~Payload() {} }; - IValue(Payload p, Tag t, bool i) : payload(p), tag(t), is_intrusive_ptr(i) {} + IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) { + if (isTensor()) { + new (&payload.as_tensor) at::Tensor(p.as_tensor); + } else { + payload.u = p.u; + } + } Payload payload; Tag tag; @@ -848,29 +945,36 @@ struct TORCH_API IValue final { }; struct TORCH_API WeakIValue final { - WeakIValue() : payload{0}, tag(IValue::Tag::None), is_intrusive_ptr(false) {} + WeakIValue() : tag(IValue::Tag::None), is_intrusive_ptr(false) {} WeakIValue(const WeakIValue& rhs) : payload(rhs.payload), tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { + if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); } } WeakIValue(const IValue& rhs) - : payload(rhs.payload), - tag(rhs.tag), + : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { + if (rhs.isTensor()) { + payload.as_intrusive_ptr = rhs.unsafeToTensorImpl(); + is_intrusive_ptr = true; + } else { + payload = rhs.payload.u; + } if (is_intrusive_ptr) { - c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); + if (payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { + c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); + } } } WeakIValue(WeakIValue&& rhs) noexcept : WeakIValue() { swap(rhs); } ~WeakIValue() { - if (is_intrusive_ptr) { + if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { c10::raw::weak_intrusive_ptr::decref(payload.as_intrusive_ptr); } } @@ -895,17 +999,33 @@ struct TORCH_API WeakIValue final { IValue lock() const { if (!is_intrusive_ptr) { - return IValue(payload, tag, false); + IValue::Payload newPayload; + newPayload.u = payload; + return IValue(newPayload, tag, false); } - auto temp = c10::weak_intrusive_ptr::reclaim( - payload.as_intrusive_ptr); - IValue::Payload pl; - pl.as_intrusive_ptr = temp.lock().release(); - temp.release(); - if (!pl.as_intrusive_ptr) { - return IValue(); + if (IValue::Tag::Tensor == tag) { + auto temp = c10::weak_intrusive_ptr::reclaim( + static_cast(payload.as_intrusive_ptr)); + c10::intrusive_ptr ip(temp.lock()); + temp.release(); + if (!ip) { + return IValue(); + } else { + return IValue(at::Tensor(std::move(ip))); + } } else { - return IValue(pl, tag, true); + auto temp = c10::weak_intrusive_ptr::reclaim( + payload.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? nullptr + : payload.as_intrusive_ptr); + IValue::Payload pl; + pl.u.as_intrusive_ptr = temp.lock().release(); + temp.release(); + if (!pl.u.as_intrusive_ptr) { + return IValue(); + } else { + return IValue(pl, tag, true); + } } } @@ -913,7 +1033,7 @@ struct TORCH_API WeakIValue final { if (!is_intrusive_ptr) { return 1; } - auto temp = c10::weak_intrusive_ptr::reclaim( + auto temp = c10::weak_intrusive_ptr::reclaim( payload.as_intrusive_ptr); size_t result = temp.use_count(); temp.release(); @@ -924,7 +1044,7 @@ struct TORCH_API WeakIValue final { if (!is_intrusive_ptr) { return 1; } - auto temp = c10::weak_intrusive_ptr::reclaim( + auto temp = c10::weak_intrusive_ptr::reclaim( payload.as_intrusive_ptr); size_t result = temp.weak_use_count(); temp.release(); @@ -935,7 +1055,8 @@ struct TORCH_API WeakIValue final { } private: - IValue::Payload payload; + using Payload = IValue::Payload::TriviallyCopyablePayload; + Payload payload; IValue::Tag tag; bool is_intrusive_ptr; }; diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 89c8e669c138..fe55d783e780 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -48,14 +48,18 @@ struct tagged_capsule { template c10::intrusive_ptr IValue::moveToIntrusivePtr() { auto t = c10::intrusive_ptr::reclaim( - static_cast(payload.as_intrusive_ptr)); + payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? NullType::singleton() + : static_cast(payload.u.as_intrusive_ptr)); clearToNone(); return t; } template c10::intrusive_ptr IValue::toIntrusivePtr() const { auto r = c10::intrusive_ptr::reclaim( - static_cast(payload.as_intrusive_ptr)); + payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? NullType::singleton() + : static_cast(payload.u.as_intrusive_ptr)); auto p = r; r.release(); return p; @@ -131,12 +135,22 @@ inline c10::intrusive_ptr IValue::toEnumHolder() const& { } inline at::Tensor IValue::toTensor() && { AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); - return at::Tensor( - moveToIntrusivePtr()); + auto result = std::move(payload.as_tensor); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // payload.as_tensor.~Tensor(); + clearToNone(); + return result; } inline at::Tensor IValue::toTensor() const& { AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); - return at::Tensor(toIntrusivePtr()); + return payload.as_tensor; } inline c10::Storage IValue::toStorage() && { AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind()); @@ -148,10 +162,10 @@ inline c10::Storage IValue::toStorage() const& { return c10::Storage(toIntrusivePtr()); } inline c10::Stream IValue::toStream() && { - return c10::Stream::unpack(payload.as_int); + return c10::Stream::unpack(payload.u.as_int); } inline c10::Stream IValue::toStream() const& { - return c10::Stream::unpack(payload.as_int); + return c10::Stream::unpack(payload.u.as_int); } inline c10::intrusive_ptr IValue::toBlob() && { AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind()); @@ -713,7 +727,8 @@ using _guarded_unsigned_long = std::conditional_t< inline const ivalue::Object& IValue::toObjectRef() const { AT_ASSERT(isObject(), "Expected Object but got ", tagKind()); - return *static_cast(payload.as_intrusive_ptr); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference"); + return *static_cast(payload.u.as_intrusive_ptr); } // note: when adding a DEFINE_TO case here you should also add a @@ -980,8 +995,11 @@ inline c10::List IValue::toIntList() const& { } inline std::vector IValue::toIntVector() const { AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toIntVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toDoubleList() && { AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind()); @@ -993,8 +1011,11 @@ inline c10::List IValue::toDoubleList() const& { } inline std::vector IValue::toDoubleVector() const { AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toDoubleVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toBoolList() && { AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind()); @@ -1014,8 +1035,11 @@ inline c10::List IValue::toTensorList() const& { } inline std::vector IValue::toTensorVector() const { AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toTensorVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toList() && { AT_ASSERT(isList(), "Expected GenericList but got ", tagKind()); @@ -1027,7 +1051,10 @@ inline c10::List IValue::toList() const& { } inline c10::ArrayRef IValue::toListRef() const { AT_ASSERT(isList(), "Expected GenericList but got ", tagKind()); - return static_cast(payload.as_intrusive_ptr) + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toListRef on null intrusive_ptr IValue"); + return static_cast(payload.u.as_intrusive_ptr) ->list; } inline c10::Dict IValue::toGenericDict() && { @@ -1049,7 +1076,7 @@ inline c10::intrusive_ptr IValue::toTuple() const& { inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Tuple), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } template < typename... Args, @@ -1065,14 +1092,14 @@ inline IValue::IValue(const std::tuple& t) inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::String), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(std::string v) : IValue(ivalue::ConstantString::create(std::move(v))) {} inline IValue::IValue(c10::impl::GenericList v) : tag(Tag::GenericList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.impl_.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } template > @@ -1104,7 +1131,7 @@ inline IValue::IValue(std::array v) : IValue(c10::List()) { inline IValue::IValue(c10::impl::GenericDict v) : tag(Tag::GenericDict), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.impl_.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } template inline IValue::IValue(c10::Dict v) @@ -1131,17 +1158,17 @@ inline IValue::IValue(c10::nullopt_t) : IValue() {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Object), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::PyObject), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Enum), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue IValue::make_capsule( @@ -1149,7 +1176,7 @@ inline IValue IValue::make_capsule( IValue iv; iv.tag = Tag::Capsule; iv.is_intrusive_ptr = true; - iv.payload.as_intrusive_ptr = blob.release(); + iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); return iv; } @@ -1170,30 +1197,33 @@ IValue::IValue(c10::intrusive_ptr custom_class) { auto ivalue_obj = c10::ivalue::Object::create( c10::StrongTypePtr(nullptr, classType), /*num_slots=*/1); ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class))); - payload.as_intrusive_ptr = ivalue_obj.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release()); tag = Tag::Object; is_intrusive_ptr = true; } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Future), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::RRef), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Quantizer), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline const std::string& IValue::toStringRef() const { AT_ASSERT(isString(), "Expected String but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toStringRef on null intrusive_ptr IValue"); return static_cast( - payload.as_intrusive_ptr) + payload.u.as_intrusive_ptr) ->string(); } inline c10::optional> IValue:: @@ -1202,8 +1232,11 @@ inline c10::optional> IValue:: return c10::nullopt; } AT_ASSERT(isString(), "Expected optional but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toOptionalStringRef on null intrusive_ptr IValue"); return std::reference_wrapper( - static_cast(payload.as_intrusive_ptr) + static_cast(payload.u.as_intrusive_ptr) ->string()); } @@ -1241,15 +1274,13 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { // for bool type, do equality check return this->toBool() == rhs.toBool(); } else if (this->isTensor() && rhs.isTensor()) { - // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr - // is false for undefined tensor - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + return this->payload.as_tensor.is_same(rhs.payload.as_tensor); } else if (this->isTensor() && rhs.isNone()) { // special case: undefined tensor and None are the same identity - return !this->is_intrusive_ptr; + return !this->payload.as_tensor.defined(); } else if (this->isNone() && rhs.isTensor()) { // special case: undefined tensor and None are the same identity - return !rhs.is_intrusive_ptr; + return !rhs.payload.as_tensor.defined(); } else if (this->isInt() && rhs.isInt()) { return this->toInt() == rhs.toInt(); } else if (this->isDouble() && rhs.isDouble()) { @@ -1260,7 +1291,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { // for objects holding in IValue, do shallow compare on pointer address to // testify the identity return this->is_intrusive_ptr && rhs.is_intrusive_ptr && - this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } } diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index a3ae813616e0..7d3890f582b8 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -2370,19 +2370,19 @@ struct TORCH_API AnyClassType : public Type { inline bool IValue::isDoubleList() const { // note: avoids calling type() to avoid extra referencing counting for the returned type. - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == FloatType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == FloatType::Kind; } inline bool IValue::isTensorList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == TensorType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == TensorType::Kind; } inline bool IValue::isIntList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == IntType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == IntType::Kind; } inline bool IValue::isBoolList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == BoolType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == BoolType::Kind; } template<> diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h index 637db95991f2..790d97ee3994 100644 --- a/c10/util/intrusive_ptr.h +++ b/c10/util/intrusive_ptr.h @@ -206,7 +206,7 @@ class intrusive_ptr final { "NullType must have a constexpr singleton() method"); #endif static_assert( - std::is_same::value, + std::is_base_of::type>::value, "NullType::singleton() must return a element_type* pointer"); TTarget* target_; @@ -509,7 +509,7 @@ class weak_intrusive_ptr final { "NullType must have a constexpr singleton() method"); #endif static_assert( - std::is_same::value, + std::is_base_of::type>::value, "NullType::singleton() must return a element_type* pointer"); TTarget* target_; From 480a756194f27580753a63d908393dfda3baeb25 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 08:33:26 -0800 Subject: [PATCH 12/44] [PyTorch] IValue::toTensor can now return const Tensor& (#48868) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48868 Building on the previous diff, we can make `toTensor()` return a `const Tensor&`, which should make it easier to avoid reference counting. ghstack-source-id: 119327372 Test Plan: internal benchmarks. Reviewed By: bwasti Differential Revision: D25325379 fbshipit-source-id: ca699632901691bcee432f595f75b0a4416d55dd --- aten/src/ATen/core/ivalue.h | 7 +- aten/src/ATen/core/ivalue_inl.h | 7 +- torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 2 +- torch/csrc/jit/frontend/tracer.cpp | 6 +- torch/csrc/jit/passes/freeze_module.cpp | 8 +- torch/csrc/jit/runtime/argument_spec.h | 2 +- torch/csrc/jit/runtime/interpreter.cpp | 4 +- torch/csrc/jit/runtime/profiling_record.cpp | 2 +- torch/csrc/jit/runtime/static/ops.cpp | 82 +++++++++---------- torch/csrc/jit/serialization/pickler.cpp | 2 +- torch/csrc/jit/serialization/python_print.cpp | 4 +- torch/csrc/jit/serialization/unpickler.cpp | 2 +- 12 files changed, 67 insertions(+), 61 deletions(-) diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 5370294b2f2c..ca68a8df46e1 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -275,8 +275,8 @@ struct TORCH_API IValue final { // Tensors should be compared based on internal storage if (this->isTensor()) { - const auto thisTensor = this->toTensor(); - const auto rhsTensor = rhs.toTensor(); + const auto& thisTensor = this->toTensor(); + const auto& rhsTensor = rhs.toTensor(); return thisTensor.is_alias_of(rhsTensor); } @@ -345,7 +345,8 @@ struct TORCH_API IValue final { return Tag::Tensor == tag; } at::Tensor toTensor() &&; - at::Tensor toTensor() const&; + at::Tensor& toTensor() &; + const at::Tensor& toTensor() const&; at::TensorImpl* unsafeToTensorImpl() const { return payload.as_tensor.unsafeGetTensorImpl(); } diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index fe55d783e780..b96f4b834989 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -148,7 +148,11 @@ inline at::Tensor IValue::toTensor() && { clearToNone(); return result; } -inline at::Tensor IValue::toTensor() const& { +inline at::Tensor& IValue::toTensor() & { + AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); + return payload.as_tensor; +} +inline const at::Tensor& IValue::toTensor() const& { AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); return payload.as_tensor; } @@ -744,6 +748,7 @@ inline const ivalue::Object& IValue::toObjectRef() const { inline type IValue::to() const& { \ return this->method_name(); \ } + DEFINE_TO(at::Tensor, toTensor) DEFINE_TO(at::Storage, toStorage) DEFINE_TO(c10::Stream, toStream) diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index f1a0a634727a..5bddc510fe56 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -209,7 +209,7 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( std::stringstream encoded_inputs; for (const auto& input : inputs) { if (input.isTensor()) { - auto input_tensor = input.toTensor(); + auto& input_tensor = input.toTensor(); encoded_inputs << ";"; auto sep = ""; diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp index 1bab391bd393..0c88371399de 100644 --- a/torch/csrc/jit/frontend/tracer.cpp +++ b/torch/csrc/jit/frontend/tracer.cpp @@ -137,7 +137,7 @@ Value* TracingState::getValue(const IValue& var) { return graph->insertNode(dict_node)->output(); } if (var.isTensor()) { - auto ten = var.toTensor(); + auto& ten = var.toTensor(); if (!ten.defined()) { Node* n = graph->createNone(); return graph->insertNode(n)->output(); @@ -237,7 +237,7 @@ bool TracingState::hasValue(const IValue& var) const { Value* TracingState::getOutput(const IValue& iv, size_t i) { bool tracing_mode_strict = getTracingState()->strict; if (iv.isTensor()) { - at::Tensor var = iv.toTensor(); + const at::Tensor& var = iv.toTensor(); if (!var.defined()) { Node* n = graph->createNone(); return graph->insertNode(n)->output(); @@ -506,7 +506,7 @@ void setValueTrace(const IValue& v, Value* value) { } void TracingState::setValue(const IValue& v, Value* value) { if (v.isTensor()) { - auto var = v.toTensor(); + auto& var = v.toTensor(); AT_ASSERT(var.defined()); env_stack.back()[v] = value; } else if (v.isTensorList()) { diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 2778c7712f23..f66f54eeb567 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -289,11 +289,11 @@ class AttributePropagator { IValue overrideGradient(IValue attr) { if (attr.isTensor()) { - auto t = attr.toTensor(); + auto& t = attr.toTensor(); if (t.requires_grad()) { - t = t.detach(); - t.set_requires_grad(false); - attr = IValue(t); + auto detached = t.detach(); + detached.set_requires_grad(false); + attr = IValue(std::move(detached)); } } else if (attr.isTuple()) { auto tuple = std::move(attr).toTuple(); diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h index 401933c6d67e..a0e60e879146 100644 --- a/torch/csrc/jit/runtime/argument_spec.h +++ b/torch/csrc/jit/runtime/argument_spec.h @@ -237,7 +237,7 @@ struct CompleteArgumentSpec { for (int32_t i = 0; i < num_inputs; i++) { if (!inputs[i].isTensor()) continue; - auto tensor = inputs[i].toTensor(); + auto& tensor = inputs[i].toTensor(); all_dims += tensor.defined() ? tensor.ndimension() : 0; } // allocate enough room for all TensorPODs and dimensions diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 24ca9dbf9793..ce4718becaf7 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1418,7 +1418,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // Check every input's shape against profiled (expected) shape. for (i = 0; i < num_inputs; i++) { auto& input = peek(stack, i, num_inputs); - auto t = input.toTensor(); + auto& t = input.toTensor(); const TypePtr& expected = frame.function->type_table_[inst.X + i]; auto expected_type = expected->cast(); if (t.defined() && !expected_type->matchTensor(t)) { @@ -1439,7 +1439,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // so it's safe to pass this guard check push(stack, true); } else { - auto t = stack.back().toTensor(); + auto& t = stack.back().toTensor(); const TypePtr& expected = frame.function->type_table_[inst.X]; auto expected_type = expected->cast(); if (t.defined() && diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp index 8d276dd58b50..d233f089f187 100644 --- a/torch/csrc/jit/runtime/profiling_record.cpp +++ b/torch/csrc/jit/runtime/profiling_record.cpp @@ -165,7 +165,7 @@ void ProfilingRecord::insertShapeProfile(Node* n, size_t offset) { if (v.isTensor()) { std::lock_guard lock(this->mutex_); auto& profiled_types = profiled_types_per_frame_[frame_id]; - auto t = v.toTensor(); + auto& t = v.toTensor(); if (t.defined()) { auto pttp = tensorTypeInCurrentExecutionContext(t); GRAPH_DEBUG( diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 5c118f513565..89519d3765b5 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -79,13 +79,13 @@ struct static_add final : public at::native::structured_add_out { REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); auto in2_s = p_node->Input(2, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); static_add op{out_t}; op.meta(in0_t, in1_t, in2_s); op.impl(in0_t, in1_t, in2_s, out_t); @@ -94,12 +94,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::mul_out(out_t, in0_t, in1_t); }; @@ -107,15 +107,15 @@ REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); - auto in2_t = p_node->Input(2, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); + auto& in2_t = p_node->Input(2, reg).toTensor(); auto in3_s = p_node->Input(3, reg).toScalar(); auto in4_s = p_node->Input(4, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s); }; @@ -123,13 +123,13 @@ REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_s = p_node->Input(1, reg).toScalar(); auto in2_s = p_node->Input(2, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::clamp_out(out_t, in0_t, in1_s, in2_s); }; @@ -137,12 +137,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::bmm, aten_bmm, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::bmm_out_cpu(out_t, in0_t, in1_t); }; @@ -154,7 +154,7 @@ REGISTER_OPERATOR_FUNCTOR( [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { auto input_size = p_node->input_regs().size(); - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); double in1_d = input_size > 1 ? p_node->Input(1, reg).toDouble() : 0; double in2_d = input_size > 2 ? p_node->Input(2, reg).toDouble() : std::numeric_limits::infinity(); @@ -164,7 +164,7 @@ REGISTER_OPERATOR_FUNCTOR( if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::nan_to_num_out(out_t, in0_t, in1_d, in2_d, in3_d); }; @@ -176,18 +176,18 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator { if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_tl[0]); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::_cat_out_cpu(out_t, in0_tl, in1_i); }; }); REGISTER_OPERATOR_FUNCTOR(aten::tanh, aten_tanh, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::tanh_out(out_t, in0_t); }; @@ -217,7 +217,7 @@ SROperator aten_stack(Node* n) { for (auto i = 0; i < inputs.size(); i++) { inputs[i] = inputs[i].unsqueeze(dim); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::_cat_out_cpu(out_t, inputs, dim); }; @@ -230,11 +230,11 @@ REGISTER_OPERATOR_FUNCTOR( aten_sigmoid, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::sigmoid_out(out_t, in0_t); }; @@ -247,57 +247,57 @@ REGISTER_OPERATOR_FUNCTOR( if (in1) { auto in1_s = in1->toScalar(); return [=](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::leaky_relu_out(out_t, in0_t, in1_s); }; } else { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_s = p_node->Input(1, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::leaky_relu_out(out_t, in0_t, in1_s); }; } }); REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::threshold_out(out_t, in0_t, 0, 0); }; }); REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); double in1_d = p_node->input_regs().size() > 1 ? p_node->Input(1, reg).toDouble() : -1.0; if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::logit_out(out_t, in0_t, in1_d); }; }); REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::resize_as_(out_t, in0_t, c10::nullopt); at::native::copy_(out_t, in0_t, false); }; @@ -317,14 +317,14 @@ std::function&)> getNativeOperation(Node* n) { if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); p_node->Output(0, reg) = at::native::transpose(in0_t, in1_i, in2_i); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); p_node->Output(0, reg) = at::native::flatten(in0_t, in1_i, in2_i); @@ -386,19 +386,19 @@ getNativeOperation(Node* n) { }; } else if (n->kind() == c10::Symbol::fromQualString("aten::permute")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_iv = p_node->Input(1, reg).toIntVector(); p_node->Output(0, reg) = at::native::permute(in0_t, in1_iv); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::reshape")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_iv = p_node->Input(1, reg).toIntVector(); p_node->Output(0, reg) = at::native::reshape(in0_t, in1_iv); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::slice")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); auto in3_i = p_node->Input(3, reg).toInt(); @@ -408,13 +408,13 @@ getNativeOperation(Node* n) { }; } else if (n->kind() == c10::Symbol::fromQualString("aten::narrow")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto self = p_node->Input(0, reg).toTensor(); // self + auto& self = p_node->Input(0, reg).toTensor(); // self auto dim = p_node->Input(1, reg).toInt(); // dim int64_t start = 0; if (p_node->Input(2, reg).isScalar()) { start = p_node->Input(2, reg).toInt(); } else { - auto t = p_node->Input(2, reg).toTensor(); + auto& t = p_node->Input(2, reg).toTensor(); start = t.item(); } auto length = p_node->Input(3, reg).toInt(); // length @@ -440,7 +440,7 @@ getNativeOperation(Node* n) { } else if (n->kind() == c10::Symbol::fromQualString("aten::to")) { return [](const ProcessedNode* p_node, std::vector& reg) { DCHECK(p_node->input_regs().size() == 5); - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toScalarType(); auto in2_i = p_node->Input(2, reg).toBool(); auto in3_i = p_node->Input(3, reg).toBool(); diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 6e5c3b927c38..811569485888 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -354,7 +354,7 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) { // // The format here is the same one used by `torch.save()`. The code for the // format can be found in `torch/serialization.py`. - auto tensor = ivalue.toTensor(); + auto& tensor = ivalue.toTensor(); bool quantized = tensor.is_quantized(); // The arguments to this function are: // storage, storage_offset, size, stride, requires_grad, backward_hooks diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index c86cbc460c9c..18d656c98f32 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -309,12 +309,12 @@ struct PythonPrintImpl { // because it doesn't hash any information about the tensors. // We will probably need to optimize this at some point using hashing. if (val.isTensor()) { - auto t = val.toTensor(); + auto& t = val.toTensor(); for (size_t i = 0; i < constant_table_.size(); ++i) { if (!constant_table_[i].isTensor()) { continue; } - auto t2 = constant_table_[i].toTensor(); + auto& t2 = constant_table_[i].toTensor(); if (t.options().type_equal(t2.options()) && t.equal(t2)) { return i; } diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 3ff5da29fe1f..841e87592be9 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -632,7 +632,7 @@ void Unpickler::rebuildTensor(bool quantized) { auto tup = pop(stack_).toTuple(); const auto& elements = tup->elements(); size_t idx = 0; - auto storage_tensor = elements.at(idx++).toTensor(); + auto& storage_tensor = elements.at(idx++).toTensor(); int64_t storage_offset = elements.at(idx++).toInt(); std::vector size = tupleToIntList(elements.at(idx++)); std::vector stride = tupleToIntList(elements.at(idx++)); From 68a6e4637903dba279c60daae5cff24e191ff9b4 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 6 Jan 2021 08:39:11 -0800 Subject: [PATCH 13/44] Push anonymous namespace into codegen, not template (#49498) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49498 In the near future, I want to code generate some functions that are visible externally to this compilation unit. I cannot easily do this if all the codegen code is wrapped in a global anonymous namespace, so push the namespace in. Registration has to stay in an anonymous namespace to avoid name conflicts. This could also have been solved by making the wrapper functions have more unique names but I didn't do this in the end. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: albanD, smessmer Differential Revision: D25616104 Pulled By: ezyang fbshipit-source-id: 323c0dda05a081502aab702f359a08dfac8c41a4 --- aten/src/ATen/templates/RegisterDispatchKey.cpp | 7 +++++-- tools/codegen/gen.py | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index e923f6d73bd0..ed4359c6883e 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -37,10 +37,13 @@ namespace at { -namespace { - ${dispatch_definitions} +// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid +// ambiguity with conflicting identifiers that may have been defined in +// at namespace already. +namespace { + TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) { ${dispatch_registrations} } diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 8f521e6651bc..4768670b6f26 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -435,6 +435,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: # For an overview of what this template code looks like, see # https://github.com/pytorch/rfcs/pull/9 return f"""\ +namespace {{ + {self.gen_structured_class( f, k, class_name=class_name, @@ -448,6 +450,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: {impl_call} return {ret_expr}; }} + +}} // anonymous namespace """ elif self.target is Target.REGISTRATION: @@ -540,9 +544,13 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: """ return f"""\ +namespace {{ + {returns_type} {name}({args_str}) {{ {cuda_guard}{return_kw}{impl_name}({args_exprs_str}); }} + +}} // anonymous namespace """ elif self.target is Target.REGISTRATION: From 74c055b24065d0202aecdf4bc837d3698d1639e1 Mon Sep 17 00:00:00 2001 From: Loi Ly Date: Wed, 6 Jan 2021 09:45:15 -0800 Subject: [PATCH 14/44] Fix mypy type hint for AdaptiveAvgPool2,3d, AdaptiveMaxPool2,3d (#49963) Summary: Fixes https://github.com/pytorch/pytorch/issues/49918 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49963 Reviewed By: mrshenli, heitorschueroff Differential Revision: D25760110 Pulled By: ezyang fbshipit-source-id: aeb655b784689544000ea3b948f7d6d025aee441 --- test/type_hint_tests/opt_size.py | 6 ++++++ torch/nn/common_types.py | 7 ++++++- torch/nn/functional.pyi.in | 10 +++++----- torch/nn/modules/pooling.py | 15 ++++++++------- 4 files changed, 25 insertions(+), 13 deletions(-) create mode 100644 test/type_hint_tests/opt_size.py diff --git a/test/type_hint_tests/opt_size.py b/test/type_hint_tests/opt_size.py new file mode 100644 index 000000000000..f24e57e6e56f --- /dev/null +++ b/test/type_hint_tests/opt_size.py @@ -0,0 +1,6 @@ +import torch.nn as nn + +avg_pool1 = nn.AdaptiveAvgPool2d((1, None)) +avg_pool2 = nn.AdaptiveAvgPool2d((None, 1)) +max_pool1 = nn.AdaptiveMaxPool2d((1, None)) +max_pool2 = nn.AdaptiveMaxPool2d((None, 1)) diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py index fa9d5bb1eb00..884f739e2781 100644 --- a/torch/nn/common_types.py +++ b/torch/nn/common_types.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Union, Tuple +from typing import TypeVar, Union, Tuple, Optional from .. import Tensor # Create some useful type aliases @@ -24,6 +24,11 @@ _size_5_t = _scalar_or_tuple_5_t[int] _size_6_t = _scalar_or_tuple_6_t[int] +# For arguments which represent optional size parameters (eg, adaptive pool parameters) +_size_any_opt_t = _scalar_or_tuple_any_t[Optional[int]] +_size_2_opt_t = _scalar_or_tuple_2_t[Optional[int]] +_size_3_opt_t = _scalar_or_tuple_3_t[Optional[int]] + # For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters) _ratio_2_t = _scalar_or_tuple_2_t[float] _ratio_3_t = _scalar_or_tuple_3_t[float] diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in index 94071556e144..208dc7c2df40 100644 --- a/torch/nn/functional.pyi.in +++ b/torch/nn/functional.pyi.in @@ -1,7 +1,7 @@ from torch import Tensor from torch.types import _size from typing import Any, Optional, Tuple, Dict, List, Callable, Sequence, Union -from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t +from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t, _size_2_opt_t, _size_3_opt_t # 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys. # It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature @@ -75,21 +75,21 @@ def adaptive_max_pool1d_with_indices(input: Tensor, output_size: _size, return_i Tensor, Tensor]: ... -def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[ +def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size_2_opt_t, return_indices: bool = ...) -> Tuple[ Tensor, Tensor]: ... -def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[ +def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size_3_opt_t, return_indices: bool = ...) -> Tuple[ Tensor, Tensor]: ... def adaptive_avg_pool1d(input: Tensor, output_size: _size_1_t) -> Tensor: ... -def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_t) -> Tensor: ... +def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ... -def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_t) -> Tensor: ... +def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ... def dropout(input: Tensor, p: float = ..., training: bool = ..., inplace: bool = ...) -> Tensor: ... diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index e8f68307f230..78aae504083b 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -5,7 +5,8 @@ from .utils import _single, _pair, _triple from .. import functional as F -from ..common_types import _size_any_t, _size_1_t, _size_2_t, _size_3_t, _ratio_3_t, _ratio_2_t +from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t, + _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t) class _MaxPoolNd(Module): @@ -953,7 +954,7 @@ class _AdaptiveMaxPoolNd(Module): __constants__ = ['output_size', 'return_indices'] return_indices: bool - def __init__(self, output_size: _size_any_t, return_indices: bool = False) -> None: + def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None: super(_AdaptiveMaxPoolNd, self).__init__() self.output_size = output_size self.return_indices = return_indices @@ -1020,7 +1021,7 @@ class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd): """ - output_size: _size_2_t + output_size: _size_2_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_max_pool2d(input, self.output_size, self.return_indices) @@ -1057,7 +1058,7 @@ class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd): """ - output_size: _size_3_t + output_size: _size_3_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_max_pool3d(input, self.output_size, self.return_indices) @@ -1066,7 +1067,7 @@ def forward(self, input: Tensor) -> Tensor: class _AdaptiveAvgPoolNd(Module): __constants__ = ['output_size'] - def __init__(self, output_size: _size_any_t) -> None: + def __init__(self, output_size: _size_any_opt_t) -> None: super(_AdaptiveAvgPoolNd, self).__init__() self.output_size = output_size @@ -1125,7 +1126,7 @@ class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd): """ - output_size: _size_2_t + output_size: _size_2_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_avg_pool2d(input, self.output_size) @@ -1159,7 +1160,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd): """ - output_size: _size_3_t + output_size: _size_3_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_avg_pool3d(input, self.output_size) From efe0533a24796c7402e1e8eba2317eb5424d90e3 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 6 Jan 2021 10:39:15 -0800 Subject: [PATCH 15/44] Clean up some type annotations in torch/testing/_internal (#50078) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50078 Upgrades type annotations from Python2 to Python3 Test Plan: Sandcastle tests Reviewed By: pritamdamania87 Differential Revision: D25717560 fbshipit-source-id: cec631f3121ef9ab87ff8b3b00f1fae6df9a2155 --- .../_internal/distributed/rpc/dist_autograd_test.py | 7 +++---- .../distributed/rpc/jit/dist_autograd_test.py | 3 +-- .../_internal/distributed/rpc/jit/rpc_test_faulty.py | 10 ++++------ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index c7fdbe536061..15d5cfeca214 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -11,6 +11,7 @@ import torch.testing._internal.dist_utils from torch.autograd import Function from torch.autograd.function import once_differentiable +from torch.distributed.rpc import RRef from torch.testing._internal.common_utils import IS_MACOS from torch.testing._internal.dist_utils import ( dist_init, @@ -70,8 +71,7 @@ def create_tensor(): @torch.jit.script -def create_torchscript_tensor(): - # type: () -> Tensor +def create_torchscript_tensor() -> torch.Tensor: return torch.ones((3, 3)).requires_grad_() @@ -94,8 +94,7 @@ def my_script_add(t1, t2): @torch.jit.script -def my_script_ref_add(ref_t1, t2): - # type: (RRef[Tensor], Tensor) -> Tensor +def my_script_ref_add(ref_t1: RRef[torch.Tensor], t2: torch.Tensor) -> torch.Tensor: t1 = ref_t1.to_here() return torch.add(t1, t2) diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py index ee3ebdb33eff..5ae40cdea065 100644 --- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py @@ -34,8 +34,7 @@ def test_get_gradients(self): dst_rank = self.rank @torch.jit.script - def dist_get_gradients(context_id): - # type: (int) -> (Dict[Tensor, Tensor]) + def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]): return dist_autograd.get_gradients(context_id) FileCheck().check("get_gradients").run(str(dist_get_gradients.graph)) diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py index 656f25322274..96ede7231a97 100644 --- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py +++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py @@ -3,6 +3,7 @@ import torch import torch.distributed.rpc as rpc from torch import Tensor +from torch.distributed.rpc import RRef from torch.testing._internal.dist_utils import ( dist_init, worker_name, @@ -63,18 +64,15 @@ def rpc_async_call_future_ret( return fut @torch.jit.script -def rref_to_here(rref_var): - # type: (RRef[Tensor]) -> Tensor +def rref_to_here(rref_var: RRef[Tensor]) -> Tensor: return rref_var.to_here() @torch.jit.script -def rref_to_here_with_timeout(rref_var, timeout): - # type: (RRef[Tensor], float) -> Tensor +def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor: return rref_var.to_here(timeout) @torch.jit.script -def rpc_async_with_rref_arg(dst_worker_name, args): - # type: (str, Tuple[RRef[Tensor]]) -> Tensor +def rpc_async_with_rref_arg(dst_worker_name: str, args: Tuple[RRef[Tensor]]) -> Tensor: fut = rpc.rpc_async(dst_worker_name, rref_to_here, args) ret = fut.wait() return ret From e606e603312cf874127b560bcbb8b78b9574ac84 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 6 Jan 2021 10:41:07 -0800 Subject: [PATCH 16/44] [Needs Review] Convert some files to Python3 (#49351) Summary: Uses the Python standard library 2to3 script to convert a number of Python 2 files to Python 3. This facilitates code maintenance such as dropping unused imports in D25500422. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49351 Test Plan: Standard sandcastle tests Reviewed By: xush6528 Differential Revision: D25499576 fbshipit-source-id: 0c44718ac734771ce0758b1cb30676cc3d76ac10 --- docs/caffe2/process.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/caffe2/process.py b/docs/caffe2/process.py index 9fa37e5fbb5a..3b94b9d38502 100644 --- a/docs/caffe2/process.py +++ b/docs/caffe2/process.py @@ -1,20 +1,21 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 ## @package process # Module doxygen.process # Script to insert preamble for doxygen and regen API docs -import glob, os, shutil +import os +import shutil # Module caffe2...caffe2.python.control_test -def insert(originalfile,first_line,description): - with open(originalfile,'r') as f: +def insert(originalfile, first_line, description): + with open(originalfile, 'r') as f: f1 = f.readline() - if(f1.find(first_line)<0): + if(f1.find(first_line) < 0): docs = first_line + description + f1 - with open('newfile.txt','w') as f2: + with open('newfile.txt', 'w') as f2: f2.write(docs) f2.write(f.read()) - os.rename('newfile.txt',originalfile) + os.rename('newfile.txt', originalfile) else: print('already inserted') @@ -29,15 +30,15 @@ def insert(originalfile,first_line,description): for file in files: if (file.endswith(".py") and not file.endswith("_test.py") and not file.endswith("__.py")): filepath = os.path.join(root, file) - print("filepath: " + filepath) + print(("filepath: " + filepath)) directory = os.path.dirname(filepath)[2:] - directory = directory.replace("/",".") - print "directory: " + directory + directory = directory.replace("/", ".") + print("directory: " + directory) name = os.path.splitext(file)[0] first_line = "## @package " + name description = "\n# Module " + directory + "." + name + "\n" - print first_line,description - insert(filepath,first_line,description) + print(first_line, description) + insert(filepath, first_line, description) if os.path.exists("doxygen/doxygen-python"): print("Looks like you ran this before, so we need to cleanup those old files...") From 7d9eb6c6802e85193cdd6139833ff66dc0be228f Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Wed, 6 Jan 2021 10:49:00 -0800 Subject: [PATCH 17/44] Implementation of torch::cuda::synchronize (#50072) Summary: Adding `torch::cuda::synchronize()` to libtorch. Note that the implementation here adds a new method to the `CUDAHooksInterface`. An alternative that was suggested to me is to add a method to the `DeviceGuard` interface. Fixes https://github.com/pytorch/pytorch/issues/47722 Pull Request resolved: https://github.com/pytorch/pytorch/pull/50072 Reviewed By: H-Huang Differential Revision: D25804342 Pulled By: jbschlosser fbshipit-source-id: 45aa61d7c6fbfd3178caf2eb5ec053d6c01b5a43 --- aten/src/ATen/cuda/detail/CUDAHooks.cpp | 5 +++++ aten/src/ATen/cuda/detail/CUDAHooks.h | 1 + aten/src/ATen/detail/CUDAHooksInterface.h | 4 ++++ torch/csrc/api/include/torch/cuda.h | 3 +++ torch/csrc/api/src/cuda.cpp | 9 +++++++++ 5 files changed, 22 insertions(+) diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index f38860e8ef13..b75ef8219b1c 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -369,6 +369,11 @@ int CUDAHooks::getNumGPUs() const { return at::cuda::device_count(); } +void CUDAHooks::deviceSynchronize(int64_t device_index) const { + at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index)); + c10::cuda::device_synchronize(); +} + // Sigh, the registry doesn't support namespaces :( using at::CUDAHooksRegistry; using at::RegistererCUDAHooksRegistry; diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index dff8913b153f..abef2e7ff835 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -38,6 +38,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override; void cuFFTClearPlanCache(int64_t device_index) const override; int getNumGPUs() const override; + void deviceSynchronize(int64_t device_index) const override; }; }}} // at::cuda::detail diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index af4eb6fd0739..afe88761d88f 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -181,6 +181,10 @@ struct TORCH_API CUDAHooksInterface { virtual int getNumGPUs() const { return 0; } + + virtual void deviceSynchronize(int64_t device_index) const { + TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP); + } }; // NB: dummy argument to suppress "ISO C++11 requires at least one argument diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h index 5f6f2a9eb8a9..a7e063b90af9 100644 --- a/torch/csrc/api/include/torch/cuda.h +++ b/torch/csrc/api/include/torch/cuda.h @@ -23,5 +23,8 @@ void TORCH_API manual_seed(uint64_t seed); /// Sets the seed for all available GPUs. void TORCH_API manual_seed_all(uint64_t seed); +/// Waits for all kernels in all streams on a CUDA device to complete. +void TORCH_API synchronize(int64_t device_index = -1); + } // namespace cuda } // namespace torch diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp index d40cd8611c42..b8f3ffa0ee0a 100644 --- a/torch/csrc/api/src/cuda.cpp +++ b/torch/csrc/api/src/cuda.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -49,5 +50,13 @@ void manual_seed_all(uint64_t seed) { } } +void synchronize(int64_t device_index) { + TORCH_CHECK(is_available(), "No CUDA GPUs are available"); + int64_t num_gpus = cuda::device_count(); + TORCH_CHECK(device_index == -1 || device_index < num_gpus, + "Device index out of range: ", device_index); + at::detail::getCUDAHooks().deviceSynchronize(device_index); +} + } // namespace cuda } // namespace torch From 638086950d3f339de49c6b5393733aea2fee6a55 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 6 Jan 2021 11:01:27 -0800 Subject: [PATCH 18/44] Clean up type annotations in torch/nn/quantized/modules (#49941) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49941 Test Plan: Sandcastle Reviewed By: jerryzh168 Differential Revision: D25718715 fbshipit-source-id: bbe450d937cf7ef634e003c09146e308180d1d58 --- torch/nn/quantized/modules/conv.py | 21 +++----- torch/nn/quantized/modules/embedding_ops.py | 6 +-- .../quantized/modules/functional_modules.py | 54 +++++++------------ 3 files changed, 27 insertions(+), 54 deletions(-) diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py index a9ba3293630d..00ceba7ab367 100644 --- a/torch/nn/quantized/modules/conv.py +++ b/torch/nn/quantized/modules/conv.py @@ -240,8 +240,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv1d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv1d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -327,8 +326,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv2d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv2d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -412,8 +410,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv3d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv3d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -466,8 +463,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode) - def _input_padding(self, kernel_size, dilation, padding): - # type: (List[int], List[int], List[int]) -> List[int] + def _input_padding(self, kernel_size: List[int], dilation: List[int], padding: List[int]) -> List[int]: res = torch.jit.annotate(List[int], []) for kdx in range(len(kernel_size)): pad = (dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx]) @@ -561,8 +557,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose1d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose1d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) @@ -645,8 +640,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose2d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose2d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) @@ -730,8 +724,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose3d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose3d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py index e41d55347741..523994b364c8 100644 --- a/torch/nn/quantized/modules/embedding_ops.py +++ b/torch/nn/quantized/modules/embedding_ops.py @@ -22,8 +22,7 @@ def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8): raise NotImplementedError('Unsupported dtype on quantized embedding! Supports quint8 and quint4x2.') @torch.jit.export - def set_weight(self, weight): - # type: (torch.Tensor) -> None + def set_weight(self, weight: torch.Tensor) -> None: if self.dtype in [torch.quint8, torch.quint4x2]: self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight) else: @@ -125,8 +124,7 @@ def extra_repr(self): return extra_repr_str - def set_weight(self, w): - # type: (torch.Tensor) -> None + def set_weight(self, w: torch.Tensor) -> None: self._packed_params.set_weight(w) def weight(self): diff --git a/torch/nn/quantized/modules/functional_modules.py b/torch/nn/quantized/modules/functional_modules.py index b9fab962d563..08b5447bb925 100644 --- a/torch/nn/quantized/modules/functional_modules.py +++ b/torch/nn/quantized/modules/functional_modules.py @@ -40,45 +40,39 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.add(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = torch.mul(x, y) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.mul(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = torch.cat(x, dim=dim) r = self.activation_post_process(r) return r r"""Operation equivalent to ``relu(torch.add(x,y))``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = torch.nn.functional.relu(r) r = self.activation_post_process(r) @@ -101,38 +95,32 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) return r r"""Operation equivalent to ``torch.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.add(x, y) return r r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = torch.mul(x, y) return r r"""Operation equivalent to ``torch.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.mul(x, y) return r r"""Operation equivalent to ``torch.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = torch.cat(x, dim=dim) return r r"""Operation equivalent to ``relu(torch.add(x,y))``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = torch.nn.functional.relu(r) return r @@ -195,45 +183,39 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.ops.quantized.add``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = ops.quantized.add_scalar(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = ops.quantized.mul_scalar(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.ops.quantized.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.add_relu``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r From 3ce539881a12df901538d6cd93f752469583f65b Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 6 Jan 2021 11:26:03 -0800 Subject: [PATCH 19/44] Back out "Revert D25757721: [pytorch][PR] Run mypy on more test files" (#50142) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50142 Original commit changeset: 58437d719285 Test Plan: OSS CI Reviewed By: walterddr, ngimel Differential Revision: D25803866 fbshipit-source-id: d6b83a5211e430c0451994391876103f1ad96315 --- mypy.ini | 11 +++++++++++ test/test_bundled_inputs.py | 4 +++- test/test_expecttest.py | 3 ++- test/test_numpy_interop.py | 18 +++++++++--------- torch/testing/_internal/expecttest.py | 4 +++- torch/utils/bundled_inputs.py | 4 ++-- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/mypy.ini b/mypy.ini index 7d6161bddd17..bab4ce5dfd42 100644 --- a/mypy.ini +++ b/mypy.ini @@ -17,8 +17,13 @@ check_untyped_defs = True files = torch, caffe2, + test/test_bundled_images.py, + test/test_bundled_inputs.py, test/test_complex.py, + test/test_dataset.py, + test/test_expecttest.py, test/test_futures.py, + test/test_numpy_interop.py, test/test_torch.py, test/test_type_hints.py, test/test_type_info.py @@ -119,6 +124,12 @@ ignore_errors = True [mypy-torch.overrides] ignore_errors = True +# +# Adding type annotations to caffe2 is probably not worth the effort +# only work on this if you have a specific reason for it, otherwise +# leave these ignores as they are. +# + [mypy-caffe2.python.*] ignore_errors = True diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py index f57407c9b1d1..e12339f3acea 100644 --- a/test/test_bundled_inputs.py +++ b/test/test_bundled_inputs.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 import io +from typing import List + import torch import torch.utils.bundled_inputs from torch.testing._internal.common_utils import TestCase, run_tests @@ -27,7 +29,7 @@ def forward(self, arg): sm = torch.jit.script(SingleTensorModel()) original_size = model_size(sm) - get_expr = [] + get_expr : List[str] = [] samples = [ # Tensor with small numel and small storage. (torch.tensor([1]),), diff --git a/test/test_expecttest.py b/test/test_expecttest.py index 652a33c41869..5e2461797705 100644 --- a/test/test_expecttest.py +++ b/test/test_expecttest.py @@ -4,6 +4,7 @@ import string import textwrap import doctest +from typing import Dict, Any import hypothesis from hypothesis.strategies import text, integers, composite, sampled_from, booleans @@ -38,7 +39,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote): r3 = {r}{quote}placeholder3{quote} """.format(r='r' if raw else '', quote=quote * 3) new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0] - ns = {} + ns : Dict[str, Any] = {} exec(new_prog, ns) msg = "program was:\n{}".format(new_prog) self.assertEqual(ns['r'], 'placeholder', msg=msg) # noqa: F821 diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py index 35ac4eb94889..81c385ae90a2 100644 --- a/test/test_numpy_interop.py +++ b/test/test_numpy_interop.py @@ -47,10 +47,8 @@ def get_castable_tensor(shape, dtype): else: # can't directly use min and max, because for int64_t, max - min # is greater than int64_t range and triggers UB. - dtype_info = torch.iinfo(dtype) - low = max(dtype_info.min, int(-1e10)) - high = min(dtype_info.max, int(1e10)) - dtype_info = torch.iinfo(dtype) + low = max(torch.iinfo(dtype).min, int(-1e10)) + high = min(torch.iinfo(dtype).max, int(1e10)) t = torch.empty(shape, dtype=torch.int64).random_(low, high) return t.to(dtype) @@ -272,10 +270,12 @@ def test_numpy_array_interface(self, device): ] for tp, dtype in zip(types, dtypes): if np.dtype(dtype).kind == 'u': - x = torch.Tensor([1, 2, 3, 4]).type(tp) + # .type expects a XxxTensor, which have no type hints on + # purpose, so ignore during mypy type checking + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) else: - x = torch.Tensor([1, -2, 3, -4]).type(tp) + x = torch.Tensor([1, -2, 3, -4]).type(tp) # type: ignore array = np.array([1, -2, 3, -4], dtype=dtype) # Test __array__ w/o dtype argument @@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device): float_types = [torch.DoubleTensor, torch.FloatTensor] float_dtypes = [np.float64, np.float32] for tp, dtype in zip(float_types, float_dtypes): - x = torch.Tensor([1, 2, 3, 4]).type(tp) + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) for func in ['sin', 'sqrt', 'ceil']: ufunc = getattr(np, func) @@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device): # Test functions with boolean return value for tp, dtype in zip(types, dtypes): - x = torch.Tensor([1, 2, 3, 4]).type(tp) + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) geq2_x = np.greater_equal(x, 2) geq2_array = np.greater_equal(array, 2).astype('uint8') @@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device): self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val)) # numpy integral type parses like a python int in custom python bindings: - self.assertEqual(torch.Storage(np_val).size(), scalar) + self.assertEqual(torch.Storage(np_val).size(), scalar) # type: ignore tensor = torch.tensor([2], dtype=torch.int) tensor[0] = np_val diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py index 9e46a9a84a37..4dae7ebf03dc 100644 --- a/torch/testing/_internal/expecttest.py +++ b/torch/testing/_internal/expecttest.py @@ -3,6 +3,7 @@ import traceback import os import string +from typing import Tuple # This file implements expect tests (also known as "golden" tests). @@ -139,7 +140,8 @@ def ok_for_raw_triple_quoted_string(s, quote): r"(?Pr?)", re.DOTALL) -def replace_string_literal(src, lineno, new_string): +def replace_string_literal(src : str, lineno : int, + new_string : str) -> Tuple[str, int]: r""" Replace a triple quoted string literal with new contents. Only handles printable ASCII correctly at the moment. This diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py index c5d603885e4a..741c0841778a 100644 --- a/torch/utils/bundled_inputs.py +++ b/torch/utils/bundled_inputs.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union +from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence import textwrap import torch from torch._C import TupleType, OptionalType, ListType @@ -17,7 +17,7 @@ class InflatableArg(NamedTuple): def augment_model_with_bundled_inputs( model: torch.jit.ScriptModule, - inputs: Optional[List[Tuple[Any, ...]]] = None, + inputs: Optional[Sequence[Tuple[Any, ...]]] = None, _receive_inflate_expr: Optional[List[str]] = None, # For debugging. ) -> None: """Add bundled sample inputs to a model. From 6eee2a0a9f3545eb3d923408eedf2a2136bf4d14 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Wed, 6 Jan 2021 11:34:50 -0800 Subject: [PATCH 20/44] [JIT] disable masked fill (#50147) Summary: There is an internal user who is experiencing a bug with masked_fill. While I am almost certain this corresponds to an old pytorch version with the bug, the model that is breaking is important and time-sensitive and we are covering all bases to try to get it to work again. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50147 Reviewed By: nhsoukai Differential Revision: D25806541 Pulled By: eellison fbshipit-source-id: 131bd71b5db9717a8a9cb97973d0b4f0e96455d6 --- test/test_jit_fuser_te.py | 1 + torch/csrc/jit/passes/tensorexpr_fuser.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 2143b4e19020..4886abc58758 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1281,6 +1281,7 @@ def forward(self, x): self.assertEqual(ref, mod.forward(x)) self.assertLastGraphAllFused() + @unittest.skip("Temporarily disabled") def test_masked_fill(self): dtypes = [ torch.int8, diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index e8091957ba65..166238cebe17 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -127,7 +127,7 @@ bool isSupported(Node* node) { "aten::round(Tensor self) -> Tensor", "aten::trunc(Tensor self) -> Tensor", "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor", - "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", + // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor", "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor", From ba691e1a428eb133e9c7b8e2f73ae723b8182f65 Mon Sep 17 00:00:00 2001 From: Sam Estep Date: Wed, 6 Jan 2021 11:47:35 -0800 Subject: [PATCH 21/44] Remove incorrect links to zdevito/ATen (#50065) Summary: Similar to https://github.com/pytorch/pytorch/issues/49028, this PR removes a few more references to https://github.com/zdevito/ATen. - The links for Functions.h, Tensor.h, and Type.h are simply broken, probably because they refer to `master` rather than a specific commit (cf. https://github.com/pytorch/pytorch/issues/47066) - I'm unsure about the change to the `about` section of `aten/conda/meta.yaml`; can someone comment on whether I am understanding that field correctly? - The reference to https://github.com/zdevito/ATen/issues/163 remains [in `tools/autograd/derivatives.yaml`](https://github.com/pytorch/pytorch/blob/cd608fe59b70fa7cafb07110096b2e023a8b6e9c/tools/autograd/derivatives.yaml#L91), because the contents of that issue discussion don't seem to be mirrored anywhere else. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50065 Reviewed By: ezyang, walterddr Differential Revision: D25767353 Pulled By: samestep fbshipit-source-id: 265f46f058bc54ef6d1a77f112cdfa1f115b3247 --- aten/conda/meta.yaml | 2 +- caffe2/contrib/aten/README.md | 6 +++--- caffe2/contrib/aten/docs/pytorch_to_caffe2.md | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml index d8096fc73a0f..a502690a5447 100644 --- a/aten/conda/meta.yaml +++ b/aten/conda/meta.yaml @@ -24,7 +24,7 @@ requirements: - mkl # [not osx] about: - home: https://github.com/zdevito/ATen + home: https://github.com/pytorch/pytorch license: BSD summary: A TENsor library for C++14 diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md index 377a1f780271..593079ef1393 100644 --- a/caffe2/contrib/aten/README.md +++ b/caffe2/contrib/aten/README.md @@ -1,6 +1,6 @@ # An ATen operator for Caffe2 -[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch +ATen is a simple tensor library thats exposes the Tensor operations in Torch and PyTorch directly in C++14. This library provides a generated wrapper around the ATen API that makes these functions available in Caffe2 as an operator. It also makes it accessible using the ToffeeIR. @@ -8,8 +8,8 @@ ToffeeIR. ### Example Usage in Caffe2 -First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), -[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). +First identify a function in ATen you want to call in Functions.h, +Tensor.h, or Type.h. We will call the `pow` operator: diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md index 85c275bb5178..c3f615ee37b9 100644 --- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md +++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md @@ -6,7 +6,7 @@ operators that haven't been standardized yet, or custom `torch.autograd.Function are specific to a network. To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library. -[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten) +[ATen](https://github.com/pytorch/pytorch/tree/master/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/master/caffe2/contrib/aten) that can run these tensor functions in a Caffe2 network after importing them through ONNX. This guide explains how to configure Caffe2 and modify your PyTorch program to use @@ -61,8 +61,8 @@ We can add a `symbolic` method to it like so: The function `graph.at` adds a new ATen op the computation graph. You can call any ATen function using this facility. To do so, -first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), -[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). +first identify a function in ATen you want to call in Functions.h, +Tensor.h, or Type.h. As an example, we might want to call the `pow` operator: @@ -86,9 +86,9 @@ To call methods of ATen's `Type` objects, you provide an additional string attri that determines the type. For instance, `ones` creates a new constant tensor of all ones: ``` class Type { - ... - virtual Tensor ones(IntArrayRef size) const; - ... + ... + virtual Tensor ones(IntArrayRef size) const; + ... }; ``` From 9b519b4a3f101fc799f1a9fcec79b21a31e3af2e Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Wed, 6 Jan 2021 12:15:11 -0800 Subject: [PATCH 22/44] [PyTorch Mobile] Generate Kernel dtype selection code in selected_mobile_ops.h during the build (#49279) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49279 Now that the YAML files for tracing based selective build optionally have the information regarding the selected kernel function dtypes, we can start generating constexpr selection code in the include file (`selected_mobile_ops.h`) to make the inclusion of code for specific dtypes selective based on compile time decisions. The way this is done is that if we detect that the code for a specific dtype should not be in the binary, we add an exception (throw) statement just before the method is called (see the first diff in this dtack) and allow the compiler to optimize away the rest of the function's body. This has the advantage of allowing the compiler to know the lambda's return type (since it's inferred from the `return` statements in the body of the method, and if we compile out all the cases, then the compiler won't know the return type and it will result in a compilation error). The generated `` is being used (included) in `Dispatch.h`. In case `XPLAT_MOBILE_BUILD` is not defined, then we should include code for all kernel dtypes (non-selective build). When merging, we need to handle the case of both older and newer (tracing based) operator lists. If we detect any operator that includes all overloads, it indicates that an old style operator list is part of the build, and we need to `include_all_kernel_dtypes` for this build. ghstack-source-id: 119439497 Test Plan: For Segmentation v220, here is one of the intermediate generated YAML files (selected_operators.yaml): {P154480509} and here is the generated `selected_mobile_ops.h` file: {P159808798} Here is the `selected_mobile_ops.h` file for lite_predictor (which includes all ops and all dtypes): {P159806443} Continuous build for ~8 checked-in models validates that the selection code works as expected when we build based on dtype selection. Reviewed By: iseeyuan Differential Revision: D25388949 fbshipit-source-id: 1c182a4831a7f94f7b152f02dbd3bc01c0d22443 --- aten/src/ATen/Dispatch.h | 4 ++ tools/codegen/selective_build/selector.py | 71 ++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 2e663b4f48dd..341e20cab1f3 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -10,6 +10,9 @@ #include #include +#ifdef XPLAT_MOBILE_BUILD +#include +#else namespace at { /** * The method should_include_kernel_dtype() returns true/false @@ -25,6 +28,7 @@ inline constexpr bool should_include_kernel_dtype( return true; } } +#endif /** * In the Facebook internal build (using BUCK), this macro is enabled by diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py index 3e80e168d31c..eeb15049075e 100644 --- a/tools/codegen/selective_build/selector.py +++ b/tools/codegen/selective_build/selector.py @@ -1,4 +1,4 @@ -from typing import Dict, Set, Optional, Tuple +from typing import Dict, Set, Optional, Tuple, List import yaml from dataclasses import dataclass @@ -26,6 +26,20 @@ class SelectiveBuilder: # A dictionary of operator -> operator metadata. operators: Dict[str, SelectiveBuildOperator] + # A dictionary of selected kernel tags and dtypes. Typically a + # PyTorch Operator Kernel (function) may have many code paths + # that are specialized for many many Tensor dtypes, so it's not + # one per kernel function, but there could be many per kernel + # function. The tag isn't a kernel function name, but some fragment + # of the kernel function implementation itself. + kernel_metadata: Dict[str, List[str]] + + # If true, then fragments for all dtypes for all kernel functions + # are included. This is typically set when any one of the + # operator lists is generated from a mechanism other than + # tracing based selective build. + include_all_kernel_dtypes: bool + @staticmethod def get_nop_selector() -> 'SelectiveBuilder': return SelectiveBuilder.from_yaml_dict({'include_all_operators': True}) @@ -33,9 +47,11 @@ def get_nop_selector() -> 'SelectiveBuilder': @staticmethod def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder': valid_top_level_keys = { + 'include_all_kernel_dtypes', 'include_all_operators', 'debug_info', 'operators', + 'kernel_metadata', } top_level_keys = set(data.keys()) if len(top_level_keys - valid_top_level_keys) > 0: @@ -58,7 +74,24 @@ def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder': for (k, v) in operators_dict.items(): operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v) - return SelectiveBuilder(include_all_operators, debug_info, operators) + + kernel_metadata = {} + kernel_metadata_dict = data.get('kernel_metadata', {}) + assert isinstance(kernel_metadata_dict, dict) + + for (k, v) in kernel_metadata_dict.items(): + kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v)) + + include_all_kernel_dtypes = data.get('include_all_kernel_dtypes', False) + assert isinstance(include_all_kernel_dtypes, bool) + + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + include_all_kernel_dtypes, + ) @staticmethod def from_yaml_str(config_contents: str) -> 'SelectiveBuilder': @@ -86,6 +119,7 @@ def from_legacy_op_registration_allow_list( } return SelectiveBuilder.from_yaml_dict({ 'operators': operators, + 'include_all_kernel_dtypes': True, }) def is_operator_selected(self, name: str) -> bool: @@ -147,8 +181,15 @@ def is_root_operator(self, name: str) -> bool: base_op: SelectiveBuildOperator = self.operators[name] return base_op.include_all_overloads and base_op.is_root_operator + def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool: + if self.include_all_operators or self.include_all_kernel_dtypes: + return True + + return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag] + def to_dict(self) -> Dict[str, object]: ret: Dict[str, object] = { + 'include_all_kernel_dtypes': self.include_all_kernel_dtypes, 'include_all_operators': self.include_all_operators, } operators = {} @@ -159,14 +200,38 @@ def to_dict(self) -> Dict[str, object]: if self._debug_info is not None: ret['debug_info'] = self._debug_info + ret['kernel_metadata'] = {k: list(v) for (k, v) in self.kernel_metadata.items()} + return ret +def merge_kernel_metadata( + lhs: Dict[str, List[str]], + rhs: Dict[str, List[str]], +) -> Dict[str, List[str]]: + kernel_metadata: Dict[str, List[str]] = {} + for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()): + dtypes_copy = set(dtypes) + if tag_name in kernel_metadata: + dtypes_copy |= set(kernel_metadata[tag_name]) + + kernel_metadata[tag_name] = list(dtypes_copy) + + return kernel_metadata + def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder: include_all_operators = lhs.include_all_operators or rhs.include_all_operators debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info) operators = merge_operator_dicts(lhs.operators, rhs.operators) - return SelectiveBuilder(include_all_operators, debug_info, operators) + kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata) + include_all_kernel_dtypes = lhs.include_all_kernel_dtypes or rhs.include_all_kernel_dtypes + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + include_all_kernel_dtypes, + ) def op_name_from_native_function(f: NativeFunction) -> str: From 09eb468398763fe2914fa4e28eb04dcbf1b3e615 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Wed, 6 Jan 2021 12:35:09 -0800 Subject: [PATCH 23/44] [vulkan] 2D prepacking for conv2d (#48816) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48816 Test Plan: Imported from OSS Reviewed By: AshkanAliabadi Differential Revision: D25786280 Pulled By: SS-JIA fbshipit-source-id: b41bf55dcff8f3dfbbf1994171e2ef62f16ff29a --- aten/src/ATen/native/vulkan/glsl/conv2d.glsl | 24 +- .../ATen/native/vulkan/glsl/conv2d_dw.glsl | 5 +- .../ATen/native/vulkan/glsl/conv2d_pw.glsl | 13 +- aten/src/ATen/native/vulkan/ops/Common.h | 2 +- .../ATen/native/vulkan/ops/Convolution.cpp | 355 ++++++++++-------- 5 files changed, 221 insertions(+), 178 deletions(-) diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl index bb2508aefe65..547eec7fafef 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl @@ -17,7 +17,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 padding; ivec2 dilate; vec2 clamp; - int stacks_per_tower; + ivec3 src_kernel; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -28,9 +28,6 @@ void main() { /* Dynamically Uniform */ const ivec3 size = imageSize(uOutput); const ivec3 isize = textureSize(uInput, 0); - const int tower = pos.z/(uBlock.stacks_per_tower); - const int tower_offset = pos.z % uBlock.stacks_per_tower; - const ivec4 block = tower_offset * uBlock.kernel.z + ivec4(0, 1, 2, 3); if (all(lessThan(pos, size))) { const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding; @@ -42,16 +39,15 @@ void main() { vec4 sum = uBias.data[pos.z]; for (int z = 0; z < uBlock.kernel.z; z+=4) { - const ivec4 kz = block + z; - - for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) { - for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) { - const vec4 In = texelFetch(uInput, ivec3(x, y, z/4), 0); - - sum = fma(In.xxxx, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.x), 0), sum); - sum = fma(In.yyyy, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.y), 0), sum); - sum = fma(In.zzzz, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.z), 0), sum); - sum = fma(In.wwww, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.w), 0), sum); + const int z4 = z/4; + for (int y = start.y, ky = kstart.y + pos.z * uBlock.src_kernel.y; y < end.y; y += uBlock.dilate.y, ++ky) { + for (int x = start.x, kx = 4*kstart.x + z4*uBlock.src_kernel.z; x < end.x; x += uBlock.dilate.x, kx+=4) { + const vec4 In = texelFetch(uInput, ivec3(x, y, z4), 0); + + sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0 + kx, ky, 0), 0), sum); + sum = fma(In.yyyy, texelFetch(uKernel, ivec3(1 + kx, ky, 0), 0), sum); + sum = fma(In.zzzz, texelFetch(uKernel, ivec3(2 + kx, ky, 0), 0), sum); + sum = fma(In.wwww, texelFetch(uKernel, ivec3(3 + kx, ky, 0), 0), sum); } } } diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl index 0f49515718b2..f8f929461ce7 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl @@ -17,6 +17,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 padding; ivec2 dilate; vec2 clamp; + ivec2 src_kernel; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -38,10 +39,10 @@ void main() { vec4 sum = uBias.data[pos.z]; for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) { - for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) { + for (int x = start.x, kx = kstart.x + ky*uBlock.src_kernel.x; x < end.x; x += uBlock.dilate.x, ++kx) { sum = fma( texelFetch(uInput, ivec3(x, y, pos.z), 0), - texelFetch(uKernel, ivec3(kx, ky, pos.z), 0), + texelFetch(uKernel, ivec3(kx, pos.z, 0), 0), sum); } } diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl index 1355b2c09b05..b28f0550132f 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl @@ -16,7 +16,6 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 stride; ivec2 padding; vec2 clamp; - int stacks_per_tower; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -27,9 +26,6 @@ void main() { /* Dynamically Uniform */ const ivec3 size = imageSize(uOutput); const ivec3 isize = textureSize(uInput, 0); - const int tower = pos.z/(uBlock.stacks_per_tower); - const int tower_offset = pos.z % uBlock.stacks_per_tower; - const ivec4 block = tower_offset * uBlock.kernel.x + ivec4(0, 1, 2, 3); if (all(lessThan(pos, size))) { const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding; @@ -38,12 +34,11 @@ void main() { for (int z = 0; z < uBlock.kernel.x; z+=4) { const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z/4), 0); - const ivec4 kz = block + z; - sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, tower, kz.x), 0), sum); - sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, tower, kz.y), 0), sum); - sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, tower, kz.z), 0), sum); - sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, tower, kz.w), 0), sum); + sum = fma(In.xxxx, texelFetch(uKernel, ivec3(z+0, pos.z, 0), 0), sum); + sum = fma(In.yyyy, texelFetch(uKernel, ivec3(z+1, pos.z, 0), 0), sum); + sum = fma(In.zzzz, texelFetch(uKernel, ivec3(z+2, pos.z, 0), 0), sum); + sum = fma(In.wwww, texelFetch(uKernel, ivec3(z+3, pos.z, 0), 0), sum); } imageStore( diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h index 6f7080f71a80..b0bbeeaf34f1 100644 --- a/aten/src/ATen/native/vulkan/ops/Common.h +++ b/aten/src/ATen/native/vulkan/ops/Common.h @@ -36,7 +36,7 @@ struct Layout final { }; struct Experimentation { - static constexpr bool kUseConv2dOldApi = true; + static constexpr bool kUseConv2dOldApi = false; }; struct ConvPrepackLimits final { diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index 5af2c14b80cb..c757f6cdac7a 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -25,7 +25,7 @@ inline bool is_pointwise(const IntArrayRef filter) { (1 == filter[Layout::Filter::width]); } -vTensor pack_weights( +vTensor pack_weights_dw( api::Resource::Pool& pool, const Tensor& weight_arg, const int64_t groups) { @@ -39,161 +39,201 @@ vTensor pack_weights( const IntArrayRef src_filter = weight.sizes(); const float* const src_weight_ptr = weight.data_ptr(); - // - // Depthwise - // + const int64_t src_kw_sz = src_filter[Layout::Filter::width]; + const int64_t src_kh_sz = src_filter[Layout::Filter::height]; + const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4)); + vTensor v_weight{ + api::context(), + &pool, + { + 4, + num_stacks, + src_kw_sz * src_kh_sz, + }, + weight.options(), + }; - if (is_depthwise(src_filter, groups)) { - vTensor v_weight{ - api::context(), - &pool, - src_filter, - weight.options(), - }; + using Future = vTensor::Future; + Future v_weight_future = v_weight.host(); + Future::Payload v_weight_payload = v_weight_future.wait(); - using Future = vTensor::Future; - Future v_weight_future = v_weight.host(); - Future::Payload v_weight_payload = v_weight_future.wait(); + /* Source */ + const int64_t src_kernel_sz = src_kw_sz * src_kh_sz; + const int64_t src_block_sz = + src_kernel_sz * src_filter[Layout::Filter::input]; - memcpy( - v_weight_payload.get(), - src_weight_ptr, - std::min(weight.nbytes(), v_weight.nbytes())); + /* Destination */ + const int64_t dst_kw_sz = src_kw_sz * src_kh_sz; + const int64_t dst_kh_sz = num_stacks; + const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz; - return v_weight; - } + float* const dst_weight_ptr = v_weight_payload.get(); + memset(dst_weight_ptr, 0, v_weight.nbytes()); - // - // General - // + for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) { + /* Source */ + const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz; - if (Experimentation::kUseConv2dOldApi) { - const uint32_t OC = src_filter[Layout::Filter::output]; - const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u); - const uint32_t C = src_filter[Layout::Filter::input]; - const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u); - const uint32_t KH = src_filter[Layout::Filter::height]; - const uint32_t KW = src_filter[Layout::Filter::width]; - - vTensor v_weight{ - api::context(), - &pool, - { - 1, - 4 * KH * KW, - OC_4, - 4 * C_4 - }, - weight.options(), - }; + /* Destination */ + const int64_t dst_oh = src_oc / 4; + const int64_t dst_c = src_oc % 4; - using Future = vTensor::Future; - Future v_weight_future = v_weight.host(); - Future::Payload v_weight_payload = v_weight_future.wait(); + float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz; - float* const dst_weight_ptr = v_weight_payload.get(); - memset(dst_weight_ptr, 0, v_weight.nbytes()); + for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) { + memcpy( + dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz, + src_weight_oc_ptr + src_ih * src_kw_sz, + sizeof(float) * src_kw_sz); + } + } - const float* src = src_weight_ptr; - float* const dst = dst_weight_ptr; + return v_weight; +} +vTensor pack_weights_old( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (weight_arg.is_vulkan()) { + return convert(weight_arg); + } + + const Tensor weight = weight_arg.contiguous(); + const IntArrayRef src_filter = weight.sizes(); + const float* const src_weight_ptr = weight.data_ptr(); + + const uint32_t OC = src_filter[Layout::Filter::output]; + const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u); + const uint32_t C = src_filter[Layout::Filter::input]; + const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u); + const uint32_t KH = src_filter[Layout::Filter::height]; + const uint32_t KW = src_filter[Layout::Filter::width]; + + vTensor v_weight{ + api::context(), + &pool, { - uint32_t ridx = 0; - const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16; - for (uint32_t oc = 0; oc < OC; ++oc) { - int oc_4 = oc / 4; - int oc_4_i = oc % 4; - float* dst_oc = dst + oc_4 * oc_4SizeNumel; - for (uint32_t ic = 0; ic < C; ++ic) { - int ic_4 = ic / 4; - int ic_4_i = ic % 4; - float* dst_ic = dst_oc + ic_4 * KW * KH * 16; - for (uint32_t ky = 0; ky < KH; ++ky) { - float* dst_ky = dst_ic + ky * KW * 16; - for (uint32_t kx = 0; kx < KW; ++kx) { - float* dst_kx = dst_ky + kx * 16; - dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; - } + 1, + 4 * KH * KW, + OC_4, + 4 * C_4 + }, + weight.options(), + }; + + using Future = vTensor::Future; + Future v_weight_future = v_weight.host(); + Future::Payload v_weight_payload = v_weight_future.wait(); + + float* const dst_weight_ptr = v_weight_payload.get(); + memset(dst_weight_ptr, 0, v_weight.nbytes()); + + const float* src = src_weight_ptr; + float* const dst = dst_weight_ptr; + + { + uint32_t ridx = 0; + const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16; + for (uint32_t oc = 0; oc < OC; ++oc) { + int oc_4 = oc / 4; + int oc_4_i = oc % 4; + float* dst_oc = dst + oc_4 * oc_4SizeNumel; + for (uint32_t ic = 0; ic < C; ++ic) { + int ic_4 = ic / 4; + int ic_4_i = ic % 4; + float* dst_ic = dst_oc + ic_4 * KW * KH * 16; + for (uint32_t ky = 0; ky < KH; ++ky) { + float* dst_ky = dst_ic + ky * KW * 16; + for (uint32_t kx = 0; kx < KW; ++kx) { + float* dst_kx = dst_ky + kx * 16; + dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; } } } + } - // shader KO4C4HW_to_image - struct Image3D { - float* data_; - uint32_t dim0_, dim1_, dim2_; - - Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) { - dim0_ = dim0; - dim1_ = dim1; - dim2_ = dim2; - data_ = new float[dim0 * dim1 * dim2 * 4]; - memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float)); - } + // shader KO4C4HW_to_image + struct Image3D { + float* data_; + uint32_t dim0_, dim1_, dim2_; + + Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) { + dim0_ = dim0; + dim1_ = dim1; + dim2_ = dim2; + data_ = new float[dim0 * dim1 * dim2 * 4]; + memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float)); + } - inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { - return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_; - } + inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_; + } - void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) { - data_[idx(i0, i1, i2, i3)] = value; - } + void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) { + data_[idx(i0, i1, i2, i3)] = value; + } - float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { - return data_[idx(i0, i1, i2, i3)]; - } - } image{4 * C_4, OC_4, KH * KW}; - - for (uint32_t sx = 0; sx < C_4; ++sx) { - for (uint32_t sy = 0; sy < OC_4; ++sy) { - for (uint32_t sz = 0; sz < (KH * KW); ++sz) { - for (uint32_t vi = 0; vi < 4; ++vi) { - int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz; - image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]); - image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]); - image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]); - image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]); - } + float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return data_[idx(i0, i1, i2, i3)]; + } + } image{4 * C_4, OC_4, KH * KW}; + + for (uint32_t sx = 0; sx < C_4; ++sx) { + for (uint32_t sy = 0; sy < OC_4; ++sy) { + for (uint32_t sz = 0; sz < (KH * KW); ++sz) { + for (uint32_t vi = 0; vi < 4; ++vi) { + int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz; + image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]); + image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]); + image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]); + image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]); } } } + } - // inverse function of nchw_to_image - const uint32_t W = 4 * C_4; - const uint32_t H = OC_4; - const uint32_t D = KH * KW; - for (uint32_t sx = 0; sx < W; ++sx) { - for (uint32_t sy = 0; sy < H; ++sy) { - for (uint32_t sz = 0; sz < D; ++sz) { - for (uint32_t szvi = 0; szvi < 4; ++szvi) { - dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi); - } + // inverse function of nchw_to_image + const uint32_t W = 4 * C_4; + const uint32_t H = OC_4; + const uint32_t D = KH * KW; + for (uint32_t sx = 0; sx < W; ++sx) { + for (uint32_t sy = 0; sy < H; ++sy) { + for (uint32_t sz = 0; sz < D; ++sz) { + for (uint32_t szvi = 0; szvi < 4; ++szvi) { + dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi); } } } } + } - return v_weight; + return v_weight; +} + +vTensor pack_weights_2d( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (weight_arg.is_vulkan()) { + return convert(weight_arg); } + const Tensor weight = weight_arg.contiguous(); + const IntArrayRef src_filter = weight.sizes(); + const float* const src_weight_ptr = weight.data_ptr(); + + const int64_t src_kw_sz = src_filter[Layout::Filter::width]; + const int64_t src_kh_sz = src_filter[Layout::Filter::height]; const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4)); - const int64_t stack_depth = - 4 * api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4)); - const int64_t max_stacks_per_tower = - ConvPrepackLimits::maxStackDepth / stack_depth; - const int64_t num_towers = div_up(num_stacks, max_stacks_per_tower); - int64_t stacks_per_tower = num_stacks; - if (num_towers > 1) { - stacks_per_tower = div_up(num_stacks, num_towers); - } + const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4)); vTensor v_weight{ api::context(), &pool, { - stacks_per_tower, - stack_depth, - src_filter[Layout::Filter::height] * num_towers, - src_filter[Layout::Filter::width], + 4, + src_kh_sz * num_stacks, + src_kw_sz * stack_depth, }, weight.options(), }; @@ -203,53 +243,59 @@ vTensor pack_weights( Future::Payload v_weight_payload = v_weight_future.wait(); /* Source */ - const int64_t src_kw_sz = src_filter[Layout::Filter::width]; - const int64_t src_kh_sz = src_filter[Layout::Filter::height]; const int64_t src_kernel_sz = src_kw_sz * src_kh_sz; const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input]; /* Destination */ - const IntArrayRef dst_filter = v_weight.sizes(); - const int64_t dst_kw_sz = src_filter[Layout::Filter::width]; - const int64_t dst_kh_sz = src_filter[Layout::Filter::height] * num_towers; + const int64_t dst_kw_sz = src_kw_sz * stack_depth; + const int64_t dst_kh_sz = src_kh_sz * num_stacks; const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz; - const int64_t dst_block_sz = - dst_kernel_sz * dst_filter[Layout::Filter::input]; - - TORCH_INTERNAL_ASSERT(src_kernel_sz*num_towers == dst_kernel_sz, "Internal error!"); float* const dst_weight_ptr = v_weight_payload.get(); memset(dst_weight_ptr, 0, v_weight.nbytes()); for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) { - const int64_t i_tower = src_oc / (stacks_per_tower * 4); /* Source */ - const float* const src_weight_oc_ptr = - src_weight_ptr + src_oc * src_block_sz; + const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz; /* Destination */ - const int64_t local_oc = src_oc % (stacks_per_tower * 4); - const int64_t dst_oc = local_oc / 4; - const int64_t dst_oc_offset = local_oc % 4; + const int64_t dst_oh = src_oc / 4; + const int64_t dst_c = src_oc % 4; - float* const dst_weight_oc_ptr = dst_weight_ptr + dst_oc * dst_block_sz + - dst_oc_offset * dst_kernel_sz; + float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz; for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) { - const int64_t dst_ic = 4 * src_ic; - - memcpy( - dst_weight_oc_ptr + dst_ic * dst_kernel_sz + - (i_tower * src_kernel_sz), - src_weight_oc_ptr + src_ic * src_kernel_sz, - sizeof(float) * src_kernel_sz); + const int64_t dst_ic4 = src_ic/4; + for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) { + for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) { + memcpy( + dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz + + dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4, + src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw, + sizeof(float)); + } + } } } return v_weight; } +vTensor pack_weights( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (is_depthwise(weight_arg.sizes(), groups)) { + return pack_weights_dw(pool, weight_arg, groups); + } + + if (Experimentation::kUseConv2dOldApi) { + return pack_weights_old(pool, weight_arg, groups); + } + return pack_weights_2d(pool, weight_arg, groups); +} + vTensor pack_biases( api::Resource::Pool& pool, const c10::optional& bias, @@ -394,6 +440,7 @@ void conv2d_depthwise( const vTensor& v_weight, const vTensor& v_bias, const IntArrayRef filter, + const IntArrayRef src_filter, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -406,6 +453,7 @@ void conv2d_depthwise( int32_t padding_x, padding_y; int32_t dilate_x, dilate_y; float clamp_x, clamp_y; + int32_t src_filter_w, src_filter_h; } block { safe_downcast(filter[Layout::Filter::width]), safe_downcast(filter[Layout::Filter::height]), @@ -417,6 +465,8 @@ void conv2d_depthwise( safe_downcast(dilation[Layout::Parameter::height]), output_min, output_max, + safe_downcast(src_filter[Layout::Filter::width]), + safe_downcast(src_filter[Layout::Filter::height]), }; context->dispatch( @@ -473,14 +523,12 @@ void conv2d_pointwise( const float output_min, const float output_max) { if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { - const int64_t stacks_per_tower = v_weight.sizes()[0]; const struct { int32_t kernel_ic, kernel_oc; int32_t stride_x, stride_y; int32_t padding_x, padding_y; float clamp_x, clamp_y; - int32_t stacks_per_tower; } block { safe_downcast(filter[Layout::Filter::input]), safe_downcast(filter[Layout::Filter::output]), @@ -490,7 +538,6 @@ void conv2d_pointwise( safe_downcast(padding[Layout::Parameter::height]), output_min, output_max, - safe_downcast(stacks_per_tower), }; context->dispatch( @@ -542,20 +589,20 @@ void conv2d( const vTensor& v_weight, const vTensor& v_bias, const IntArrayRef filter, + const IntArrayRef src_filter, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, const float output_min, const float output_max) { if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { - const int64_t stacks_per_tower = v_weight.sizes()[0]; const struct { int32_t kernel_x, kernel_y, kernel_ic, kernel_oc; int32_t stride_x, stride_y; int32_t padding_x, padding_y; int32_t dilate_x, dilate_y; float clamp_x, clamp_y; - int32_t stacks_per_tower; + int32_t src_filter_w, src_filter_h, src_filter_w4; } block { safe_downcast(filter[Layout::Filter::width]), safe_downcast(filter[Layout::Filter::height]), @@ -569,7 +616,9 @@ void conv2d( safe_downcast(dilation[Layout::Parameter::height]), output_min, output_max, - safe_downcast(stacks_per_tower), + safe_downcast(src_filter[Layout::Filter::width]), + safe_downcast(src_filter[Layout::Filter::height]), + safe_downcast(src_filter[Layout::Filter::width]*4), }; context->dispatch( @@ -859,6 +908,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const { packed_.v_weight, packed_.v_bias, packed_.filter, + unpacked_.filter, packed_.stride, packed_.padding, packed_.dilation, @@ -904,6 +954,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const { packed_.v_weight, packed_.v_bias, packed_.filter, + unpacked_.filter, packed_.stride, packed_.padding, packed_.dilation, From 473e78c0faac1d14f6a02799dfc4940bcfe6e07d Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 6 Jan 2021 12:36:12 -0800 Subject: [PATCH 24/44] Remove redundant code for unsupported Python versions (#49486) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49486 Remove code for Python 3.5 and lower. There's more that can be removed/modernised, but sticking mainly to redundant version checks here, to keep the diff/PR smaller. Pull Request resolved: https://github.com/pytorch/pytorch/pull/46579 Reviewed By: zou3519 Differential Revision: D24453571 Pulled By: ezyang fbshipit-source-id: c2cfcf05d6c5f65df64d89c331692c9aec09248e --- .jenkins/pytorch/macos-test.sh | 5 -- .../win-test-helpers/setup_pytorch_env.bat | 2 - caffe2/python/compatibility.py | 8 --- caffe2/python/onnx/backend.py | 3 +- caffe2/python/onnx/frontend.py | 5 +- caffe2/python/pybind_state.h | 12 ---- caffe2/python/utils.py | 4 +- .../caffe2/jenkins/common/install_python.sh | 5 -- test/test_dataloader.py | 67 +++++++------------ test/test_jit_profiling.py | 7 +- test/test_jit_simple.py | 7 +- tools/shared/module_loader.py | 2 - torch/_six.py | 1 - torch/csrc/utils/python_compat.h | 15 ----- torch/csrc/utils/six.h | 8 --- torch/cuda/__init__.py | 10 +-- torch/multiprocessing/__init__.py | 2 +- torch/multiprocessing/spawn.py | 17 ----- torch/serialization.py | 2 +- torch/testing/_internal/common_nn.py | 2 +- torch/utils/data/dataloader.py | 4 -- 21 files changed, 39 insertions(+), 149 deletions(-) delete mode 100644 caffe2/python/compatibility.py diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 0c34ddcc6179..24ec02c76df5 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -9,11 +9,6 @@ pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil # TODO move this to docker pip install unittest-xml-reporting pytest -# faulthandler become built-in since 3.3 -if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then - pip install -q faulthandler -fi - if [ -z "${IN_CI}" ]; then rm -rf ${WORKSPACE_DIR}/miniconda3/lib/python3.6/site-packages/torch* fi diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index a052a1b67d59..ed6482890993 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -41,8 +41,6 @@ popd :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage if %errorlevel% neq 0 ( exit /b %errorlevel% ) -:: No need to install faulthandler since we only test Python >= 3.6 on Windows -:: faulthandler is builtin since Python 3.3 set DISTUTILS_USE_SDK=1 diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py deleted file mode 100644 index 9d615a308333..000000000000 --- a/caffe2/python/compatibility.py +++ /dev/null @@ -1,8 +0,0 @@ -from six import PY2, PY3 - -if PY2: - import collections - container_abcs = collections -elif PY3: - import collections.abc - container_abcs = collections.abc diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 2c80fadafaee..193a6f217f93 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -17,7 +17,6 @@ # system protobuf. import onnx.backend from caffe2.python import core, workspace, rnn_cell, gru_cell -from caffe2.python.compatibility import container_abcs from caffe2.python.model_helper import ModelHelper from caffe2.proto import caffe2_pb2 import caffe2.python.utils @@ -771,7 +770,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version) if isinstance(ops, Caffe2Ops): return ops - if not isinstance(ops, container_abcs.Iterable): + if not isinstance(ops, collections.abc.Iterable): ops = [ops] return Caffe2Ops(ops, [], []) diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index bb2778d1a991..b5121602aff5 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -10,13 +10,12 @@ - +import collections import itertools import logging import re from caffe2.python import core as caffe2_core -from caffe2.python.compatibility import container_abcs from onnx import (checker, helper, numpy_helper, mapping, GraphProto, NodeProto, TensorProto, OperatorSetIdProto) from onnx.helper import make_tensor_value_info, make_model @@ -153,7 +152,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes): const_tensors = [] if isinstance(nodes, tuple): nodes, const_tensors = nodes - if not isinstance(nodes, container_abcs.Iterable): + if not isinstance(nodes, collections.abc.Iterable): nodes = [nodes] return nodes, const_tensors diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h index b3926e941194..6513f216a9be 100644 --- a/caffe2/python/pybind_state.h +++ b/caffe2/python/pybind_state.h @@ -232,7 +232,6 @@ class TensorFeeder : public BlobFeederBase { for (int i = 0; i < tensor.numel(); ++i) { char* str; Py_ssize_t strSize; -#if PY_MAJOR_VERSION > 2 if (PyBytes_Check(input[i])) { CAFFE_ENFORCE( PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1, @@ -246,11 +245,6 @@ class TensorFeeder : public BlobFeederBase { } else { CAFFE_THROW("Unsupported python object type passed into ndarray."); } -#else - CAFFE_ENFORCE( - PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1, - "Unsupported python object type passed into ndarray."); -#endif // PY_MAJOR_VERSION > 2 outPtr[i] = std::string(str, strSize); } break; @@ -342,18 +336,12 @@ class PythonOpBase : public Operator { try { builder_call = loads(py::bytes(pickled)).cast(); } catch (const py::error_already_set& e) { -#if PY_MAJOR_VERSION >= 3 LOG(INFO) << "Cannot unpickle python operator: " << e.what(); LOG(INFO) << "Try latin1 encoding for python3 run"; // to use the `_a` literal for arguments using namespace pybind11::literals; builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1") .template cast(); -#else - // for py2, simply re-throw the exception, as there is no encoding - // argument for pickle.loads - throw; -#endif } CAFFE_ENFORCE(builder_call); CAFFE_ENFORCE_EQ(py::len(builder_call), 3); diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 947dd9bf296d..289d107303fa 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -6,12 +6,12 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python.compatibility import container_abcs from future.utils import viewitems from google.protobuf.message import DecodeError, Message from google.protobuf import text_format import sys +import collections import copy import functools import numpy as np @@ -126,7 +126,7 @@ def MakeArgument(key, value): """Makes an argument based on the value type.""" argument = caffe2_pb2.Argument() argument.name = key - iterable = isinstance(value, container_abcs.Iterable) + iterable = isinstance(value, collections.abc.Iterable) # Fast tracking common use case where a float32 array of tensor parameters # needs to be serialized. The entire array is guaranteed to have the same diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh index 48a47b271107..19633d451ab3 100755 --- a/docker/caffe2/jenkins/common/install_python.sh +++ b/docker/caffe2/jenkins/common/install_python.sh @@ -135,11 +135,6 @@ if [ -z "${INSTALL_SETUPTOOLS}" ]; then pip install -U pip setuptools!=38.5.2 fi -# tornado 5.0 requires Python 2.7.9+ or 3.4+ -if [[ $($PYTHON -c 'import sys; print(int(sys.version_info <= (2, 7, 9) or sys.version_info <= (3, 4)))' == 1) ]]; then - pip install 'tornado<5' -fi - # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by # defaults installs the most recent networkx version, so we install this lower # version explicitly before scikit-image pulls it in as a dependency diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 047297c438b7..c257dd8a2fd7 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -3,6 +3,7 @@ import errno import os import ctypes +import faulthandler import torch import gc import time @@ -34,18 +35,6 @@ else: warnings.warn(err_msg) -try: - import faulthandler - HAS_FAULTHANDLER = True -except ImportError: - HAS_FAULTHANDLER = False - err_msg = ("faulthandler not found. Some data loader tests use it for error " - "reporting (e.g., TestDataLoader.test_proper_exit).") - if IS_PYTORCH_CI: - raise ImportError(err_msg) from None - else: - warnings.warn(err_msg) - # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -86,9 +75,7 @@ JOIN_TIMEOUT = 60.0 # seconds -supported_multiprocessing_contexts = [None] -if torch.multiprocessing._supports_context: - supported_multiprocessing_contexts += list(torch.multiprocessing.get_all_start_methods()) +supported_multiprocessing_contexts = [None] + list(torch.multiprocessing.get_all_start_methods()) @unittest.skipIf( @@ -312,29 +299,25 @@ def test_iterable_dataset_err(self): # takes in dummy var so this can also be used as a `worker_init_fn` def set_faulthander_if_available(_=None): - if HAS_FAULTHANDLER: - faulthandler.enable(sys.__stderr__) - if not IS_WINDOWS: - # windows does not have faulthandler.register - # chain=False prevents the default behavior of killing the process - faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False) + faulthandler.enable(sys.__stderr__) + if not IS_WINDOWS: + # windows does not have faulthandler.register + # chain=False prevents the default behavior of killing the process + faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False) set_faulthander_if_available() # Process `pid` must have called `set_faulthander_if_available` def print_traces_of_all_threads(pid): - if HAS_FAULTHANDLER: - if not IS_WINDOWS: - # use the custom signal if available - os.kill(pid, signal.SIGUSR1) - else: - # otherwise we can still use the handler given by faulthandler.enable() - # at the cost of killing the process. - os.kill(pid, signal.SIGSEGV) + if not IS_WINDOWS: + # use the custom signal if available + os.kill(pid, signal.SIGUSR1) else: - # if there is no faulthandler, use SIGINT otherwise and hope for the best - os.kill(pid, signal.SIGINT) + # otherwise we can still use the handler given by faulthandler.enable() + # at the cost of killing the process. + os.kill(pid, signal.SIGSEGV) + # wait in parent process to give subprocess some time to print time.sleep(5) @@ -1037,17 +1020,13 @@ def test_invalid_ctor_args_combinations(self): "batch_size=None option disables auto-batching and is mutually exclusive"): self._get_data_loader(self.dataset, batch_size=None, drop_last=True) - if torch.multiprocessing._supports_context: - valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1] - with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"): - self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx) - with self.assertRaisesRegex(ValueError, "should specify a valid start method in"): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad') - with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object()) - else: - with self.assertRaisesRegex(ValueError, "multiprocessing_context relies on Python >= 3.4"): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='fork') + valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1] + with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"): + self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx) + with self.assertRaisesRegex(ValueError, "should specify a valid start method in"): + self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad') + with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "): + self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object()) # map-style sampler = torch.utils.data.SequentialSampler(self.dataset) @@ -1504,7 +1483,7 @@ def _test_sampler(self, **kwargs): def test_sampler(self): self._test_sampler() self._test_sampler(num_workers=4) - if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context: + if not NO_MULTIPROCESSING_SPAWN: self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn') def _test_batch_sampler(self, **kwargs): @@ -1529,7 +1508,7 @@ def _test_batch_sampler(self, **kwargs): def test_batch_sampler(self): self._test_batch_sampler() self._test_batch_sampler(num_workers=4) - if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context: + if not NO_MULTIPROCESSING_SPAWN: self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn') @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py index dc6bb2fbf878..1cf67f87ded9 100644 --- a/test/test_jit_profiling.py +++ b/test/test_jit_profiling.py @@ -4,7 +4,6 @@ if __name__ == '__main__': run_tests() - if not PY2: - import test_jit_py3 - suite = unittest.findTestCases(test_jit_py3) - unittest.TextTestRunner().run(suite) + import test_jit_py3 + suite = unittest.findTestCases(test_jit_py3) + unittest.TextTestRunner().run(suite) diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py index 23da6602c572..23c7f3b4b6f6 100644 --- a/test/test_jit_simple.py +++ b/test/test_jit_simple.py @@ -4,7 +4,6 @@ if __name__ == '__main__': run_tests() - if not PY2: - import test_jit_py3 - suite = unittest.findTestCases(test_jit_py3) - unittest.TextTestRunner().run(suite) + import test_jit_py3 + suite = unittest.findTestCases(test_jit_py3) + unittest.TextTestRunner().run(suite) diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py index c24a19678c39..51c57aa161c9 100644 --- a/tools/shared/module_loader.py +++ b/tools/shared/module_loader.py @@ -1,5 +1,3 @@ - - def import_module(name, path): import importlib.util spec = importlib.util.spec_from_file_location(name, path) diff --git a/torch/_six.py b/torch/_six.py index c53feed94cce..00f9fa6b7f95 100644 --- a/torch/_six.py +++ b/torch/_six.py @@ -33,7 +33,6 @@ FileNotFoundError = builtins.FileNotFoundError StringIO = io.StringIO container_abcs = collections.abc -PY3 = sys.version_info[0] == 3 PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7 def with_metaclass(meta: type, *bases) -> type: diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h index 28d990c64c42..7e1cb0c4f92d 100644 --- a/torch/csrc/utils/python_compat.h +++ b/torch/csrc/utils/python_compat.h @@ -63,20 +63,5 @@ __PySlice_Unpack(PyObject *_r, (PySlice_Unpack(SLICE, START, STOP, STEP) == 0) #endif -// https://bugsfiles.kde.org/attachment.cgi?id=61186 -#if PY_VERSION_HEX >= 0x03020000 #define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \ (PySlice_GetIndicesEx(SLICE, LEN, START, STOP, LENGTH, STEP) == 0) -#else -#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \ - (PySlice_GetIndicesEx((PySliceObject*)SLICE, LEN, START, STOP, LENGTH, STEP) == 0) -#endif - -// This function was introduced in Python 3.4 -#if PY_VERSION_HEX < 0x03040000 -inline int -PyGILState_Check() { - PyThreadState * tstate = _PyThreadState_Current; - return tstate && (tstate == PyGILState_GetThisThreadState()); -} -#endif diff --git a/torch/csrc/utils/six.h b/torch/csrc/utils/six.h index 932f0bf61a29..b83e60c77cf3 100644 --- a/torch/csrc/utils/six.h +++ b/torch/csrc/utils/six.h @@ -23,11 +23,7 @@ inline bool isTuple(pybind11::handle input) { if (PyTuple_Check(input.ptr())) { return true; } -#if PY_MAJOR_VERSION == 2 - return isStructSeq(input); -#else return false; -#endif } inline bool isTuple(PyObject* obj) { @@ -40,12 +36,8 @@ inline bool isTuple(PyObject* obj) { // But on Python 2, structseq is not a subtype of tuple, so we need to manually create a // new tuple object from structseq. inline THPObjectPtr maybeAsTuple(PyStructSequence *obj) { -#if PY_MAJOR_VERSION == 2 - return THPObjectPtr(torch::utils::structseq_slice(obj, 0, Py_SIZE(obj))); -#else Py_INCREF(obj); return THPObjectPtr((PyObject *)obj); -#endif } inline THPObjectPtr maybeAsTuple(PyObject *obj) { diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 5535cef78395..e59c798a59be 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -153,15 +153,9 @@ def _lazy_init(): # immediately, while we are still guaranteed to have the GIL, because some # of the C calls we make below will release the GIL if _is_in_bad_fork(): - from sys import version_info - if version_info < (3, 4): - msg = ("To use CUDA with multiprocessing, you must use Python " - "3.4+ and the 'spawn' start method") - else: - msg = ("To use CUDA with multiprocessing, you must use the " - "'spawn' start method") raise RuntimeError( - "Cannot re-initialize CUDA in forked subprocess. " + msg) + "Cannot re-initialize CUDA in forked subprocess. To use CUDA with " + "multiprocessing, you must use the 'spawn' start method") if not hasattr(torch._C, '_cuda_getDeviceCount'): raise AssertionError("Torch not compiled with CUDA enabled") if _cudart is None: diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py index 561eddfb02a2..039ddf2a1b09 100644 --- a/torch/multiprocessing/__init__.py +++ b/torch/multiprocessing/__init__.py @@ -35,7 +35,7 @@ """Add helper function to spawn N processes and wait for completion of any of them. This depends `mp.get_context` which was added in Python 3.4.""" -from .spawn import spawn, SpawnContext, _supports_context, start_processes, ProcessContext, \ +from .spawn import spawn, SpawnContext, start_processes, ProcessContext, \ ProcessRaisedException, ProcessExitedException diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py index b2008912dbb5..9ad17c94ccf8 100644 --- a/torch/multiprocessing/spawn.py +++ b/torch/multiprocessing/spawn.py @@ -66,24 +66,8 @@ def _wrap(fn, i, args, error_queue): sys.exit(1) -# Multiprocessing contexts are introduced at Python 3.4 -_supports_context = sys.version_info >= (3, 4) - - -def _python_version_check(): - if not _supports_context: - raise RuntimeError("Requires python 3.4 or higher to use " - "torch.multiprocessing.spawn and " - "torch.multiprocessing.ProcessContext helper " - "to launch multiple processes. If you are using " - "this for distributed training and have a lower " - "version of python, please use " - "torch.distributed.launch instead.") - - class ProcessContext: def __init__(self, processes, error_queues): - _python_version_check() self.error_queues = error_queues self.processes = processes self.sentinels = { @@ -182,7 +166,6 @@ def __init__(self, processes, error_queues): # Currently we only add this API first, we can consider adding it to documentation as # needed in the future. def start_processes(fn, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'): - _python_version_check() mp = multiprocessing.get_context(start_method) error_queues = [] processes = [] diff --git a/torch/serialization.py b/torch/serialization.py index ebc5d0a08541..3b6f5828d858 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -192,7 +192,7 @@ def storage_to_tensor_type(storage): def _is_path(name_or_buffer): return isinstance(name_or_buffer, str) or \ - (sys.version_info[0] == 3 and isinstance(name_or_buffer, pathlib.Path)) + isinstance(name_or_buffer, pathlib.Path) class _opener(object): diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 714361497d94..022255a5298b 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -4866,7 +4866,7 @@ def __call__(self, test_case): if self.should_test_pickle: # TODO: do this with in-memory files as soon as torch.save will support it - with TemporaryFile() as f: + with tempfile.TemporaryFile() as f: test_case._forward(module, input) torch.save(module, f) f.seek(0) diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index f75e4cca195e..d4ef1a99a2df 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -308,10 +308,6 @@ def multiprocessing_context(self): def multiprocessing_context(self, multiprocessing_context): if multiprocessing_context is not None: if self.num_workers > 0: - if not multiprocessing._supports_context: - raise ValueError('multiprocessing_context relies on Python >= 3.4, with ' - 'support for different start methods') - if isinstance(multiprocessing_context, string_classes): valid_start_methods = multiprocessing.get_all_start_methods() if multiprocessing_context not in valid_start_methods: From fcb69d2ebaede960e7708706436d372b68807921 Mon Sep 17 00:00:00 2001 From: Ashkan Aliabadi Date: Wed, 6 Jan 2021 12:56:59 -0800 Subject: [PATCH 25/44] Add android.permission.INTERNET permission to Android test_app. (#49996) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49996 According to section 5.2.1 of Snapdragon Profiler User Guide (https://developer.qualcomm.com/qfile/30580/snapdragon_profiler_user_guide_reva.pdf) OpenGL ES, Vulkan, and OpenCL apps must include android.permission.INTERNET in the app's AndroidManifest.xml to enable API tracing and GPU metrics. Test Plan: Imported from OSS Reviewed By: SS-JIA Differential Revision: D25809555 Pulled By: AshkanAliabadi fbshipit-source-id: c4d88a7ea98d9166efbc4157df7d822d99ba0df9 --- android/test_app/app/src/main/AndroidManifest.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/android/test_app/app/src/main/AndroidManifest.xml b/android/test_app/app/src/main/AndroidManifest.xml index a83bf223bdaf..abdd9a8d986a 100644 --- a/android/test_app/app/src/main/AndroidManifest.xml +++ b/android/test_app/app/src/main/AndroidManifest.xml @@ -18,4 +18,10 @@ + + + + From e4c41b6936ed433aff8e60735eba938ba66334e8 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Wed, 6 Jan 2021 14:14:24 -0800 Subject: [PATCH 26/44] Remove codegen logic to support non-c10-full ops (#49164) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49164 This PR removes the logic paths in codegen that were responsible for handling non-c10-full ops. This only goes through our basic codegen. It does not simplify C++ code yet and it does not remove the codegen for generated unboxing wrappers yet. ghstack-source-id: 119450487 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D25462977 fbshipit-source-id: 7e70d14bea96948f5056d98125f3e6ba6bd78285 --- tools/autograd/gen_trace_type.py | 48 ++++++----------------- tools/autograd/gen_variable_type.py | 61 ++++++++--------------------- tools/codegen/api/cpp.py | 11 +----- tools/codegen/api/dispatcher.py | 39 ++++++------------ tools/codegen/api/native.py | 3 +- tools/codegen/api/python.py | 13 ++---- tools/codegen/gen.py | 33 ++++------------ tools/codegen/model.py | 14 ++----- tools/jit/gen_unboxing_wrappers.py | 38 ------------------ 9 files changed, 56 insertions(+), 204 deletions(-) diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index d8d42762e4fb..d8e68606e6ba 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -117,13 +117,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen else: return [ADD_TRACE_INPUT.substitute(name=name, input=name)] - args: List[Union[Argument, TensorOptionsArguments]] = [] - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - args = list(f.func.schema_order_arguments()) - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - args = [cpp_args.argument for cpp_args in sig_group.signature.arguments() - if not isinstance(cpp_args.argument, SelfArgument)] + args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments()) if f.func.is_out_fn(): # *_out functions take the result as a separate argument, but we don't want to @@ -131,12 +125,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # So first, we need to remove the out argument from the list of arguments to trace. # TODO: byte-for-byte compatible with old codegen behavior - it's incorrect to assume # there is only one output argument. - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - # for c10-full ops, the out argument is in the end - args = args[:-1] - else: - # for legacy ops, the out argument is in the beginning. - args = args[1:] + args = args[:-1] trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args) @@ -374,14 +363,10 @@ def method_definition(f: NativeFunction) -> Optional[str]: if cpp.name(f.func) in MANUAL_TRACER: return None - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - formals = ', '.join( - f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' - for a in f.func.schema_order_arguments() - ) - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments()) + formals = ', '.join( + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ) return METHOD_DEFINITION.substitute( return_type=cpp.returns_type(f.func.returns), @@ -396,27 +381,16 @@ def method_definition(f: NativeFunction) -> Optional[str]: ); """) -UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\ -m.impl_UNBOXED("${name}", &${class_type}::${type_wrapper_name}); -""") - @with_native_function def method_registration(f: NativeFunction) -> Optional[str]: if cpp.name(f.func) in MANUAL_TRACER: return None - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - return WRAPPER_REGISTRATION.substitute( - name=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='TraceType', - ) - else: - return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( - name=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='TraceType', - ) + return WRAPPER_REGISTRATION.substitute( + name=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='TraceType', + ) def gen_trace_type_shard( fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index f49f5e15845b..c78e1e5f66cc 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -191,19 +191,6 @@ } """) -# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once -# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because -# ops that are `use_c10_dispatcher: full` need different c++ code than ops -# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants -# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates -# can be deleted once all ops are `use_c10_dispatcher: full`. -# If you update one of the templates, you likely also have to update the other. - -# See NOTE[UnboxedOnly] -UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\ -m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name}); -""") - WRAPPER_REGISTRATION = CodeTemplate("""\ m.impl("${unqual_operator_name_with_overload}", TORCH_FN(${class_type}::${type_wrapper_name}) @@ -349,30 +336,18 @@ def gen_variable_type( @with_native_function def gen_formals(f: NativeFunction) -> str: - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - formals = ', '.join( - f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' - for a in f.func.schema_order_arguments() - ) - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments()) - return formals + return ', '.join( + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ) @with_native_function def gen_wrapper_registration(f: NativeFunction) -> str: - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - return WRAPPER_REGISTRATION.substitute( - unqual_operator_name_with_overload=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='VariableType', - ) - else: - return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( - unqual_operator_name_with_overload=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='VariableType', - ) + return WRAPPER_REGISTRATION.substitute( + unqual_operator_name_with_overload=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='VariableType', + ) def gen_variable_type_shard( fm: FileManager, @@ -669,7 +644,7 @@ def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequen call = CALL_DISPATCH_VIA_NAMESPACE.substitute( api_name=cpp.name( f.func, - faithful_name_for_out_overloads=f.use_c10_dispatcher.dispatcher_uses_new_style(), + faithful_name_for_out_overloads=True, ), unpacked_args=unpacked_args) else: @@ -887,16 +862,12 @@ def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]: body: List[str] = [] unpacked_bindings: List[Binding] = [] - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - bindings = [r for a in f.func.schema_order_arguments() - for r in cpp.argument(a, - method=False, - cpp_no_default_args=set(), - faithful=False, - has_tensor_options=False)] - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - bindings = list(sig_group.signature.arguments()) + bindings = [r for a in f.func.schema_order_arguments() + for r in cpp.argument(a, + method=False, + cpp_no_default_args=set(), + faithful=False, + has_tensor_options=False)] for i, binding in enumerate(bindings): assert not isinstance(binding.argument, SelfArgument) diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index 8a1d2a5272f5..0debd52ca896 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -1,6 +1,5 @@ from tools.codegen.model import * from tools.codegen.api.types import * -import tools.codegen.local as local from typing import Optional, Sequence, Union, List, Set # This file describes the translation of JIT schema to the public C++ @@ -88,10 +87,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: if mutable: return MutRefCType(BaseCType('Tensor', binds)) # TODO: fix this discrepancy else: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return ConstRefCType(OptionalCType(BaseCType('Tensor', binds))) - else: - return ConstRefCType(BaseCType('Tensor', binds)) + return ConstRefCType(OptionalCType(BaseCType('Tensor', binds))) elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) return OptionalCType(elem) elif isinstance(t, ListType): @@ -105,10 +101,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: elif str(t.elem) == 'Dimname': return BaseCType("DimnameList", binds) elif str(t.elem) == 'Tensor?': - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return ConstRefCType(BaseCType("c10::List>", binds)) - else: - return BaseCType("TensorList", binds) + return ConstRefCType(BaseCType("c10::List>", binds)) elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) # TODO: explicitly qualify namespace here return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds) diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 3adc2465b607..bb65bc386e64 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -2,8 +2,6 @@ from tools.codegen.api.types import * import tools.codegen.api.cpp as cpp -import tools.codegen.api.native as native -import tools.codegen.local as local import itertools from typing import Sequence, List, Union @@ -31,17 +29,11 @@ def name(func: FunctionSchema) -> str: return cpp.name(func) def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - # This is a faux amis. If it makes sense in the future to add - # more special cases here, or invert things so cpp.argument_type - # calls this, or just completely inline the function, please do - # it. - return cpp.argumenttype_type(t, mutable=mutable, binds=binds) - else: - # This is real sharing. If you're modifying this path, ask - # yourself why you are changing the native functions protocol - # here and not in native. - return native.argumenttype_type(t, mutable=mutable, binds=binds) + # This is a faux amis. If it makes sense in the future to add + # more special cases here, or invert things so cpp.argument_type + # calls this, or just completely inline the function, please do + # it. + return cpp.argumenttype_type(t, mutable=mutable, binds=binds) def argument_type(a: Argument, *, binds: ArgName) -> CType: return argumenttype_type(a.type, mutable=a.is_write, binds=binds) @@ -53,10 +45,6 @@ def returns_type(rs: Sequence[Return]) -> str: def argument( a: Union[Argument, TensorOptionsArguments, SelfArgument] ) -> List[Binding]: - # We could forward to native.argument but it is a bit suspect because - # the grouping may not be set correctly - assert local.use_c10_dispatcher().dispatcher_uses_new_style() - if isinstance(a, Argument): return [Binding( ctype=argument_type(a, binds=a.name), @@ -71,13 +59,10 @@ def argument( assert_never(a) def arguments(func: FunctionSchema) -> List[Binding]: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return [ - r for a in itertools.chain( - func.arguments.positional, - func.arguments.kwarg_only, - func.arguments.out - ) for r in argument(a) - ] - else: - return native.arguments(func) + return [ + r for a in itertools.chain( + func.arguments.positional, + func.arguments.kwarg_only, + func.arguments.out + ) for r in argument(a) + ] diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index 936500b560db..af82210b20f4 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -64,8 +64,7 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out # Erase SelfArgument from the distinction return argument(a.argument, is_out=is_out) elif isinstance(a, TensorOptionsArguments): - if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures, - UseC10Dispatcher.with_codegenerated_unboxing_wrapper]: + if local.use_c10_dispatcher() == UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: # TODO: expunge this logic entirely default = None if should_default: diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index bc5cbb440b98..749513cb5c0d 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -3,7 +3,6 @@ from tools.codegen.api.types import * import tools.codegen.api.cpp as cpp -import tools.codegen.local as local from tools.codegen.gen import pythonify_default from tools.codegen.model import * @@ -599,11 +598,8 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str: elif isinstance(t, OptionalType): if str(t.elem) == 'Tensor': - if not simple_type or local.use_c10_dispatcher().dispatcher_uses_new_style(): - # Is it desired to keep '?' for simple_type with new style dispatcher? - return 'Tensor?' - else: - return 'Tensor' + # Is it desired to keep '?' for simple_type with new style dispatcher? + return 'Tensor?' elem = argument_type_str(t.elem, simple_type=simple_type) if elem == 'Layout': # TODO: fix this special case in PythonArgParser? @@ -1022,10 +1018,7 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str: elif isinstance(t, OptionalType): if str(t.elem) == 'Tensor': - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return 'optionalTensor' - else: - return 'tensor' + return 'optionalTensor' elif isinstance(t.elem, BaseType): if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar, diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 4768670b6f26..08e9572131e3 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -471,12 +471,6 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: # for mypy type refinement; would be fixed by TODO on target assert self.target is not Target.DECLARATION - if f.func.is_out_fn(): - assert local.use_c10_dispatcher().dispatcher_uses_new_style(), \ - ("{} takes out arguments and has to be written in the new style. " + - "Please add `use_c10_dispatcher: full` to your operator in native_functions.yaml " + - "and write the C++ implementation to take out arguments in the end.").format(f.func.name) - if self.dispatch_key not in f.dispatch: return None if f.manual_kernel_registration: @@ -519,8 +513,7 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: const DeviceGuard device_guard(device_or_default(device)); """ else: - assert local.use_c10_dispatcher() in [UseC10Dispatcher.with_codegenerated_unboxing_wrapper, - UseC10Dispatcher.hacky_wrapper_for_legacy_signatures] + assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures cuda_guard_from_tensor_options = """\ const DeviceGuard device_guard(options.device()); """ @@ -562,16 +555,14 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: # Figure out which signature the function is if local.use_c10_dispatcher() is UseC10Dispatcher.full: payload = f"TORCH_FN({name})" - elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: + else: + assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures payload = f""" c10::impl::hacky_wrapper_for_legacy_signatures< {dispatcher_sig.type()}, {len(f.func.arguments.out)} >(TORCH_FN({name})) """ - else: - assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper - payload = f"torch::CppFunction::makeUnboxedOnly(&{name})" return f'm.impl("{f.func.name}",\n{payload});\n' else: @@ -790,14 +781,9 @@ def __call__(self, f: NativeFunction) -> Optional[str]: dispatcher_sig = DispatcherSignature.from_schema(f.func) sig: Union[NativeSignature, DispatcherSignature] - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - sig = dispatcher_sig - dispatcher_exprs = dispatcher_sig.exprs() - dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" - else: - sig = native_sig - dispatcher_exprs = native_sig.dispatcher_exprs() - dispatch_key = "options.computeDispatchKey()" + sig = dispatcher_sig + dispatcher_exprs = dispatcher_sig.exprs() + dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" if self.target is Target.DEFINITION: # I don't think there's actually a good reason to generate @@ -823,11 +809,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: }} """ elif self.target is Target.REGISTRATION: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" - else: - assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper - return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});""" + return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" elif self.target is Target.DECLARATION: raise AssertionError() else: @@ -1052,7 +1034,6 @@ def compute_declaration_yaml(f: NativeFunction) -> object: ('name', cpp.name(f.func)), ('operator_name', str(f.func.name.name)), ('overload_name', str(f.func.name.overload_name)), - ('use_c10_dispatcher', f.use_c10_dispatcher.name), ('manual_kernel_registration', f.manual_kernel_registration), ('category_override', f.category_override if f.category_override is not None else ''), ('matches_jit_signature', True), diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 9c8a0d73e815..1128878fe45c 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -49,12 +49,8 @@ def __str__(self) -> str: class UseC10Dispatcher(Enum): full = 0 - with_codegenerated_unboxing_wrapper = 1 hacky_wrapper_for_legacy_signatures = 2 - def dispatcher_uses_new_style(self) -> bool: - return self in [UseC10Dispatcher.full, UseC10Dispatcher.hacky_wrapper_for_legacy_signatures] - # The basic input to the code generation is native_functions.yaml. # The name "native", BTW, comes from the distinction between native # functions and legacy TH functions. The legacy TH functions are gone, @@ -77,7 +73,7 @@ class NativeFunction: func: 'FunctionSchema' # Corresponds to the 'use_c10_dispatcher' field. The default - # is 'with_codegenerated_unboxing_wrapper' + # is 'full' use_c10_dispatcher: UseC10Dispatcher # Whether or not to omit automatic generation of a DeviceGuard @@ -177,16 +173,14 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': assert isinstance(cpp_no_default_args_list, list) cpp_no_default_args = set(cpp_no_default_args_list) - use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None) - if use_c10_dispatcher_s is None: - use_c10_dispatcher = UseC10Dispatcher.full - elif use_c10_dispatcher_s == 'full': + use_c10_dispatcher_s = e.pop('use_c10_dispatcher', 'full') + if use_c10_dispatcher_s == 'full': use_c10_dispatcher = UseC10Dispatcher.full elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures': use_c10_dispatcher = UseC10Dispatcher.hacky_wrapper_for_legacy_signatures else: raise AssertionError( - f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}') + f'use_c10_dispatcher must be full or hacky_wrapper_for_legacy_signatures, got {use_c10_dispatcher}') variants_s = e.pop('variants', 'function') assert isinstance(variants_s, str) diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py index a52c109c603f..19e459e3f7ac 100644 --- a/tools/jit/gen_unboxing_wrappers.py +++ b/tools/jit/gen_unboxing_wrappers.py @@ -377,29 +377,7 @@ def pack_arguments(args): device=device, pin_memory=pin_memory, args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]), first=args_with_tensor_options[0], num_inputs=num_inputs) - elif decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - if len(decl['returns']) == 0: - return_type = "void" - elif len(decl['returns']) == 1: - return_type = decl['returns'][0]['type'] - else: - return_type = "std::tuple<{}>".format(", ".join([r['type'] for r in decl['returns']])) - for a in decl['arguments']: - if 'type' not in a: - raise Exception(decl) - argument_types_with_leading_comma = ", ".join([a['type'] for a in decl['arguments']]) - if argument_types_with_leading_comma != "": - argument_types_with_leading_comma = ", " + argument_types_with_leading_comma - args_with_leading_comma = pack_arguments(args) - if args_with_leading_comma != "": - args_with_leading_comma = ", " + args_with_leading_comma - return CALL_UNBOXED_KERNEL.substitute(name=decl['name'], - args_with_leading_comma=args_with_leading_comma, - num_inputs=num_inputs, - return_type=return_type, - formals_types_with_leading_comma=argument_types_with_leading_comma) else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] if is_namespace_function: return CALL_NAMESPACE.substitute(name=decl['name'], args=pack_arguments(args), @@ -438,16 +416,6 @@ def emit_decl_variant(decl): returns = decl['returns'] - if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - constructor = CONSTRUCTOR.substitute(name=decl['name'], - call=call, - kw_assignments=kw_assignments, - num_inputs=num_inputs, - op_capture=op_capture, - lvalues=lvalues) - else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] - return constructor def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration): @@ -549,12 +517,6 @@ def expand_options(decl, i, arg): # ops are assigned arbitrarily but stably to a file based on hash for group in jit_decl_groups: x = sum(ord(c) for c in group[0]['name']) % num_shards - for decl in group: - if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - shards[x].append(OPERATOR.substitute(signature=decl['schema_string'], - op=emit_decl_variant(decl))) - else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] for i, shard in enumerate(shards): env = { From 4a14020c0d56c733da381561c32009546ada28dd Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Wed, 6 Jan 2021 14:14:24 -0800 Subject: [PATCH 27/44] Remove .impl_UNBOXED() and functionalities associated with it (#49220) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49220 Since all ops are c10-full, we can remove .impl_UNBOXED now. This also removes the ability of KernelFunction or CppFunction to store unboxedOnly kernels. ghstack-source-id: 119450489 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D25490225 fbshipit-source-id: 32de9d591e6a842fe18abc82541580647e9cfdad --- aten/src/ATen/BatchingRegistrations.cpp | 2 +- aten/src/ATen/autocast_mode.cpp | 30 +++++------ aten/src/ATen/core/boxing/KernelFunction.cpp | 21 -------- aten/src/ATen/core/boxing/KernelFunction.h | 42 --------------- .../ATen/core/boxing/KernelFunction_impl.h | 54 ++----------------- .../ATen/core/boxing/KernelFunction_test.cpp | 40 -------------- .../op_registration/op_registration_test.cpp | 2 +- aten/src/ATen/native/vulkan/VulkanAten.cpp | 6 +-- aten/src/ATen/native/vulkan/ops/Clamp.cpp | 8 +-- .../ATen/native/vulkan/ops/Convolution.cpp | 2 +- aten/src/ATen/native/vulkan/ops/Factory.cpp | 2 +- test/cpp_extensions/msnpu_extension.cpp | 8 +-- test/cpp_extensions/rng_extension.cpp | 6 +-- test/mobile/op_deps/simple_ops.cpp | 6 +-- tools/code_analyzer/run_analyzer.sh | 2 +- torch/library.h | 51 +++++++++--------- 16 files changed, 64 insertions(+), 218 deletions(-) diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 9bdec2dce77e..2cd7cac4e71b 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -1015,7 +1015,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("_add_batch_dim", native::_add_batch_dim); m.impl("_remove_batch_dim", native::_remove_batch_dim); - m.impl_UNBOXED("sum.dim_IntList", sum_batching_rule); + m.impl("sum.dim_IntList", sum_batching_rule); m.impl("is_complex", native::is_complex); m.impl("conj", native::conj); diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index dfb8e3ac0f32..9a2f34257c57 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -239,13 +239,9 @@ Therefore, for the moment, this is all copy pasted in from VariableTypeEverythin m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ &WrapFunction::type::call); -#define KERNEL_UNBOXED_ONLY(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \ - m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ - &WrapFunction::type::call); - // Less-common but still useful case: redispatching to a function with a new signature (e.g. appending a dtype) -#define KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \ - m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ +#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \ + m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ &WrapFunction::type::call); /***************************************** @@ -367,20 +363,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { KERNEL(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional&, const c10::optional&, int64_t), fp32) KERNEL(ADD_NS(dist), "dist", Tensor (const Tensor &, const Tensor &, Scalar), fp32) KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32) - KERNEL_UNBOXED_ONLY(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional), fp32) + KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional), fp32) KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, Scalar, int64_t, Scalar), fp32) // fp32_set_opt_dtype KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(softmax), "softmax.int", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(log_softmax), "log_softmax.int", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(cumprod), "cumprod", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(cumsum), "cumsum", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even // when autocasting. // KERNEL(ADD_NS(norm), "norm.ScalarOpt_dtype", Tensor (const Tensor &, c10::optional, ScalarType), fp32_set_opt_dtype) @@ -388,20 +384,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { // KERNEL(ADD_NS(norm), "norm.names_ScalarOpt_dim_dtype", Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_set_opt_dtype) KERNEL(ADD_NS(sum), "sum", Tensor (const Tensor &, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(sum), "sum.dim_IntList", Tensor (const Tensor &, IntArrayRef, bool, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional), fp32_set_opt_dtype) // fp32_append_dtype // The fp32_append_dtype wrapper overrides implicit promotion behavior. // norm does not implicitly promote, but be aware when adding new ops to this policy. - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional, ScalarType), fp32_append_dtype) - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional, IntArrayRef, bool), Tensor (const Tensor &, c10::optional, IntArrayRef, bool, ScalarType), fp32_append_dtype) - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional, DimnameList, bool), Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional, IntArrayRef, bool), Tensor (const Tensor &, c10::optional, IntArrayRef, bool, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional, DimnameList, bool), Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_append_dtype) // promote KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote) KERNEL(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const c10::optional&), promote) KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote) - KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote) + KERNEL(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote) KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote) KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional), promote) KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote) diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index f84352ebee1f..58c35557018c 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -57,25 +57,4 @@ bool KernelFunction::_equalsBoxedAndUnboxed(const KernelFunction& other) const { unboxed_kernel_func_ == other.unboxed_kernel_func_; } -void KernelFunction::checkBoxedKernel(const OperatorHandle& opHandle) const { - if (C10_UNLIKELY(boxed_kernel_func_ == nullptr)) { - if (unboxed_kernel_func_ == nullptr) { - TORCH_INTERNAL_ASSERT( - false, - "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction.", - " opname: ", - opHandle.operator_name(), - " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`."); - } else { - // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this case should be impossible. - TORCH_INTERNAL_ASSERT( - false, - "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call().", - " opname: ", - opHandle.operator_name(), - " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`."); - } - } -} - } // namespace c10 diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index 6817907b12b1..bf847681aac8 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -123,26 +123,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedFunctor(std::unique_ptr kernelFunctor); - /** - * Create a KernelFunction from an unboxed functor and prevent creation of an - * unboxing-wrapper. This means that you cannot call this KernelFunction - * using KernelFunction::callBoxed() - * - * This is necessary because our unboxing wrappers don't work for all types - * yet, so if you want to use one of these types as function arguments, - * you need to use makeFromUnboxedOnlyFunctor. - * - * Example: - * - * > class MyFunctor final { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::make_unique()); - */ - template - static KernelFunction makeFromUnboxedOnlyFunctor(std::unique_ptr kernelFunctor); - /** * Create a KernelFunction from an unboxed function. * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction @@ -158,23 +138,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedFunction(FuncPtr); - /** - * Create a KernelFunction from an unboxed function and prevent creation of an - * unboxing-wrapper. This means that you cannot call this KernelFunction - * using KernelFunction::callBoxed() - * - * This is necessary because our unboxing wrappers don't work for all types - * yet, so if you want to use one of these types as function arguments, - * you need to use makeFromUnboxedOnlyFunctor. - * - * Example: - * - * > Tensor unboxed_func(Tensor a, Tensor b) {...} - * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(); - */ - template - static KernelFunction makeFromUnboxedOnlyFunction(FuncPtr); - /** * Create a KernelFunction from an unboxed function. * KernelFunction::makeFromUnboxedFunction is usually a better choice than @@ -189,9 +152,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func); - template - static KernelFunction makeFromUnboxedOnlyRuntimeFunction(FuncType* func); - static KernelFunction makeFallthrough(); static KernelFunction makeAmbiguousAutogradOther(); static KernelFunction makeNamedNotSupported(); @@ -226,8 +186,6 @@ class TORCH_API KernelFunction final { template static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, Stack* stack); - void checkBoxedKernel(const OperatorHandle& opHandle) const; - OperatorKernel* getFunctor_() const; std::shared_ptr functor_; diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 82a65fa27ffb..f45d8b28105e 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -23,8 +23,7 @@ inline void KernelFunction::make_boxed_function(OperatorKernel*, const OperatorH } inline bool KernelFunction::isValid() const { - // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this should only check boxed_kernel_func_. - return boxed_kernel_func_ != nullptr || unboxed_kernel_func_ != nullptr; + return boxed_kernel_func_ != nullptr; } inline bool KernelFunction::isFallthrough() const { @@ -32,7 +31,10 @@ inline bool KernelFunction::isFallthrough() const { } inline void KernelFunction::callBoxed(const OperatorHandle& opHandle, Stack* stack) const { - checkBoxedKernel(opHandle); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + boxed_kernel_func_ != nullptr, + "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction." + ); (*boxed_kernel_func_)(functor_.get(), opHandle, stack); } @@ -111,21 +113,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr -inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr kernelFunctor) { - // TODO We want to get rid of kernels that have only an unboxed function pointer. - // All kernels should have a boxed pointer. - - static_assert(guts::is_functor::value, "Tried to call KernelFunction::makeFromUnboxedFunctor but the argument is not a functor."); - static_assert(std::is_base_of::value, "Tried to call KernelFunction::makeFromUnboxedFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); - - return KernelFunction( - std::move(kernelFunctor), - nullptr, // Don't create a boxed kernel for this - reinterpret_cast(&impl::wrap_kernel_functor_unboxed::call) - ); -} - template inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) { static_assert(is_compile_time_function_pointer::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); @@ -144,26 +131,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) #endif } -template -inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunction(FuncPtr func_ptr) { - // TODO We want to get rid of kernels that have only an unboxed function pointer. - // All kernels should have a boxed pointer. - static_assert(is_compile_time_function_pointer::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); - static_assert(!std::is_same::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); - static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); - -#if !defined(C10_MOBILE) - return makeFromUnboxedOnlyFunctor::type> ( - guts::make_unique_base::type>() - ); -#else - // On mobile, we rather want to optimize for binary size than for performance, - // so let's not inline the kernel into the wrapper but use makeFromUnboxedOnlyRuntimeFunction - // instead. - return makeFromUnboxedOnlyRuntimeFunction(func_ptr.func_ptr()); -#endif -} - template inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) { static_assert(guts::is_function_type::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); @@ -175,17 +142,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* f ); } -template -inline KernelFunction KernelFunction::makeFromUnboxedOnlyRuntimeFunction(FuncType* func) { - static_assert(guts::is_function_type::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); - static_assert(!std::is_same::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); - TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr"); - - return makeFromUnboxedOnlyFunctor>>( - guts::make_unique_base>>(func) - ); -} - template inline std::enable_if_t>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { static_assert(guts::is_functor>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp index 8ba50db14a2b..e17efab10ba5 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp @@ -544,26 +544,6 @@ TEST(KernelFunctionTest, givenUnboxedFunctor_withoutReturn_whenCallingUnboxed_th kernels::expectUnboxedCallingWithoutReturnWorks(func); } -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectUnboxedCallingWithReturnWorks(func); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectUnboxedCallingWithoutReturnWorks(func); -} - TEST(KernelFunctionTest, givenUnboxedFunction_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernels::unboxed_function_with_return)); kernels::expectBoxedCallingWithReturnWorks(func); @@ -584,26 +564,6 @@ TEST(KernelFunctionTest, givenUnboxedFunction_withoutReturn_whenCallingUnboxed_t kernels::expectUnboxedCallingWithoutReturnWorks(func); } -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return)); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return)); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return)); - kernels::expectUnboxedCallingWithReturnWorks(func); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return)); - kernels::expectUnboxedCallingWithoutReturnWorks(func); -} - TEST(KernelFunctionTest, givenUnboxedRuntimeFunction_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromUnboxedRuntimeFunction(&kernels::unboxed_function_with_return); kernels::expectBoxedCallingWithReturnWorks(func); diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 6259578fdac8..56afe8ca7fb5 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -1909,7 +1909,7 @@ TEST(NewOperatorRegistrationTest, CppFunction) { m.def("fn3", [](const Tensor& x) { return x; }); // These require explicit schema m.def("fn4(Tensor x) -> Tensor", CppFunction::makeFallthrough()); - m.def("fn5(Tensor x) -> Tensor", CppFunction::makeUnboxedOnly(dummy_fn)); + m.def("fn5(Tensor x) -> Tensor", CppFunction::makeFromUnboxedFunction(dummy_fn)); m.def("fn6(Tensor x) -> Tensor", CppFunction::makeFromBoxedFunction<&backend_fallback_kernel>()); } diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp index 4dba9de7d5b0..88c519c09ea3 100644 --- a/aten/src/ATen/native/vulkan/VulkanAten.cpp +++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp @@ -548,7 +548,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape)); m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select)); m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose)); - m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_); + m.impl("transpose_", at::native::vulkan::aten::transpose_); m.impl("view", TORCH_FN(at::native::vulkan::aten::view)); m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze)); m.impl("empty.memory_format", at::native::vulkan::aten::empty); @@ -569,11 +569,11 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat)); m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar)); m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar)); - m.impl_UNBOXED( + m.impl( "convolution_overrideable", at::native::vulkan::aten::convolution); m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_); m.impl("relu_", at::native::vulkan::aten::relu_); - m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_); + m.impl("add_.Tensor", at::native::vulkan::aten::add_); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index 369a47fee93a..9f25d89bca9b 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -167,10 +167,10 @@ Tensor& relu_(Tensor& self) { TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("clamp", TORCH_FN(clamp)); m.impl("clamp_", TORCH_FN(clamp_)); - m.impl_UNBOXED("hardtanh", hardtanh); - m.impl_UNBOXED("hardtanh_", hardtanh_); - m.impl_UNBOXED("relu", relu); - m.impl_UNBOXED("relu_", relu_); + m.impl("hardtanh", hardtanh); + m.impl("hardtanh_", hardtanh_); + m.impl("relu", relu); + m.impl("relu_", relu_); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index c757f6cdac7a..d88545e3a25a 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -688,7 +688,7 @@ Tensor convolution( #ifdef USE_VULKAN_API TORCH_LIBRARY_IMPL(aten, Vulkan, m) { - m.impl_UNBOXED("convolution_overrideable", convolution); + m.impl("convolution_overrideable", convolution); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp index 6e48ba120c31..14deb30b9888 100644 --- a/aten/src/ATen/native/vulkan/ops/Factory.cpp +++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp @@ -45,7 +45,7 @@ Tensor empty_strided( #ifdef USE_VULKAN_API TORCH_LIBRARY_IMPL(aten, Vulkan, m) { - m.impl_UNBOXED("empty.memory_format", at::native::vulkan::ops::empty_memory_format); + m.impl("empty.memory_format", at::native::vulkan::ops::empty_memory_format); m.impl("empty_strided", TORCH_FN(at::native::vulkan::ops::empty_strided)); } diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp index 88c1d509b34c..ea67910f96da 100644 --- a/test/cpp_extensions/msnpu_extension.cpp +++ b/test/cpp_extensions/msnpu_extension.cpp @@ -53,10 +53,10 @@ std::tuple fake_convolution_backward( } TORCH_LIBRARY_IMPL(aten, MSNPU, m) { - m.impl_UNBOXED("empty.memory_format", empty_override); - m.impl_UNBOXED("add.Tensor", add_override); - m.impl_UNBOXED("convolution_overrideable", fake_convolution); - m.impl_UNBOXED("convolution_backward_overrideable", fake_convolution_backward); + m.impl("empty.memory_format", empty_override); + m.impl("add.Tensor", add_override); + m.impl("convolution_overrideable", fake_convolution); + m.impl("convolution_backward_overrideable", fake_convolution_backward); } // TODO: Extend this to exercise multi-device setting. In that case, diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp index bf16a840dfc9..4a71a526617f 100644 --- a/test/cpp_extensions/rng_extension.cpp +++ b/test/cpp_extensions/rng_extension.cpp @@ -54,9 +54,9 @@ size_t getInstanceCount() { } TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) { - m.impl_UNBOXED("aten::random_.from", random_from_to); - m.impl_UNBOXED("aten::random_.to", random_to); - m.impl_UNBOXED("aten::random_", random_); + m.impl("aten::random_.from", random_from_to); + m.impl("aten::random_.to", random_to); + m.impl("aten::random_", random_); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { diff --git a/test/mobile/op_deps/simple_ops.cpp b/test/mobile/op_deps/simple_ops.cpp index 3651d1b05353..a76c58838a72 100644 --- a/test/mobile/op_deps/simple_ops.cpp +++ b/test/mobile/op_deps/simple_ops.cpp @@ -80,7 +80,7 @@ namespace { // cares about the name TORCH_LIBRARY(_test, m) { m.def("AA(Tensor self) -> Tensor"); - m.impl("AA", torch::CppFunction::makeUnboxedOnly(AA_op)); + m.impl("AA", torch::CppFunction::makeFromUnboxedFunction(AA_op)); m.def("BB(Tensor self) -> Tensor"); m.impl("BB", TORCH_FN(BB_op)); @@ -97,10 +97,10 @@ TORCH_LIBRARY_FRAGMENT(_test, m) { } TORCH_LIBRARY_IMPL(_test, CPU, m) { - m.impl_UNBOXED("EE", EE_op); + m.impl("EE", EE_op); m.impl("FF", torch::dispatch(DispatchKey::CPU, - torch::CppFunction::makeUnboxedOnly(FF_op)) + torch::CppFunction::makeFromUnboxedFunction(FF_op)) ); m.impl("GG", torch::dispatch(DispatchKey::CPU, diff --git a/tools/code_analyzer/run_analyzer.sh b/tools/code_analyzer/run_analyzer.sh index 79b366fb1a0d..dc8705cc39f7 100755 --- a/tools/code_analyzer/run_analyzer.sh +++ b/tools/code_analyzer/run_analyzer.sh @@ -15,7 +15,7 @@ echo "Analyze: ${INPUT}" # to operate, so for safety we match a more expansive set. "${ANALYZER_BIN}" \ -op_schema_pattern="^(_aten|_prim|aten|quantized|_quantized|prepacked|profiler|_test)::[a-zA-Z0-9_.]+(\(.*)?$" \ - -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl|impl_UNBOXED)|torch::Library::(_?def|_?impl|_?impl_UNBOXED)" \ + -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl)|torch::Library::(_?def|_?impl)" \ -op_invoke_pattern="c10::Dispatcher::findSchema" \ -root_symbol_pattern="torch::jit::[^(]" \ -torch_library_init_pattern="^.*TORCH_LIBRARY_init_([^(]+)(\(.*)?$" \ diff --git a/torch/library.h b/torch/library.h index d86c1afbd50e..fee98abb2b81 100644 --- a/torch/library.h +++ b/torch/library.h @@ -116,19 +116,6 @@ class TORCH_API CppFunction final { , debug_() {} - /// This static factory lets you create CppFunctions that (1) don't have boxing - /// wrappers (because we don't support it yet) and (2) don't have schema - /// inference (because some ops don't support it). - template - static CppFunction makeUnboxedOnly(Func* f) { - // TODO: Eliminate the necessity for this function entirely. - return CppFunction( - c10::KernelFunction::makeFromUnboxedOnlyRuntimeFunction(f), - /* cpp_signature */ c10::impl::CppSignature::make(), - /* schema */ nullptr - ); - } - /// This creates a fallthrough function. Fallthrough functions /// immediately redispatch to the next available dispatch key, /// but are implemented more efficiently than a hand written @@ -170,6 +157,22 @@ class TORCH_API CppFunction final { ); } + /// Create a function from an unboxed kernel function. + /// This is typically used to register common operators. + template::value, std::nullptr_t> = nullptr> + static CppFunction makeFromUnboxedFunction(FuncPtr* f) { + return CppFunction(f); + } + + /// Create a function from a compile time unboxed kernel function pointer. + /// This is typically used to register common operators. + /// Compile time function pointers can be used to allow the compiler + /// to optimize (e.g. inline) calls to it. + template::value, std::nullptr_t> = nullptr> + static CppFunction makeFromUnboxedFunction(FuncPtr f) { + return CppFunction(f); + } + CppFunction&& debug(std::string d) && { debug_ = std::move(d); return std::move(*this); @@ -496,20 +499,10 @@ class TORCH_API Library final { return impl(name, dispatch(std::forward(key), std::forward(raw_f))); } - /// \private - /// - /// Convenience overload for unboxed only kernels; kernels whose type - /// signatures are not supported by our template based metaprogramming - /// system. These are currently quite common but will be eventually - /// eliminated. - /// - /// This is equivalent to calling CppFunction::makeUnboxedOnly() on - /// the function, but this name for the function makes it easy to grep for. template Library& impl_UNBOXED(Name name, Func* raw_f) & { - // TODO: Remove this overload once the makeUnboxedOnly incidence rate - // goes way down - return impl(name, CppFunction::makeUnboxedOnly(raw_f)); + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; } // These overloads cover cases when a SelectiveStr (see Note [Selective build]) @@ -531,7 +524,10 @@ class TORCH_API Library final { template Library& impl(detail::SelectiveStr, Dispatch&& key, Func&& raw_f) & { return *this; } template - Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { return *this; } + Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; + } template Library& impl(detail::SelectiveStr name, Func&& raw_f) & { @@ -543,7 +539,8 @@ class TORCH_API Library final { } template Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { - return impl(name.operator const char*(), CppFunction::makeUnboxedOnly(raw_f)); + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; } /// Register a fallback implementation for all operators which will be used From 249261ada7a137db6cf9d2114d42d0ca2e0d396b Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Wed, 6 Jan 2021 14:14:24 -0800 Subject: [PATCH 28/44] Remove generated_unboxing_wrappers and setManuallyBoxedKernel (#49251) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49251 Since all ops are c10-full and use templated unboxing now, we don't need to codegenerate any unboxing logic anymore. Since this codegen was the only code using setManuallyBoxedKernel, we can also remove that functionality from KernelFunction, OperatorEntry and Dispatcher. ghstack-source-id: 119450486 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D25502865 fbshipit-source-id: 49d009df159fda4be41bd02457d4427e6e638c10 --- .jenkins/pytorch/codegen-test.sh | 7 - BUILD.bazel | 3 - aten/src/ATen/core/boxing/KernelFunction.h | 6 - .../ATen/core/boxing/KernelFunction_impl.h | 10 - aten/src/ATen/core/dispatch/Dispatcher.cpp | 6 - aten/src/ATen/core/dispatch/Dispatcher.h | 6 - aten/src/ATen/core/dispatch/OperatorEntry.cpp | 18 - aten/src/ATen/core/dispatch/OperatorEntry.h | 12 - caffe2/CMakeLists.txt | 5 - caffe2/contrib/aten/gen_op.py | 3 +- tools/build_variables.bzl | 6 - tools/jit/gen_unboxing_wrappers.py | 568 ------------------ .../templates/generated_unboxing_wrappers.cpp | 132 ---- tools/setup_helpers/generate_code.py | 8 - 14 files changed, 1 insertion(+), 789 deletions(-) delete mode 100644 tools/jit/gen_unboxing_wrappers.py delete mode 100644 tools/jit/templates/generated_unboxing_wrappers.cpp diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh index 17e7e9fa3445..47d13f2908d0 100755 --- a/.jenkins/pytorch/codegen-test.sh +++ b/.jenkins/pytorch/codegen-test.sh @@ -48,13 +48,6 @@ python -m tools.autograd.gen_autograd \ "$OUT"/autograd \ tools/autograd -# unboxing_wrappers codegen (called by torch codegen but can run independently) -mkdir -p "$OUT"/unboxing_wrappers -python -m tools.jit.gen_unboxing_wrappers \ - "$OUT"/torch/share/ATen/Declarations.yaml \ - "$OUT"/unboxing_wrappers \ - tools/jit/templates - # annotated_fn_args codegen (called by torch codegen but can run independently) mkdir -p "$OUT"/annotated_fn_args python -m tools.autograd.gen_annotated_fn_args \ diff --git a/BUILD.bazel b/BUILD.bazel index b3faea487965..2b4636d850c9 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -193,9 +193,6 @@ libtorch_cpp_generated_sources = [ "torch/csrc/autograd/generated/Functions.h", "torch/csrc/autograd/generated/Functions.cpp", "torch/csrc/autograd/generated/variable_factories.h", - "torch/csrc/jit/generated/generated_unboxing_wrappers_0.cpp", - "torch/csrc/jit/generated/generated_unboxing_wrappers_1.cpp", - "torch/csrc/jit/generated/generated_unboxing_wrappers_2.cpp", ] libtorch_python_generated_sources = [ diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index bf847681aac8..ddbbd912777a 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -173,12 +173,6 @@ class TORCH_API KernelFunction final { // For testing internal invariants only bool _equalsBoxedAndUnboxed(const KernelFunction&) const; - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic. This can be done once https://github.com/pytorch/pytorch/issues/32366 is fixed. - void setManuallyBoxedKernel_(InternalBoxedKernelFunction* func); - private: explicit KernelFunction(std::unique_ptr functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func); diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index f45d8b28105e..b248e54a6f94 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -168,14 +168,4 @@ inline std::enable_if_t>::value, ); } -inline void KernelFunction::setManuallyBoxedKernel_(InternalBoxedKernelFunction* func) { - if (boxed_kernel_func_ == &fallthrough_kernel) { - // special case no-op - return; - } - TORCH_INTERNAL_ASSERT(boxed_kernel_func_ == nullptr, "Tried to set a manually boxed kernel for a kernel that already has a boxed kernel set."); - TORCH_INTERNAL_ASSERT(unboxed_kernel_func_ != nullptr, "Tried to set a manually boxed kernel for an invalid KernelFunction."); - boxed_kernel_func_ = func; -} - } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 5e3e91afbb45..270cffaf6d1f 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -295,12 +295,6 @@ void Dispatcher::checkInvariants() const { } } -void Dispatcher::setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func) { - std::lock_guard lock(mutex_); - op.operatorIterator_->op.setManuallyBoxedKernel_(*this, func); - // NB: Do not need to set manually boxed kernel for backend fallbacks -} - std::vector Dispatcher::findDanglingImpls() const { return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> std::vector { std::vector opsWithDanglingImpls; diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 60f9f9bd0579..9641dfbea0cd 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -182,12 +182,6 @@ class TORCH_API Dispatcher final { */ RegistrationHandleRAII registerLibrary(std::string ns, std::string debug); - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setBoxedKernelFor_ once all operators work with the templated boxing logic - void setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func); - // ------------------------------------------------------------------------ // // Listeners on registrations diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index f0d7bc6968ed..7c3698beeb06 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -21,7 +21,6 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name) , schema_() , dispatchTable_() , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()) -, manuallyBoxedKernel_() , kernels_() , cpp_signature_() , is_observed_(ObservedOperators::isObserved(name_)) @@ -122,10 +121,6 @@ std::list::iterator OperatorEntry::registerKernel( ); } - if (manuallyBoxedKernel_.has_value()) { - kernel.setManuallyBoxedKernel_(*manuallyBoxedKernel_); - } - k.emplace_front(std::move(kernel), std::move(inferred_function_schema), std::move(debug)); std::list::iterator inserted = k.begin(); // update the dispatch table, i.e. re-establish the invariant @@ -331,19 +326,6 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) } } -void OperatorEntry::setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func) { - TORCH_INTERNAL_ASSERT(!manuallyBoxedKernel_); - manuallyBoxedKernel_ = func; - - for (auto& kv : kernels_) { - for (auto& k : kv.second) { - k.kernel.setManuallyBoxedKernel_(func); - } - } - // Refresh entries in dispatchTable_ - updateDispatchTableFull_(dispatcher); -} - void OperatorEntry::checkInvariants() const { if (schema_) { TORCH_INTERNAL_ASSERT(schema_->schema.operator_name() == name_, dumpState()); diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 5098fd0d8c28..44b8fac5661e 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -148,12 +148,6 @@ class TORCH_API OperatorEntry final { const DispatchKeyExtractor& dispatchKeyExtractor() const { return dispatchKeyExtractor_; } - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic - void setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func); - // Asserts that the given FuncType is correct for calling this operator in an unboxed way. template void assertSignatureIsCorrect() { @@ -189,12 +183,6 @@ class TORCH_API OperatorEntry final { std::array(DispatchKey::NumDispatchKeys)> dispatchTable_; DispatchKeyExtractor dispatchKeyExtractor_; - // This manuallyBoxedKernel_ member is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete manuallyBoxedKernel_ once all operators work with the templated boxing logic - c10::optional manuallyBoxedKernel_; - // kernels_ stores all registered kernels for the corresponding dispatch key // and catchAllKernels_ stores the catch-all kernels. // If an operator library gets loaded that overwrites an already existing kernel, diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 191a7ca26835..9b934e4831e8 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -340,9 +340,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set(GENERATED_CXX_TORCH "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_0.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_1.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_2.cpp" ) if(NOT INTERN_DISABLE_AUTOGRAD) @@ -434,8 +431,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) "${TOOLS_PATH}/autograd/load_derivatives.py" "${TOOLS_PATH}/autograd/nested_dict.py" "${TOOLS_PATH}/autograd/utils.py" - "${TOOLS_PATH}/jit/gen_unboxing_wrappers.py" - "${TOOLS_PATH}/jit/templates/generated_unboxing_wrappers.cpp" WORKING_DIRECTORY "${TORCH_ROOT}") diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 769f9d59c856..64d3de547bb7 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -285,8 +285,7 @@ def emit_assignments(o, env): real_inputs = 0 for i, arg in enumerate(o['arguments']): env['arguments'].append(arg['name']) - # Emulate logic in gen_unboxing_wrappers.py. Pretend the flat argument - # list is a stack where the end is the top. + # Pretend the flat argument list is a stack where the end is the top. view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs if arg['type'] == 'TensorList': # NOTE: do not advance real_inputs here. After this we will diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index dc05ace7c542..5ed0b1340811 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -7,9 +7,6 @@ GENERATED_CPP = [ "autograd/generated/VariableType_2.cpp", "autograd/generated/VariableType_3.cpp", "autograd/generated/VariableType_4.cpp", - "jit/generated/generated_unboxing_wrappers_0.cpp", - "jit/generated/generated_unboxing_wrappers_1.cpp", - "jit/generated/generated_unboxing_wrappers_2.cpp", "autograd/generated/TraceType_0.cpp", "autograd/generated/TraceType_1.cpp", "autograd/generated/TraceType_2.cpp", @@ -39,9 +36,6 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name[36:-3]) for name in lib def libtorch_generated_sources(gencode_pattern): return [gencode_pattern.format(name) for name in [ "autograd/generated/Functions.cpp", - "jit/generated/generated_unboxing_wrappers_0.cpp", - "jit/generated/generated_unboxing_wrappers_1.cpp", - "jit/generated/generated_unboxing_wrappers_2.cpp", "autograd/generated/VariableType_0.cpp", "autograd/generated/VariableType_1.cpp", "autograd/generated/VariableType_2.cpp", diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py deleted file mode 100644 index 19e459e3f7ac..000000000000 --- a/tools/jit/gen_unboxing_wrappers.py +++ /dev/null @@ -1,568 +0,0 @@ -""" -To run this file by hand from the root of the PyTorch -repository, run: - -python -m tools.jit.gen_unboxing_wrappers \ - build/aten/src/ATen/Declarations.yaml \ - $OUTPUT_DIR \ - tools/jit/templates - -Where $OUTPUT_DIR is where you would like the files to be -generated. In the full build system, OUTPUT_DIR is -torch/csrc/jit/generated/ -""" - -# This file generates generated_unboxing_wrappers, which contains -# manual unboxing wrappers for ops that aren't use_c10_dispatcher: full -# because the templated unboxing logic in c10 doesn't support them yet. -# The ultimate goal is to make all ops use the templated unboxing and -# delete this codegen file. - -import argparse -import re -from itertools import groupby -from functools import reduce -import yaml - -from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT -from ..autograd.utils import CodeTemplate, YamlLoader, write, is_out_variant, op_name_with_overload -from tools.codegen.selective_build.selector import SelectiveBuilder - -# JIT has a type system of -# Scalar = int | float | bool # int is the largest int (int64_t), -# float is the largest float (double) we don't have the others because they are never held in tensors -# Type = Scalar # primitive numbers -# | Tensor # any tensor, as defined by at::Tensor -# | Type[] # a dynamically sized list[ of a type -# | Scalar[N] # a homogenous fixed size scalar list, single scalars can expand to this list -# | (Type1, Type2, ...) # a heterogeneous tuple -# | Layout | ScalarType | Device | Generator # special singleton types for built-in concepts in tensor lib - -# clean up the variety of C++ types in the ATen declarations -# to be in the restricted set of types that the IR represents -# note: no default values for this map, to make it clear what types -# can be passedthrough - -TYPE_MAP = { - 'std::array': 'bool[2]', - 'std::array': 'bool[3]', - 'std::array': 'bool[4]', - 'std::string': 'str', - 'std::string?': 'str?', - 'Scalar': 'Scalar', - 'ScalarList': 'Scalar[]', - 'MemoryFormat': 'MemoryFormat', - 'MemoryFormat?': 'MemoryFormat?', - 'QScheme': 'QScheme', - 'Scalar?': 'Scalar?', - 'Tensor': 'Tensor', - 'Tensor?': 'Tensor?', - 'TensorList': 'Tensor[]', - # this appears in return values instead of TensorList - # since TensorList is a ArrayRef in arguments but a vector - # in returns - 'std::vector': 'Tensor[]', - 'IntArrayRef': 'int[]', - 'IntArrayRef?': 'int[]?', - 'ArrayRef?': 'float[]?', - 'Layout': 'Layout', - 'Layout?': 'Layout?', - 'Device': 'Device', - 'Device?': 'Device?', - 'ScalarType': 'ScalarType', - 'ScalarType?': 'ScalarType?', - 'int64_t': 'int', - 'int64_t?': 'int?', - 'double': 'float', - 'double?': 'float?', - 'bool': 'bool', - 'bool?': 'bool?', - 'Generator': 'Generator?', - 'Generator?': 'Generator?', -} - - -def optional_type_of(arg, typ): - # optional type special handling for Tensor?[] and Tensor - # types that is missing a optional annotation - if arg.get('is_nullable') and '?' not in typ: - if typ == 'TensorList' or typ == 'Tensor[]': - typ = 'Tensor?[]' - else: - typ = '{}?'.format(typ) - return typ - - -def annotated_type_of(arg, typ): - anno = arg.get('annotation') - if anno: - typ = '{}({})'.format(typ, anno) - return typ - - -def jit_type_of(arg): - jit_type = arg.get('jit_type') - if not jit_type: - jit_type = TYPE_MAP[arg['simple_type']] - if is_sized_intlist_arg(arg): - jit_type = 'int[{}]'.format(arg['size']) - jit_type = optional_type_of(arg, jit_type) - jit_type = annotated_type_of(arg, jit_type) - arg['jit_type'] = jit_type - return jit_type - - -# map from aten 'simple_type' to the function that will turn a tensor into -# that type -FROM_IVALUE = { - 'Device': '{}.toDevice()', - 'Device?': '{}.toOptional()', - 'IntArrayRef': '{}.toIntVector()', - 'IntArrayRef?': '{}.toOptionalIntArray()', - 'ArrayRef?': '{}.toOptionalDoubleArray()', - 'Layout': '{}.toLayout()', - 'Layout?': '{}.toOptional()', - 'MemoryFormat': '{}.toMemoryFormat()', - 'MemoryFormat?': '{}.toOptional()', - 'QScheme': '{}.toQScheme()', - 'Scalar': '{}.toScalar()', - 'Scalar?': '{}.toOptional()', - 'ScalarType': '{}.toScalarType()', - 'ScalarType?': '{}.toOptional()', - 'Tensor': '{}.toTensor()', - 'Tensor?': 'toOptionalTensor({})', - 'Tensor?[]': 'toListOfOptionalTensor({})', - 'TensorList': '{}.toTensorVector()', - 'ScalarList': '{}.toScalarVector()', - 'bool': '{}.toBool()', - 'bool?': '{}.toOptional()', - 'double': '{}.toDouble()', - 'double?': '{}.toOptional()', - 'int64_t': '{}.toInt()', - 'int64_t?': '{}.toOptional()', - 'std::string': '{}.toStringRef()', - 'std::string?': '{}.toOptional()', - 'Generator?': '{}.toOptional()', - 'std::array': 'as_bool_array<2>({}.toBoolList())', - 'std::array': 'as_bool_array<3>({}.toBoolList())', - 'std::array': 'as_bool_array<4>({}.toBoolList())', -} - - -def from_ivalue(arg, value): - typ = optional_type_of(arg, arg['simple_type']) - return FROM_IVALUE[typ].format(value) - - -CALL_UNBOXED_KERNEL = CodeTemplate("""\ -auto result_ = callUnboxedKernel<${return_type}${formals_types_with_leading_comma}>(unboxedKernel${args_with_leading_comma}); -""") -CALL_NAMESPACE = CodeTemplate("""\ -auto result_ = at::${name}( - ${args} -); -""") -CALL_METHOD = CodeTemplate("""\ -auto result_ = (${first}).${name}( - ${args} -); -""") -CALL_NAMESPACE_WITH_TENSOR_OPTIONS = CodeTemplate("""\ -const auto options = TensorOptions() - .dtype(${dtype}) - .layout(${layout}) - .device(${device}) - .pinned_memory(${pin_memory}); - auto result_ = torch::${name}(${args_with_tensor_options}); -""") -CALL_METHOD_WITH_TENSOR_OPTIONS = CodeTemplate("""\ -const auto options = TensorOptions() - .dtype(${dtype}) - .layout(${layout}) - .device(${device}) - .pinned_memory(${pin_memory}); -auto result_ = (${first}).${name}(${args_with_tensor_options}); -""") - -CONSTRUCTOR = CodeTemplate("""\ -[](OperatorKernel* unboxedKernel, const OperatorHandle&, Stack* stack) { - using namespace at; - ${lvalues} - ${call} - drop(*stack, ${num_inputs}); - pack(*stack, std::move(result_)); -} -""") - -OPERATOR = CodeTemplate("""\ - .op("${signature}", - ${op}) -""") - - -disallowed_types = { - 'Storage', - 'DimnameList?', - 'ConstQuantizerPtr', - 'Dimname', - 'DimnameList', -} - -default_only_types = {'Generator'} - - -def is_jit_arg(i, arg): - simple_type = arg['simple_type'] - if simple_type in disallowed_types: - return False - if simple_type in default_only_types and 'default' not in arg: - return False - if simple_type == 'Type': - return False - return True - - -def is_jit_op(decl): - # We currently don't support functions that return nothing - assert all(r['type'] != 'void' for r in decl['returns']) - if len(decl['returns']) == 0: - return False - - arguments = decl['arguments'] - - # there must be a single out variant - if is_out_variant(decl) and sum([not not arg.get('output') for arg in arguments]) > 1: - return False - - return (('namespace' in decl['method_of'] or 'Tensor' in decl['method_of']) and - all(is_jit_arg(i, arg) for i, arg in enumerate(decl['arguments'])) and - all(is_jit_arg(i, arg) for i, arg in enumerate(decl['returns']))) - - -def is_tensor_arg(arg): - return arg['simple_type'] in {'Tensor', 'TensorList'} - - -def is_sized_intlist_arg(arg): - """Returns True for arguments declared as IntArrayRef[k], but False for IntArrayRef.""" - return (arg['simple_type'] == 'IntArrayRef') and ('size' in arg) - - -def base_name(decl): - name = decl['name'] - return name[:-1] if decl.get('inplace', False) else name[:-4] if name.endswith('_out') else name - - -def is_view(decl): - return base_name(decl) in RETURNS_VIEWS_OF_INPUT - - -# Copied from ..autograd.gen_python_functions.SKIP_PYTHON_BINDINGS -BACKWARD_OP_PATTERNS = [ - '.*_backward', - '.*_backward_(out|input|weight|bias)', -] - -def is_backward_op(decl): - for pattern in BACKWARD_OP_PATTERNS: - if re.match('^' + pattern + '$', decl['name']): - return True - return False - - -# for each argument in decl, the location it should appear in the -# jit schema declaration. e.g. -# arguments = [x, y, z] # the order in aten -# jit_argument_order = [2, 0, 1] -# aten::my_arg(Tensor y, Tensor z, Tensor x) # the order in schema -# used to move 'out' arguments to the end of the list -def argument_order(decl): - return decl.get('jit_argument_order') or list(range(len(decl['arguments']))) - - -def format_return_type(returns): - if len(returns) == 0: - return 'void' - elif len(returns) == 1: - return returns[0]['type'] - else: - return_types = [r['type'] for r in returns] - return 'std::tuple<{}>'.format(','.join(return_types)) - - -def get_simple_type(arg): - simple_type = arg['type'] - simple_type = simple_type.replace(' &', '').replace('const ', '') - simple_type = simple_type.replace('Generator *', 'Generator') - - opt_match = re.match(r'c10::optional<(.+)>', simple_type) - if opt_match: - simple_type = '{}?'.format(opt_match.group(1)) - return simple_type - - -def load_aten_declarations(path): - with open(path, 'r') as f: - declarations = yaml.load(f, Loader=YamlLoader) - - # enrich declarations with additional information - selected_declarations = [] - for declaration in declarations: - if declaration.get('deprecated'): - continue - - for arg in declaration['arguments']: - arg['simple_type'] = get_simple_type(arg) - for arg in declaration['schema_order_arguments']: - arg['simple_type'] = get_simple_type(arg) - for ret in declaration['returns']: - ret['simple_type'] = get_simple_type(ret) - - declaration['formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['arguments']] - declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['schema_order_arguments']] - declaration['args'] = [arg['name'] for arg in declaration['arguments']] - declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] - declaration['api_name'] = declaration['name'] - if declaration.get('overload_name'): - declaration['type_wrapper_name'] = "{}_{}".format( - declaration['name'], declaration['overload_name']) - else: - declaration['type_wrapper_name'] = declaration['name'] - declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0] - declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1] - declaration['return_type'] = format_return_type(declaration['returns']) - - declaration['base_name'] = declaration['name'] - selected_declarations.append(declaration) - - return selected_declarations - - -def gen_unboxing_wrappers( - declarations, - out, - template_path, - operator_selector: SelectiveBuilder, - disable_autograd=False, - force_schema_registration=False, -): - GENERATED_UNBOXING_WRAPPERS_CPP = CodeTemplate.from_file(template_path + '/generated_unboxing_wrappers.cpp') - - ops = [] - - def get_invocation(decl, args, num_inputs): - - # because the arg list can get lengthy we put them on a separate line - def pack_arguments(args): - return ',\n'.join(args) - is_namespace_function = 'namespace' in decl['method_of'] - tensor_options_arg_index = decl.get('tensor_options_arg_index', None) - if tensor_options_arg_index is not None: - dtype = args[tensor_options_arg_index] - layout = args[tensor_options_arg_index + 1] - device = args[tensor_options_arg_index + 2] - pin_memory = args[tensor_options_arg_index + 3] - args_with_tensor_options = args[:tensor_options_arg_index] + \ - ['options'] + args[(tensor_options_arg_index + 4):] - if is_namespace_function: - return CALL_NAMESPACE_WITH_TENSOR_OPTIONS.substitute( - name=decl['name'], dtype=dtype, layout=layout, - device=device, pin_memory=pin_memory, - args_with_tensor_options=pack_arguments(args_with_tensor_options)) - else: - return CALL_METHOD_WITH_TENSOR_OPTIONS.substitute( - name=decl['name'], dtype=dtype, layout=layout, - device=device, pin_memory=pin_memory, - args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]), - first=args_with_tensor_options[0], num_inputs=num_inputs) - else: - if is_namespace_function: - return CALL_NAMESPACE.substitute(name=decl['name'], - args=pack_arguments(args), - num_inputs=num_inputs) - else: - return CALL_METHOD.substitute( - name=decl['name'], first=args[0], - args=pack_arguments(args[1:]), num_inputs=num_inputs) - - def requires_lvalue(arg): - jit_type = jit_type_of(arg) - return jit_type.startswith('Tensor') and '!' in jit_type - - def emit_decl_variant(decl): - if ('emit_dummy_placeholder' in decl): - return "DUMMY_OPERATION" - kw_assignments = [] - - # mutable arguments in aten are passed as non const references - # these must be lvalues, so we have to put them in variables - # before calling the function - lvalues = [] - - arguments = [] - num_inputs = len(decl['arguments']) - op_capture = '' - order = argument_order(decl) - for i, arg in enumerate(decl['arguments']): - value = from_ivalue(arg, '(std::move(peek(*stack, {}, {})))'.format(order[i], num_inputs)) - if requires_lvalue(arg): - lvalues.append('auto {} = {};\n'.format(arg['name'], value)) - value = arg['name'] - arguments.append(value) - - call = get_invocation(decl, arguments, num_inputs) - - returns = decl['returns'] - - return constructor - - def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration): - result = [] - for decl in jit_decls: - if disable_autograd and is_backward_op(decl): - continue - op_name = op_name_with_overload(decl) - if operator_selector.is_root_operator(op_name): - result.append(decl) - else: - if force_schema_registration: - decl['emit_dummy_placeholder'] = True - result.append(decl) - - return result - - # This function declares an order on declarations. This is necessary because - # there is some ambiguity in the choice of overload: if an argument is overloaded - # to accept both Scalar and Tensor, the schema with the Tensor should come first - # TODO: this can (probably) be removed when we remove the implicit conversion - # from Tensor -> Number. - def sort_decls(jit_decls): - def declkey(decl): - # key = sum_{i < len(args)} {1 if arg is tensor else 2} * (3 ** i) - # This is a ternary encoding where - # 0: No argument at this position - # 1: Tensor argument at this position - # 2: Some other argument at this position. - args = decl['arguments'] - result = 0 - for i in range(len(args)): - result += (3 ** i) * (1 if args[i]['simple_type'] == 'Tensor' else 2) - return result - - # NB: itertools.groupby requires the list be sorted. - sorted_decls = sorted(jit_decls, key=lambda decl: decl['name']) - grouped_decls = [list(g) for _, g in - groupby(sorted_decls, key=lambda decl: decl['name'])] - return [sorted(g, key=declkey) for g in grouped_decls] - - aten_decls = load_aten_declarations(declarations) - jit_decls = [d for d in aten_decls if is_jit_op(d)] - - # add arguments dtype and device for functions like zeros - def expand_options(decl, i, arg): - if arg['simple_type'] != 'TensorOptions': - return [arg] - assert decl.get('tensor_options_arg_index') != i - decl['tensor_options_arg_index'] = i - tensor_options_expansion = [ - # XXX - until we actually have first-class interpreter types for these - # concepts, the default values to be encoded in Tensors - # If you change this, you also need to update [TensorOptions in script] - # in the tracer code. - # dtype is specified as an int64_t of at::ScalarType - {'name': 'dtype', 'simple_type': 'ScalarType'}, - # layout is specified as an int64_t of at::Layout - {'name': 'layout', 'simple_type': 'Layout'}, - # device is specified as an IntArrayRef of { at::Device::Type, device_id } - {'name': 'device', 'simple_type': 'Device'}, - # pin_memory is specified as a boolean - {'name': 'pin_memory', 'simple_type': 'bool', 'default': False}, - ] - # TODO: Don't repack this into TensorOptions. Needs various changes in downstream code. - if 'default' in arg: - for el in tensor_options_expansion: - el['simple_type'] += '?' - el['default'] = 'None' - if 'default' in arg and arg['default'] == 'at::kLong': - tensor_options_expansion[0]['default'] = 'long' - if 'kwarg_only' in arg and arg['kwarg_only']: - for el in tensor_options_expansion: - el['kwarg_only'] = True - return tensor_options_expansion - - additional_jit_decls = [] - - for decl in jit_decls: - decl['arguments'] = [a for i, arg in enumerate(decl['arguments']) for a in expand_options(decl, i, arg)] - if is_out_variant(decl): - reorder_out_args(decl) - - jit_decls.extend(additional_jit_decls) - jit_decls = filter_decls(jit_decls, disable_autograd, operator_selector, force_schema_registration) - - # generation is deterministic - jit_decl_groups = sort_decls(jit_decls) - - # NOTE: see Note [Sharded File] at the top of the generated_unboxing_wrappers.cpp - # template regarding sharding of the generated files. - # - # If you edit the number of shards here, you will also have to - # modify generate_code.py, torch/CMakeLists.txt, and the TARGETS - # files. - num_shards = 3 - shards = [[] for _ in range(num_shards)] - - # ops are assigned arbitrarily but stably to a file based on hash - for group in jit_decl_groups: - x = sum(ord(c) for c in group[0]['name']) % num_shards - - for i, shard in enumerate(shards): - env = { - 'constructors': shard, - } - write(out, 'generated_unboxing_wrappers_%d.cpp' % i, GENERATED_UNBOXING_WRAPPERS_CPP, env) - - all_shards = reduce( - lambda lhs, rhs: lhs + rhs, - shards, - ) - env = { - 'constructors': all_shards, - } - write(out, 'generated_unboxing_wrappers_everything.cpp', GENERATED_UNBOXING_WRAPPERS_CPP, env) - - -default_map = {'{}': 'None', 'nullptr': 'None', 'c10::nullopt': 'None'} - - -def reorder_out_args(decl): - first_arg = decl['arguments'][0] - assert(first_arg['output']) - # the output variant must go at the end - # note: this is an annoying side effect of using a single '*' - # to denote kwarg_only - nargs = len(decl['arguments']) - decl['jit_argument_order'] = [nargs - 1] + list(range(nargs - 1)) - - -def is_kwarg_only(a): - return a.get('kwarg_only') or a.get('output') - -def main(): - parser = argparse.ArgumentParser( - description='Generate JIT op dispatch') - parser.add_argument('declarations', metavar='DECL', - help='path to Declarations.yaml') - parser.add_argument('out', metavar='OUT', - help='path to output directory') - parser.add_argument('template_path', metavar='TEMPLATE_PATH', - help='path to templates directory') - args = parser.parse_args() - gen_unboxing_wrappers(args.declarations, args.out, args.template_path, - SelectiveBuilder.get_nop_selector()) - - -if __name__ == '__main__': - main() diff --git a/tools/jit/templates/generated_unboxing_wrappers.cpp b/tools/jit/templates/generated_unboxing_wrappers.cpp deleted file mode 100644 index cd8d12f6b15e..000000000000 --- a/tools/jit/templates/generated_unboxing_wrappers.cpp +++ /dev/null @@ -1,132 +0,0 @@ -#include "torch/csrc/jit/runtime/operator.h" -#include "torch/csrc/jit/runtime/custom_operator.h" -#include "torch/csrc/jit/frontend/function_schema_parser.h" - -#include "torch/csrc/autograd/profiler.h" -#include "torch/csrc/autograd/generated/variable_factories.h" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// ${generated_comment} - -// This file contains manual unboxing wrappers for ops that aren't -// use_c10_dispatcher: full because the templated unboxing logic in c10 doesn't -// support them yet. The ultimate goal is to make all ops use the templated -// unboxing and delete this codegen file. - -// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up -// incremental rebuilds. See the comment at the top of -// templates/VariableType.cpp for an analogous, in-depth discussion. - -namespace torch { namespace jit { - -using autograd::Variable; -using autograd::variable_list; -using at::Scalar; -using at::ScalarType; -using at::Tensor; -using at::TensorOptions; -using at::DeviceGuard; -using at::MemoryFormat; - -using ::c10::fmap; -using ::c10::filter; -using c10::OperatorKernel; -using c10::OperatorHandle; -using c10::KernelFunction; -using c10::RegistrationHandleRAII; -using c10::Stack; - -namespace { - -template -Return callUnboxedKernel(OperatorKernel* unboxedKernel, Args... args) { - using FuncType = Return (Args...); - auto* typedUnboxedKernel = static_cast*>(unboxedKernel); - return (*typedUnboxedKernel)(std::forward(args)...); -} - -// TODO: remove the toOptionalTensor and toListOfOptionalTensor -// when we remove the undefined tensor semantic from TH - -// XXX: This function is to specialize IValue for tensor type in -// interpreter, it should only be used in this file -at::Tensor toOptionalTensor(const IValue& v) { - if (v.isNone()) { - return at::Tensor(); - } - return v.toTensor(); -} - -// XXX: This function is to specialize IValue for list of optional -// tensor type in interpreter, it should only be used in this file -std::vector toListOfOptionalTensor(const IValue& v) { - // v is a list of optional tensor, loop over as generic list - auto vlist = v.toListRef(); - std::vector res; - - for (const IValue &v: vlist) { - res.emplace_back(toOptionalTensor(v)); - } - return res; -} - -template -std::array as_bool_array(const c10::List& list) { - std::array res; - AT_ASSERT(list.size() == N); - std::copy(list.begin(), list.end(), res.begin()); - return res; -} - -KernelFunction::InternalBoxedKernelFunction *DUMMY_OPERATION = - [](c10::OperatorKernel *, const c10::OperatorHandle &, std::vector *) -> void { - TORCH_CHECK(false, "Operator has been stripped in the custom build.") - }; - -class Registerer final { -public: - Registerer&& op(const std::string& schemaStr, KernelFunction::InternalBoxedKernelFunction* boxed_kernel_wrapper) && { - static auto& dispatcher = c10::Dispatcher::singleton(); - auto schema = parseSchema(schemaStr); - schema.setAliasAnalysis(AliasAnalysisKind::FROM_SCHEMA); - c10::OperatorName name = schema.operator_name(); - RegistrationHandleRAII registration = dispatcher.registerName(name); - auto op = dispatcher.findOp(name).value(); - registrationHandles_.push_back(std::move(registration)); - dispatcher.setManuallyBoxedKernelFor_(op, boxed_kernel_wrapper); - return std::move(*this); - } - - Registerer() = default; - Registerer(const Registerer&) = delete; - Registerer& operator=(const Registerer&) = delete; - Registerer(Registerer&&) noexcept = default; - Registerer& operator=(Registerer&&) noexcept = default; -private: - std::vector registrationHandles_; -}; - -static auto registry = Registerer() - // Generated operators - ${constructors} - ; - -} // anon namespace - - -}} // namespace torch::jit diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py index 9ca843abc69f..10bbc33c352f 100644 --- a/tools/setup_helpers/generate_code.py +++ b/tools/setup_helpers/generate_code.py @@ -30,7 +30,6 @@ def generate_code(ninja_global=None, operator_selector=None): from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python from tools.autograd.gen_annotated_fn_args import gen_annotated - from tools.jit.gen_unboxing_wrappers import gen_unboxing_wrappers from tools.codegen.selective_build.selector import SelectiveBuilder @@ -70,13 +69,6 @@ def generate_code(ninja_global=None, disable_autograd=disable_autograd, operator_selector=operator_selector, ) - gen_unboxing_wrappers( - declarations_path or DECLARATIONS_PATH, - jit_gen_dir, - tools_jit_templates, - disable_autograd=disable_autograd, - operator_selector=operator_selector, - force_schema_registration=force_schema_registration) if subset == "python" or not subset: gen_annotated( From 6643e9fbb3c6770fa128be93635e87725de3c839 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Wed, 6 Jan 2021 14:14:24 -0800 Subject: [PATCH 29/44] Remove `use_c10_dispatcher: full` lines (#49259) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49259 Since `use_c10_dispatcher: full` is now the default, we can remove all those pesky lines mentioning it. Only the `use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` lines are left. ghstack-source-id: 119450485 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D25506526 fbshipit-source-id: 8053618120c0b52ff7c73cacb34bec7eb38f8fe0 --- aten/src/ATen/native/native_functions.yaml | 1244 +------------------- 1 file changed, 4 insertions(+), 1240 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 215ca70bfbae..b474d435398c 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -7,42 +7,34 @@ # DEPRECATED. DO NOT USE - func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # Computes the gradient of current tensor w.r.t. graph leaves. @@ -59,18 +51,15 @@ # where Variables *are* Tensors (as opposed to them containing tensors, which # is what the previous interpretation was.) - func: set_data(Tensor(a!) self, Tensor new_data) -> () - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: data(Tensor self) -> Tensor - use_c10_dispatcher: full manual_kernel_registration: True variants: method # True if this `Variable` is a leaf and thus does not have a `grad_fn`. - func: is_leaf(Tensor self) -> bool - use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -85,23 +74,19 @@ # assert y2.output_nr == 2 # - func: output_nr(Tensor self) -> int - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: _version(Tensor self) -> int - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!) - use_c10_dispatcher: full manual_kernel_registration: True variants: method # Enables .grad attribute for non-leaf Tensors. - func: retain_grad(Tensor(a!) self) -> () - use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -120,47 +105,36 @@ variants: function - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_as(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method - func: align_tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool - use_c10_dispatcher: full dispatch: CUDA: _use_cudnn_ctc_loss - func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: _cudnn_ctc_loss - func: _use_cudnn_rnn_flatten_weight() -> bool - use_c10_dispatcher: full - func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: _cudnn_rnn_flatten_weight @@ -180,71 +154,52 @@ CUDA: _cudnn_init_dropout_state - func: _debug_has_internal_overlap(Tensor self) -> int - use_c10_dispatcher: full variants: function - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) - use_c10_dispatcher: full - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!) - use_c10_dispatcher: full - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) - use_c10_dispatcher: full - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor - use_c10_dispatcher: full - func: _shape_as_tensor(Tensor self) -> Tensor - use_c10_dispatcher: full - func: dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: feature_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: abs(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: abs - func: abs_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: abs_ @@ -281,18 +236,15 @@ # Absolute, alias for abs - func: absolute(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: absolute_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: angle(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: angle @@ -303,19 +255,16 @@ CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: view_as_real - func: view_as_complex(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: view_as_complex - func: sgn(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sgn @@ -332,15 +281,12 @@ CPU, CUDA: sgn_out - func: real(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function - func: imag(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function - func: conj(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -349,19 +295,16 @@ CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: _conj - func: acos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acos - func: acos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acos_ @@ -373,28 +316,22 @@ # arccos, alias of acos - func: arccos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arccos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor - use_c10_dispatcher: full - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor - use_c10_dispatcher: full # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full structured_delegate: add.out variants: function, method dispatch: @@ -403,7 +340,6 @@ MkldnnCPU: mkldnn_add - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method structured_delegate: add.out dispatch: @@ -422,13 +358,11 @@ MkldnnCPU: mkldnn_add_out - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: add_relu - func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU: add_relu_ @@ -441,25 +375,21 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: add - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: add_ - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addmv - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addmv_ @@ -470,20 +400,17 @@ CPU, CUDA: addmv_out - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: addmv_impl_cpu CUDA: addmv_impl_cuda - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addr Math: math_addr - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addr_ @@ -495,17 +422,14 @@ Math: math_addr_out - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: affine_grid_generator - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor - use_c10_dispatcher: full variants: function - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: all @@ -516,18 +440,15 @@ CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool - use_c10_dispatcher: full variants: function, method - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: any @@ -538,7 +459,6 @@ CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -568,10 +488,8 @@ # preserve tracing. Get rid of this when arange can directly take tensors for bounds # (so that it can be traced directly). - func: _dim_arange(Tensor like, int dim) -> Tensor - use_c10_dispatcher: full - func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: argmax @@ -582,7 +500,6 @@ CPU, CUDA: argmax_out - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: argmin @@ -593,13 +510,11 @@ CPU, CUDA: argmin_out - func: acosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acosh - func: acosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acosh_ @@ -611,24 +526,20 @@ # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arccosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: asinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: asinh - func: asinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: asinh_ @@ -640,24 +551,20 @@ # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arcsinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atanh - func: atanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atanh_ @@ -669,18 +576,15 @@ # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arctanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: as_strided_tensorimpl @@ -695,14 +599,12 @@ DefaultBackend: as_strided_ - func: asin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: asin SparseCPU, SparseCUDA: asin_sparse - func: asin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: asin_ @@ -716,24 +618,20 @@ # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arcsin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atan - func: atan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atan_ @@ -745,55 +643,44 @@ # arctan, alias of atan - func: arctan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arctan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atleast_1d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: atleast_2d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function - func: atleast_3d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: baddbmm_cpu CUDA: baddbmm_cuda - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: baddbmm__cpu CUDA: baddbmm__cuda - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -825,7 +712,6 @@ # Sample bernoulli with values in `self` as probability. - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: bernoulli @@ -837,13 +723,11 @@ CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: bernoulli_ @@ -852,7 +736,6 @@ # There is no default valid on `p` here because it would introduce ambiguity # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration. - func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor @@ -908,11 +791,9 @@ CUDA: _bincount_cuda - func: bitwise_not(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -921,13 +802,11 @@ CPU, CUDA: bitwise_not_out - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: copysign - func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: copysign_ @@ -938,23 +817,19 @@ CPU, CUDA: copysign_out - func: copysign.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: copysign - func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: copysign_ - func: logical_not(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_not_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -963,11 +838,9 @@ CPU, CUDA: logical_not_out - func: logical_xor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -976,11 +849,9 @@ CPU, CUDA: logical_xor_out - func: logical_and(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -989,11 +860,9 @@ CPU, CUDA: logical_and_out - func: logical_or(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -1008,7 +877,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: bmm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: bmm_cpu @@ -1017,7 +885,6 @@ SparseCUDA: bmm_sparse_cuda - func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor - use_c10_dispatcher: full variants: function dispatch: SparseCUDA: _bmm_sparse_cuda @@ -1038,7 +905,6 @@ SparseCUDA: _bmm_out_sparse_cuda - func: broadcast_tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full device_guard: False - func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) @@ -1048,7 +914,6 @@ Math: broadcast_to - func: cat(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: cat @@ -1058,23 +923,19 @@ DefaultBackend: cat_out - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor - use_c10_dispatcher: full - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: block_diag(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full variants: function - func: ceil(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ceil - func: ceil_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ceil_ @@ -1085,25 +946,20 @@ CPU, CUDA: ceil_out - func: chain_matmul(Tensor[] matrices) -> Tensor - use_c10_dispatcher: full variants: function - func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False - func: tensor_split.sections(Tensor(a) self, int sections, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: tensor_split.indices(Tensor(a) self, int[] indices, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: tensor_split.tensor_indices_or_sections(Tensor(a) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[] @@ -1111,14 +967,12 @@ variants: function, method - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: clamp QuantizedCPU: clamp_quantized_cpu - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_ @@ -1129,13 +983,11 @@ CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_max - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_max_ @@ -1146,13 +998,11 @@ CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_min - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_min_ @@ -1164,7 +1014,6 @@ # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) @@ -1175,11 +1024,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: cudnn_is_acceptable(Tensor self) -> bool - use_c10_dispatcher: full device_guard: False - func: complex(Tensor real, Tensor imag) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: complex @@ -1190,7 +1037,6 @@ CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: polar @@ -1201,13 +1047,11 @@ CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: constant_pad_nd - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) - use_c10_dispatcher: full variants: method manual_cpp_binding: True @@ -1220,7 +1064,6 @@ DefaultBackend: convolution_overrideable - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full dispatch: DefaultBackend: convolution_backward_overrideable @@ -1246,12 +1089,10 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: conv_tbc - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full # NB: we inherit the goofy argument order from PyTorch torch.nn.functional - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor @@ -1264,24 +1105,20 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: copy_ - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full dispatch: {} - func: cos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cos - func: cos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cos_ @@ -1292,13 +1129,11 @@ CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cosh - func: cosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cosh_ @@ -1309,28 +1144,23 @@ CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: count_nonzero - func: count_nonzero(Tensor self, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: count_nonzero - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid - use_c10_dispatcher: full dispatch: CUDA: cudnn_affine_grid_generator_forward # TODO: Why do I have to call this grad?! - func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta - use_c10_dispatcher: full dispatch: CUDA: cudnn_affine_grid_generator_backward @@ -1351,27 +1181,22 @@ CUDA: cudnn_convolution_deprecated - func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_deprecated2 - func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution - func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_input - func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward - func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_weight @@ -1381,45 +1206,37 @@ CUDA: cudnn_convolution_transpose_deprecated - func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_deprecated2 - func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose # NB: output_padding not strictly needed here, but it's helpful for the float # backwards - func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward - func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_input - func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_weight # NB: input is special cased in a way I don't quite understand - func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output - use_c10_dispatcher: full dispatch: CUDA: cudnn_grid_sampler_forward - func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid) - use_c10_dispatcher: full dispatch: CUDA: cudnn_grid_sampler_backward - func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cummax @@ -1430,7 +1247,6 @@ DefaultBackend: cummax_out - func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -1444,7 +1260,6 @@ CUDA: cummax_helper_cuda - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cummin @@ -1455,7 +1270,6 @@ DefaultBackend: cummin_out - func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -1469,18 +1283,15 @@ CUDA: cummin_helper_cuda - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cumprod - func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: cumprod_ @@ -1491,29 +1302,24 @@ DefaultBackend: cumprod_out - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cumsum - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: cumsum_ @@ -1524,137 +1330,111 @@ DefaultBackend: cumsum_out - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full # convenience function that converts to intlists for you - func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: ctc_loss_backward_cpu CUDA: ctc_loss_backward_gpu - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: diagflat(Tensor self, int offset=0) -> Tensor - use_c10_dispatcher: full variants: function, method - func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: diagonal - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: div.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: div SparseCPU, SparseCUDA: div_sparse - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: div_ SparseCPU, SparseCUDA: div_sparse_ - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU, CUDA: div_out SparseCPU, SparseCUDA: div_out_sparse_zerodim # For C++ only, until we have conversion from C++ numbers to Tensor - func: div.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: div - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: div_ # divide, alias for div - func: divide.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method # true_divide, an alias for div - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: dot(Tensor self, Tensor tensor) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: dot @@ -1666,7 +1446,6 @@ DefaultBackend: dot_out - func: vdot(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: vdot @@ -1678,30 +1457,24 @@ DefaultBackend: vdot_out - func: einsum(str equation, Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: embedding - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor - use_c10_dispatcher: full - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor - use_c10_dispatcher: full dispatch: CPU: embedding_dense_backward_cpu CUDA: embedding_dense_backward_cuda - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: embedding_renorm_cpu_ CUDA: embedding_renorm_cuda_ - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor - use_c10_dispatcher: full # NOTE [ embedding_bag Native Functions ] # The `_embedding_bag.*` variants assume that input tensors except for `weight`, @@ -1720,11 +1493,9 @@ CUDA: _embedding_bag_forward_only_cuda - func: rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor) - use_c10_dispatcher: full # row_stack is the alias of vstack - func: row_stack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full dispatch: Math: row_stack @@ -1755,20 +1526,17 @@ CUDA: _embedding_bag_dense_backward_cuda - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _embedding_bag_per_sample_weights_backward_cpu CUDA: _embedding_bag_per_sample_weights_backward_cuda - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures device_guard: False - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: empty_cpu CUDA: empty_cuda @@ -1776,7 +1544,6 @@ SparseCPU, SparseCUDA: empty_sparse - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full variants: method - func: new_empty_strided(Tensor self, int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -1808,7 +1575,6 @@ QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -1818,7 +1584,6 @@ Meta: resize_meta_ - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor - use_c10_dispatcher: full variants: function dispatch: QuantizedCPU, QuantizedCUDA: empty_quantized @@ -1832,19 +1597,16 @@ device_guard: False - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda - func: erf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erf - func: erf_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erf_ @@ -1855,13 +1617,11 @@ CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erfc - func: erfc_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erfc_ @@ -1872,13 +1632,11 @@ CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp - func: exp_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp_ @@ -1889,13 +1647,11 @@ CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp2 - func: exp2_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp2_ @@ -1906,13 +1662,11 @@ CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: expm1 - func: expm1_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: expm1_ @@ -1923,14 +1677,12 @@ CPU, CUDA: expm1_out - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False dispatch: DefaultBackend: expand - func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False @@ -1953,49 +1705,39 @@ CUDA: eye_out_cuda - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: fill_ - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: fill_ - func: floor(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: floor - func: floor_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: floor_ @@ -2006,14 +1748,12 @@ CPU, CUDA: floor_out - func: floor_divide(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: floor_divide SparseCPU, SparseCUDA: floor_divide_sparse - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: floor_divide_ @@ -2026,21 +1766,17 @@ SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: frac(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: frac - func: frac_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: frac_ @@ -2074,11 +1810,9 @@ CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -2087,11 +1821,9 @@ CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method # NOTE [ grid_sampler Native Functions ] @@ -2110,37 +1842,30 @@ # Nor does it take in `align_corners` because it only supports the mode # `align_corners = True`. - func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: CPU: grid_sampler_2d_cpu CUDA: grid_sampler_2d_cuda - func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: grid_sampler_2d_backward_cpu CUDA: grid_sampler_2d_backward_cuda # See NOTE [ grid_sample CPU fallback ] - func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _grid_sampler_2d_cpu_fallback - func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: grid_sampler_3d_backward_cpu CUDA: grid_sampler_3d_backward_cuda @@ -2173,7 +1898,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -2191,7 +1915,6 @@ # Real to complex forward FFT - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_r2c_mkl @@ -2206,7 +1929,6 @@ # Complex to real inverse FFT - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_c2r_mkl @@ -2221,7 +1943,6 @@ # Standard complex to complex FFT (forward or backward) - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_c2c_mkl @@ -2235,19 +1956,14 @@ CUDA: _fft_c2c_cufft_out - func: _cufft_get_plan_cache_size(int device_index) -> int - use_c10_dispatcher: full - func: _cufft_get_plan_cache_max_size(int device_index) -> int - use_c10_dispatcher: full - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> () - use_c10_dispatcher: full - func: _cufft_clear_plan_cache(int device_index) -> () - use_c10_dispatcher: full - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: index @@ -2258,25 +1974,20 @@ # - Tensor Tensor::index(std::initializer_list indices) - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: index_copy_ - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: index_put_ @@ -2287,11 +1998,9 @@ # - Tensor & Tensor::index_put_(std::initializer_list indices, Scalar v) - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _index_put_impl_ @@ -2301,7 +2010,6 @@ variants: function - func: inverse(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: inverse @@ -2312,18 +2020,15 @@ DefaultBackend: inverse_out - func: _inverse_helper(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _inverse_helper_cpu CUDA: _inverse_helper_cuda - func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isnan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: @@ -2331,52 +2036,42 @@ SparseCPU, SparseCUDA: isnan_sparse - func: is_distributed(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_floating_point(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_complex(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: isreal(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: is_nonzero(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_same_size(Tensor self, Tensor other) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_signed(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: kl_div - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: kl_div_backward_cpu CUDA: kl_div_backward_cuda - func: kron(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: kron @@ -2387,7 +2082,6 @@ Math: kron_out - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: kthvalue @@ -2399,7 +2093,6 @@ CUDA: kthvalue_out_cuda - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -2422,13 +2115,11 @@ CUDA: layer_norm_backward_cuda - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nan_to_num - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nan_to_num_ @@ -2449,35 +2140,25 @@ MkldnnCPU: mkldnn_linear - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) - use_c10_dispatcher: full - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor - use_c10_dispatcher: full - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor - use_c10_dispatcher: full - func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -2493,13 +2174,11 @@ CUDA: linspace_cuda_out - func: log(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log - func: log_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log_ @@ -2510,13 +2189,11 @@ CPU, CUDA: log_out - func: log10(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log10 - func: log10_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log10_ @@ -2527,14 +2204,12 @@ CPU, CUDA: log10_out - func: log1p(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: log1p SparseCPU, SparseCUDA: log1p_sparse - func: log1p_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: log1p_ @@ -2547,13 +2222,11 @@ SparseCPU, SparseCUDA: log1p_out_sparse - func: log2(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log2 - func: log2_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log2_ @@ -2569,7 +2242,6 @@ CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: logaddexp @@ -2580,7 +2252,6 @@ CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: logaddexp2 @@ -2636,7 +2307,6 @@ CPU, CUDA: xlogy_out - func: logdet(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logdet @@ -2652,27 +2322,22 @@ # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: CPU: log_softmax_cpu CUDA: log_softmax_cuda - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: CPU: log_softmax_backward_cpu CUDA: log_softmax_backward_cuda - func: _logcumsumexp(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _logcumsumexp_cpu CUDA: _logcumsumexp_cuda @@ -2684,7 +2349,6 @@ CUDA: _logcumsumexp_out_cuda - func: logcumsumexp(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logcumsumexp @@ -2695,14 +2359,12 @@ DefaultBackend: logcumsumexp_out - func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logsumexp @@ -2713,55 +2375,44 @@ DefaultBackend: logsumexp_out - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: matmul(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor - use_c10_dispatcher: full - func: matrix_rank(Tensor self, bool symmetric=False) -> Tensor - use_c10_dispatcher: full - func: matrix_power(Tensor self, int n) -> Tensor - use_c10_dispatcher: full variants: function, method - func: matrix_exp(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor - use_c10_dispatcher: full - func: _aminmax(Tensor self) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _aminmax_all - func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _aminmax - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _compute_linear_combination @@ -2771,7 +2422,6 @@ CPU, CUDA: _compute_linear_combination_out - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: max @@ -2782,19 +2432,16 @@ CPU, CUDA: max_out - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: amax @@ -2806,48 +2453,38 @@ # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_max_pool2d - func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_max_pool3d - func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: QuantizedCPU: quantized_max_pool1d - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: QuantizedCPU: quantized_max_pool2d - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu QuantizedCPU: mean_quantized_cpu - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu @@ -2860,21 +2497,18 @@ QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: median(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: median_cpu CUDA: median_cuda - func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: median @@ -2886,21 +2520,18 @@ CUDA: median_out_cuda - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanmedian(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: nanmedian_cpu CUDA: nanmedian_cuda - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nanmedian @@ -2912,14 +2543,12 @@ CUDA: nanmedian_out_cuda - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: min @@ -2930,14 +2559,12 @@ CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: amin @@ -2953,13 +2580,10 @@ DefaultBackend: mkldnn_convolution - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor - use_c10_dispatcher: full - func: mkldnn_convolution_backward_weights(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: DefaultBackend: mkldnn_convolution_backward @@ -2979,22 +2603,18 @@ CUDA: miopen_convolution - func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_input - func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward - func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_bias - func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_weight @@ -3006,17 +2626,14 @@ # NB: output_padding not strictly needed here, but it's helpful for the float # backwards - func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward - func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward_input - func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward_weight @@ -3026,17 +2643,14 @@ CUDA: miopen_depthwise_convolution - func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward_input - func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward - func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward_weight @@ -3051,7 +2665,6 @@ CUDA: miopen_rnn_backward - func: mm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: mm_cpu @@ -3066,7 +2679,6 @@ SparseCPU, SparseCUDA: _sparse_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor - use_c10_dispatcher: full - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -3080,7 +2692,6 @@ SparseCUDA: sparse_matrix_mask_helper_cuda - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mode @@ -3091,14 +2702,12 @@ DefaultBackend: mode_out - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: mul.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mul @@ -3106,7 +2715,6 @@ MkldnnCPU: mkldnn_mul - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: mul_ @@ -3123,39 +2731,32 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: mul.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: mul - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: mul_ # multiply, alias for mul - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: multiply.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: mv(Tensor self, Tensor vec) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mv @@ -3167,31 +2768,26 @@ DefaultBackend: mv_out - func: mvlgamma(Tensor self, int p) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: mvlgamma - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: mvlgamma_ - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: narrow_copy_dense SparseCPU, SparseCUDA: narrow_copy_sparse - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False @@ -3208,7 +2804,6 @@ CUDA: batch_norm_cuda_out - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: batch_norm_stats_cuda @@ -3256,10 +2851,8 @@ CUDA: batch_norm_update_stats_cuda - func: is_vulkan_available() -> bool - use_c10_dispatcher: full - func: _nnpack_available() -> bool - use_c10_dispatcher: full - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -3268,15 +2861,12 @@ DefaultBackend: _nnpack_spatial_convolution - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: _nnpack_spatial_convolution_backward_input(Tensor input, Tensor grad_output, Tensor weight, int[2] padding) -> Tensor - use_c10_dispatcher: full variants: function - func: _nnpack_spatial_convolution_backward_weight(Tensor input, int[] weightsize, Tensor grad_output, int[2] padding) -> Tensor - use_c10_dispatcher: full variants: function - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -3293,64 +2883,50 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor - use_c10_dispatcher: full - func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor - use_c10_dispatcher: full - func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _euclidean_dist - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor - use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor - use_c10_dispatcher: full variants: function - func: permute(Tensor(a) self, int[] dims) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: DefaultBackend: permute - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method # moveaxis, alias for movedim - func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method # Only exposed from C++ -- in Python, @@ -3361,45 +2937,36 @@ # behavior on Windows, for reasons I don't understand # (maybe related to capital letter collation somehow...) - func: numpy_T(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor - use_c10_dispatcher: full - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor use_c10_dispatcher: full - func: channel_shuffle(Tensor self, int groups) -> Tensor - use_c10_dispatcher: full dispatch: CPU: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu - func: is_pinned(Tensor self) -> bool - use_c10_dispatcher: full variants: method - func: pin_memory(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor - use_c10_dispatcher: full variants: function, method - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor - use_c10_dispatcher: full variants: function - func: rad2deg(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rad2deg - func: rad2deg_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rad2deg_ @@ -3410,13 +2977,11 @@ DefaultBackend: rad2deg_out - func: deg2rad(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: deg2rad - func: deg2rad_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: deg2rad_ @@ -3533,17 +3098,14 @@ CUDA: range_cuda_out - func: ravel(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: reciprocal(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: reciprocal - func: reciprocal_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: reciprocal_ @@ -3554,13 +3116,11 @@ CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: neg - func: neg_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: neg_ @@ -3574,61 +3134,50 @@ # Alias for neg - func: negative(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: negative_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: repeat(Tensor self, int[] repeats) -> Tensor - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: DefaultBackend: repeat - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: repeat_interleave_cpu CUDA: repeat_interleave_cuda - func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: reshape(Tensor(a) self, int[] shape) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_reshape - func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False - func: round(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: round - func: round_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: round_ @@ -3640,13 +3189,10 @@ CUDA: round_out - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor - use_c10_dispatcher: full - func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full - func: relu(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: relu @@ -3654,7 +3200,6 @@ QuantizedCPU: relu_quantized_cpu - func: relu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: relu_ @@ -3662,59 +3207,50 @@ QuantizedCPU: relu_quantized_cpu_ - func: prelu(Tensor self, Tensor weight) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: prelu_cpu CUDA: prelu_cuda - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function, method dispatch: CPU: prelu_backward_cpu CUDA: prelu_backward_cuda - func: gelu(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: gelu_cpu CUDA: gelu_cuda - func: gelu_backward(Tensor grad, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: gelu_backward_cpu CUDA: gelu_backward_cuda - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn device_guard: False - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rsqrt - func: rsqrt_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rsqrt_ @@ -3725,46 +3261,37 @@ CPU, CUDA: rsqrt_out - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: select - func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: selu(Tensor self) -> Tensor - use_c10_dispatcher: full - func: selu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: celu - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) - use_c10_dispatcher: full dispatch: DefaultBackend: celu_ - func: silu(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: silu - func: silu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: silu_ @@ -3776,14 +3303,12 @@ CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: silu_backward Math: math_silu_backward - func: sigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sigmoid @@ -3791,7 +3316,6 @@ MkldnnCPU: mkldnn_sigmoid - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sigmoid_ @@ -3803,13 +3327,11 @@ CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: logit - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: logit_ @@ -3820,13 +3342,11 @@ CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sin - func: sin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sin_ @@ -3854,13 +3374,11 @@ CPU, CUDA: sinc_out - func: sinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sinh - func: sinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sinh_ @@ -3882,7 +3400,6 @@ # changing metadata of the detached tensor and expecting the original tensor to also # be updated. - func: detach(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: detach @@ -3891,134 +3408,112 @@ # only be called on non-view `Variable`s. You can use `is_view()` to check # this. If this `Variable` is a view, throws an `std::runtime_error()`. - func: detach_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: detach_ - func: size.int(Tensor self, int dim) -> int - use_c10_dispatcher: full variants: function device_guard: False manual_cpp_binding: True - func: size.Dimname(Tensor self, Dimname dim) -> int - use_c10_dispatcher: full variants: function, method device_guard: False - func: slice.Tensor(Tensor(a) self, int dim=0, int start=0, int end=9223372036854775807, int step=1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: slice - func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: slogdet - func: smm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: CPU: softmax_cpu CUDA: softmax_cuda MkldnnCPU: mkldnn_softmax - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: CPU: softmax_backward_cpu CUDA: softmax_backward_cuda - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsafe_split - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: split - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsafe_split_with_sizes - func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: split_with_sizes - func: squeeze(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: squeeze - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: squeeze - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: squeeze_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: squeeze_ - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: squeeze_ - func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -4030,7 +3525,6 @@ SparseCUDA: _sspaddmm_out_cuda - func: stack(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: stack @@ -4040,19 +3534,16 @@ DefaultBackend: stack_out - func: hstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: vstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: dstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -4070,30 +3561,25 @@ variants: function, method - func: stride.int(Tensor self, int dim) -> int - use_c10_dispatcher: full variants: function device_guard: False manual_cpp_binding: True - func: stride.Dimname(Tensor self, Dimname dim) -> int - use_c10_dispatcher: full variants: function, method device_guard: False - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -4105,13 +3591,11 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: nansum - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: nansum @@ -4122,18 +3606,15 @@ CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: sqrt(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sqrt - func: sqrt_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sqrt_ @@ -4144,39 +3625,32 @@ CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: square_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: std(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4185,20 +3659,17 @@ CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: prod @@ -4209,34 +3680,29 @@ CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: t(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full device_guard: False variants: function, method dispatch: DefaultBackend: t - func: t_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full device_guard: False variants: method dispatch: DefaultBackend: t_ - func: tan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tan - func: tan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tan_ @@ -4247,14 +3713,12 @@ CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: tanh QuantizedCPU: tanh_quantized_cpu - func: tanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tanh_ @@ -4265,7 +3729,6 @@ CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor - use_c10_dispatcher: full variants: function - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!) @@ -4276,7 +3739,6 @@ # TODO: namespace threshold in 'nn' - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: threshold @@ -4284,7 +3746,6 @@ QuantizedCPU: threshold_quantized_cpu - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU: threshold_ @@ -4297,69 +3758,57 @@ CUDA: threshold_out_cuda - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: threshold_backward CUDA: threshold_backward_cuda - func: tile(Tensor self, int[] dims) -> Tensor - use_c10_dispatcher: full variants: function, method - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: transpose - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_transpose - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: transpose_ - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_transpose_ - func: one_hot(Tensor self, int num_classes=-1) -> Tensor - use_c10_dispatcher: full python_module: nn variants: function - func: flip(Tensor self, int[] dims) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, QuantizedCPU: flip_cpu CUDA: flip_cuda - func: fliplr(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: flipud(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: roll_cpu @@ -4368,33 +3817,26 @@ # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rot90 - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor - use_c10_dispatcher: full - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor - use_c10_dispatcher: full - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _trilinear - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: trunc(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: trunc - func: trunc_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: trunc_ @@ -4406,47 +3848,39 @@ # Alias for trunc - func: fix(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: fix_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: type_as(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method - func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool - use_c10_dispatcher: full variants: function - func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _unique_cpu CUDA: _unique_cuda - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_dim_cpu CUDA: unique_dim_cuda - func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_consecutive_cpu CUDA: unique_consecutive_cuda - func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_dim_consecutive_cpu @@ -4457,42 +3891,35 @@ # Please don't rely on these two operators, they will be removed soon - func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _unique2_cpu CUDA: _unique2_cuda - func: _unsafe_view(Tensor self, int[] size) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _unsafe_view - func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsqueeze - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: unsqueeze_ - func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor - use_c10_dispatcher: full - func: var(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: var @@ -4503,30 +3930,25 @@ CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False @@ -4534,55 +3956,44 @@ # this allows us to implicitly calculate the broadcast derivative, while only dealing with the # _s_where derivative. - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function - func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function - func: where(Tensor condition) -> Tensor[] - use_c10_dispatcher: full variants: function - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor - use_c10_dispatcher: full variants: function # VariableType::_weight_norm does not want to be given a gap in the autograd graph, # so we don't define "dispatch" variants for it. - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor - use_c10_dispatcher: full variants: function - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: weight_norm_cuda - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: weight_norm_cuda_backward - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -4599,40 +4010,34 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _standard_gamma_grad_cpu CUDA: _standard_gamma_grad_cuda - func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _s_gamma_cpu CUDA: _s_gamma_cuda - func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _dirichlet_grad_cpu CUDA: _dirichlet_grad_cuda - func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _s_dirichlet_cpu CUDA: _s_dirichlet_cuda - func: poisson(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _s_poisson_cpu CUDA: _s_poisson_cuda - func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _s_binomial_cpu CUDA: _s_binomial_cuda @@ -4641,96 +4046,77 @@ # complicated - func: native_norm(Tensor self, Scalar p=2) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: norm_sparse - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: norm_sparse # TODO: reduce signatures down to one when optional args is available - func: _sparse_sum(Tensor self) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _sparse_sum - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: _sparse_sum_backward_cpu SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: softmax_sparse_cpu SparseCUDA: softmax_sparse_cuda - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: softmax_backward_sparse_cpu SparseCUDA: softmax_backward_sparse_cuda - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_sparse_cpu SparseCUDA: log_softmax_sparse_cuda - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_backward_sparse_cpu SparseCUDA: log_softmax_backward_sparse_cuda - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm @@ -4746,11 +4132,9 @@ CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) @@ -4760,11 +4144,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: frobenius_norm(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4772,7 +4154,6 @@ variants: function - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4780,7 +4161,6 @@ variants: function - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4788,7 +4168,6 @@ variants: function - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: clone @@ -4797,13 +4176,11 @@ QuantizedCPU, QuantizedCUDA: quantized_clone - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: resize_as_ - func: zero_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: zero_ @@ -4817,14 +4194,12 @@ SparseCPU, SparseCUDA: sub_out_sparse - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sub SparseCPU, SparseCUDA: sub_sparse - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: sub_ @@ -4832,13 +4207,11 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sub - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: sub_ @@ -4848,24 +4221,19 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method # For C++ only, until we have conversion from C++ numbers to Tensor - func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: rsub @@ -4876,7 +4244,6 @@ CPU, CUDA: heaviside_out - func: heaviside(Tensor self, Tensor values) -> Tensor - use_c10_dispatcher: full variants: function, method - func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!) @@ -4885,7 +4252,6 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: rsub @@ -4893,7 +4259,6 @@ # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _sparse_addmm @@ -4906,7 +4271,6 @@ SparseCUDA: addmm_out_sparse_dense_cuda - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: addmm_cpu @@ -4915,7 +4279,6 @@ SparseCUDA: addmm_sparse_dense_cuda - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: addmm_cpu_ @@ -5049,49 +4412,40 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () - use_c10_dispatcher: full - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: new_with_dims_sparse - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_ - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_and_clear_ - func: sparse_mask(Tensor self, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_to_dense MkldnnCPU: mkldnn_to_dense - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor - use_c10_dispatcher: full - func: sparse_dim(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_dim_sparse @@ -5099,14 +4453,12 @@ # legacy method - func: _dimI(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_dim_sparse device_guard: False - func: dense_dim(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: dense_dim_sparse @@ -5114,42 +4466,36 @@ # legacy method - func: _dimV(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: dense_dim_sparse device_guard: False - func: _nnz(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _nnz_sparse device_guard: False - func: coalesce(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU: coalesce_sparse_cpu SparseCUDA: coalesce_sparse_cuda - func: is_coalesced(Tensor self) -> bool - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: is_coalesced_sparse device_guard: False - func: _indices(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _indices_sparse device_guard: False - func: _values(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _values_sparse @@ -5159,21 +4505,18 @@ # a bit unsafe. Similar to _indices and _values, this is useful for implementing # custom sparse operations in Python/C++ extension. - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _coalesced_sparse_ device_guard: False - func: indices(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: indices_sparse device_guard: False - func: values(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: values_sparse @@ -5186,196 +4529,161 @@ SparseCUDA: hspmm_out_sparse_cuda - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: hspmm_sparse_cpu SparseCUDA: hspmm_sparse_cuda - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: SparseCPU, SparseCUDA: copy_sparse_ - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: unbind - func: unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: dense_to_sparse - func: to_sparse(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: dense_to_sparse - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU: dense_to_mkldnn - func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight - func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv3d_weight - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor - use_c10_dispatcher: full - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: quantize_per_tensor - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_tensor_list_cpu - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_channel_cpu - func: dequantize.self(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: dequantize_quant - func: dequantize.tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: QuantizedCPU: dequantize_tensors_quantized_cpu - func: q_scale(Tensor self) -> float - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_scale_quant - func: q_zero_point(Tensor self) -> int - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_zero_point_quant - func: q_per_channel_scales(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_scales - func: q_per_channel_zero_points(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points - func: q_per_channel_axis(Tensor self) -> int - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_axis - func: int_repr(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU: int_repr_quantized_cpu QuantizedCUDA: int_repr_quantized_cuda - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor - use_c10_dispatcher: full dispatch: CPU: make_per_tensor_quantized_tensor_cpu CUDA: make_per_tensor_quantized_tensor_cuda - func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor - use_c10_dispatcher: full dispatch: CPU: make_per_channel_quantized_tensor_cpu - func: qscheme(Tensor self) -> QScheme - use_c10_dispatcher: full variants: method dispatch: QuantizedCPU, QuantizedCUDA: qscheme_quant - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _fake_quantize_learnable_per_tensor_affine - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) - use_c10_dispatcher: full variants: function - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor - use_c10_dispatcher: full variants: function - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function # to(Device) must not exist because all constructors of Device also works for @@ -5387,61 +4695,47 @@ device_guard: False - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: meshgrid(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: cartesian_prod(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full variants: function - func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor - use_c10_dispatcher: full variants: function - func: item(Tensor self) -> Scalar - use_c10_dispatcher: full variants: method - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType - use_c10_dispatcher: full - func: can_cast(ScalarType from, ScalarType to) -> bool - use_c10_dispatcher: full variants: function - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType - use_c10_dispatcher: full variants: function # NB: Does NOT check precondition that numel == 1 - func: _local_scalar_dense(Tensor self) -> Scalar - use_c10_dispatcher: full dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda @@ -5467,7 +4761,6 @@ CUDA: _thnn_fused_gru_cell_cuda - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: _thnn_fused_gru_cell_backward_cuda @@ -5476,28 +4769,20 @@ # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -5515,55 +4800,46 @@ # Quantized RNN layers # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) -# use_c10_dispatcher: full + # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) -# use_c10_dispatcher: full + # Quantized GRU layers # - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) -# use_c10_dispatcher: full +# # - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) -# use_c10_dispatcher: full +# # Quantized RNN cells - func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full - func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full - func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full # PackedSequence utilities - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: DefaultBackend: _pack_padded_sequence - func: _pack_padded_sequence_backward(Tensor grad, int[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor - use_c10_dispatcher: full - func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor) - use_c10_dispatcher: full # wrappers for legacy TH methods - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: set_ - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -5572,61 +4848,51 @@ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: set_tensor_ - func: set_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: set_cpu_ CUDA: set_cuda_ - func: is_set_to(Tensor self, Tensor tensor) -> bool - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: is_set_to - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: view(Tensor(a) self, int[] size) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -5634,126 +4900,101 @@ MkldnnCPU: mkldnn_view - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_put_ CUDA: legacy::cuda::_th_put_ - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: index_add_cpu_ CUDA: index_add_cuda_ - func: index_add(Tensor self, int dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_index_fill_ CUDA: legacy::cuda::_th_index_fill_ - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: index_fill_ - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_ - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_fill_ - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_reduce_ - func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_scalar_reduce_ - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_add_ - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: eq_ - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: eq_ @@ -5771,35 +5012,27 @@ CPU, CUDA: bitwise_and_out - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __and__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -5815,35 +5048,27 @@ CPU, CUDA: bitwise_or_out - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __or__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -5859,181 +5084,149 @@ CPU, CUDA: bitwise_xor_out - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __lshift__ - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __lshift__ - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __ilshift__ - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __ilshift__ - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __rshift__ - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __rshift__ - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __irshift__ - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __irshift__ - func: lgamma_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: _lgamma__cpu CUDA: _lgamma__cuda - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: tril_cpu_ CUDA: tril_cuda_ - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: triu_cpu_ CUDA: triu_cuda_ - func: digamma_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_renorm_ CUDA: legacy::cuda::_th_renorm_ - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: pow_ - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: pow_ - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_scalar_ CUDA: lerp_cuda_scalar_ - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_tensor_ CUDA: lerp_cuda_tensor_ - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: fmod_ - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: fmod_ - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: remainder_ - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: remainder_ - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: addbmm_ @@ -6044,61 +5237,51 @@ CPU, CUDA: addbmm_out - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: addbmm - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addcdiv_ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: geometric_ @@ -6112,13 +5295,11 @@ CUDA: diag_cuda_out - func: diag(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: diag - func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6128,7 +5309,6 @@ CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: cross @@ -6140,7 +5320,6 @@ CUDA: triu_cuda_out - func: triu(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: triu @@ -6152,32 +5331,27 @@ CUDA: tril_cuda_out - func: tril(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: tril - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: tril_indices_cpu CUDA: tril_indices_cuda - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: triu_indices_cpu CUDA: triu_indices_cuda - func: trace(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: trace_cpu CUDA: trace_cuda - func: trace_backward(Tensor grad, int[] sizes) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6188,7 +5362,6 @@ QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ne @@ -6201,20 +5374,17 @@ QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ne_ - func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ne_ @@ -6224,22 +5394,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6249,7 +5415,6 @@ QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: eq @@ -6262,7 +5427,6 @@ QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: eq @@ -6275,7 +5439,6 @@ QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ge @@ -6288,20 +5451,17 @@ QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ge_ - func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ge_ @@ -6311,22 +5471,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6336,7 +5492,6 @@ QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: le @@ -6349,20 +5504,17 @@ QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: le QuantizedCPU: le_quantized_cpu - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: le_ - func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: le_ @@ -6372,22 +5524,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6397,7 +5545,6 @@ QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gt @@ -6410,20 +5557,17 @@ QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: gt_ - func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: gt_ @@ -6433,22 +5577,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6458,7 +5598,6 @@ QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lt @@ -6471,20 +5610,17 @@ QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: lt_ - func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: lt_ @@ -6494,22 +5630,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) @@ -6519,14 +5651,12 @@ CUDA: take_out_cuda - func: take(Tensor self, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: take_cpu CUDA: take_cuda - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6537,7 +5667,6 @@ CUDA: index_select_out_cuda - func: index_select(Tensor self, int dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: index_select_cpu_ @@ -6549,11 +5678,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function - func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6564,14 +5691,12 @@ CUDA: masked_select_out_cuda - func: masked_select(Tensor self, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: masked_select_cpu CUDA: masked_select_cuda - func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6582,14 +5707,12 @@ CUDA: nonzero_out_cuda - func: nonzero(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_nonzero CUDA: nonzero_cuda - func: nonzero_numpy(Tensor self) -> Tensor[] - use_c10_dispatcher: full variants: method, function - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) @@ -6599,13 +5722,11 @@ CUDA: gather_out_cpu_cuda - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gather - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6613,11 +5734,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor - use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -6625,13 +5744,11 @@ CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: addcmul - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addcmul_ @@ -6642,7 +5759,6 @@ CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: addcdiv @@ -6654,7 +5770,6 @@ CUDA: legacy::cuda::_th_gels_out - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR) - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_gels @@ -6666,13 +5781,11 @@ DefaultBackend: triangular_solve_out - func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: triangular_solve - func: _triangular_solve_helper(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _triangular_solve_helper_cpu @@ -6684,13 +5797,11 @@ DefaultBackend: symeig_out - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: symeig - func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _symeig_helper_cpu @@ -6702,7 +5813,6 @@ DefaultBackend: eig_out - func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: eig @@ -6713,13 +5823,11 @@ DefaultBackend: svd_out - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: svd - func: _svd_helper(Tensor self, bool some, bool compute_uv) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _svd_helper_cpu @@ -6727,23 +5835,19 @@ # swapaxes, alias for transpose - func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False # swapdims, alias for transpose - func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False @@ -6753,13 +5857,11 @@ DefaultBackend: cholesky_out - func: cholesky(Tensor self, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: cholesky - func: _cholesky_helper(Tensor self, bool upper) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _cholesky_helper_cpu @@ -6771,20 +5873,17 @@ DefaultBackend: cholesky_solve_out - func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: cholesky_solve - func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _cholesky_solve_helper_cpu CUDA: _cholesky_solve_helper_cuda - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: solve @@ -6795,7 +5894,6 @@ DefaultBackend: solve_out - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _solve_helper_cpu @@ -6808,7 +5906,6 @@ CUDA: legacy::cuda::_th_potri_out - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_potri @@ -6820,7 +5917,6 @@ Math: qr_out - func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R) - use_c10_dispatcher: full variants: method, function dispatch: Math: qr @@ -6832,7 +5928,6 @@ CUDA: legacy::cuda::_th_geqrf_out - func: geqrf(Tensor self) -> (Tensor a, Tensor tau) - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_geqrf @@ -6844,7 +5939,6 @@ CPU: legacy::cpu::_th_orgqr_out - func: orgqr(Tensor self, Tensor input2) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_orgqr @@ -6855,13 +5949,11 @@ CPU: legacy::cpu::_th_ormqr_out - func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_ormqr - func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _lu_with_info_cpu @@ -6873,13 +5965,11 @@ DefaultBackend: lu_solve_out - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: lu_solve - func: _lu_solve_helper(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _lu_solve_helper_cpu @@ -6892,20 +5982,17 @@ CPU, CUDA: multinomial_out - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: multinomial - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: legacy::cpu::_th_multinomial_alias_setup CUDA: legacy::cuda::_th_multinomial_alias_setup - func: _multinomial_alias_draw(Tensor J, Tensor q, int num_samples, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: legacy::cpu::_th_multinomial_alias_draw @@ -6918,7 +6005,6 @@ CUDA: _lgamma_out_cuda - func: lgamma(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lgamma @@ -6929,7 +6015,6 @@ CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: digamma @@ -6940,19 +6025,16 @@ CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: polygamma - func: erfinv(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: erfinv - func: erfinv_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: erfinv_ @@ -6963,13 +6045,11 @@ CPU, CUDA: erfinv_out - func: i0(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: i0 - func: i0_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: i0_ @@ -6980,13 +6060,11 @@ CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sign - func: sign_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: sign_ @@ -6997,7 +6075,6 @@ CPU, CUDA: sign_out - func: signbit(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -7007,7 +6084,6 @@ CUDA: signbit_out - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: dist @@ -7018,7 +6094,6 @@ CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: atan2 @@ -7036,14 +6111,12 @@ CUDA: lerp_cuda_tensor_out - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: lerp_cpu_scalar CUDA: lerp_cuda_scalar - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: lerp_cpu_tensor @@ -7056,7 +6129,6 @@ CUDA: _histc_out_cuda - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_histc @@ -7068,7 +6140,6 @@ CPU, CUDA: fmod_out - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: fmod @@ -7079,7 +6150,6 @@ CPU, CUDA: fmod_out - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: fmod @@ -7090,7 +6160,6 @@ CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: hypot @@ -7107,7 +6176,6 @@ CPU, CUDA: igamma_out - func: igamma(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: igamma @@ -7124,13 +6192,11 @@ CPU, CUDA: igammac_out - func: igammac(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: igammac - func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: igammac_ @@ -7141,7 +6207,6 @@ CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: nextafter @@ -7158,7 +6223,6 @@ CPU, CUDA: remainder_out - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: remainder @@ -7169,27 +6233,23 @@ CPU, CUDA: remainder_out - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: remainder - func: min(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: min QuantizedCPU: min_quantized_cpu - func: max(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: max QuantizedCPU: max_quantized_cpu - func: maximum(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: maximum @@ -7202,14 +6262,12 @@ # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max - func: max.other(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: minimum(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: minimum @@ -7225,35 +6283,30 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: min.other(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7263,7 +6316,6 @@ CUDA: legacy::cuda::_th_sort_out - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function dispatch: CPU: sort_cpu @@ -7274,7 +6326,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function - func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -7283,17 +6334,14 @@ Math: msort_out - func: msort(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: Math: msort - func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7303,20 +6351,17 @@ CUDA: legacy::cuda::_th_topk_out - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: topk QuantizedCPU: topk_quantized_cpu - func: all(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: all - func: any(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: any @@ -7329,14 +6374,12 @@ CUDA: legacy::cuda::_th_renorm_out - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_renorm CUDA: legacy::cuda::_th_renorm - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -7344,13 +6387,11 @@ QuantizedCPU, QuantizedCUDA: unfold - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: unfold_backward - func: equal(Tensor self, Tensor other) -> bool - use_c10_dispatcher: full variants: method, function dispatch: CPU: cpu_equal @@ -7363,7 +6404,6 @@ CPU, CUDA: pow_out - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: pow @@ -7374,7 +6414,6 @@ CPU, CUDA: pow_out - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: pow @@ -7385,7 +6424,6 @@ SparseCPU, SparseCUDA: pow_out_sparse_scalar - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: pow @@ -7397,7 +6435,6 @@ Math: float_power_out - func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: float_power @@ -7408,7 +6445,6 @@ Math: float_power_out - func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor - use_c10_dispatcher: full dispatch: Math: float_power @@ -7418,25 +6454,21 @@ Math: float_power_out - func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: float_power - func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: Math: float_power_ - func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: Math: float_power_ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: normal_ @@ -7447,7 +6479,6 @@ CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7457,7 +6488,6 @@ CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7467,7 +6497,6 @@ CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7478,19 +6507,16 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: alias(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: alias - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_index_copy_ CUDA: legacy::cuda::_th_index_copy_ - func: _cumsum(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cumsum_cpu CUDA: _cumsum_cuda @@ -7502,7 +6528,6 @@ CUDA: _cumsum_out_cuda - func: _cumprod(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cumprod_cpu CUDA: _cumprod_cuda @@ -7514,29 +6539,24 @@ CUDA: _cumprod_out_cuda - func: _var(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_var - func: _std(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_std - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () - use_c10_dispatcher: full variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CUDA: _amp_update_scale_cuda - func: _cat(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cat_cpu CUDA: cat_cuda @@ -7550,644 +6570,552 @@ QuantizedCPU: cat_out_quantized_cpu - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalar_kernel_slow CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalar_kernel_slow_ CUDA: foreach_tensor_add_scalar_kernel_cuda_ - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow_ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalar_kernel_slow CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalar_kernel_slow_ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow_ CUDA: foreach_tensor_sub_list_kernel_cuda_ - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ - func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow CUDA: foreach_tensor_add_scalarlist_kernel_cuda - func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow_ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ - func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalarlist_kernel_slow CUDA: foreach_tensor_sub_scalarlist_kernel_cuda - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalarlist_kernel_slow_ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ - func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow CUDA: foreach_tensor_div_scalarlist_kernel_cuda - func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow_ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ - func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalarlist_kernel_slow CUDA: foreach_tensor_mul_scalarlist_kernel_cuda - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalarlist_kernel_slow_ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ - func: _foreach_exp(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_exp_slow CUDA: foreach_tensor_exp_cuda - func: _foreach_zero_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_zero_slow_ CUDA: foreach_tensor_zero_cuda_ - func: _foreach_exp_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_exp_slow_ CUDA: foreach_tensor_exp_cuda_ - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sqrt_slow CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sqrt_slow_ CUDA: foreach_tensor_sqrt_cuda_ - func: _foreach_abs(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_abs_slow CUDA: foreach_tensor_abs_cuda - func: _foreach_abs_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_abs_slow_ CUDA: foreach_tensor_abs_cuda_ - func: _foreach_acos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_acos_slow CUDA: foreach_tensor_acos_cuda - func: _foreach_acos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_acos_slow_ CUDA: foreach_tensor_acos_cuda_ - func: _foreach_asin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_asin_slow CUDA: foreach_tensor_asin_cuda - func: _foreach_asin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_asin_slow_ CUDA: foreach_tensor_asin_cuda_ - func: _foreach_atan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_atan_slow CUDA: foreach_tensor_atan_cuda - func: _foreach_atan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_atan_slow_ CUDA: foreach_tensor_atan_cuda_ - func: _foreach_ceil(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_ceil_slow CUDA: foreach_tensor_ceil_cuda - func: _foreach_ceil_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_ceil_slow_ CUDA: foreach_tensor_ceil_cuda_ - func: _foreach_cos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cos_slow CUDA: foreach_tensor_cos_cuda - func: _foreach_cos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cos_slow_ CUDA: foreach_tensor_cos_cuda_ - func: _foreach_cosh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cosh_slow CUDA: foreach_tensor_cosh_cuda - func: _foreach_cosh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cosh_slow_ CUDA: foreach_tensor_cosh_cuda_ - func: _foreach_erf(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erf_slow CUDA: foreach_tensor_erf_cuda - func: _foreach_erf_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erf_slow_ CUDA: foreach_tensor_erf_cuda_ - func: _foreach_erfc(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erfc_slow CUDA: foreach_tensor_erfc_cuda - func: _foreach_erfc_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erfc_slow_ CUDA: foreach_tensor_erfc_cuda_ - func: _foreach_expm1(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_expm1_slow CUDA: foreach_tensor_expm1_cuda - func: _foreach_expm1_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_expm1_slow_ CUDA: foreach_tensor_expm1_cuda_ - func: _foreach_floor(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_floor_slow CUDA: foreach_tensor_floor_cuda - func: _foreach_floor_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_floor_slow_ CUDA: foreach_tensor_floor_cuda_ - func: _foreach_log(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log_slow CUDA: foreach_tensor_log_cuda - func: _foreach_log_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log_slow_ CUDA: foreach_tensor_log_cuda_ - func: _foreach_log10(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log10_slow CUDA: foreach_tensor_log10_cuda - func: _foreach_log10_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log10_slow_ CUDA: foreach_tensor_log10_cuda_ - func: _foreach_log1p(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log1p_slow CUDA: foreach_tensor_log1p_cuda - func: _foreach_log1p_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log1p_slow_ CUDA: foreach_tensor_log1p_cuda_ - func: _foreach_log2(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log2_slow CUDA: foreach_tensor_log2_cuda - func: _foreach_log2_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log2_slow_ CUDA: foreach_tensor_log2_cuda_ - func: _foreach_neg(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_neg_slow CUDA: foreach_tensor_neg_cuda - func: _foreach_neg_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_neg_slow_ CUDA: foreach_tensor_neg_cuda_ - func: _foreach_tan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tan_slow CUDA: foreach_tensor_tan_cuda - func: _foreach_tan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tan_slow_ CUDA: foreach_tensor_tan_cuda_ - func: _foreach_tanh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tanh_slow CUDA: foreach_tensor_tanh_cuda - func: _foreach_tanh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tanh_slow_ CUDA: foreach_tensor_tanh_cuda_ - func: _foreach_sin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sin_slow CUDA: foreach_tensor_sin_cuda - func: _foreach_sin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sin_slow_ CUDA: foreach_tensor_sin_cuda_ - func: _foreach_sinh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sinh_slow CUDA: foreach_tensor_sinh_cuda - func: _foreach_sinh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sinh_slow_ CUDA: foreach_tensor_sinh_cuda_ - func: _foreach_round(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_round_slow CUDA: foreach_tensor_round_cuda - func: _foreach_round_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_round_slow_ CUDA: foreach_tensor_round_cuda_ - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_lgamma_slow CUDA: foreach_tensor_lgamma_cuda - func: _foreach_lgamma_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_lgamma_slow_ CUDA: foreach_tensor_lgamma_cuda_ - func: _foreach_frac(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_frac_slow CUDA: foreach_tensor_frac_cuda - func: _foreach_frac_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_frac_slow_ CUDA: foreach_tensor_frac_cuda_ - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_reciprocal_slow CUDA: foreach_tensor_reciprocal_cuda - func: _foreach_reciprocal_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_reciprocal_slow_ CUDA: foreach_tensor_reciprocal_cuda_ - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sigmoid_slow CUDA: foreach_tensor_sigmoid_cuda - func: _foreach_sigmoid_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sigmoid_slow_ CUDA: foreach_tensor_sigmoid_cuda_ - func: _foreach_trunc(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_trunc_slow CUDA: foreach_tensor_trunc_cuda - func: _foreach_trunc_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_trunc_slow_ CUDA: foreach_tensor_trunc_cuda_ - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalar_slow_ CUDA: foreach_tensor_addcdiv_scalar_cuda_ - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalar_slow_ CUDA: foreach_tensor_addcmul_scalar_cuda_ - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalarlist_slow_ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalarlist_slow_ CUDA: foreach_tensor_addcmul_scalarlist_cuda_ - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalar_slow CUDA: foreach_tensor_addcdiv_scalar_cuda - func: _foreach_addcmul.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalar_slow CUDA: foreach_tensor_addcmul_scalar_cuda - func: _foreach_addcdiv.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalarlist_slow CUDA: foreach_tensor_addcdiv_scalarlist_cuda - func: _foreach_addcmul.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalarlist_slow CUDA: foreach_tensor_addcmul_scalarlist_cuda - func: _foreach_maximum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_maximum_slow CUDA: foreach_tensor_maximum_cuda - func: _foreach_minimum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_minimum_slow CUDA: foreach_tensor_minimum_cuda - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_mode CUDA: legacy::cuda::_th_mode @@ -8199,7 +7127,6 @@ CUDA: legacy::cuda::_th_mode_out - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda @@ -8211,13 +7138,11 @@ CUDA: bucketize_out_cuda - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda - func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda @@ -8229,7 +7154,6 @@ CUDA: searchsorted_out_cuda - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda @@ -8243,7 +7167,6 @@ CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: mse_loss @@ -8255,7 +7178,6 @@ CPU, CUDA: mse_loss_backward_out - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: mse_loss_backward @@ -8267,7 +7189,6 @@ DefaultBackend: l1_loss_out - func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: l1_loss @@ -8279,7 +7200,6 @@ CPU, CUDA: l1_loss_backward_out - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: l1_loss_backward @@ -8317,7 +7237,6 @@ python_module: nn - func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn - func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!)) @@ -8328,7 +7247,6 @@ CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target) - use_c10_dispatcher: full python_module: nn dispatch: CPU: multilabel_margin_loss_forward_cpu @@ -8342,7 +7260,6 @@ CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: multilabel_margin_loss_backward_cpu @@ -8428,7 +7345,6 @@ CUDA: smooth_l1_loss_out - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: smooth_l1_loss @@ -8441,7 +7357,6 @@ CUDA: smooth_l1_loss_backward_out - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: smooth_l1_loss_backward @@ -8453,7 +7368,6 @@ DefaultBackend: soft_margin_loss_out - func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: soft_margin_loss @@ -8465,7 +7379,6 @@ DefaultBackend: soft_margin_loss_backward_out - func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: soft_margin_loss_backward @@ -8477,7 +7390,6 @@ CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: elu @@ -8489,13 +7401,11 @@ CPU, CUDA: elu_backward_out - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: elu_ @@ -8508,7 +7418,6 @@ CUDA: legacy::cuda::_thnn_glu_forward_out - func: glu(Tensor self, int dim=-1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: glu @@ -8522,7 +7431,6 @@ CUDA: legacy::cuda::_thnn_glu_backward_out - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: glu_backward @@ -8535,20 +7443,17 @@ CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid QuantizedCPU: hardsigmoid_quantized_cpu - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid_backward @@ -8561,7 +7466,6 @@ QuantizedCPU: hardtanh_out_quantized_cpu - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh @@ -8574,13 +7478,11 @@ CPU, CUDA: hardtanh_backward_out - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh_ @@ -8593,19 +7495,16 @@ CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish_backward @@ -8618,20 +7517,17 @@ QuantizedCPU: leaky_relu_out_quantized_cpu - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu QuantizedCPU: leaky_relu_quantized_cpu - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu_ @@ -8642,7 +7538,6 @@ python_module: nn - func: log_sigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn - func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!)) @@ -8653,7 +7548,6 @@ CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) - use_c10_dispatcher: full python_module: nn dispatch: CPU: log_sigmoid_forward_cpu @@ -8667,7 +7561,6 @@ CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: log_sigmoid_backward_cpu @@ -8681,20 +7574,17 @@ CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: rrelu_with_noise_cpu CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: rrelu_with_noise_backward - func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU: rrelu_with_noise_cpu_ @@ -8707,7 +7597,6 @@ CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softplus @@ -8719,7 +7608,6 @@ CPU, CUDA: softplus_backward_out - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softplus_backward @@ -8731,7 +7619,6 @@ CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softshrink @@ -8743,7 +7630,6 @@ CPU, CUDA: softshrink_backward_out - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softshrink_backward @@ -8756,23 +7642,19 @@ MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_adaptive_avg_pool2d - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda QuantizedCPU: adaptive_avg_pool2d_quantized_cpu - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool2d_backward_cpu @@ -8787,7 +7669,6 @@ QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool3d_cpu @@ -8802,7 +7683,6 @@ CUDA: adaptive_avg_pool3d_backward_out_cuda - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool3d_backward_cpu @@ -8818,7 +7698,6 @@ # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool2d_cpu @@ -8832,7 +7711,6 @@ CUDA: adaptive_max_pool2d_backward_out_cuda - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool2d_backward_cpu @@ -8848,7 +7726,6 @@ # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool3d_cpu @@ -8862,7 +7739,6 @@ CUDA: adaptive_max_pool3d_backward_out_cuda - func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool3d_backward_cpu @@ -8877,7 +7753,6 @@ MkldnnCPU: mkldnn_avg_pool2d_out - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool2d_cpu @@ -8893,7 +7768,6 @@ CUDA: avg_pool2d_backward_out_cuda - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool2d_backward_cpu @@ -8908,7 +7782,6 @@ MkldnnCPU: mkldnn_avg_pool3d_out - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool3d_cpu @@ -8924,7 +7797,6 @@ CUDA: avg_pool3d_backward_out_cuda - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool3d_backward_cpu @@ -8940,7 +7812,6 @@ # Return: (Tensor output, Tensor indices) - func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool2d_cpu @@ -8954,7 +7825,6 @@ CUDA: fractional_max_pool2d_backward_out_cuda - func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool2d_backward_cpu @@ -8970,7 +7840,6 @@ # Return: (Tensor output, Tensor indices) - func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool3d_cpu @@ -8984,7 +7853,6 @@ CUDA: fractional_max_pool3d_backward_out_cuda - func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool3d_backward_cpu @@ -9000,7 +7868,6 @@ # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool2d_with_indices_cpu @@ -9014,7 +7881,6 @@ CUDA: max_pool2d_with_indices_backward_out_cuda - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool2d_with_indices_backward_cpu @@ -9030,7 +7896,6 @@ # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool3d_with_indices_cpu @@ -9044,7 +7909,6 @@ CUDA: max_pool3d_with_indices_backward_out_cuda - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool3d_with_indices_backward_cpu @@ -9058,7 +7922,6 @@ CUDA: max_unpooling2d_forward_out_cuda - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling2d_forward_cpu @@ -9072,7 +7935,6 @@ CUDA: max_unpooling2d_backward_out_cuda - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling2d_backward_cpu @@ -9086,7 +7948,6 @@ CUDA: max_unpooling3d_forward_out_cuda - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling3d_forward_cpu @@ -9100,7 +7961,6 @@ CUDA: max_unpooling3d_backward_out_cuda - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling3d_backward_cpu @@ -9114,7 +7974,6 @@ CUDA: reflection_pad1d_out_cuda - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad1d_cpu @@ -9128,7 +7987,6 @@ CUDA: reflection_pad1d_backward_out_cuda - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: reflection_pad1d_backward_cpu @@ -9142,7 +8000,6 @@ CUDA: reflection_pad2d_out_cuda - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad2d_cpu @@ -9156,7 +8013,6 @@ CUDA: reflection_pad2d_backward_out_cuda - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: reflection_pad2d_backward_cpu @@ -9170,7 +8026,6 @@ CUDA: replication_pad1d_out_cuda - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad1d_cpu @@ -9184,7 +8039,6 @@ CUDA: replication_pad1d_backward_out_cuda - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad1d_backward_cpu @@ -9198,7 +8052,6 @@ CUDA: replication_pad2d_out_cuda - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad2d_cpu @@ -9212,7 +8065,6 @@ CUDA: replication_pad2d_backward_out_cuda - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad2d_backward_cpu @@ -9226,7 +8078,6 @@ CUDA: replication_pad3d_out_cuda - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad3d_cpu @@ -9240,28 +8091,24 @@ CUDA: replication_pad3d_backward_out_cuda - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad3d_backward_cpu CUDA: replication_pad3d_backward_cuda - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_cpu CUDA: upsample_linear1d_cuda - func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_backward_cpu CUDA: upsample_linear1d_backward_cuda - func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_cpu @@ -9269,54 +8116,46 @@ QuantizedCPU: upsample_bilinear2d_quantized_cpu - func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_backward_cpu CUDA: upsample_bilinear2d_backward_cuda - func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_cpu CUDA: upsample_trilinear3d_cuda - func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_backward_cpu CUDA: upsample_trilinear3d_backward_cuda - func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_cpu CUDA: upsample_bicubic2d_cuda - func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_backward_cpu CUDA: upsample_bicubic2d_backward_cuda - func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: upsample_nearest1d - func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: upsample_nearest1d_backward - func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_cpu @@ -9324,14 +8163,12 @@ QuantizedCPU: upsample_nearest2d_quantized_cpu - func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_backward_cpu CUDA: upsample_nearest2d_backward_cuda - func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_cpu @@ -9339,7 +8176,6 @@ QuantizedCPU: upsample_nearest3d_quantized_cpu - func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_backward_cpu @@ -9354,7 +8190,6 @@ CUDA: upsample_linear1d_out_cuda - func: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_cpu @@ -9368,7 +8203,6 @@ CUDA: upsample_linear1d_backward_out_cuda - func: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_backward_cpu @@ -9382,7 +8216,6 @@ CUDA: upsample_bilinear2d_out_cuda - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_cpu @@ -9397,7 +8230,6 @@ CUDA: upsample_bilinear2d_backward_out_cuda - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_backward_cpu @@ -9411,7 +8243,6 @@ CUDA: upsample_bicubic2d_out_cuda - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_cpu @@ -9425,7 +8256,6 @@ CUDA: upsample_bicubic2d_backward_out_cuda - func: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_backward_cpu @@ -9439,7 +8269,6 @@ CUDA: upsample_trilinear3d_out_cuda - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_cpu @@ -9453,7 +8282,6 @@ CUDA: upsample_trilinear3d_backward_out_cuda - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_backward_cpu @@ -9468,7 +8296,6 @@ CUDA: upsample_nearest1d_out_cuda - func: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn structured_delegate: upsample_nearest1d.out @@ -9481,7 +8308,6 @@ CUDA: upsample_nearest1d_backward_out_cuda - func: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn structured_delegate: upsample_nearest1d_backward.grad_input @@ -9493,7 +8319,6 @@ CUDA: upsample_nearest2d_out_cuda - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_cpu @@ -9508,7 +8333,6 @@ CUDA: upsample_nearest2d_backward_out_cuda - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_backward_cpu @@ -9522,7 +8346,6 @@ CUDA: upsample_nearest3d_out_cuda - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_cpu @@ -9537,7 +8360,6 @@ CUDA: upsample_nearest3d_backward_out_cuda - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_backward_cpu @@ -9550,7 +8372,6 @@ CPU, CUDA: sigmoid_backward_out - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: sigmoid_backward @@ -9562,7 +8383,6 @@ CPU, CUDA: logit_backward_out - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: logit_backward @@ -9574,7 +8394,6 @@ CPU, CUDA: tanh_backward_out - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: tanh_backward @@ -9619,7 +8438,6 @@ CUDA: slow_conv_transpose2d_backward_out_cuda - func: slow_conv_transpose2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose2d_backward_cpu @@ -9647,7 +8465,6 @@ CUDA: slow_conv_transpose3d_backward_out_cuda - func: slow_conv_transpose3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose3d_backward_cpu @@ -9683,7 +8500,6 @@ CUDA: slow_conv2d_backward_out_cuda - func: thnn_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv2d_backward_cpu @@ -9716,7 +8532,6 @@ CUDA: thnn_conv_depthwise2d_backward_out - func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight) - use_c10_dispatcher: full python_module: nn dispatch: CUDA: thnn_conv_depthwise2d_backward @@ -9748,7 +8563,6 @@ CPU: slow_conv3d_backward_out_cpu - func: slow_conv3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv3d_backward_cpu @@ -9761,7 +8575,6 @@ CUDA: slow_conv_dilated2d_cuda - func: slow_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated2d_backward_cpu @@ -9775,7 +8588,6 @@ CUDA: slow_conv_dilated3d_cuda - func: slow_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated3d_backward_cpu @@ -9789,7 +8601,6 @@ CUDA: col2im_out_cuda - func: col2im(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: col2im_cpu @@ -9803,14 +8614,12 @@ CUDA: col2im_backward_out_cuda - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: col2im_backward_cpu CUDA: col2im_backward_cuda - func: column_stack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full dispatch: Math: column_stack @@ -9827,7 +8636,6 @@ CUDA: im2col_out_cuda - func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: im2col_cpu @@ -9841,30 +8649,25 @@ CUDA: im2col_backward_out_cuda - func: im2col_backward(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: im2col_backward_cpu CUDA: im2col_backward_cuda - func: isfinite(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False - func: isinf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False - func: record_stream(Tensor(a!) self, Stream s) -> () - use_c10_dispatcher: full variants: method dispatch: CUDA: record_stream_cuda - func: isposinf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9873,7 +8676,6 @@ CPU, CUDA: isposinf_out - func: isneginf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9886,12 +8688,10 @@ # of the vmap frontend API (see torch/_vmap_internals.py). They are not # user-facing, hence the leading underscore. Please don't use them them anywhere else. - func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor - use_c10_dispatcher: full variants: function # See NOTE [_add_batch_dim and _remove_batch_dim] - func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor - use_c10_dispatcher: full variants: function ## Functions related to the fast Fourier transform and the torch.fft namespace @@ -9906,7 +8706,6 @@ # torch.fft.fft # NOTE: NOT an alias for torch.fft, which has different semantics - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9916,7 +8715,6 @@ variants: function - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9926,7 +8724,6 @@ variants: function - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9936,7 +8733,6 @@ variants: function - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9946,7 +8742,6 @@ variants: function - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9956,7 +8751,6 @@ variants: function - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9966,7 +8760,6 @@ variants: function - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9976,7 +8769,6 @@ variants: function - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9986,7 +8778,6 @@ variants: function - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9996,7 +8787,6 @@ variants: function - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10006,7 +8796,6 @@ variants: function - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10016,7 +8805,6 @@ variants: function - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10026,7 +8814,6 @@ variants: function - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10036,7 +8823,6 @@ variants: function - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10066,12 +8852,10 @@ variants: function - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function - func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -10085,7 +8869,6 @@ # See linalg_det as an example. - func: linalg_cholesky(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10100,25 +8883,21 @@ # torch.linalg.det, alias for torch.det - func: linalg_det(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function - func: det(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: det - func: _syevd_helper(Tensor self, bool compute_eigenvectors, str uplo) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _syevd_helper_cpu CUDA: _syevd_helper_cuda - func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10131,7 +8910,6 @@ DefaultBackend: linalg_eigh_out - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10144,7 +8922,6 @@ DefaultBackend: linalg_eigvalsh_out - func: inner(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -10152,14 +8929,12 @@ # torch.outer, alias for torch.ger - func: outer(Tensor self, Tensor vec2) -> Tensor - use_c10_dispatcher: full variants: function, method - func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: ger(Tensor self, Tensor vec2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ger @@ -10170,12 +8945,10 @@ DefaultBackend: ger_out - func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function - func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function @@ -10190,7 +8963,6 @@ variants: function - func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10204,7 +8976,6 @@ Math: linalg_cond_out - func: linalg_cond.p_str(Tensor self, str p) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10252,7 +9023,6 @@ Math: linalg_tensorinv_out - func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10287,7 +9057,6 @@ CUDA: _linalg_qr_helper_cuda - func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10303,32 +9072,27 @@ ## Functions that are only for testing # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full # Note: this function is only for testing. - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_intlist # Note: this function is only for testing. - func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_intlist # Note: this function is only for testing. - func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_floatlist # Note: this function is only for testing. - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor - use_c10_dispatcher: full python_module: nn # Note: this function is only for testing. From eef5eb05bf0468ed5f840d2bf3e09c135b8760df Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Wed, 6 Jan 2021 14:14:24 -0800 Subject: [PATCH 30/44] Remove backward and requires_grad from Autograd backend key (#49613) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49613 Just following a TODO in the code base... ghstack-source-id: 119450484 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D25644597 fbshipit-source-id: 26f5fa6af480929d0468b0de3ab103813e40d78b --- torch/csrc/autograd/VariableTypeManual.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index d1f15fff3669..f6c3f23cd0f7 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -387,14 +387,6 @@ TORCH_LIBRARY_IMPL(aten, Autograd, m) { m.impl("detach", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach))); m.impl("detach_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach_))); m.impl("copy_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::copy_))); - // For backward() and requires_grad_(), we need the DefaultBackend kernel, but we also need the Autograd backend - // kernel, because when called with a VariableTensorId tensor, it goes through the variable fallback kernel, - // which calls callBoxed(), which doesn't support optional tensor arguments yet and backward() has an optional - // tensor argument. - // TODO Once callBoxed() supports optional tensor arguments, we can enable `use_c10_dispatcher: full` for backward() - // and requires_grad_(), then remove the backend Autograd kernel here, only leaving the Math kernel. - m.impl("_backward", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_backward))); - m.impl("requires_grad_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::requires_grad_))); m.impl("_fw_primal", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_fw_primal))); } From dde5b6e177ec24d34651ffd8df04b4ebdf264e6e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 14:39:42 -0800 Subject: [PATCH 31/44] [PyTorch] Reapply D25547962: Make tls_local_dispatch_key_set inlineable (reapply) (#49763) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49763 This was reverted because it landed in a stack together with D25542799 (https://github.com/pytorch/pytorch/commit/9ce1df079f6ea90dd4b7f9aa12a1a78d51a8b204), which really was broken. ghstack-source-id: 119063016 Test Plan: CI Reviewed By: ezyang Differential Revision: D25685959 fbshipit-source-id: 514d8076eac67c760f119cfebc2ae3d0ddcd4e04 --- c10/core/impl/LocalDispatchKeySet.cpp | 20 +++----------------- c10/core/impl/LocalDispatchKeySet.h | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp index 358e6ef7e1f7..ff3e454eda8a 100644 --- a/c10/core/impl/LocalDispatchKeySet.cpp +++ b/c10/core/impl/LocalDispatchKeySet.cpp @@ -5,10 +5,6 @@ namespace c10 { namespace impl { -C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process."); - -namespace { - /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, /// thread_local is not supported. #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY @@ -18,25 +14,15 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) -static PODLocalDispatchKeySet raw_local_dispatch_key_set; +PODLocalDispatchKeySet raw_local_dispatch_key_set; #endif -} // anonymous namespace - +#ifdef _MSC_VER LocalDispatchKeySet tls_local_dispatch_key_set() { - // Hack until variable performance is fixed - // - // ezyang: I'm pretty unhappy about this implementation, it looks wrong - // to me, as it seems to be performing a mutation on - // raw_local_dispatch_key_set. I can't conveniently test the correct - // version though... - if (FLAGS_disable_variable_dispatch) { - raw_local_dispatch_key_set.set_excluded( - raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset); - } return raw_local_dispatch_key_set; } +#endif // _MSC_VER void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) { raw_local_dispatch_key_set = PODLocalDispatchKeySet { diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 5262b1d4d6c0..313dc5ca3508 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -23,8 +23,6 @@ namespace c10 { namespace impl { -C10_DECLARE_bool(disable_variable_dispatch); - // POD version of LocalDispatchKeySet. Declared here just so that // we can put it in the guards. struct C10_API PODLocalDispatchKeySet { @@ -54,7 +52,24 @@ struct C10_API LocalDispatchKeySet { DispatchKeySet excluded_; }; +// thread_local variables cannot be C10_API on Windows. +#ifdef _MSC_VER C10_API LocalDispatchKeySet tls_local_dispatch_key_set(); +#else // _MSC_VER +/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, +/// thread_local is not supported. +#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY + extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; +#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) + extern C10_API PODLocalDispatchKeySet raw_local_dispatch_key_set; +#endif + +inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() { + // Don't let people fiddle with the thread_local directly just + // because they include this header. + return raw_local_dispatch_key_set; +} +#endif // _MSC_VER // Internal, use ThreadLocalStateGuard C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set); From 3270e661c3e885171ffd3a7fa94cb085b267be3a Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Wed, 6 Jan 2021 14:53:55 -0800 Subject: [PATCH 32/44] [PyTorch Mobile] Skip signature check when converting to typed operator handle (#49469) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49469 In Functions.cpp, there is a call to `typed<...>` that converts to a `TypedOperatorHandle`. This isn't needed at runtime since it's already been exercised during development, and for mobile, there is no possibility of operators or kernels being registered by users (from Python code the way it is possible on server side). ghstack-source-id: 118714246 Test Plan: Sandcastle ### App testing results: FBiOS fails with an error similar to this one: https://fb.workplace.com/groups/2102613013103952/permalink/3815085708523332/ Tested 2 AR effects (gren screen and colors shift) on IGiOS. ### BSB results: D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912) **fbios: Succeeded** Change in Download Size for arm64 + 3x assets variation: -7.2 KiB Change in Uncompressed Size for arm64 + 3x assets variation: -27.1 KiB Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:135971531636706@base/bsb:135971531636706@diff/ D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912) **fbios-pika: Succeeded** Change in Download Size for arm64 + 3x assets variation: -11.0 KiB Change in Uncompressed Size for arm64 + 3x assets variation: -7.4 KiB Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:430379774665351@base/bsb:430379774665351@diff/ 3:02 AM D25581159-V1 (https://www.internalfb.com/intern/diff/D25581159/?dest_number=118689912) **igios: Succeeded** Change in Download Size for arm64 + 3x assets variation: -5.3 KiB Change in Uncompressed Size for arm64 + 3x assets variation: -17.3 KiB Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:685843828784135@base/bsb:685843828784135@diff/ Reviewed By: iseeyuan Differential Revision: D25581159 fbshipit-source-id: 4a62982829ec42c2d3f58f47f876f2543bc0099b --- aten/src/ATen/core/dispatch/Dispatcher.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 9641dfbea0cd..d83653f75363 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -304,7 +304,9 @@ class TORCH_API OperatorHandle { // smuggle in a kernel that is typed incorrectly). For everything // in core library this won't happen, because all the static registrations // will be done by the time a typed() handle is acquired. +#if !defined C10_MOBILE operatorIterator_->op.assertSignatureIsCorrect(); +#endif return TypedOperatorHandle(operatorIterator_); } From dc41d1765517316073b5faa3be6a46e50005a0fe Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Wed, 6 Jan 2021 15:40:35 -0800 Subject: [PATCH 33/44] .circleci: Add option to not run build workflow (#50162) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50162 Adds an option to not run the build workflow when the `run_build` parameter is set to false Should reduce the amount of double workflows that are run by pytorch-probot Uses functionality introduced in https://github.com/pytorch/pytorch-probot/pull/18 Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: yns88 Differential Revision: D25812971 Pulled By: seemethere fbshipit-source-id: 4832170f6abcabe3f385f47a663d148b0cfe2a28 --- .circleci/config.yml | 4 ++++ .circleci/generate_config_yml.py | 5 ++++- .circleci/verbatim-sources/header-section.yml | 3 +++ .github/pytorch-circleci-labels.yml | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0716e516518b..d19c08b2b0b6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,6 +11,9 @@ parameters: run_binary_tests: type: boolean default: false + run_build: + type: boolean + default: true docker_config_defaults: &docker_config_defaults user: jenkins @@ -9762,6 +9765,7 @@ workflows: only: - postnightly executor: windows-with-nvidia-gpu + when: << pipeline.parameters.run_build >> ecr_gc: triggers: - schedule: diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py index f1af924bd3e2..a836d2e510a6 100755 --- a/.circleci/generate_config_yml.py +++ b/.circleci/generate_config_yml.py @@ -112,7 +112,10 @@ def gen_build_workflows_tree(): "when": r"<< pipeline.parameters.run_binary_tests >>", "jobs": [f() for f in binary_build_functions], }, - "build": {"jobs": [f() for f in build_workflows_functions]}, + "build": { + "when": r"<< pipeline.parameters.run_build >>", + "jobs": [f() for f in build_workflows_functions] + }, } } diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml index 26205a0cccba..43d4c94ee5ed 100644 --- a/.circleci/verbatim-sources/header-section.yml +++ b/.circleci/verbatim-sources/header-section.yml @@ -11,6 +11,9 @@ parameters: run_binary_tests: type: boolean default: false + run_build: + type: boolean + default: true docker_config_defaults: &docker_config_defaults user: jenkins diff --git a/.github/pytorch-circleci-labels.yml b/.github/pytorch-circleci-labels.yml index ccdf2e876af1..3a9eeca0abcc 100644 --- a/.github/pytorch-circleci-labels.yml +++ b/.github/pytorch-circleci-labels.yml @@ -9,3 +9,5 @@ labels_to_circle_params: - release/.* tags: - v[0-9]+(\.[0-9]+)*-rc[0-9]+ + set_to_false: + - run_build From eb8003d8e9639369c25b16f2cec338590beb0cb8 Mon Sep 17 00:00:00 2001 From: James Reed Date: Wed, 6 Jan 2021 15:43:37 -0800 Subject: [PATCH 34/44] [FX] Remove extraneous newlines at end of code (#50117) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50117 Test Plan: Imported from OSS Reviewed By: ansley Differential Revision: D25791847 Pulled By: jamesr66a fbshipit-source-id: 9c0b296e117e6bcf69ed9624ad0b243fa3db0f76 --- test/test_fx.py | 5 +++++ torch/fx/graph.py | 11 ++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/test/test_fx.py b/test/test_fx.py index 65d5aa3f0101..2511adc52c62 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -861,6 +861,11 @@ def forward(self, x, w): x, w = torch.rand(3, 4), torch.rand(4, 4) self.assertTrue(any(n.target == torch.relu for n in traced.graph.nodes)) + def test_empty_graph_codegen(self): + graph = torch.fx.Graph() + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + self.assertEqual(gm(), None) + def test_sequential(self): m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)) gm = torch.fx.symbolic_trace(m) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index fd0087dca398..6e493676f8c2 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -693,13 +693,18 @@ def emit_node(node : Node): import_strs = [f'import {name}' for name in sorted(modules_used)] import_block = '\n'.join(import_strs) + if len(body) == 0: + # If the Graph has no non-placeholder nodes, no lines for the body + # have been emitted. To continue to have valid Python code, emit a + # single pass statement + body.append('pass\n') + code = ''.join(body) - code = '\n'.join(' ' + line for line in code.split('\n')) + '\n' + code = '\n'.join(' ' + line for line in code.split('\n')) fn_code = f"""\ {import_block} def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}: -{code} -""" +{code}""" return fn_code From e49372d460c80102bd6bae3dd2f8c1e2b61ebc1b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 6 Jan 2021 16:12:47 -0800 Subject: [PATCH 35/44] Bugfix nightly checkout tool to work on Windows (#49274) Summary: I am submitting this PR on behalf of Janne Hellsten(nurpax) from NVIDIA, for the convenience of CLA. Thanks Janne a lot for the contribution! This fixes the bug when running ` ./tools/nightly.py checkout -b my-nightly-branch` on windows. Before this fix, this command gets the following error on Windows. ``` ERROR:root:Fatal exception Traceback (most recent call last): File "./tools/nightly.py", line 166, in logging_manager yield root_logger File "./tools/nightly.py", line 644, in main install( File "./tools/nightly.py", line 552, in install spdir = _site_packages(pytdir.name, platform) File "./tools/nightly.py", line 325, in _site_packages os.path.join(pytdir.name, "Lib", "site-packages") NameError: name 'pytdir' is not defined log file: d:\pytorch\nightly\log\2020-12-11_16h10m14s_6867a21e-3c0e-11eb-878e-04ed3363a33e\nightly.log ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/49274 Reviewed By: H-Huang Differential Revision: D25808156 Pulled By: malfet fbshipit-source-id: 00778016366ab771fc3fb152710c7849210640fb --- tools/nightly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/nightly.py b/tools/nightly.py index 1fecc67e72f3..55a90e3fd9fb 100755 --- a/tools/nightly.py +++ b/tools/nightly.py @@ -322,10 +322,10 @@ def pytorch_install(url): def _site_packages(dirname, platform): if platform.startswith("win"): - os.path.join(pytdir.name, "Lib", "site-packages") + template = os.path.join(dirname, "Lib", "site-packages") else: template = os.path.join(dirname, "lib", "python*.*", "site-packages") - spdir = glob.glob(template)[0] + spdir = glob.glob(template)[0] return spdir From 6838ecefb6c9d138dfde7f1eaeccff6c8fc72fff Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 6 Jan 2021 16:38:05 -0800 Subject: [PATCH 36/44] Clean up some type annotations in torch/jit (#49939) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49939 Upgrades type annotations from Python2 to Python3 Test Plan: Sandcastle tests Reviewed By: xush6528 Differential Revision: D25717573 fbshipit-source-id: 7d5c98fafaa224e0504b73dc69b1e4a6410c0494 --- torch/jit/quantized.py | 60 +++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py index 615741f38da7..d853a55b3933 100644 --- a/torch/jit/quantized.py +++ b/torch/jit/quantized.py @@ -130,8 +130,7 @@ def check_forward_input(self, input): input.size(1), self.input_size)) @torch.jit.script_method - def check_forward_hidden(self, input, hx, hidden_label=''): - # type: (Tensor, Tensor, str) -> None + def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = '') -> None: if input.size(0) != hx.size(0): raise RuntimeError( "Input batch size {} doesn't match hidden{} batch size {}".format( @@ -169,8 +168,7 @@ def __init__(self, other): self.nonlinearity = other.nonlinearity @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tensor + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: self.check_forward_input(input) if hx is None: hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -201,8 +199,7 @@ def __init__(self, other): super(QuantizedLSTMCell, self).__init__(other) @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] + def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]: self.check_forward_input(input) if hx is None: zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -222,8 +219,7 @@ def __init__(self, other): super(QuantizedGRUCell, self).__init__(other) @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tensor + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: self.check_forward_input(input) if hx is None: hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -236,8 +232,7 @@ def forward(self, input, hx=None): ) -def apply_permutation(tensor, permutation, dim=1): - # type: (Tensor, Tensor, int) -> Tensor +def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: return tensor.index_select(dim, permutation) @@ -303,8 +298,7 @@ def get_weight_bias(ihhh): self.all_weights.append(cell_params) @torch.jit.script_method - def check_input(self, input, batch_sizes): - # type: (Tensor, Optional[Tensor]) -> None + def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None: expected_input_dim = 2 if batch_sizes is not None else 3 if input.dim() != expected_input_dim: raise RuntimeError( @@ -316,8 +310,7 @@ def check_input(self, input, batch_sizes): self.input_size, input.size(-1))) @torch.jit.script_method - def get_expected_hidden_size(self, input, batch_sizes): - # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int] + def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]: if batch_sizes is not None: mini_batch = int(batch_sizes[0]) else: @@ -328,21 +321,19 @@ def get_expected_hidden_size(self, input, batch_sizes): return expected_hidden_size @torch.jit.script_method - def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'): - # type: (Tensor, Tuple[int, int, int], str) -> None + def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int], + msg: str = 'Expected hidden size {}, got {}') -> None: if hx.size() != expected_hidden_size: raise RuntimeError(msg.format(expected_hidden_size, list(hx.size()))) @torch.jit.script_method - def check_forward_args(self, input, hidden, batch_sizes): - # type: (Tensor, Tensor, Optional[Tensor]) -> None + def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None: self.check_input(input, batch_sizes) expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) self.check_hidden_size(hidden, expected_hidden_size, msg='Expected hidden size {}, got {}') @torch.jit.script_method - def permute_hidden(self, hx, permutation): - # type: (Tensor, Optional[Tensor]) -> Tensor + def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor: if permutation is None: return hx return apply_permutation(hx, permutation) @@ -355,8 +346,9 @@ def __init__(self, other, dtype): super(QuantizedLSTM, self).__init__(other, dtype) @torch.jit.script_method - def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] # noqa + def forward_impl(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]], batch_sizes: Optional[Tensor], + max_batch_size: int, sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: + # noqa if hx is None: num_directions = 2 if self.bidirectional else 1 zeros = torch.zeros(self.num_layers * num_directions, @@ -379,8 +371,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): return output, hidden @torch.jit.script_method - def forward_tensor(self, input, hx=None): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] + def forward_tensor(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: batch_sizes = None max_batch_size = input.size(0) if self.batch_first else input.size(1) sorted_indices = None @@ -391,8 +382,8 @@ def forward_tensor(self, input, hx=None): return output, self.permute_hidden(hidden, unsorted_indices) @torch.jit.script_method - def forward_packed(self, input, hx=None): - # type: (PackedSequence, Optional[Tuple[Tensor, Tensor]]) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]] # noqa + def forward_packed(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None + ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]: input, batch_sizes, sorted_indices, unsorted_indices = input max_batch_size = batch_sizes[0] max_batch_size = int(max_batch_size) @@ -404,15 +395,13 @@ def forward_packed(self, input, hx=None): @torch.jit.script_method - def permute_hidden(self, hx, permutation): - # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor] + def permute_hidden(self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]) -> Tuple[Tensor, Tensor]: if permutation is None: return hx return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation) @torch.jit.script_method - def check_forward_args(self, input, hidden, batch_sizes): - # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None + def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]) -> None: self.check_input(input, batch_sizes) expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) @@ -432,8 +421,9 @@ class QuantizedGRU(QuantizedRNNBase): __overloads__ = {'forward': ['forward_packed', 'forward_tensor']} @torch.jit.script_method - def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): - # type: (Tensor, Optional[Tensor], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tensor] # noqa + def forward_impl(self, input: Tensor, hx: Optional[Tensor], batch_sizes: Optional[Tensor], max_batch_size: int, + sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tensor]: + # noqa if hx is None: num_directions = 2 if self.bidirectional else 1 hx = torch.zeros(self.num_layers * num_directions, @@ -459,8 +449,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): return output, hidden @torch.jit.script_method - def forward_tensor(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor] + def forward_tensor(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: batch_sizes = None max_batch_size = input.size(0) if self.batch_first else input.size(1) sorted_indices = None @@ -470,8 +459,7 @@ def forward_tensor(self, input, hx=None): return output, self.permute_hidden(hidden, unsorted_indices) @torch.jit.script_method - def forward_packed(self, input, hx=None): - # type: (PackedSequence, Optional[Tensor]) -> Tuple[PackedSequence, Tensor] + def forward_packed(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]: input, batch_sizes, sorted_indices, unsorted_indices = input max_batch_size = batch_sizes[0] max_batch_size = int(max_batch_size) From fa160d18e701f673c0af99d2e92f485b181bb789 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 18:06:31 -0800 Subject: [PATCH 37/44] [PyTorch][jit] Add Type::{castRaw,expectRef} (#50061) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50061 These are more efficient than creating an extra `shared_ptr` when you just want to access the casted value. ghstack-source-id: 119325644 Test Plan: CI Reviewed By: ezyang Differential Revision: D25766630 fbshipit-source-id: 46f11f70333b44714cab708a4850922ab7486793 --- aten/src/ATen/core/jit_type_base.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index 37da9ad7ef8d..e5a6d48340cf 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -152,6 +152,20 @@ struct TORCH_API Type : std::enable_shared_from_this { return nullptr; } template + T* castRaw() { + if (T::Kind == kind()) { + return static_cast(this); + } + return nullptr; + } + template + const T* castRaw() const { + if (T::Kind == kind()) { + return static_cast(this); + } + return nullptr; + } + template std::shared_ptr expect() { auto r = cast(); AT_ASSERT(r); @@ -163,6 +177,18 @@ struct TORCH_API Type : std::enable_shared_from_this { AT_ASSERT(r); return r; } + template + T& expectRef() { + auto* r = castRaw(); + AT_ASSERT(r); + return *r; + } + template + const T& expectRef() const { + auto* r = castRaw(); + AT_ASSERT(r); + return *r; + } virtual ~Type() = default; virtual bool hasFreeVariables() const { return false; From ef1fa547ba015b620b7bf53aa2908ab68ea5d5d2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 6 Jan 2021 18:06:31 -0800 Subject: [PATCH 38/44] [PyTorch] Use expectRef() when calling listConstruct (#50062) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50062 Avoids creating an extra shared_ptr. ghstack-source-id: 119325645 Test Plan: CI Reviewed By: ezyang Differential Revision: D25766631 fbshipit-source-id: f2ab8349dfea325054820fa2c1055180c740574e --- torch/csrc/jit/mobile/interpreter.cpp | 2 +- torch/csrc/jit/passes/constant_propagation.cpp | 4 +++- torch/csrc/jit/runtime/interpreter.cpp | 3 ++- torch/csrc/jit/runtime/static/ops.cpp | 2 +- torch/csrc/jit/runtime/vararg_functions.cpp | 9 +++------ torch/csrc/jit/runtime/vararg_functions.h | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index 031c21474618..681eddfaa832 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -148,7 +148,7 @@ bool InterpreterState::run(Stack& stack) { case RET: return false; case LIST_CONSTRUCT: { - auto type = code_->types_[inst.X]->expect(); + const auto& type = code_->types_[inst.X]->expectRef(); listConstruct(stack, type, inst.N); ++pc; } break; diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index da9d551a6c88..75be7e86acab 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -45,7 +45,9 @@ c10::optional> runNodeIfInputsAreConstant( } break; case prim::ListConstruct: { listConstruct( - stack, n->output()->type()->expect(), n->inputs().size()); + stack, + n->output()->type()->expectRef(), + n->inputs().size()); } break; case prim::DictConstruct: { dictConstruct( diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index ce4718becaf7..7d588b6d96e7 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1495,7 +1495,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { ++frame.pc; } break; case LIST_CONSTRUCT: { - auto type = frame.function->type_table_[inst.X]->expect(); + const auto& type = + frame.function->type_table_[inst.X]->expectRef(); listConstruct(stack, type, inst.N); ++frame.pc; } break; diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 89519d3765b5..4d66c6382c2d 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -361,7 +361,7 @@ getNativeOperation(Node* n) { // run op listConstruct( stack, - p_node->get_node()->output()->type()->expect(), + p_node->get_node()->output()->type()->expectRef(), p_node->input_regs().size()); // put output back p_node->Output(0, reg) = std::move(stack[0]); diff --git a/torch/csrc/jit/runtime/vararg_functions.cpp b/torch/csrc/jit/runtime/vararg_functions.cpp index 44bc56206eaf..220a5e67f723 100644 --- a/torch/csrc/jit/runtime/vararg_functions.cpp +++ b/torch/csrc/jit/runtime/vararg_functions.cpp @@ -204,16 +204,13 @@ void namedTupleConstruct( c10::ivalue::Tuple::createNamed(std::move(elems), std::move(type))); } -void listConstruct( - Stack& stack, - const at::ListTypePtr& type, - size_t num_inputs) { +void listConstruct(Stack& stack, const at::ListType& type, size_t num_inputs) { // Structuring the implementation this way allows NRVO to avoid // move-constructing vals on its way onto the stack. Moving a List // isn't free. auto makeList = - [](Stack& stack, const at::ListTypePtr& type, size_t num_inputs) { - c10::List vals(type->getElementType()); + [](Stack& stack, const at::ListType& type, size_t num_inputs) { + c10::List vals(type.getElementType()); vals.reserve(num_inputs); for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) { vals.emplace_back(std::move(stack[i])); diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h index d6eba7f5d191..e9580411212a 100644 --- a/torch/csrc/jit/runtime/vararg_functions.h +++ b/torch/csrc/jit/runtime/vararg_functions.h @@ -25,7 +25,7 @@ void namedTupleConstruct( void listConstruct( Stack& stack, - const at::ListTypePtr& list_type, + const at::ListType& list_type, size_t num_inputs); void dictConstruct( From b6b76a105511b4bf896447321caf091cdb00a507 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 6 Jan 2021 18:09:27 -0800 Subject: [PATCH 39/44] Mod lists to neutral+descriptive terms in caffe2/caffe2/opt (#49801) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49801 Per "https://fb.workplace.com/groups/e/permalink/3320810064641820/" we can no longer use the terms "whitelist" and "blacklist", and editing any file containing them results in a critical error signal. Let's embrace the change. This diff changes "blacklist" to "blocklist" in a number of non-interface contexts (interfaces would require more extensive testing and might interfere with reading stored data, so those are deferred until later). Test Plan: Sandcastle Reviewed By: xush6528 Differential Revision: D25686949 fbshipit-source-id: e07de4d228674ae61559719cbe4717f8044778d2 --- caffe2/opt/fakefp16_transform.cc | 6 +++--- caffe2/opt/glow_net_transform.cc | 22 +++++++++++----------- caffe2/opt/glow_net_transform.h | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/caffe2/opt/fakefp16_transform.cc b/caffe2/opt/fakefp16_transform.cc index 424056bd2c80..cbd3132dfc08 100644 --- a/caffe2/opt/fakefp16_transform.cc +++ b/caffe2/opt/fakefp16_transform.cc @@ -299,8 +299,8 @@ void fakeFp16Transform(NetDef* net) { FLAGS_fake_fp16_conversion_use_fp16_acc, FLAGS_fake_fp16_conversion_use_nnpi); - auto blacklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist); - auto blacklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); + auto blocklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist); + auto blocklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); // A hack to only do fakefp16 transformation for operators which will be // lowered to ONNXIFI. @@ -320,7 +320,7 @@ void fakeFp16Transform(NetDef* net) { auto* op = net->mutable_op(i); auto net_pos = ArgumentHelper::GetSingleArgument(*op, "net_pos", -1); - if (blacklist_pos.count(net_pos) || blacklist_type.count(op->type())) { + if (blocklist_pos.count(net_pos) || blocklist_type.count(op->type())) { continue; } auto it = kFakeFp16OpConversionMap.find(op->type()); diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc index ee3ce1b27e2c..45ce9a487fbb 100644 --- a/caffe2/opt/glow_net_transform.cc +++ b/caffe2/opt/glow_net_transform.cc @@ -107,7 +107,7 @@ void onnxifi( const std::vector& input_names, const std::vector& output_names, const std::vector& weight_names, - const std::unordered_set& blacklist, + const std::unordered_set& blocklist, const ShapeInfoMap& shape_hints_max_bs, bool use_onnx, size_t max_batch_size, @@ -154,19 +154,19 @@ void onnxifi( // Before applying backlist, make sure the ops in the net all have an net_pos; caffe2::BackendTransformerBase::annotateOpIndex(net); - // Parse the blacklist - auto more_blacklist = ParseNetPositionList(FLAGS_onnxifi_blacklist); - for (const auto& b : blacklist) { - more_blacklist.emplace(b); + // Parse the blocklist + auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist); + for (const auto& b : blocklist) { + more_blocklist.emplace(b); } // ONNX mode will change the op order so it doesn't apply here if (!opts.use_onnx) { - auto blacklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); + auto blocklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); for (const auto& op : net->op()) { - if (blacklisted_ops.count(op.type())) { + if (blocklisted_ops.count(op.type())) { ArgumentHelper helper(op); - more_blacklist.emplace(helper.GetSingleArgument(op, kNetPos, -1)); + more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1)); } } } @@ -179,7 +179,7 @@ void onnxifi( // 1. for specified op, we find its input and outputs. // 2. for each input and output, we create a new copy op and attach it as an // input to the copy. - // 3. we blacklist these new copy operators from onnxification. This forces + // 3. we blocklist these new copy operators from onnxification. This forces // these intermediate tensors to also become outputs of the onnxifi op. // 4. we put the right arguments on the copy ops so TensorObserver can print // out the values. @@ -213,11 +213,11 @@ void onnxifi( AddArgument(kNetPos, pos, ©_op); AddArgument("observe_input_tensors", 1, ©_op); net->add_op()->CopyFrom(copy_op); - more_blacklist.emplace(pos); + more_blocklist.emplace(pos); } OnnxifiTransformer ts(opts); - ts.transform(ws, net, weight_names, more_shape_hints, more_blacklist); + ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist); // Cleanup the input from the workspace for (const auto& i : input_names) { diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h index e8d1c9b9054f..f6cd975a6e91 100644 --- a/caffe2/opt/glow_net_transform.h +++ b/caffe2/opt/glow_net_transform.h @@ -16,7 +16,7 @@ namespace caffe2 { namespace glow { /// Onnxifi transformation on the net and workspace. We also /// needed the input data/shape to populate the shape. In addition, we take a \p -/// blacklist to control and mask what ops we want to consider in onnxifi +/// blocklist to control and mask what ops we want to consider in onnxifi /// process. We can also set whether to use ONNX proto or C2 proto through /// ONNXIFI interface. void onnxifi( @@ -25,7 +25,7 @@ void onnxifi( const std::vector& input_names, const std::vector& output_names, const std::vector& weight_names, - const std::unordered_set& blacklist, + const std::unordered_set& blocklist, const ShapeInfoMap& shape_hints_max_bs, bool use_onnx, size_t max_batch_size = 0, From 4e2ab2cd734b5142622c7406b4bddc662caf7501 Mon Sep 17 00:00:00 2001 From: Qifan Lu Date: Wed, 6 Jan 2021 18:25:02 -0800 Subject: [PATCH 40/44] Move generator state APIs to ATen (#49589) Summary: ## Rationale While most of the `torch.Generator` properties and methods are implemented as a thin wrapper of the corresponding `at::Generator` methods, `torch.Generator.get_state()` and `torch.Generator.set_state()` are implemented in legacy Torch code and are not dispatched through the `c10::GeneratorImpl` interface. This is not structured well and makes implementing generators for new backends (e.g. `XLAGeneratorImpl` for the XLA backend) inconvenient. As such, this pull request seeks to move these generator state APIs to c10 and ATen. ## What is being refactored? * Interfaces - Added `c10::GeneratorImpl::set_state` and `c10::GeneratorImpl::state` for getting and setting the internal state of a random number generator. - `at::Generator::set_state` and `at::Generator::state` wraps the above-mentioned APIs, as it's basically a PIMPL. - Added helper function `at::detail::check_rng_state` for checking the validity of new RNG state tensor. * CPU Generator - Renamed and moved `THTensor_(setRNGState)` and `THTensor_(getRNGState)` to `CPUGeneratorImpl::set_state` and `CPUGenerator::state`. - Renamed and moved `THGeneratorState` and `THGeneratorStateNew` to `CPUGeneratorStateLegacy` and `CPUGeneratorState`. * CUDA Generator - Renamed and moved `THCRandom_setRNGState` and `THCRandom_getRNGState` to `CUDAGeneratorImpl::set_state` and `CUDAGeneratorImpl::state`. * PyTorch Bindings - `THPGenerator_setState` and `THPGenerator_getState` now simply forward to `at::Generator::set_state` and `at::Generator::state`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49589 Reviewed By: H-Huang Differential Revision: D25785774 Pulled By: pbelevich fbshipit-source-id: 8ed79209c4ffb1a0ae8b19952ac8871ac9e0255f --- aten/src/ATen/CPUGeneratorImpl.cpp | 160 +++++++++++++++++++++++ aten/src/ATen/CPUGeneratorImpl.h | 2 + aten/src/ATen/CUDAGeneratorImpl.h | 4 +- aten/src/ATen/core/Generator.cpp | 16 +++ aten/src/ATen/core/Generator.h | 28 ++++ aten/src/ATen/cuda/CUDAGeneratorImpl.cpp | 63 ++++++++- aten/src/ATen/test/cpu_rng_test.cpp | 2 + aten/src/TH/CMakeLists.txt | 1 - aten/src/TH/THGenerator.hpp | 39 ------ aten/src/TH/generic/THTensorRandom.cpp | 116 ---------------- aten/src/TH/generic/THTensorRandom.h | 5 - aten/src/THC/THCTensorRandom.cu | 54 -------- aten/src/THC/THCTensorRandom.h | 5 - c10/core/GeneratorImpl.h | 3 + test/cpp_extensions/rng_extension.cpp | 2 + torch/csrc/Generator.cpp | 46 +++---- 16 files changed, 294 insertions(+), 252 deletions(-) create mode 100644 aten/src/ATen/core/Generator.cpp delete mode 100644 aten/src/TH/THGenerator.hpp diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index bfa4a2a8f72f..ff4a2f1c61e2 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -6,6 +8,42 @@ namespace at { namespace detail { +/** + * CPUGeneratorImplStateLegacy is a POD class needed for memcpys + * in torch.get_rng_state() and torch.set_rng_state(). + * It is a legacy class and even though it is replaced with + * at::CPUGeneratorImpl, we need this class and some of its fields + * to support backward compatibility on loading checkpoints. + */ +struct CPUGeneratorImplStateLegacy { + /* The initial seed. */ + uint64_t the_initial_seed; + int left; /* = 1; */ + int seeded; /* = 0; */ + uint64_t next; + uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector */ + + /********************************/ + + /* For normal distribution */ + double normal_x; + double normal_y; + double normal_rho; + int normal_is_valid; /* = 0; */ +}; + +/** + * CPUGeneratorImplState is a POD class containing + * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used + * as a helper for torch.get_rng_state() and torch.set_rng_state() + * functions. + */ +struct CPUGeneratorImplState { + CPUGeneratorImplStateLegacy legacy_pod; + float next_float_normal_sample; + bool is_next_float_normal_sample_valid; +}; + /** * PyTorch maintains a collection of default generators that get * initialized once. The purpose of these default generators is to @@ -75,6 +113,128 @@ uint64_t CPUGeneratorImpl::seed() { return random; } +/** + * Sets the internal state of CPUGeneratorImpl. The new internal state + * must be a strided CPU byte tensor and of the same size as either + * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or + * CPUGeneratorImplState (for new state). + * + * FIXME: Remove support of the legacy state in the future? + */ +void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + using detail::CPUGeneratorImplState; + using detail::CPUGeneratorImplStateLegacy; + + static_assert(std::is_pod::value, "CPUGeneratorImplStateLegacy is not a PODType"); + static_assert(std::is_pod::value, "CPUGeneratorImplState is not a PODType"); + + static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy); + static const size_t size_current = sizeof(CPUGeneratorImplState); + static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size"); + + detail::check_rng_state(new_state); + + at::mt19937 engine; + auto float_normal_sample = c10::optional(); + auto double_normal_sample = c10::optional(); + + // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. + CPUGeneratorImplStateLegacy* legacy_pod; + auto new_state_size = new_state.numel(); + if (new_state_size == size_legacy) { + legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data(); + // Note that in CPUGeneratorImplStateLegacy, we didn't have float version + // of normal sample and hence we leave the c10::optional as is + + // Update next_double_normal_sample. + // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y) + // and a rho value (normal_rho). These three values were redundant and in the new + // DistributionsHelper.h, we store the actual extra normal sample, rather than three + // intermediate values. + if (legacy_pod->normal_is_valid) { + auto r = legacy_pod->normal_rho; + auto theta = 2.0 * M_PI * legacy_pod->normal_x; + // we return the sin version of the normal sample when in caching mode + double_normal_sample = c10::optional(r * ::sin(theta)); + } + } else if (new_state_size == size_current) { + auto rng_state = (CPUGeneratorImplState*)new_state.data(); + legacy_pod = &rng_state->legacy_pod; + // update next_float_normal_sample + if (rng_state->is_next_float_normal_sample_valid) { + float_normal_sample = c10::optional(rng_state->next_float_normal_sample); + } + + // Update next_double_normal_sample. + // Note that in getRNGState, we now return the actual normal sample in normal_y + // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho + // are squashed to 0.0. + if (legacy_pod->normal_is_valid) { + double_normal_sample = c10::optional(legacy_pod->normal_y); + } + } else { + AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy, + " or a CPUGeneratorImplState of size ", size_current, + " but found the input RNG state size to be ", new_state_size); + } + + // construct engine_ + // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our + // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are + // doing a std::copy. + at::mt19937_data_pod rng_data; + std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin()); + rng_data.seed_ = legacy_pod->the_initial_seed; + rng_data.left_ = legacy_pod->left; + rng_data.seeded_ = legacy_pod->seeded; + rng_data.next_ = static_cast(legacy_pod->next); + engine.set_data(rng_data); + TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state"); + this->engine_ = engine; + this->next_float_normal_sample_ = float_normal_sample; + this->next_double_normal_sample_ = double_normal_sample; +} + +/** + * Gets the current internal state of CPUGeneratorImpl. The internal + * state is returned as a CPU byte tensor. + */ +c10::intrusive_ptr CPUGeneratorImpl::get_state() const { + using detail::CPUGeneratorImplState; + + static const size_t size = sizeof(CPUGeneratorImplState); + static_assert(std::is_pod::value, "CPUGeneratorImplState is not a PODType"); + + auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_tensor.data_ptr(); + + // accumulate generator data to be copied into byte tensor + auto accum_state = std::make_unique(); + auto rng_data = this->engine_.data(); + accum_state->legacy_pod.the_initial_seed = rng_data.seed_; + accum_state->legacy_pod.left = rng_data.left_; + accum_state->legacy_pod.seeded = rng_data.seeded_; + accum_state->legacy_pod.next = rng_data.next_; + std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state)); + accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy + accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy + accum_state->legacy_pod.normal_is_valid = false; + accum_state->legacy_pod.normal_y = 0.0; + accum_state->next_float_normal_sample = 0.0f; + accum_state->is_next_float_normal_sample_valid = false; + if (this->next_double_normal_sample_) { + accum_state->legacy_pod.normal_is_valid = true; + accum_state->legacy_pod.normal_y = *(this->next_double_normal_sample_); + } + if (this->next_float_normal_sample_) { + accum_state->is_next_float_normal_sample_valid = true; + accum_state->next_float_normal_sample = *(this->next_float_normal_sample_); + } + + memcpy(rng_state, accum_state.get(), size); + return state_tensor.getIntrusivePtr(); +} + /** * Gets the DeviceType of CPUGeneratorImpl. * Used for type checking during run time. diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h index eceb338966fd..f8b43a04c73c 100644 --- a/aten/src/ATen/CPUGeneratorImpl.h +++ b/aten/src/ATen/CPUGeneratorImpl.h @@ -17,6 +17,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override; uint64_t current_seed() const override; uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; static DeviceType device_type(); uint32_t random(); uint64_t random64(); diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h index 9a9febd01f8e..1179a049aa08 100644 --- a/aten/src/ATen/CUDAGeneratorImpl.h +++ b/aten/src/ATen/CUDAGeneratorImpl.h @@ -129,8 +129,10 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override; uint64_t current_seed() const override; uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; void set_philox_offset_per_thread(uint64_t offset); - uint64_t philox_offset_per_thread(); + uint64_t philox_offset_per_thread() const; void capture_prologue(int64_t* offset_extragraph); uint64_t capture_epilogue(); PhiloxCudaState philox_cuda_state(uint64_t increment); diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp new file mode 100644 index 000000000000..800f8c7c88ec --- /dev/null +++ b/aten/src/ATen/core/Generator.cpp @@ -0,0 +1,16 @@ +#include +#include +#include + +namespace at { + +void Generator::set_state(const at::Tensor& new_state) { + TORCH_CHECK(new_state.defined(), "Undefined tensor is not allowed"); + this->impl_->set_state(*new_state.unsafeGetTensorImpl()); +} + +at::Tensor Generator::get_state() const { + return at::Tensor::wrap_tensor_impl(this->impl_->get_state()); +} + +} // namespace at diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index de3f6e46f8f2..b5bbb2fe3c74 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -56,6 +56,8 @@ namespace at { +class Tensor; + struct TORCH_API Generator { Generator() {} @@ -96,6 +98,12 @@ struct TORCH_API Generator { uint64_t seed() { return impl_->seed(); } + // Implementation not inlined to prevent cycle reference between + // `ATen/core/Generator.h` and `ATen/core/Tensor.h` + void set_state(const at::Tensor& new_state); + + at::Tensor get_state() const; + std::mutex& mutex() { return impl_->mutex_; } @@ -130,4 +138,24 @@ Generator make_generator(Args&&... args) { return Generator(c10::make_intrusive(std::forward(args)...)); } +namespace detail { + +/** + * Helper function for checking the validity of new random generator + * state. Right now following conditions are checked: + * + * - The new state tensor must be a torch.ByteTensor + * - Data of the new state tensor must be contiguous + */ +static inline void check_rng_state(const c10::TensorImpl& new_state) { + TORCH_CHECK_TYPE( + new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte, + "RNG state must be a torch.ByteTensor" + ); + + TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous"); +} + +} // namespace detail + } // namespace at diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index 8a5e4f48e0c0..f0572bb6d809 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -130,6 +130,67 @@ uint64_t CUDAGeneratorImpl::seed() { return random; } +/** + * Gets the current internal state of CUDAGeneratorImpl. The internal + * state is returned as a CPU byte tensor. + */ +c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { + // The RNG state comprises the seed, and an offset used for Philox. + // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120. + // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here + // because this is just host side code and we don't want to worry about linking with cuda + static const size_t states_size = 200 * sizeof(4120); + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = states_size + seed_size + offset_size; + + auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_tensor.data_ptr(); + // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1 + // gen_states in THCGenerator struct was an array of curandStateMtgp32s. + memset(rng_state, -1, states_size); + auto current_seed = this->current_seed(); + auto offset = static_cast(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic + memcpy(rng_state + states_size, ¤t_seed, seed_size); + memcpy(rng_state + states_size + seed_size, &offset, offset_size); + + return state_tensor.getIntrusivePtr(); +} + +/** + * Sets the internal state of CUDAGeneratorImpl. The new internal state + * must be a strided CPU byte tensor and have appropriate size. See + * comments of CUDAGeneratorImpl::state for information about the layout + * and size of the internal state. + */ +void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = states_size + seed_size + offset_size; + + detail::check_rng_state(new_state); + + bool no_philox_seed = false; + auto new_state_size = new_state.numel(); + if (new_state_size == total_size - offset_size) { + no_philox_seed = true; + } else { + TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size"); + } + + uint64_t input_seed; + auto new_rng_state = new_state.data(); + memcpy(&input_seed, new_rng_state + states_size, seed_size); + this->set_current_seed(input_seed); + int64_t philox_offset = 0; + if (!no_philox_seed) { + memcpy(&philox_offset, new_rng_state + states_size + seed_size, offset_size); + } + this->set_philox_offset_per_thread(static_cast(philox_offset)); +} + /** * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10 * @@ -143,7 +204,7 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { /** * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. */ -uint64_t CUDAGeneratorImpl::philox_offset_per_thread() { +uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread"); return philox_offset_per_thread_; } diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp index 6d596095d7a0..805ed40557b6 100644 --- a/aten/src/ATen/test/cpu_rng_test.cpp +++ b/aten/src/ATen/test/cpu_rng_test.cpp @@ -28,6 +28,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); } uint64_t current_seed() const override { throw std::runtime_error("not implemented"); } uint64_t seed() override { throw std::runtime_error("not implemented"); } + void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); } + c10::intrusive_ptr get_state() const override { throw std::runtime_error("not implemented"); } TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); } static DeviceType device_type() { return DeviceType::CPU; } diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index a3ed10126b93..5661a697da38 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -79,7 +79,6 @@ install(FILES THHalf.h THTensor.hpp THStorageFunctions.hpp - THGenerator.hpp DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") install(FILES diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp deleted file mode 100644 index 1a40611f8b5b..000000000000 --- a/aten/src/TH/THGenerator.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include - -/** - * THGeneratorState is a POD class needed for memcpys - * in torch.get_rng_state() and torch.set_rng_state(). - * It is a legacy class and even though it is replaced with - * at::CPUGeneratorImpl, we need this class and some of its fields - * to support backward compatibility on loading checkpoints. - */ -struct THGeneratorState { - /* The initial seed. */ - uint64_t the_initial_seed; - int left; /* = 1; */ - int seeded; /* = 0; */ - uint64_t next; - uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector */ - - /********************************/ - - /* For normal distribution */ - double normal_x; - double normal_y; - double normal_rho; - int normal_is_valid; /* = 0; */ -}; - -/** - * THGeneratorStateNew is a POD class containing - * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used - * as a helper for torch.get_rng_state() and torch.set_rng_state() - * functions. - */ -struct THGeneratorStateNew { - THGeneratorState legacy_pod; - float next_float_normal_sample; - bool is_next_float_normal_sample_valid; -}; diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 399bcc38e1de..c37b0b9bb7f0 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) @@ -149,119 +148,4 @@ void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTens } } #endif - -#if defined(TH_REAL_IS_BYTE) -void THTensor_(getRNGState)(at::Generator _generator, THTensor *self) -{ - // See Note [Acquire lock when using random generators] - std::lock_guard lock(_generator.mutex()); - static const size_t size = sizeof(THGeneratorStateNew); - THTensor_(resize1d)(self, size); - THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size"); - THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); - static_assert(std::is_pod::value, "THGeneratorStateNew is not a PODType"); - - // cast byte tensor to POD type - THGeneratorStateNew* rng_state = (THGeneratorStateNew*)self->data(); - - // accumulate generator data to be copied into byte tensor - auto accum_state = std::make_unique(); - auto cast_generator = at::check_generator(_generator); - auto rng_data = cast_generator->engine().data(); - accum_state->legacy_pod.the_initial_seed = rng_data.seed_; - accum_state->legacy_pod.left = rng_data.left_; - accum_state->legacy_pod.seeded = rng_data.seeded_; - accum_state->legacy_pod.next = rng_data.next_; - std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state)); - accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy - accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy - accum_state->legacy_pod.normal_is_valid = false; - accum_state->legacy_pod.normal_y = 0.0; - accum_state->next_float_normal_sample = 0.0f; - accum_state->is_next_float_normal_sample_valid = false; - if(cast_generator->next_double_normal_sample()) { - accum_state->legacy_pod.normal_is_valid = true; - accum_state->legacy_pod.normal_y = *(cast_generator->next_double_normal_sample()); - } - if(cast_generator->next_float_normal_sample()) { - accum_state->is_next_float_normal_sample_valid = true; - accum_state->next_float_normal_sample = *(cast_generator->next_float_normal_sample()); - } - - memcpy(rng_state, accum_state.get(), size); -} - -void THTensor_(setRNGState)(at::Generator _generator, THTensor *self) -{ - // See Note [Acquire lock when using random generators] - std::lock_guard lock(_generator.mutex()); - auto cast_generator = at::check_generator(_generator); - THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); - static_assert(std::is_pod::value, "THGeneratorState is not a PODType"); - static_assert(std::is_pod::value, "THGeneratorStateNew is not a PODType"); - - static const size_t size_legacy = sizeof(THGeneratorState); - static const size_t size_current = sizeof(THGeneratorStateNew); - static_assert(size_legacy != size_current, "Legacy THGeneratorState and THGeneratorStateNew can't be of the same size"); - - at::mt19937 engine; - auto float_normal_sample = c10::optional(); - auto double_normal_sample = c10::optional(); - - // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. - THGeneratorState* legacy_pod; - if (THTensor_(nElement)(self) == size_legacy) { - legacy_pod = (THGeneratorState*)self->data(); - // Note that in legacy THGeneratorState, we didn't have float version - // of normal sample and hence we leave the c10::optional as is - - // Update next_double_normal_sample. - // Note that legacy THGeneratorState stores two uniform values (normal_x, normal_y) - // and a rho value (normal_rho). These three values were redundant and in the new - // DistributionsHelper.h, we store the actual extra normal sample, rather than three - // intermediate values. - if (legacy_pod->normal_is_valid) { - auto r = legacy_pod->normal_rho; - auto theta = 2.0 * M_PI * legacy_pod->normal_x; - // we return the sin version of the normal sample when in caching mode - double_normal_sample = c10::optional(r * ::sin(theta)); - } - } else if (THTensor_(nElement)(self) == size_current) { - auto rng_state = (THGeneratorStateNew*)self->data(); - legacy_pod = &rng_state->legacy_pod; - // update next_float_normal_sample - if (rng_state->is_next_float_normal_sample_valid) { - float_normal_sample = c10::optional(rng_state->next_float_normal_sample); - } - - // Update next_double_normal_sample. - // Note that in getRNGState, we now return the actual normal sample in normal_y - // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho - // are squashed to 0.0. - if (legacy_pod->normal_is_valid) { - double_normal_sample = c10::optional(legacy_pod->normal_y); - } - } else { - AT_ERROR("Expected either a THGeneratorState of size ", size_legacy, - " or a THGeneratorStateNew of size ", size_current, - " but found the input RNG state size to be ", THTensor_(nElement)(self)); - } - - // construct engine_ - // Note that legacy THGeneratorState stored a state array of 64 bit uints, whereas in our - // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are - // doing a std::copy. - at::mt19937_data_pod rng_data; - std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin()); - rng_data.seed_ = legacy_pod->the_initial_seed; - rng_data.left_ = legacy_pod->left; - rng_data.seeded_ = legacy_pod->seeded; - rng_data.next_ = static_cast(legacy_pod->next); - engine.set_data(rng_data); - THArgCheck(engine.is_valid(), 1, "Invalid mt19937 state"); - cast_generator->set_engine(engine); - cast_generator->set_next_float_normal_sample(float_normal_sample); - cast_generator->set_next_double_normal_sample(double_normal_sample); -} -#endif #endif diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h index ffc52bc69390..ddeb905680cd 100644 --- a/aten/src/TH/generic/THTensorRandom.h +++ b/aten/src/TH/generic/THTensorRandom.h @@ -9,9 +9,4 @@ TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor * TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTensor *J, int n_sample, c10::optional _generator); #endif -#if defined(TH_REAL_IS_BYTE) -TH_API void THTensor_(getRNGState)(at::Generator _generator, THTensor *self); -TH_API void THTensor_(setRNGState)(at::Generator _generator, THTensor *self); -#endif - #endif diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index aefb427f4e67..8655ea2fb829 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -12,60 +12,6 @@ #define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 -// NB: ROCm compiler seems to have a bug where __host__ functions must be -// explicitly specified extern "C" otherwise ROCm compiler doesn't respect it. -// See https://github.com/RadeonOpenCompute/hcc/issues/839 -__host__ void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state) -{ - auto gen = at::check_generator(gen_); - std::lock_guard lock(gen->mutex_); - // The RNG state comprises the seed, and an offset used for Philox. - // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120. - // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); - // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here - // because this is just host side code and we don't want to worry about linking with cuda - static const size_t states_size = 200 * sizeof(4120); - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = states_size + seed_size + offset_size; - THByteTensor_resize1d(rng_state, total_size); - THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); - THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1 - // gen_states in THCGenerator struct was an array of curandStateMtgp32s. - memset(THByteTensor_data(rng_state), -1, states_size); - auto current_seed = gen->current_seed(); - auto offset = static_cast(gen->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic - memcpy(THByteTensor_data(rng_state) + states_size, ¤t_seed, seed_size); - memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &offset, offset_size); -} - -__host__ void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state) -{ - auto gen = at::check_generator(gen_); - std::lock_guard lock(gen->mutex_); - static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = states_size + seed_size + offset_size; - bool no_philox_seed = false; - if (THByteTensor_nElement(rng_state) == total_size - offset_size) { - no_philox_seed = true; - } - else { - THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); - } - THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - uint64_t input_seed; - memcpy(&input_seed, THByteTensor_data(rng_state) + states_size, seed_size); - gen->set_current_seed(input_seed); - int64_t philox_offset = 0; - if (!no_philox_seed) { - memcpy(&philox_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size); - } - gen->set_philox_offset_per_thread(static_cast(philox_offset)); -} - #include #include diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h index b1d7f1ef1797..696e36f70bec 100644 --- a/aten/src/THC/THCTensorRandom.h +++ b/aten/src/THC/THCTensorRandom.h @@ -9,9 +9,4 @@ #include #include -#include - -TORCH_CUDA_API void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state); -TORCH_CUDA_API void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state); - #endif diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h index 3af652a1a3b2..84e620e93a72 100644 --- a/c10/core/GeneratorImpl.h +++ b/c10/core/GeneratorImpl.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * Note [Generator] @@ -71,6 +72,8 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target { virtual void set_current_seed(uint64_t seed) = 0; virtual uint64_t current_seed() const = 0; virtual uint64_t seed() = 0; + virtual void set_state(const c10::TensorImpl& new_state) = 0; + virtual c10::intrusive_ptr get_state() const = 0; Device device() const; // See Note [Acquire lock when using random generators] diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp index 4a71a526617f..f3ab91fb3cab 100644 --- a/test/cpp_extensions/rng_extension.cpp +++ b/test/cpp_extensions/rng_extension.cpp @@ -22,6 +22,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); } uint64_t current_seed() const override { throw std::runtime_error("not implemented"); } uint64_t seed() override { throw std::runtime_error("not implemented"); } + void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); } + c10::intrusive_ptr get_state() const override { throw std::runtime_error("not implemented"); } TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); } static DeviceType device_type() { return DeviceType::CPU; } diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp index 55e5abc29ef9..2bc478f36007 100644 --- a/torch/csrc/Generator.cpp +++ b/torch/csrc/Generator.cpp @@ -15,7 +15,6 @@ #include #ifdef USE_CUDA -#include #include #endif @@ -78,45 +77,32 @@ static PyObject * THPGenerator_getState(PyObject *_self, PyObject *noargs) { using namespace torch::autograd; HANDLE_TH_ERRORS - auto self = (THPGenerator*)_self; - Variable var = torch::empty({0}, at::device(at::kCPU).dtype(at::kByte)); - if (self->cdata.device().type() == at::kCPU) { - THByteTensor_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl())); - } else { -#ifdef USE_CUDA - TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA); - THCRandom_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl())); -#else - TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA"); -#endif - } - return THPVariable_Wrap(std::move(var)); + auto& gen = ((THPGenerator*)_self)->cdata; + + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen.mutex()); + auto state_tensor = gen.get_state(); + + return THPVariable_Wrap(std::move(state_tensor)); END_HANDLE_TH_ERRORS } static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state) { using namespace torch::autograd; - auto self = (THPGenerator*)_self; + HANDLE_TH_ERRORS if (!THPVariable_Check(_new_state)) { throw torch::TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name); } - auto& tensor = ((THPVariable*)_new_state)->cdata; - if (tensor.layout() != kStrided || tensor.device().type() != kCPU || tensor.scalar_type() != kByte) { - auto type_name = torch::utils::options_to_string(tensor.options()); - throw torch::TypeError("expected a torch.ByteTensor, but got %s", type_name.c_str()); - } - if (self->cdata.device().type() == at::kCPU) { - THByteTensor_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl()); - } else { -#ifdef USE_CUDA - TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA); - THCRandom_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl()); -#else - TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA"); -#endif - } + auto self = (THPGenerator*)_self; + auto& gen = self->cdata; + auto& new_state_tensor = ((THPVariable*)_new_state)->cdata; + + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen.mutex()); + gen.set_state(new_state_tensor); + Py_INCREF(self); return (PyObject*)self; END_HANDLE_TH_ERRORS From 838e73de2042083503021ae8505e066b93d4c2d4 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Wed, 6 Jan 2021 18:35:09 -0800 Subject: [PATCH 41/44] enable alltoall_single torchscript support (#48345) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48345 Test Plan: wait for sandcastle Differential Revision: D25074475 fbshipit-source-id: 04261f8453567154b0464f8348320e936ca06384 --- .../check_backward_compatibility.py | 2 +- test/distributed/test_jit_c10d.py | 27 ++++++++++--------- torch/csrc/distributed/c10d/init.cpp | 27 ++++++++++++++++--- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 8527293189a9..2d5d50096c81 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -72,7 +72,7 @@ def allow_listed(schema, allow_list): dont_parse_list = [ ("_TorchScriptTesting.*", datetime.date(2099, 9, 17)), ("test_backend", datetime.date(2099, 9, 17)), - ("c10d.frontend", datetime.date(2020, 12, 30)), + ("dist_c10d", datetime.date(2021, 1, 30)), ] diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py index 85788b914059..182a405d0e78 100644 --- a/test/distributed/test_jit_c10d.py +++ b/test/distributed/test_jit_c10d.py @@ -4,6 +4,7 @@ import torch import torch.distributed as c10d import time +from datetime import timedelta from typing import List import torch.testing._internal.common_utils as common @@ -31,6 +32,14 @@ def unique_process_group_name(prefix): now = int(time.time() * 1000) return "%s_%d" % (prefix, now) +def _create_tcp_store(): + addr = "localhost" + port = common.find_free_port() + timeout = timedelta(minutes=5) + timeout_millisecond = int(timeout / timedelta(milliseconds=1)) + return torch.classes.dist_c10d.TCPStore(addr, port, 1, True, timeout_millisecond) + + @unittest.skipIf( TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment", @@ -48,19 +57,15 @@ def setUp(self): raise unittest.SkipTest("NCCL test requires 2+ GPUs") def _create_nccl_pg(self, name_prefix): - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True) name = unique_process_group_name(name_prefix) - return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name) + return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name) def _create_nccl_pg_as_base_process_group(self, name): - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() return torch.classes.dist_c10d.frontend().new_process_group_helper( self.world_size, self.rank, [], "nccl", tcp_store, name, 0) @@ -155,9 +160,7 @@ def test_frontend_singleton(self): frontend1 = torch.classes.dist_c10d.frontend() frontend2 = torch.classes.dist_c10d.frontend() - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() pg_name = unique_process_group_name("singleton_test_process_group") @@ -180,9 +183,7 @@ def test_process_group_as_module_member(self): class TestModule(torch.nn.Module): def __init__(self): super(TestModule, self).__init__() - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() name = unique_process_group_name("module_member_process_group") self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper( diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 76b466c91f10..0d4250eddd13 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1260,11 +1260,25 @@ static const auto TCPStoreTorchBind = .def(torch::init([](const std::string& host_name, int64_t port, int64_t world_size, - bool is_master) { + bool is_master, + int64_t timeout) { + auto timeout_miliseconds = std::chrono::milliseconds(timeout); return c10::make_intrusive<::c10d::TCPStore>( - host_name, port, world_size, is_master); + host_name, port, world_size, is_master, timeout_miliseconds); })); +// TODO: This should really take Store as constructor argument instead of +// TCPStore, but the fact that TorchScript does not support polymorphism +// forced us to cast in C++ instead of automatic casting +static const auto PrefixStoreTorchBind = + torch::class_<::c10d::PrefixStore>("dist_c10d", "PrefixStore") + .def(torch::init([](const std::string& prefix, + const c10::intrusive_ptr<::c10d::TCPStore>& store) { + return c10::make_intrusive<::c10d::PrefixStore>( + prefix, store); + })); + + // Torchbind the ProcessGroup to make it available in TorchScript static const auto ProcessGroupWorkTorchBind = torch::class_<::c10d::ProcessGroup::Work>("dist_c10d", "Work") @@ -1624,7 +1638,14 @@ static const auto ProcessGroupNCCLTorchBind = outputSplitSizes, inputSplitSizes, ::c10d::AllToAllOptions()); - }); + + }) + .def("size", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t) self->getSize(); + }) + .def("rank", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t) self->getRank(); + }); #endif static const auto DistributedC10dFrontendTorchBind = From 11cdb910b4af2b4abb6a9c98a325f6f378347fba Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Wed, 6 Jan 2021 21:46:56 -0800 Subject: [PATCH 42/44] [fx] Add matrix multiplication fusion pass (#50151) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50151 **Summary** This commit adds a graph transformation pass that merges several matrix multiplications that use the same RHS operand into one large matrix multiplication. The LHS operands from all of the smaller matrix multiplications are concatenated together and used as an input in the large matrix multiply, and the result is split in order to obtain the same products as the original set of matrix multiplications. **Test Plan** This commit adds a simple unit test with two matrix multiplications that share the same RHS operand. `python test/test_fx_experimental.py -k merge_matmul -v` Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25809409 Pulled By: SplitInfinity fbshipit-source-id: fb55c044a54dea9f07b71aa60d44b7a8f3966ed0 --- test/test_fx_experimental.py | 123 ++++++++++++++ torch/fx/experimental/merge_matmul.py | 220 ++++++++++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 torch/fx/experimental/merge_matmul.py diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index 6e9c877b8de6..ac71d6037591 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -21,6 +21,7 @@ PartitionMode ) from torch.fx.experimental.fuser import fuse +from torch.fx.experimental import merge_matmul try: from torchvision.models import resnet18 @@ -844,6 +845,128 @@ def forward(self, a): for p_name in para_list: assert p_name in node.attrs_for_lowering + def test_merge_matmuls(self): + """ + A collection of test cases for torch.fx.experimental.merge_matmul, + a graph transformation that merges matrix multiplication operations. + """ + # Utility function for counting matmuls for test assertions. + def _count_matmuls(mod): + gm = torch.fx.symbolic_trace(mod) + + num_matmuls = 0 + for node in gm.graph.nodes: + if node.target == torch.matmul: + num_matmuls += 1 + + return num_matmuls + + # Simple test case in which there are two matmuls of the same size to merge. + class SimpleMergeMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, x, y): + a = torch.matmul(x, self.rhs) + b = torch.matmul(y, self.rhs) + return a + b + + # Initialize inputs. + a = torch.randn(3, 3) + b = torch.randn(3, 3) + + # Initialize RHS for matmuls. + rhs = torch.randn(3, 4) + + # Construct SimpleMergeMatmulModule and call merge_matmul on it. + module = SimpleMergeMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(a, b) + after = opt_module(a, b) + before.allclose(after) + + # Basic graph structure check; original module should have 2 matmuls + # and optimized module should have 1. + self.assertEqual(_count_matmuls(module), 2) + self.assertEqual(_count_matmuls(opt_module), 1) + + # Test case in which there are multiple matmuls of different sizes to merge. + class FiveMergeMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, a, b, c, d, e): + s = torch.Tensor((0)) + matmuls = [] + + # For some reason using a list comprehension or for-loop for this + # doesn't work. + matmuls.append(torch.matmul(a, self.rhs)) + matmuls.append(torch.matmul(b, self.rhs)) + matmuls.append(torch.matmul(c, self.rhs)) + matmuls.append(torch.matmul(d, self.rhs)) + matmuls.append(torch.matmul(e, self.rhs)) + + for m in matmuls: + s += torch.sum(m) + + return s + + # Initialize inputs. + inputs = [torch.randn(2 * i + 1, 5) for i in range(5)] + + # Initialize RHS. + rhs = torch.randn(5, 4) + + # Construct FiveMergeMatmulModule and call merge_matmul on it. + module = FiveMergeMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(*inputs) + after = opt_module(*inputs) + before.allclose(after) + + # Basic graph structure check; original module should have len(inputs) matmuls + # and optimized module should have 1. + self.assertEqual(_count_matmuls(module), len(inputs)) + self.assertEqual(_count_matmuls(opt_module), 1) + + # Simple test case in which two matmuls cannot be merged due to a data dependency between + # the LHS operands. + class UnmergeableMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, x): + a = torch.matmul(x, self.rhs) + a_abs = torch.abs(a) + b = torch.matmul(a_abs.transpose(1, 0), self.rhs) + return b + + # Initialize inputs. + a = torch.randn(3, 3) + + # Initialize RHS for matmuls. + rhs = torch.randn(3, 4) + + # Construct UnmergeableMatmulModule and call merge_matmul on it. + module = UnmergeableMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(a) + after = opt_module(a) + before.allclose(after) + + # Basic graph structure check; the number of matrix multiplcations should not have changed. + self.assertEqual(_count_matmuls(module), 2) + self.assertEqual(_count_matmuls(opt_module), 2) if __name__ == "__main__": run_tests() diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py new file mode 100644 index 000000000000..b72bbe633dd9 --- /dev/null +++ b/torch/fx/experimental/merge_matmul.py @@ -0,0 +1,220 @@ +import torch + +from torch.fx.graph import Graph +from torch.fx.graph_module import GraphModule +from torch.fx.node import Node +from torch.fx.symbolic_trace import symbolic_trace + +import itertools +import operator + +from typing import Dict, List + + +def get_first_dim(t: torch.Tensor) -> int: + """ + A free function primarily for use in the merge_matmul graph transformation below + that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape + is an attribute (and cannot be the target of a call_function node) and also helps save + a getitem op in the graph. + + Arguments: + t: The tensor to get the first dimension of. + + Returns: + The first dimension of t. + """ + return t.shape[0] + + +def legalize_graph(gm: GraphModule): + """ + Replace the graph of the given GraphModule with one that contains the same nodes as the + original, but in topologically sorted order. + + This is used by the merge_matmul transformation below, which disturbs the topologically sorted + order of its input GraphModule, so that this order is restored before further transformation. + + Arguments: + gm: The graph module to topologically sort. It is modified in-place. + + """ + # Build an adjacency list representation of node dependencies in the graph. This also + # serves as a list of nodes that still need to be inserted into the new, topologically + # sorted graph. + dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes} + + # Construct a new graph that will contain all nodes in topologically sorted order. + new_graph = Graph() + value_remap: Dict[Node, Node] = {} + + # Copy over all nodes with no dependencies. + for node, deps in dependencies.items(): + if not deps: + value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) + + # Remove the copied over nodes from the adjacency list. + for copied_node in value_remap.keys(): + del dependencies[copied_node] + + # While there are still nodes to insert into the new graph: + while dependencies: + copied_this_round = [] + + # Copy over all nodes whose dependencies already exist in the new graph. + for node, deps in dependencies.items(): + all_deps_copied = True + for dep in deps: + if dep not in value_remap: + all_deps_copied = False + + if all_deps_copied: + value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) + copied_this_round.append(node) + + # Delete all nodes copied over in this iteration from dependencies. + for copied_node in copied_this_round: + del dependencies[copied_node] + + # Replace the old graph with the new, topologically sorted one. + gm.graph = new_graph + + +def may_depend_on(a: Node, b: Node, search_depth: int = 6): + """ + Determine if one node depends on another in a torch.fx.Graph. + + Arguments: + a: The node that may have a dependency on b. + b: The node that a may have a dependency on. + search_depth: In the case of an indirect dependency, this function + searches upto this many nodes away in search of a + data dependency. If none is found, the function + makes the conservative assumption that there is a + dependency. + + Returns: + True if a may depend on b, False if it definitely does not. + """ + # Equivalence is defined as dependence. + if a == b: + return True + + # If a has no inputs, it cannot depend on b. + if len(a.all_input_nodes) == 0: + return False + + # If the search depth has been exhausted and no conclusion has been + # reached, assume that there is a data dependency. + if search_depth == 0: + return True + + # Recursively check all inputs of a. + for inp in a.all_input_nodes: + if may_depend_on(inp, b, search_depth - 1): + return True + + return False + + +def are_nodes_independent(nodes: List[Node]): + """ + Check if all of the given nodes are pairwise-data independent. + + Arguments: + nodes: The nodes to check for data dependencies. + + Returns: + True if any pair in nodes has a data dependency. + """ + # For each pair in nodes: + for i, j in itertools.combinations(nodes, 2): + if may_depend_on(i, j) or may_depend_on(j, i): + return False + + return True + + +def merge_matmul(in_mod: torch.nn.Module): + """ + A graph transformation that merges matrix multiplication operations that share the same right-hand + side operand into one large matrix multiplication. + ____ _________ _________ + ---- | | | | M| A * C | + M| A | T| B | * K| C | = |---------| + ---- , | | | | T| B * C | + K ---- --------- --------- + K R R + """ + gm = symbolic_trace(in_mod) + + rhs_users: Dict[Node, List[Node]] = {} + lhs_users: Dict[Node, List[Node]] = {} + + # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to + # the matmul of which they are the LHS/RHS. + for node in gm.graph.nodes: + if node.op != "call_function" or node.target is not torch.matmul: + continue + + lhs, rhs = node.args + + # TODO: Properly handle aliasing caused by get_attr. For now, + # use the attribute name as the operand if the node is a + # get_attr. + lhs = lhs.target if lhs.op == "get_attr" else lhs + rhs = rhs.target if rhs.op == "get_attr" else rhs + + lhs_users.setdefault(lhs, []).append(node) + rhs_users.setdefault(rhs, []).append(node) + + for rhs, mms in rhs_users.items(): + # There must be at least matmuls for a merge to make sense. + if len(mms) < 2: + continue + + # All matmuls must not depend on each other directly or indirectly + # in order for the merge to be possible. + if not are_nodes_independent(mms): + continue + + lhs_vals = [mm.args[0] for mm in mms] + + # Merge the matmul. + # Collect a list of LHS operands and the single RHS operand. + lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals] + rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs + + # Concatenate all the LHS operands. + merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {}) + + # Multiply the concatenated LHS operands with the one RHS. This will produce + # the same results as all the individual matmuls involving rhs in the original graph, + # but they will all be concatenated together. + merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {}) + + # Split the result of the merged matmul using the shapes of the LHS operands + # to ascertain how large each chunk should be. + merge_mm_sizes = [ + gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs + ] + merge_mm_split = gm.graph.call_function( + torch.split, (merge_mm, merge_mm_sizes), {} + ) + merge_mm_res = [ + gm.graph.call_function(operator.getitem, (merge_mm_split, out), {}) + for out in range(len(lhs)) + ] + + # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul. + for old, new in zip(mms, merge_mm_res): + old.replace_all_uses_with(new) + gm.graph.erase_node(old) + + # All of the new nodes created above were inserted at the end, so we need to sort + # the nodes topologically to make sure all definitions precede uses. + legalize_graph(gm) + + gm.recompile() + gm.graph.lint(in_mod) + return gm From 968ad47b410b93d2600d163db50eb9fb45c24a2b Mon Sep 17 00:00:00 2001 From: UNO Leo Date: Wed, 6 Jan 2021 22:19:37 -0800 Subject: [PATCH 43/44] Fix error messages thrown when the padding size is not valid (#50135) Summary: Hi, I changed error messages so that they correspond to the actual implementation. Acording to the implementation, half of kernel size is valid as padding size. This is minor but an example that the padding size is exactly equal to the half of kernel size, Input: 5 x 5 Kernel: 4 x 4 Stride: 4 Padding: 2 ==> Output: 2 x 2 You don't get the error in the above case, like following: ```python import torch import torch.nn as nn # no error input = torch.randn(1, 1, 5, 5) pool = nn.MaxPool2d(4, 4, padding=2) print(pool(input).shape) # >>> torch.Size([1, 1, 2, 2]) ``` You get the error when you set the padding size larger then half of kernel size like: ```python # it raises error input = torch.randn(1, 1, 5, 5) pool = nn.MaxPool2d(4, 4, padding=3) print(pool(input).shape) ``` The error message is: ``` --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) in () 1 input = torch.randn(1, 1, 5, 5) 2 pool = nn.MaxPool2d(4, 4, padding=3) ----> 3 print(pool(input).shape) 3 frames /usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in _max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices) 584 stride = torch.jit.annotate(List[int], []) 585 return torch.max_pool2d( --> 586 input, kernel_size, stride, padding, dilation, ceil_mode) 587 588 max_pool2d = boolean_dispatch( RuntimeError: pad should be smaller than half of kernel size, but got padW = 3, padH = 3, kW = 4, kH = 4 ``` Thanks in advance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50135 Reviewed By: hl475 Differential Revision: D25815337 Pulled By: H-Huang fbshipit-source-id: 98142296fa6e6849d2e1407d2c1d4e3c2f83076d --- aten/src/ATen/native/Pool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 071460b090cd..8b5d65a8a60f 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -72,7 +72,7 @@ pool2d_shape_check( TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4), "non-empty 3D or 4D input tensor expected but got ndim: ", ndim); TORCH_CHECK(kW/2 >= padW && kH/2 >= padH, - "pad should be smaller than half of kernel size, but got ", + "pad should be smaller than or equal to half of kernel size, but got ", "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH); TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1, @@ -172,7 +172,7 @@ pool3d_shape_check( } TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, - "pad should be smaller than half of kernel size, but got " + "pad should be smaller than or equal to half of kernel size, but got " "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH); TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1, From 321b98830e17e9e1a366ababeb5475f9f202c815 Mon Sep 17 00:00:00 2001 From: Chunli Fu Date: Thu, 7 Jan 2021 02:01:25 -0800 Subject: [PATCH 44/44] [script] Validator for unsupported ops on accelerator Summary: ATT Next step: 1. integrate with dper flow. 2. Support in bento after diff is pushed to prod. Test Plan: buck run mode/opt-clang sigrid/predictor/scripts:check_accelerator_unsupported_ops -- --model_entity_id=232891739 I0106 17:08:36.425796 1238141 pybind_state.cc:531] Unsupported ops: Fused8BitRowwiseQuantizedToFloat Reviewed By: khabinov Differential Revision: D25818253 fbshipit-source-id: 8d8556b0400c1747f154b0517352f1685f1aa8b1 --- caffe2/opt/onnxifi_transformer.cc | 5 +++-- caffe2/opt/onnxifi_transformer.h | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 8089314c3100..2dd8c8d2d8b4 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -1195,11 +1195,11 @@ void OnnxifiTransformer::applyFilteringRules( blocklistCpuPartition(net, blocklisted_ops); } -void OnnxifiTransformer::getBackendId() { +std::vector OnnxifiTransformer::getBackendId() { idx_ = 0; if (opts_.use_onnx) { - return; + return backend_ids_; } // Try to find a backend that support Caffe2 proto. Note that this is quite // opportunistic as we don't officially support Caffe2 proto. @@ -1214,6 +1214,7 @@ void OnnxifiTransformer::getBackendId() { break; } } + return backend_ids_; } NetDef OnnxifiTransformer::TransformViaC2( diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h index d88eb739750c..d1af1731013d 100644 --- a/caffe2/opt/onnxifi_transformer.h +++ b/caffe2/opt/onnxifi_transformer.h @@ -61,6 +61,17 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { const ShapeInfoMap& shape_hints, const std::unordered_set& blocklisted_ops) override; + // Query whether an operator is supported by passing C2 protobuf + bool supportOpC2( + const caffe2::OperatorDef& op, + const ShapeInfoMap& shape_hints, + const std::unordered_set& weights, + const std::unordered_set& blocklisted_ops, + onnxBackendID backend_id) const; + + // Determine backend id + std::vector getBackendId(); + private: // Since we create new tensors during the conversion process, we actually need // into inject them into the original workspace @@ -114,14 +125,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { ShapeInfoMap* shape_hints_max_bs, const std::unordered_map &shape_hints_per_bs); - // Query whether an operator is supported by passing C2 protobuf - bool supportOpC2( - const caffe2::OperatorDef& op, - const ShapeInfoMap& shape_hints, - const std::unordered_set& weights, - const std::unordered_set& blocklisted_ops, - onnxBackendID backend_id) const; - // Query whether an operator is supported by passing ONNX protobuf bool supportOpOnnx( const caffe2::OperatorDef& op, @@ -152,9 +155,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { const std::unordered_set& weights, std::unordered_set* blocklisted_ops) const; - // Determine backend id - void getBackendId(); - // Extract partition info from the original net void extractPartitionInfo(const NetDef& net);