diff --git a/aten/src/ATen/common_with_cwrap.py b/aten/src/ATen/common_with_cwrap.py index 6b8e0cd7b8211..ab50de1769b09 100644 --- a/aten/src/ATen/common_with_cwrap.py +++ b/aten/src/ATen/common_with_cwrap.py @@ -1,6 +1,7 @@ # this code should be common among cwrap and ATen preprocessing # for now, I have put it in one place but right now is copied out of cwrap +import copy def parse_arguments(args): new_args = [] @@ -50,11 +51,16 @@ def set_declaration_defaults(declaration): declaration['unqual_operator_name_with_overload'] = '' # Simulate multiple dispatch, even if it's not necessary if 'options' not in declaration: - declaration['options'] = [{'arguments': declaration['arguments']}] + declaration['options'] = [{ + 'arguments': copy.deepcopy(declaration['arguments']), + 'schema_order_arguments': copy.deepcopy(declaration['schema_order_arguments']), + }] del declaration['arguments'] + del declaration['schema_order_arguments'] # Parse arguments (some of them can be strings) for option in declaration['options']: option['arguments'] = parse_arguments(option['arguments']) + option['schema_order_arguments'] = parse_arguments(option['schema_order_arguments']) # Propagate defaults from declaration to options for option in declaration['options']: for k, v in declaration.items(): diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index 84617b867d68d..f840c1ffe03f9 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -42,7 +42,10 @@ using supported_primitive_arg_types = guts::typelist::typelist< at::Tensor, at::Scalar, c10::QScheme, - c10::ScalarType + c10::ScalarType, + c10::Device, + c10::Layout, + c10::MemoryFormat >; template struct assert_is_valid_input_type { diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index d8f4a73df4271..d23f87d04a799 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -1517,9 +1517,12 @@ namespace detail { template struct getTypePtr_ final { static TypePtr call() { - if (!isCustomClassRegistered()) { - throw c10::Error("Type could not be converted to any of the known types.", ""); - } + TORCH_CHECK( + isCustomClassRegistered(), + "Type ", + c10::util::get_fully_qualified_type_name(), + " could not be converted to any of the known types." + ); auto res = getCustomClassType(); return std::dynamic_pointer_cast(std::move(res)); } @@ -1557,6 +1560,24 @@ struct getTypePtr_ final { } }; template <> +struct getTypePtr_ final { + static TypePtr call() { + return DeviceObjType::get(); + } +}; +template <> +struct getTypePtr_ final { + static TypePtr call() { + return IntType::get(); + } +}; +template <> +struct getTypePtr_ final { + static TypePtr call() { + return IntType::get(); + } +}; +template <> struct getTypePtr_ final { static TypePtr call() { return BoolType::get(); diff --git a/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h new file mode 100644 index 0000000000000..719976e5ed2ee --- /dev/null +++ b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include +#include + +// This file defines hacky_wrapper_for_legacy_signatures, which takes a kernel written in a legacy way +// (e.g. with TensorOptions packed) and wraps it into a kernel with the signature expected by +// the PyTorch operator library. The intention is to ultimately rewrite kernels to take the new signature +// and then delete this file. This transition process can happen kernel-by-kernel, since this wrapper +// is a no-op for kernels that already have a non-legacy signature. + +namespace c10 { +namespace impl { + +inline c10::optional process_memory_format(const TensorOptions& options, c10::optional memory_format) { + TORCH_CHECK( + !(options.has_memory_format() && memory_format.has_value()), + "Cannot set memory_format both in TensorOptions and explicit argument; please delete " + "the redundant setter."); + if (memory_format.has_value()) { + return memory_format; + } else { + return options.memory_format_opt(); + } +} + +namespace detail { + +// with_scattered_tensor_options takes a function pointer that potentially takes a TensorOptions argument. +// If it does, then it creates a new function pointer that takes scattered arguments, internally +// gathers those arguments, and then calls the underlying function pointer. If the underlying +// function pointer does not take a TensorOptions argument, it is passed through unmodified. + +template struct is_tensoroptions_arg : std::false_type {}; +template struct is_tensoroptions_arg>::value>> : std::true_type {}; +template +using is_tensoroptions_arg_t = typename is_tensoroptions_arg::type; + +template +inline constexpr bool has_tensoroptions_arg() { + using parameter_types = typename guts::infer_function_traits_t::parameter_types; + constexpr size_t num_tensoroptions_args = guts::typelist::count_if::value; + static_assert(num_tensoroptions_args <= 1, "Function has multiple TensorOptions parameters. We support at most one."); + return num_tensoroptions_args > 0; +} + +// sanity checks +static_assert(has_tensoroptions_arg(), ""); +static_assert(has_tensoroptions_arg(), ""); +static_assert(!has_tensoroptions_arg(), ""); + +template struct with_scattered_tensor_options_; + +template +struct with_scattered_tensor_options final {}; + +template +struct with_scattered_tensor_options()>> final { + // FuncType does not have TensorOptions arguments. + // Don't wrap anything but just return the base pointer. + using FuncPtr = UnderlyingFuncPtr; +}; + +template +struct with_scattered_tensor_options()>> final { +private: + // FuncType has TensorOptions arguments. + // Return a function pointer to a wrapper function that replaces those with expanded arguments. + using gathered_parameter_types = typename guts::infer_function_traits_t::parameter_types; + static constexpr size_t tensoroptions_arg_index = + guts::typelist::find_if< + gathered_parameter_types, + is_tensoroptions_arg_t + >::value; + + using parameters_before_tensoroptions = + guts::typelist::take_t; + using parameters_after_tensoroptions = + guts::typelist::drop_t; + + using wrapper = with_scattered_tensor_options_; +public: + using FuncPtr = TORCH_FN_TYPE(&wrapper::wrapper); +}; + +template +struct with_scattered_tensor_options_, guts::typelist::typelist> final { + static decltype(auto) wrapper( + ParametersBeforeTensorOptions... parameters_before, + optional scalar_type, + optional layout, + optional device, + optional pin_memory, + ParametersAfterTensorOptions... parameters_after) { + return (*FuncPtr::func_ptr())( + std::forward(parameters_before)..., + TensorOptions().dtype(scalar_type).device(device).layout(layout).pinned_memory(pin_memory), + std::forward(parameters_after)... + ); + } +}; + +} + +template +constexpr auto hacky_wrapper_for_legacy_signatures(FuncPtr) { + return typename detail::with_scattered_tensor_options::FuncPtr(); +}; + +} +} diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py index 0aaf9ec8f223b..27bbbd7140f80 100644 --- a/aten/src/ATen/cwrap_parser.py +++ b/aten/src/ATen/cwrap_parser.py @@ -1,4 +1,6 @@ import yaml +import copy + try: # use faster C loader if available from yaml import CLoader as Loader @@ -24,4 +26,13 @@ def parse(filename): declarations.append(declaration) elif in_declaration: declaration_lines.append(line) + declarations = [process_declaration(declaration) for declaration in declarations] return declarations + +def process_declaration(declaration): + declaration = copy.deepcopy(declaration) + if "arguments" in declaration: + declaration["schema_order_arguments"] = copy.deepcopy(declaration["arguments"]) + if "options" in declaration: + declaration["options"] = [process_declaration(option) for option in declaration["options"]] + return declaration diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 365e107d48251..bba3064eceeed 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -2,8 +2,10 @@ # "what has to be done to add a Operation ..." first! import re +import copy from code_template import CodeTemplate + from typing import Any, Dict, List, Optional, Set, Tuple, NamedTuple try: @@ -99,7 +101,8 @@ def TypedDict(name, attrs, total=True): # type: ignore """) DEFAULT_FUNCTION_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", TORCH_FN(TypeDefault::${type_wrapper_name})); +m.impl("${unqual_operator_name_with_overload}", + c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(TypeDefault::${type_wrapper_name}))); """) # NB: In the ordinary, TypeDerived code generation work flow, specification @@ -119,7 +122,7 @@ def TypedDict(name, attrs, total=True): # type: ignore BACKEND_FUNCTION_REGISTRATION = CodeTemplate("""\ m.impl("${unqual_operator_name_with_overload}", torch::dispatch(DispatchKey::${Backend}, - TORCH_FN(${Type}::${type_wrapper_name})) + c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${Type}::${type_wrapper_name}))) ); """) @@ -129,7 +132,7 @@ def TypedDict(name, attrs, total=True): # type: ignore """) # add non-virtual declaration to Tensor.cpp -C10_TENSOR_METHOD_DEFINITION = CodeTemplate("""\ +TENSOR_METHOD_DEFINITION = CodeTemplate("""\ // ${schema_string} ${return_type} Tensor::${api_name}(${method_formals}) const { @@ -138,8 +141,8 @@ def TypedDict(name, attrs, total=True): # type: ignore #else static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") - .typed<${cpp_signature}>(); - return op.call(${method_actuals}); + .typed<${tensor_method_cpp_signature}>(); + return op.call(${tensor_method_actuals}); #endif } """) @@ -155,7 +158,7 @@ def TypedDict(name, attrs, total=True): # type: ignore """) # add method definition in Functions.h -C10_FUNCTION_DEFINITION = CodeTemplate("""\ +FUNCTION_DEFINITION = CodeTemplate("""\ // ${schema_string} ${return_type} ${api_name}(${formals}) { @@ -164,8 +167,8 @@ def TypedDict(name, attrs, total=True): # type: ignore #else static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") - .typed<${cpp_signature}>(); - return op.call(${actuals}); + .typed<${function_cpp_signature}>(); + return op.call(${function_actuals}); #endif } """) @@ -455,6 +458,7 @@ def __getitem__(self, x): FunctionOption = TypedDict('FunctionOption', { 'actuals': List[str], + 'schema_order_actuals': List[str], 'api_name': str, # Like api_name, but it is the name of the internal # CPUType/CUDAType/TypeDefault function that wraps @@ -462,6 +466,11 @@ def __getitem__(self, x): # visible and is mangled with the overload name 'type_wrapper_name': str, 'arguments': List[THFormal], + # 'schema_order_arguments' is like 'arguments' but keeps them in the + # order they are defined in the JIT function schema while + # 'arguments' does some modifications (e.g. reorders out arguments + # and packs TensorOptions) + 'schema_order_arguments': List[THFormal], 'backend_types': Dict[str, List[str]], 'backends': List[str], 'buffers': List[NNBuffer], @@ -490,12 +499,18 @@ def __getitem__(self, x): 'formals': List[str], 'formals_types': List[str], 'cpp_signature': str, + # 'schema_order_cpp_signature' is like 'cpp_signature' but keeps them in the + # order they are defined in the JIT function schema while + # 'cpp_signature' does some modifications (e.g. reorders out arguments + # and packs TensorOptions) + 'schema_order_cpp_signature': str, 'inplace': bool, 'matches_jit_signature': bool, # This controls whether or not we generate the interface in Type or # TypeExtendedInterface 'extended_method': bool, 'method_actuals': List[str], + 'schema_order_method_actuals': List[str], 'method_formals_with_defaults': List[str], 'method_formals': List[str], 'mode': str, @@ -527,6 +542,11 @@ def __getitem__(self, x): ('matches_jit_signature', bool), ('schema_string', str), ('arguments', List[AtFormal]), + # 'schema_order_arguments' is like 'arguments' but keeps them in the + # order they are defined in the JIT function schema while + # 'arguments' does some modifications (e.g. reorders out arguments + # and packs TensorOptions) + ('schema_order_arguments', List[AtFormal]), ('method_of', List[str]), ('mode', str), ('python_module', str), @@ -785,8 +805,8 @@ def translate_formal(argument, option): translated['is_nullable'] = argument['is_nullable'] return translated - def get_formals(option, include_constants=False): - # type: (FunctionOption, bool) -> List[AtFormal] + def get_formals(option, schema_order, include_constants=False): + # type: (FunctionOption, bool, bool) -> List[AtFormal] seen = set() # type: Set[str] pos_args = [] # type: List[THFormal] kwd_args = [] # type: List[THFormal] @@ -802,16 +822,20 @@ def has_output_mask(argument): # type: (THFormal) -> bool return argument.get('allocate', False) and argument.get('mask', False) - for argument in option['arguments']: + if schema_order: + arguments = copy.deepcopy(option['schema_order_arguments']) + else: + arguments = copy.deepcopy(option['arguments']) + for argument in arguments: if argument.get('output') and not argument.get('allocate', False): insert(argument) - for argument in option['arguments']: + for argument in arguments: if include_constants and argument['type'] == 'CONSTANT': insert(argument) elif is_real_argument_to_wrapper(argument): insert(argument) - if any(has_output_mask(arg) for arg in option['arguments']): - mask_size = sum(has_output_mask(arg) for arg in option['arguments']) + if any(has_output_mask(arg) for arg in arguments): + mask_size = sum(has_output_mask(arg) for arg in arguments) insert({ 'name': 'output_mask', # NB: Lack of space in comma works around parsing @@ -850,6 +874,20 @@ def format_return_type(return_types): return return_types[0]['type'] return "std::tuple<{}>".format(','.join(r['type'] for r in return_types)) + def process_schema_order_actual(schema_order_actual): + if schema_order_actual == 'dtype': + return 'optTypeMetaToScalarType(options.dtype_opt())' + elif schema_order_actual == 'layout': + return 'options.layout_opt()' + elif schema_order_actual == 'device': + return 'options.device_opt()' + elif schema_order_actual == 'pin_memory': + return 'options.pinned_memory_opt()' + elif schema_order_actual == 'memory_format': + return 'c10::impl::process_memory_format(options, memory_format)' + else: + return schema_order_actual + def process_legacy_th_option(option): # type: (FunctionOption) -> None # Mutably populate option with derived values computed from values @@ -858,7 +896,8 @@ def process_legacy_th_option(option): '(^__i|[^_]_$)', option['api_name']) is not None # print(yaml.dump(option)) - formals = get_formals(option) + formals = get_formals(option, False) + schema_order_formals = get_formals(option, True) option['formals_list'] = formals option['formals'] = [format_formal(f) for f in formals] option['formals_with_defaults'] = [formal_with_default(f) for f in formals] @@ -889,8 +928,8 @@ def process_legacy_th_option(option): assert option['extended_method'], 'Expected legacy operator to be an extended method' - def native_get_formals(option, include_constants=False): - # type: (FunctionOption, bool) -> List[AtFormal] + def native_get_formals(option, schema_order, include_constants=False): + # type: (FunctionOption, bool, bool) -> List[AtFormal] seen = set() # type: Set[str] pos_args = [] kwd_args = [] @@ -904,7 +943,11 @@ def insert(argument): else: pos_args.append(argument) - for argument in option['arguments']: + if schema_order: + arguments = option['schema_order_arguments'] + else: + arguments = option['arguments'] + for argument in arguments: insert(argument) # not clear we need dynamic_type translation as we can specify the correct type @@ -995,7 +1038,8 @@ def process_native(option): assert option['python_module'] == '' or option['python_module'] == 'nn', \ "Found python_module of {} for decl {}, but only \'\' string or \'nn\' are supported".format( option['python_module'], option['name']) - formals = native_get_formals(option) + formals = native_get_formals(option, False) + schema_order_formals = native_get_formals(option, True) option['formals_list'] = formals option['formals'] = [format_formal(f) for f in formals] option['formals_with_defaults'] = [formal_with_default(f) for f in formals] @@ -1003,10 +1047,14 @@ def process_native(option): option['return_type'] = format_return_type(option['returns']) option['return_call'] = 'return ' if option['return_type'] != 'void' else '' option['actuals'] = [f['name'] for f in formals] + option['schema_order_actuals'] = [f['name'] for f in schema_order_formals] option['formals_types'] = [f['type'] for f in option['formals_list']] option['cpp_signature'] = "{} ({})".format(option['return_type'], ", ".join(option['formals_types'])) + option['schema_order_cpp_signature'] = "{} ({})".format( + option['return_type'], + ", ".join([f['type'] for f in schema_order_formals])) option['method_formals'] = [format_formal(f) for f in formals if f['name'] != 'self'] @@ -1016,6 +1064,14 @@ def process_native(option): # be const_casted to be accepted as native function's non-const argument option['method_actuals'] = [ f['name'] if f['name'] != 'self' else 'const_cast(*this)' for f in formals] + option['schema_order_method_actuals'] = [ + f['name'] if f['name'] != 'self' else 'const_cast(*this)' for f in schema_order_formals] + + if find_formal_by_type('TensorOptions', formals) is not None: + option['schema_order_actuals'] = [ + process_schema_order_actual(actual) for actual in option['schema_order_actuals']] + option['schema_order_method_actuals'] = [ + process_schema_order_actual(actual) for actual in option['schema_order_method_actuals']] def gen_tensor_method(option, formals): # type: (Any, List[AtFormal]) -> FunctionCode @@ -1068,12 +1124,23 @@ def swizzle_self(f): # blegh static_dispatch_method_body = STATIC_DISPATCH_FUNCTION_DEFAULT_BODY.substitute( option, actuals=option['method_actuals']) - method_definition = C10_TENSOR_METHOD_DEFINITION + if option['use_c10_dispatcher'] == 'full': + tensor_method_actuals = option['schema_order_method_actuals'] + tensor_method_cpp_signature = option['schema_order_cpp_signature'] + else: + assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + tensor_method_actuals = option['method_actuals'] + tensor_method_cpp_signature = option['cpp_signature'] + + method_definition = TENSOR_METHOD_DEFINITION.substitute( + option, static_dispatch_method_body=static_dispatch_method_body, + tensor_method_actuals=tensor_method_actuals, + tensor_method_cpp_signature=tensor_method_cpp_signature + ) return FunctionCode( declaration=TENSOR_METHOD_DECLARATION.substitute( option, static_dispatch_method_body=static_dispatch_method_body), - definition=method_definition.substitute( - option, static_dispatch_method_body=static_dispatch_method_body)) + definition=method_definition) def gen_namespace_function(option, multidispatch_formals): # type: (Any, List[AtFormal]) -> FunctionCode @@ -1109,8 +1176,18 @@ def gen_namespace_function(option, multidispatch_formals): static_dispatch_function_body = STATIC_DISPATCH_FUNCTION_DEFAULT_BODY.substitute( option, actuals=option['actuals']) - fn_definition = C10_FUNCTION_DEFINITION.substitute( - option, static_dispatch_function_body=static_dispatch_function_body) + if option['use_c10_dispatcher'] == 'full': + function_actuals = option['schema_order_actuals'] + function_cpp_signature = option['schema_order_cpp_signature'] + else: + assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + function_actuals = option['actuals'] + function_cpp_signature = option['cpp_signature'] + + fn_definition = FUNCTION_DEFINITION.substitute( + option, static_dispatch_function_body=static_dispatch_function_body, + function_actuals=function_actuals, + function_cpp_signature=function_cpp_signature) return FunctionCode(definition=fn_definition, declaration=fn_declaration) @@ -1213,6 +1290,7 @@ def gen_namespace_function(option, multidispatch_formals): matches_jit_signature=option["matches_jit_signature"], schema_string=option["schema_string"], arguments=formals, + schema_order_arguments=schema_order_formals, method_of=method_of, mode=option['mode'], python_module=option['python_module'], diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index b676d61399bdb..d3605bf385aa3 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -492,6 +492,7 @@ def generate_outputs(): declarations += nn_parse.run(nn_files) declarations += native_parse.run(native_files) declarations = preprocess_declarations.run(declarations) + per_op_registrations = defaultdict(list) if options.per_op_registration else None schema_registrations = [] if options.force_schema_registration else None diff --git a/aten/src/ATen/gen_backend_select_register.py b/aten/src/ATen/gen_backend_select_register.py index faba2eb1770f0..820bdfe3ee6f5 100644 --- a/aten/src/ATen/gen_backend_select_register.py +++ b/aten/src/ATen/gen_backend_select_register.py @@ -24,21 +24,26 @@ GENERATED_COMMENT = CodeTemplate( "@" + "generated from ${filename}") -FUNCTION_REGISTRATION = CodeTemplate("""\ +UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\ m.impl_UNBOXED("aten::${op_name_with_overload_name}", ${function_name}); """) +FUNCTION_REGISTRATION = CodeTemplate("""\ + m.impl("aten::${op_name_with_overload_name}", c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${function_name}))); +""") + FUNCTION_DEFINITION = CodeTemplate("""\ // ${schema_string} Tensor ${function_name}(${method_formals}) { static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${name}", "${overload_name}") - .typed<${cpp_signature}>(); + .typed<${function_cpp_signature}>(); ${dispatch_key_init} - return op.callWithDispatchKey(_dk, ${actuals}); + return op.callWithDispatchKey(_dk, ${function_actuals}); } """) + def needs_backend_select(declaration_option): # We register an op under the BackendSelect dispatch key # if a TensorOptions argument has been gathered from its declared args @@ -56,28 +61,39 @@ def register_backend_select_methods(declarations, template_path, file_manager): for decl in declarations: for option in decl["options"]: if needs_backend_select(option): - assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - name = option['name'] op_name_with_overload_name = option['name'] if option.get('overload_name', '') != '': name = "{0}_{1}".format(name, option['overload_name']) op_name_with_overload_name = "{0}.{1}".format(op_name_with_overload_name, option['overload_name']) - func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'], - op_name_with_overload_name=op_name_with_overload_name, - function_name=name) + if option['use_c10_dispatcher'] == 'full': + func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'], + op_name_with_overload_name=op_name_with_overload_name, + function_name=name) + else: + assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + func_reg = UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'], + op_name_with_overload_name=op_name_with_overload_name, + function_name=name) dispatch_key_init = gen_dispatch_key_init('_dk', option['formals_list']) + if option['use_c10_dispatcher'] == 'full': + function_cpp_signature = option['schema_order_cpp_signature'] + function_actuals = option['schema_order_actuals'] + else: + assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + function_cpp_signature = option['cpp_signature'] + function_actuals = option['actuals'] method_def = FUNCTION_DEFINITION.substitute(function_name=name, schema_string=option['schema_string'], method_formals=option['formals_with_defaults'], name=option['name'], overload_name=option['overload_name'], dispatch_key_init=dispatch_key_init, - cpp_signature=option['cpp_signature'], - actuals=option['actuals']) + function_cpp_signature=function_cpp_signature, + function_actuals=function_actuals) backend_select_function_registrations.append(func_reg) backend_select_method_definitions.append(method_def) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 3b571b78b6722..403260d795f0a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -177,6 +177,7 @@ CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) + use_c10_dispatcher: full - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) @@ -370,10 +371,13 @@ - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) @@ -491,8 +495,10 @@ CUDA: baddbmm_out_cuda - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor @@ -621,8 +627,10 @@ CPU, CUDA: logical_or_out - func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: bmm(Tensor self, Tensor mat2) -> Tensor use_c10_dispatcher: full @@ -729,6 +737,7 @@ variants: function - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) + use_c10_dispatcher: full variants: method - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor @@ -909,6 +918,7 @@ CUDA: cummin_helper_cuda - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -919,6 +929,7 @@ - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -1064,6 +1075,7 @@ device_guard: False - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: empty_cpu CUDA: empty_cuda @@ -1072,16 +1084,20 @@ Vulkan: empty_vulkan - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full variants: method - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full variants: method - func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full variants: method # other overrides are to provide a more helpful error message that dtype is required - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor + use_c10_dispatcher: full dispatch: CPU: empty_affine_quantized_other_backends_stub QuantizedCPU, QuantizedCUDA: empty_affine_quantized @@ -1089,6 +1105,7 @@ # it's a factory function receiving a tensor argument, thus overriding explicitly # other overrides are to provide a more helpful error message that dtype is required - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor + use_c10_dispatcher: full category_override: factory dispatch: CPU: empty_per_channel_affine_quantized_other_backends_stub @@ -1103,9 +1120,11 @@ device_guard: False - func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full device_guard: False - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda @@ -1160,8 +1179,10 @@ device_guard: False - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1241,12 +1262,15 @@ device_guard: False - func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) - func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: from_file @@ -1289,16 +1313,22 @@ CUDA: grid_sampler_3d_backward_cuda - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1516,6 +1546,7 @@ use_c10_dispatcher: full - func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: linspace.out(Scalar start, Scalar end, int steps=100, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1587,6 +1618,7 @@ variants: function, method - func: logspace(Scalar start, Scalar end, int steps=100, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: logspace.out(Scalar start, Scalar end, int steps=100, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1595,6 +1627,7 @@ # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -1707,12 +1740,14 @@ # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu QuantizedCPU: quantized_mean_cpu - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu @@ -2015,10 +2050,12 @@ device_guard: False - func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: ones.out(int[] size, *, Tensor(a!) out) -> Tensor(a!) - func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor use_c10_dispatcher: full @@ -2113,6 +2150,7 @@ supports_named_tensor: True - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -2121,6 +2159,7 @@ device_guard: False - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: rand.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2129,12 +2168,15 @@ - func: rand.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randint(int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randint.generator(int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: randint.low(int low, int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randint.low_generator(int low, int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2147,10 +2189,13 @@ - func: randint.low_generator_out(int low, int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randn(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randn.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2165,8 +2210,10 @@ - func: randn.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2178,8 +2225,10 @@ CUDA: randperm_out_cuda - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -2421,6 +2470,7 @@ # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -2511,9 +2561,11 @@ device_guard: False - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor @@ -2571,9 +2623,11 @@ - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -2887,10 +2941,12 @@ device_guard: False - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: zeros.out(int[] size, *, Tensor(a!) out) -> Tensor(a!) - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor use_c10_dispatcher: full @@ -2940,11 +2996,13 @@ use_c10_dispatcher: full - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor use_c10_dispatcher: full - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full @@ -2983,6 +3041,7 @@ SparseCPU: log_softmax_backward_sparse_cpu - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function, method - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor @@ -2990,6 +3049,7 @@ variants: function, method - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function, method - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor @@ -3036,6 +3096,7 @@ variants: function - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: clone @@ -3247,10 +3308,13 @@ - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor dispatch: @@ -3427,16 +3491,19 @@ use_c10_dispatcher: full - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: quantize_per_tensor - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] + use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_tensor_list_cpu - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_channel_cpu @@ -3535,14 +3602,17 @@ device_guard: False - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: method device_guard: False - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: method device_guard: False - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: method device_guard: False @@ -3562,20 +3632,26 @@ variants: method - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType + use_c10_dispatcher: full - func: can_cast(ScalarType from, ScalarType to) -> bool + use_c10_dispatcher: full variants: function - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType + use_c10_dispatcher: full variants: function # NB: Does NOT check precondition that numel == 1 @@ -3645,8 +3721,10 @@ # Quantized RNN layers # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) +# use_c10_dispatcher: full # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) +# use_c10_dispatcher: full # Quantized GRU layers @@ -4195,11 +4273,13 @@ variants: method, function - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: tril_indices_cpu CUDA: tril_indices_cuda - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: triu_indices_cpu CUDA: triu_indices_cuda diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py index 71ab636718253..69b024c0b8349 100644 --- a/aten/src/ATen/native_parse.py +++ b/aten/src/ATen/native_parse.py @@ -147,7 +147,7 @@ def type_argument_translations(arg): return t, name, default, nullable, size, annotation -def parse_arguments(args, func_variants, declaration, func_return): +def parse_arguments(args): arguments = [] kwarg_only = False @@ -174,6 +174,9 @@ def parse_arguments(args, func_variants, declaration, func_return): argument_dict['kwarg_only'] = True arguments.append(argument_dict) + return arguments + +def process_arguments(arguments, func_variants, declaration, func_return): is_out_fn = False arguments_out = [] arguments_other = [] @@ -417,7 +420,8 @@ def run(paths): declaration['overload_name'] = func.get('overload_name', overload_name) declaration['inplace'] = re.search('(^__i|[^_]_$)', fn_name) is not None return_arguments = parse_return_arguments(return_decl, declaration['inplace'], func) - arguments = parse_arguments(arguments, func.get('variants', []), declaration, return_arguments) + schema_order_arguments = parse_arguments(arguments) + arguments = process_arguments(schema_order_arguments, func.get('variants', []), declaration, return_arguments) output_arguments = [x for x in arguments if x.get('output')] propagate_field_names(output_arguments, return_arguments) declaration['return'] = return_arguments if len(output_arguments) == 0 else output_arguments @@ -435,6 +439,7 @@ def run(paths): declaration['manual_kernel_registration'] = func.get('manual_kernel_registration', False) declaration['category_override'] = func.get('category_override', '') declaration['arguments'] = func.get('arguments', arguments) + declaration['schema_order_arguments'] = func.get('schema_order_arguments', schema_order_arguments) declaration['type_method_definition_dispatch'] = \ parse_dispatch(fn_name, func.get('dispatch', declaration['name'])) declaration['python_module'] = func.get('python_module', '') diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py index 30e9dbc72748a..33d78abf61d75 100644 --- a/aten/src/ATen/nn_parse.py +++ b/aten/src/ATen/nn_parse.py @@ -232,6 +232,7 @@ def function_info(name, arguments, cimpls, buffers, backends, inplace, backend_t 'BFloat16' in backend_types['CUDA'] else False, 'backend_types': backend_types, 'arguments': arguments, + 'schema_order_arguments': copy.deepcopy(arguments), 'return': 'argument 0' if inplace else get_return(arguments), 'buffers': buffers, 'backends': backends, diff --git a/aten/src/ATen/templates/BackendSelectRegister.cpp b/aten/src/ATen/templates/BackendSelectRegister.cpp index bcbf25f3117ff..db7276913201e 100644 --- a/aten/src/ATen/templates/BackendSelectRegister.cpp +++ b/aten/src/ATen/templates/BackendSelectRegister.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace at { diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index a41816a3a613d..5bc88199bf424 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace at { diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp index 53ed0ceb3bca3..72ac3d784dade 100644 --- a/aten/src/ATen/templates/PerOpRegistration.cpp +++ b/aten/src/ATen/templates/PerOpRegistration.cpp @@ -3,6 +3,7 @@ #include #include #include +#include $extra_headers namespace at { diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 0035c1db37999..6883e0b216881 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index a85576d0ad933..89f99d85658c9 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace { diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index ea18d5ce782f2..ede79cf218612 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -29,6 +29,7 @@ #include #include +#include #include $extra_cuda_headers $legacy_th_headers diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp index fd2d3c0045ee6..eab1fe913adf9 100644 --- a/aten/src/ATen/test/extension_backend_test.cpp +++ b/aten/src/ATen/test/extension_backend_test.cpp @@ -10,7 +10,7 @@ using namespace at; static int test_int; -Tensor empty_override(IntArrayRef size, const TensorOptions & options, c10::optional optional_memory_format) { +Tensor empty_override(IntArrayRef size, c10::optional scalar_type, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional memory_format) { test_int = 1; auto tensor_impl = c10::make_intrusive( Storage( diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index 95ae2a3c0e04f..d55d3df538736 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -186,6 +186,13 @@ static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) { "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)"); } +inline optional optTypeMetaToScalarType(optional type_meta) { + if (!type_meta.has_value()) { + return c10::nullopt; + } + return typeMetaToScalarType(*type_meta); +} + static inline bool operator==(ScalarType t, caffe2::TypeMeta m) { if (auto mt = tryTypeMetaToScalarType(m)) { return (*mt) == t; diff --git a/c10/test/util/TypeList_test.cpp b/c10/test/util/TypeList_test.cpp index 59ce6f2b8981d..5721cb8aca888 100644 --- a/c10/test/util/TypeList_test.cpp +++ b/c10/test/util/TypeList_test.cpp @@ -152,3 +152,21 @@ namespace test_contains { static_assert(!contains, float>::value, ""); static_assert(!contains, double>::value, ""); } + +namespace test_take { + static_assert(std::is_same, take_t, 0>>::value, ""); + static_assert(std::is_same, take_t, 0>>::value, ""); + static_assert(std::is_same, take_t, 1>>::value, ""); + static_assert(std::is_same, take_t, 0>>::value, ""); + static_assert(std::is_same, take_t, 1>>::value, ""); + static_assert(std::is_same, take_t, 2>>::value, ""); +} + +namespace test_drop { + static_assert(std::is_same, drop_t, 0>>::value, ""); + static_assert(std::is_same, drop_t, 0>>::value, ""); + static_assert(std::is_same, drop_t, 1>>::value, ""); + static_assert(std::is_same, drop_t, 0>>::value, ""); + static_assert(std::is_same, drop_t, 1>>::value, ""); + static_assert(std::is_same, drop_t, 2>>::value, ""); +} diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h index 7a05aaf094a28..424ede4532603 100644 --- a/c10/util/TypeList.h +++ b/c10/util/TypeList.h @@ -250,6 +250,37 @@ static_assert( std::is_same>>::value, ""); +/** + * Take/drop a number of arguments from a typelist. + * Example: + * typelist == take_t, 2> + * typelist == drop_t, 2> + */ +namespace detail { + template + struct take_elements final {}; + + template + struct take_elements> final { + using type = typelist::type...>; + }; +} + +template struct take final { + static_assert(is_instantiation_of::value, "In typelist::take, the T argument must be typelist<...>."); + static_assert(num <= size::value, "Tried to typelist::take more elements than there are in the list"); + using type = typename detail::take_elements>::type; +}; +template using take_t = typename take::type; + +template struct drop final { + static_assert(is_instantiation_of::value, "In typelist::drop, the T argument must be typelist<...>."); + static_assert(num <= size::value, "Tried to typelist::drop more elements than there are in the list"); + using type = typename detail::take_elements::value - num>>::type; +}; +template using drop_t = typename drop::type; + + /** * Reverses a typelist. * Example: @@ -288,8 +319,6 @@ struct find_if, Condition, std::enable_if_t, Condition>::value; }; - - /** * Maps a list of types into a list of values. * Examples: diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp index a308bb0b15c12..c2dfac7d6b6ed 100644 --- a/test/cpp_extensions/msnpu_extension.cpp +++ b/test/cpp_extensions/msnpu_extension.cpp @@ -20,9 +20,13 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) { return Tensor(std::move(tensor_impl)); } -Tensor empty_override(IntArrayRef size, const TensorOptions& options, c10::optional optional_memory_format) { +Tensor empty_override(IntArrayRef size, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { test_int = 0; - return get_tensor(options.dtype(), size); + caffe2::TypeMeta typeMeta; + if (dtype.has_value()) { + typeMeta = scalarTypeToTypeMeta(*dtype); + } + return get_tensor(typeMeta, size); } Tensor add_override(const Tensor & a, const Tensor & b , Scalar c) { diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index b5bd05ccdabe6..b6ac180460286 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -101,6 +101,26 @@ def get_simple_type(arg): simple_type = '{}?'.format(opt_match.group(1)) return simple_type +def has_tensoroptions_argument(declaration): + for argument in declaration['arguments']: + if 'TensorOptions' == argument['dynamic_type']: + return True + return False + +def process_schema_order_arg(schema_order_arg): + if schema_order_arg == 'dtype': + return 'optTypeMetaToScalarType(options.dtype_opt())' + elif schema_order_arg == 'layout': + return 'options.layout_opt()' + elif schema_order_arg == 'device': + return 'options.device_opt()' + elif schema_order_arg == 'pin_memory': + return 'options.pinned_memory_opt()' + elif schema_order_arg == 'memory_format': + return 'c10::impl::process_memory_format(options, memory_format)' + else: + return schema_order_arg + def load_aten_declarations(path): with open(path, 'r') as f: @@ -119,7 +139,12 @@ def load_aten_declarations(path): declaration['formals'] = [arg['type'] + ' ' + arg['name'] for arg in declaration['arguments']] + declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name'] + for arg in declaration['schema_order_arguments']] declaration['args'] = [arg['name'] for arg in declaration['arguments']] + declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] + if has_tensoroptions_argument(declaration): + declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']] declaration['api_name'] = declaration['name'] # NB: keep this in sync with common_with_cwrap.py if declaration.get('overload_name'): diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 774f358be8ee9..68d91a8d2b7f0 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -194,7 +194,9 @@ """) WRAPPER_REGISTRATION = CodeTemplate("""\ -m.impl("${unqual_operator_name_with_overload}", TORCH_FN(${class_type}::${type_wrapper_name})); +m.impl("${unqual_operator_name_with_overload}", + c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${class_type}::${type_wrapper_name})) +); """) UNPACK_TENSOR = CodeTemplate("""\ @@ -339,27 +341,41 @@ # Generate a file that lists all functions and their schema string. Used for XLA REGISTRATION_DECLARATION = CodeTemplate("""\ -${return_type} ${api_name}(${formals}); // {"schema": "${schema_string}", "compound": "${compound}"} +${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"} """) # ProfiledType templates -PROFILE_DISPATCH_UNBOXED = CodeTemplate("""\ +UNBOXED_PROFILE_DISPATCH = CodeTemplate("""\ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") - .typed<${return_type} (${arg_types})>(); + .typed<${return_type} (${profiled_arg_types})>(); +RECORD_FUNCTION("${name}", std::vector({${input_names}}), Node::peek_at_next_sequence_nr()); +return c10::Dispatcher::singleton().redispatch<${profiled_ret_and_arg_types}>(${profiled_dispatch_args}); +""") +PROFILE_DISPATCH = CodeTemplate("""\ +static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") + .typed<${return_type} (${profiled_arg_types})>(); RECORD_FUNCTION("${name}", std::vector({${input_names}}), Node::peek_at_next_sequence_nr()); -return c10::Dispatcher::singleton().redispatch<${ret_and_arg_types}>(${profiled_dispatch_args}); +return c10::Dispatcher::singleton().redispatch<${profiled_ret_and_arg_types}>(${profiled_dispatch_args}); """) # TraceType templates # TODO: change `redispatch` to `NoTracerDispatchMode` + regular `call`. -TRACE_DISPATCH_UNBOXED = CodeTemplate("""\ +UNBOXED_TRACE_DISPATCH = CodeTemplate("""\ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") .typed<${return_type} (${arg_types})>(); ${assign_return_values}c10::Dispatcher::singleton().redispatch<${ret_and_arg_types}>(${trace_dispatch_args}); """) +TRACE_DISPATCH = CodeTemplate("""\ +static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::${operator_name}", "${overload_name}") + .typed<${return_type} (${schema_order_arg_types})>(); +${assign_return_values}c10::Dispatcher::singleton() + .redispatch<${schema_order_ret_and_arg_types}>(${schema_order_trace_dispatch_args}); +""") FACTORY_FUNCTION_NAMES = None @@ -598,10 +614,21 @@ def gen_variable_type(out, aten_declarations, template_path): registration_declarations = [] for declaration in aten_declarations: + if declaration['use_c10_dispatcher'] == 'full': + declaration_formals = declaration['schema_order_formals'] + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + declaration_formals = declaration['formals'] if dispatch_strategy(declaration) == 'use_derived': - registration_declarations.append(REGISTRATION_DECLARATION.substitute(declaration, compound='false')) + registration_declarations.append( + REGISTRATION_DECLARATION.substitute(declaration, + declaration_formals=declaration_formals, + compound='false')) else: - registration_declarations.append(REGISTRATION_DECLARATION.substitute(declaration, compound='true')) + registration_declarations.append( + REGISTRATION_DECLARATION.substitute(declaration, + declaration_formals=declaration_formals, + compound='true')) env = { 'registration_declarations': registration_declarations, @@ -647,6 +674,7 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade profiled_wrapper_registrations.append(WRAPPER_REGISTRATION.substitute( declaration, class_type='ProfiledType')) else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' profiled_wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( declaration, class_type='ProfiledType')) @@ -696,6 +724,9 @@ def emit_profiled_body(declaration): arg_types = ', '.join([a['type'] for a in declaration['arguments']]) ret_and_arg_types = ', '.join([declaration['return_type']] + [a['type'] for a in declaration['arguments']]) + schema_order_arg_types = ', '.join([a['type'] for a in declaration['schema_order_arguments']]) + schema_order_ret_and_arg_types = ', '.join( + [declaration['return_type']] + [a['type'] for a in declaration['schema_order_arguments']]) def check_record_function_input_type(simple_type): return simple_type in ['Tensor', 'Scalar'] @@ -706,14 +737,25 @@ def record_function_input_names(): if check_record_function_input_type(arg['simple_type'])]) profiled_dispatch_args = ['op', 'c10::DispatchKey::Profiler'] + declaration['args'] + schema_order_profiled_dispatch_args = ['op', 'c10::DispatchKey::Profiler'] + declaration['schema_order_args'] + + if declaration['use_c10_dispatcher'] == 'full': + profiled_arg_types = schema_order_arg_types + profiled_ret_and_arg_types = schema_order_ret_and_arg_types + profiled_dispatch_args = schema_order_profiled_dispatch_args + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + profiled_arg_types = arg_types + profiled_ret_and_arg_types = ret_and_arg_types + profiled_dispatch_args = profiled_dispatch_args - call = PROFILE_DISPATCH_UNBOXED.substitute( + call = PROFILE_DISPATCH.substitute( declaration, name=name, input_names=record_function_input_names(), return_type=declaration['return_type'], - arg_types=arg_types, - ret_and_arg_types=ret_and_arg_types, + profiled_arg_types=profiled_arg_types, + profiled_ret_and_arg_types=profiled_ret_and_arg_types, profiled_dispatch_args=profiled_dispatch_args, ) @@ -737,16 +779,30 @@ def emit_trace_body(declaration): arg_types = ', '.join([a['type'] for a in declaration['arguments']]) ret_and_arg_types = ', '.join([declaration['return_type']] + [a['type'] for a in declaration['arguments']]) + schema_order_arg_types = ', '.join([a['type'] for a in declaration['schema_order_arguments']]) + schema_order_ret_and_arg_types = ', '.join( + [declaration['return_type']] + [a['type'] for a in declaration['schema_order_arguments']]) trace_dispatch_args = ['op', 'c10::DispatchKey::Tracer'] + declaration['args'] + schema_order_trace_dispatch_args = ['op', 'c10::DispatchKey::Tracer'] + declaration['schema_order_args'] assign_return_values = '{} = '.format(tie_return_values) if not modifies_arguments and not returns_void else '' - call = TRACE_DISPATCH_UNBOXED.substitute( - declaration, - arg_types=arg_types, - ret_and_arg_types=ret_and_arg_types, - trace_dispatch_args=trace_dispatch_args, - assign_return_values=assign_return_values, - ) + if declaration['use_c10_dispatcher'] == 'full': + call = TRACE_DISPATCH.substitute( + declaration, + schema_order_arg_types=schema_order_arg_types, + assign_return_values=assign_return_values, + schema_order_ret_and_arg_types=schema_order_ret_and_arg_types, + schema_order_trace_dispatch_args=schema_order_trace_dispatch_args, + ) + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + call = UNBOXED_TRACE_DISPATCH.substitute( + declaration, + arg_types=arg_types, + ret_and_arg_types=ret_and_arg_types, + trace_dispatch_args=trace_dispatch_args, + assign_return_values=assign_return_values, + ) trace_body.append(call) trace_body.append(post_record_trace) if not returns_void: diff --git a/tools/autograd/templates/ProfiledType.cpp b/tools/autograd/templates/ProfiledType.cpp index f2e3cae2d36a4..3c72dcabe9694 100644 --- a/tools/autograd/templates/ProfiledType.cpp +++ b/tools/autograd/templates/ProfiledType.cpp @@ -2,6 +2,7 @@ #include #include +#include #include "torch/csrc/autograd/function.h" diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 7c91f1f64ef64..1b4f8146c557d 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -2,6 +2,7 @@ #include #include +#include // ${generated_comment}