diff --git a/aten/src/ATen/common_with_cwrap.py b/aten/src/ATen/common_with_cwrap.py
index 6b8e0cd7b8211..ab50de1769b09 100644
--- a/aten/src/ATen/common_with_cwrap.py
+++ b/aten/src/ATen/common_with_cwrap.py
@@ -1,6 +1,7 @@
 # this code should be common among cwrap and ATen preprocessing
 # for now, I have put it in one place but right now is copied out of cwrap
 
+import copy
 
 def parse_arguments(args):
     new_args = []
@@ -50,11 +51,16 @@ def set_declaration_defaults(declaration):
         declaration['unqual_operator_name_with_overload'] = ''
     # Simulate multiple dispatch, even if it's not necessary
     if 'options' not in declaration:
-        declaration['options'] = [{'arguments': declaration['arguments']}]
+        declaration['options'] = [{
+            'arguments': copy.deepcopy(declaration['arguments']),
+            'schema_order_arguments': copy.deepcopy(declaration['schema_order_arguments']),
+        }]
         del declaration['arguments']
+        del declaration['schema_order_arguments']
     # Parse arguments (some of them can be strings)
     for option in declaration['options']:
         option['arguments'] = parse_arguments(option['arguments'])
+        option['schema_order_arguments'] = parse_arguments(option['schema_order_arguments'])
     # Propagate defaults from declaration to options
     for option in declaration['options']:
         for k, v in declaration.items():
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 84617b867d68d..f840c1ffe03f9 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -42,7 +42,10 @@ using supported_primitive_arg_types = guts::typelist::typelist<
     at::Tensor,
     at::Scalar,
     c10::QScheme,
-    c10::ScalarType
+    c10::ScalarType,
+    c10::Device,
+    c10::Layout,
+    c10::MemoryFormat
   >;
 
   template<class T, bool AllowDeprecatedTypes, class Enable = void> struct assert_is_valid_input_type {
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index d8f4a73df4271..d23f87d04a799 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1517,9 +1517,12 @@ namespace detail {
 template <typename T>
 struct getTypePtr_ final {
   static TypePtr call() {
-    if (!isCustomClassRegistered<T>()) {
-      throw c10::Error("Type could not be converted to any of the known types.", "");
-    }
+    TORCH_CHECK(
+        isCustomClassRegistered<T>(),
+        "Type ",
+        c10::util::get_fully_qualified_type_name<T>(),
+        " could not be converted to any of the known types."
+    );
     auto res = getCustomClassType<T>();
     return std::dynamic_pointer_cast<Type>(std::move(res));
   }
@@ -1557,6 +1560,24 @@ struct getTypePtr_<c10::ScalarType> final {
   }
 };
 template <>
+struct getTypePtr_<c10::Device> final {
+  static TypePtr call() {
+    return DeviceObjType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::Layout> final {
+  static TypePtr call() {
+    return IntType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::MemoryFormat> final {
+  static TypePtr call() {
+    return IntType::get();
+  }
+};
+template <>
 struct getTypePtr_<bool> final {
   static TypePtr call() {
     return BoolType::get();
diff --git a/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h
new file mode 100644
index 0000000000000..719976e5ed2ee
--- /dev/null
+++ b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+
+// This file defines hacky_wrapper_for_legacy_signatures, which takes a kernel written in a legacy way
+// (e.g. with TensorOptions packed) and wraps it into a kernel with the signature expected by
+// the PyTorch operator library. The intention is to ultimately rewrite kernels to take the new signature
+// and then delete this file. This transition process can happen kernel-by-kernel, since this wrapper
+// is a no-op for kernels that already have a non-legacy signature.
+
+namespace c10 {
+namespace impl {
+
+inline c10::optional<MemoryFormat> process_memory_format(const TensorOptions& options, c10::optional<MemoryFormat> memory_format) {
+    TORCH_CHECK(
+        !(options.has_memory_format() && memory_format.has_value()),
+        "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+        "the redundant setter.");
+    if (memory_format.has_value()) {
+        return memory_format;
+    } else {
+        return options.memory_format_opt();
+    }
+}
+
+namespace detail {
+
+// with_scattered_tensor_options takes a function pointer that potentially takes a TensorOptions argument.
+// If it does, then it creates a new function pointer that takes scattered arguments, internally
+// gathers those arguments, and then calls the underlying function pointer. If the underlying
+// function pointer does not take a TensorOptions argument, it is passed through unmodified.
+
+template<class Type, class Enable = void> struct is_tensoroptions_arg : std::false_type {};
+template<class Type> struct is_tensoroptions_arg<Type, std::enable_if_t<std::is_same<TensorOptions, std::decay_t<Type>>::value>> : std::true_type {};
+template<class Type>
+using is_tensoroptions_arg_t = typename is_tensoroptions_arg<Type>::type;
+
+template<class FuncType>
+inline constexpr bool has_tensoroptions_arg() {
+    using parameter_types = typename guts::infer_function_traits_t<FuncType>::parameter_types;
+    constexpr size_t num_tensoroptions_args = guts::typelist::count_if<is_tensoroptions_arg_t, parameter_types>::value;
+    static_assert(num_tensoroptions_args <= 1, "Function has multiple TensorOptions parameters. We support at most one.");
+    return num_tensoroptions_args > 0;
+}
+
+// sanity checks
+static_assert(has_tensoroptions_arg<int (int64_t, const TensorOptions&)>(), "");
+static_assert(has_tensoroptions_arg<int (int64_t, TensorOptions)>(), "");
+static_assert(!has_tensoroptions_arg<int (int64_t, std::string)>(), "");
+
+template<class FuncPtr, class ParametersBeforeTensorOptions, class ParametersAfterTensorOptions> struct with_scattered_tensor_options_;
+
+template<class FuncPtr, class Enable = void>
+struct with_scattered_tensor_options final {};
+
+template<class UnderlyingFuncPtr>
+struct with_scattered_tensor_options<UnderlyingFuncPtr, std::enable_if_t<!has_tensoroptions_arg<typename UnderlyingFuncPtr::FuncType>()>> final {
+    // FuncType does not have TensorOptions arguments.
+    // Don't wrap anything but just return the base pointer.
+    using FuncPtr = UnderlyingFuncPtr;
+};
+
+template<class UnderlyingFuncPtr>
+struct with_scattered_tensor_options<UnderlyingFuncPtr, std::enable_if_t<has_tensoroptions_arg<typename UnderlyingFuncPtr::FuncType>()>> final {
+private:
+    // FuncType has TensorOptions arguments.
+    // Return a function pointer to a wrapper function that replaces those with expanded arguments.
+    using gathered_parameter_types = typename guts::infer_function_traits_t<typename UnderlyingFuncPtr::FuncType>::parameter_types;
+    static constexpr size_t tensoroptions_arg_index =
+        guts::typelist::find_if<
+            gathered_parameter_types,
+            is_tensoroptions_arg_t
+        >::value;
+
+    using parameters_before_tensoroptions =
+        guts::typelist::take_t<gathered_parameter_types, tensoroptions_arg_index>;
+    using parameters_after_tensoroptions =
+        guts::typelist::drop_t<gathered_parameter_types, tensoroptions_arg_index + 1>;
+
+    using wrapper = with_scattered_tensor_options_<UnderlyingFuncPtr, parameters_before_tensoroptions, parameters_after_tensoroptions>;
+public:
+    using FuncPtr = TORCH_FN_TYPE(&wrapper::wrapper);
+};
+
+template<class FuncPtr, class... ParametersBeforeTensorOptions, class... ParametersAfterTensorOptions>
+struct with_scattered_tensor_options_<FuncPtr, guts::typelist::typelist<ParametersBeforeTensorOptions...>, guts::typelist::typelist<ParametersAfterTensorOptions...>> final {
+    static decltype(auto) wrapper(
+                ParametersBeforeTensorOptions... parameters_before,
+                optional<ScalarType> scalar_type,
+                optional<Layout> layout,
+                optional<Device> device,
+                optional<bool> pin_memory,
+                ParametersAfterTensorOptions... parameters_after) {
+        return (*FuncPtr::func_ptr())(
+            std::forward<ParametersBeforeTensorOptions>(parameters_before)...,
+            TensorOptions().dtype(scalar_type).device(device).layout(layout).pinned_memory(pin_memory),
+            std::forward<ParametersAfterTensorOptions>(parameters_after)...
+        );
+    }
+};
+
+}
+
+template<class FuncPtr>
+constexpr auto hacky_wrapper_for_legacy_signatures(FuncPtr) {
+    return typename detail::with_scattered_tensor_options<FuncPtr>::FuncPtr();
+};
+
+}
+}
diff --git a/aten/src/ATen/cwrap_parser.py b/aten/src/ATen/cwrap_parser.py
index 0aaf9ec8f223b..27bbbd7140f80 100644
--- a/aten/src/ATen/cwrap_parser.py
+++ b/aten/src/ATen/cwrap_parser.py
@@ -1,4 +1,6 @@
 import yaml
+import copy
+
 try:
     # use faster C loader if available
     from yaml import CLoader as Loader
@@ -24,4 +26,13 @@ def parse(filename):
                 declarations.append(declaration)
             elif in_declaration:
                 declaration_lines.append(line)
+        declarations = [process_declaration(declaration) for declaration in declarations]
         return declarations
+
+def process_declaration(declaration):
+    declaration = copy.deepcopy(declaration)
+    if "arguments" in declaration:
+        declaration["schema_order_arguments"] = copy.deepcopy(declaration["arguments"])
+    if "options" in declaration:
+        declaration["options"] = [process_declaration(option) for option in declaration["options"]]
+    return declaration
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 365e107d48251..bba3064eceeed 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -2,8 +2,10 @@
 # "what has to be done to add a Operation ..." first!
 
 import re
+import copy
 from code_template import CodeTemplate
 
+
 from typing import Any, Dict, List, Optional, Set, Tuple, NamedTuple
 
 try:
@@ -99,7 +101,8 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 
 DEFAULT_FUNCTION_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}", TORCH_FN(TypeDefault::${type_wrapper_name}));
+m.impl("${unqual_operator_name_with_overload}",
+       c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(TypeDefault::${type_wrapper_name})));
 """)
 
 # NB: In the ordinary, TypeDerived code generation work flow, specification
@@ -119,7 +122,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 BACKEND_FUNCTION_REGISTRATION = CodeTemplate("""\
 m.impl("${unqual_operator_name_with_overload}",
        torch::dispatch(DispatchKey::${Backend},
-                       TORCH_FN(${Type}::${type_wrapper_name}))
+                       c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${Type}::${type_wrapper_name})))
 );
 """)
 
@@ -129,7 +132,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 
 # add non-virtual declaration to Tensor.cpp
-C10_TENSOR_METHOD_DEFINITION = CodeTemplate("""\
+TENSOR_METHOD_DEFINITION = CodeTemplate("""\
 
 // ${schema_string}
 ${return_type} Tensor::${api_name}(${method_formals}) const {
@@ -138,8 +141,8 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #else
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
-        .typed<${cpp_signature}>();
-    return op.call(${method_actuals});
+        .typed<${tensor_method_cpp_signature}>();
+    return op.call(${tensor_method_actuals});
 #endif
 }
 """)
@@ -155,7 +158,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 
 # add method definition in Functions.h
-C10_FUNCTION_DEFINITION = CodeTemplate("""\
+FUNCTION_DEFINITION = CodeTemplate("""\
 
 // ${schema_string}
 ${return_type} ${api_name}(${formals}) {
@@ -164,8 +167,8 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #else
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
-        .typed<${cpp_signature}>();
-    return op.call(${actuals});
+        .typed<${function_cpp_signature}>();
+    return op.call(${function_actuals});
 #endif
 }
 """)
@@ -455,6 +458,7 @@ def __getitem__(self, x):
 
 FunctionOption = TypedDict('FunctionOption', {
     'actuals': List[str],
+    'schema_order_actuals': List[str],
     'api_name': str,
     # Like api_name, but it is the name of the internal
     # CPUType/CUDAType/TypeDefault function that wraps
@@ -462,6 +466,11 @@ def __getitem__(self, x):
     # visible and is mangled with the overload name
     'type_wrapper_name': str,
     'arguments': List[THFormal],
+    # 'schema_order_arguments' is like 'arguments' but keeps them in the
+    # order they are defined in the JIT function schema while
+    # 'arguments' does some modifications (e.g. reorders out arguments
+    # and packs TensorOptions)
+    'schema_order_arguments': List[THFormal],
     'backend_types': Dict[str, List[str]],
     'backends': List[str],
     'buffers': List[NNBuffer],
@@ -490,12 +499,18 @@ def __getitem__(self, x):
     'formals': List[str],
     'formals_types': List[str],
     'cpp_signature': str,
+    # 'schema_order_cpp_signature' is like 'cpp_signature' but keeps them in the
+    # order they are defined in the JIT function schema while
+    # 'cpp_signature' does some modifications (e.g. reorders out arguments
+    # and packs TensorOptions)
+    'schema_order_cpp_signature': str,
     'inplace': bool,
     'matches_jit_signature': bool,
     # This controls whether or not we generate the interface in Type or
     # TypeExtendedInterface
     'extended_method': bool,
     'method_actuals': List[str],
+    'schema_order_method_actuals': List[str],
     'method_formals_with_defaults': List[str],
     'method_formals': List[str],
     'mode': str,
@@ -527,6 +542,11 @@ def __getitem__(self, x):
     ('matches_jit_signature', bool),
     ('schema_string', str),
     ('arguments', List[AtFormal]),
+    # 'schema_order_arguments' is like 'arguments' but keeps them in the
+    # order they are defined in the JIT function schema while
+    # 'arguments' does some modifications (e.g. reorders out arguments
+    # and packs TensorOptions)
+    ('schema_order_arguments', List[AtFormal]),
     ('method_of', List[str]),
     ('mode', str),
     ('python_module', str),
@@ -785,8 +805,8 @@ def translate_formal(argument, option):
             translated['is_nullable'] = argument['is_nullable']
         return translated
 
-    def get_formals(option, include_constants=False):
-        # type: (FunctionOption, bool) -> List[AtFormal]
+    def get_formals(option, schema_order, include_constants=False):
+        # type: (FunctionOption, bool, bool) -> List[AtFormal]
         seen = set()  # type: Set[str]
         pos_args = []  # type: List[THFormal]
         kwd_args = []  # type: List[THFormal]
@@ -802,16 +822,20 @@ def has_output_mask(argument):
             # type: (THFormal) -> bool
             return argument.get('allocate', False) and argument.get('mask', False)
 
-        for argument in option['arguments']:
+        if schema_order:
+            arguments = copy.deepcopy(option['schema_order_arguments'])
+        else:
+            arguments = copy.deepcopy(option['arguments'])
+        for argument in arguments:
             if argument.get('output') and not argument.get('allocate', False):
                 insert(argument)
-        for argument in option['arguments']:
+        for argument in arguments:
             if include_constants and argument['type'] == 'CONSTANT':
                 insert(argument)
             elif is_real_argument_to_wrapper(argument):
                 insert(argument)
-        if any(has_output_mask(arg) for arg in option['arguments']):
-            mask_size = sum(has_output_mask(arg) for arg in option['arguments'])
+        if any(has_output_mask(arg) for arg in arguments):
+            mask_size = sum(has_output_mask(arg) for arg in arguments)
             insert({
                 'name': 'output_mask',
                 # NB: Lack of space in comma works around parsing
@@ -850,6 +874,20 @@ def format_return_type(return_types):
             return return_types[0]['type']
         return "std::tuple<{}>".format(','.join(r['type'] for r in return_types))
 
+    def process_schema_order_actual(schema_order_actual):
+        if schema_order_actual == 'dtype':
+            return 'optTypeMetaToScalarType(options.dtype_opt())'
+        elif schema_order_actual == 'layout':
+            return 'options.layout_opt()'
+        elif schema_order_actual == 'device':
+            return 'options.device_opt()'
+        elif schema_order_actual == 'pin_memory':
+            return 'options.pinned_memory_opt()'
+        elif schema_order_actual == 'memory_format':
+            return 'c10::impl::process_memory_format(options, memory_format)'
+        else:
+            return schema_order_actual
+
     def process_legacy_th_option(option):
         # type: (FunctionOption) -> None
         # Mutably populate option with derived values computed from values
@@ -858,7 +896,8 @@ def process_legacy_th_option(option):
             '(^__i|[^_]_$)', option['api_name']) is not None
 
         # print(yaml.dump(option))
-        formals = get_formals(option)
+        formals = get_formals(option, False)
+        schema_order_formals = get_formals(option, True)
         option['formals_list'] = formals
         option['formals'] = [format_formal(f) for f in formals]
         option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
@@ -889,8 +928,8 @@ def process_legacy_th_option(option):
 
         assert option['extended_method'], 'Expected legacy operator to be an extended method'
 
-    def native_get_formals(option, include_constants=False):
-        # type: (FunctionOption, bool) -> List[AtFormal]
+    def native_get_formals(option, schema_order, include_constants=False):
+        # type: (FunctionOption, bool, bool) -> List[AtFormal]
         seen = set()  # type: Set[str]
         pos_args = []
         kwd_args = []
@@ -904,7 +943,11 @@ def insert(argument):
                 else:
                     pos_args.append(argument)
 
-        for argument in option['arguments']:
+        if schema_order:
+            arguments = option['schema_order_arguments']
+        else:
+            arguments = option['arguments']
+        for argument in arguments:
             insert(argument)
 
         # not clear we need dynamic_type translation as we can specify the correct type
@@ -995,7 +1038,8 @@ def process_native(option):
         assert option['python_module'] == '' or option['python_module'] == 'nn', \
             "Found python_module of {} for decl {}, but only \'\' string or \'nn\' are supported".format(
                 option['python_module'], option['name'])
-        formals = native_get_formals(option)
+        formals = native_get_formals(option, False)
+        schema_order_formals = native_get_formals(option, True)
         option['formals_list'] = formals
         option['formals'] = [format_formal(f) for f in formals]
         option['formals_with_defaults'] = [formal_with_default(f) for f in formals]
@@ -1003,10 +1047,14 @@ def process_native(option):
         option['return_type'] = format_return_type(option['returns'])
         option['return_call'] = 'return ' if option['return_type'] != 'void' else ''
         option['actuals'] = [f['name'] for f in formals]
+        option['schema_order_actuals'] = [f['name'] for f in schema_order_formals]
 
         option['formals_types'] = [f['type'] for f in option['formals_list']]
 
         option['cpp_signature'] = "{} ({})".format(option['return_type'], ", ".join(option['formals_types']))
+        option['schema_order_cpp_signature'] = "{} ({})".format(
+            option['return_type'],
+            ", ".join([f['type'] for f in schema_order_formals]))
 
         option['method_formals'] = [format_formal(f) for f in formals
                                     if f['name'] != 'self']
@@ -1016,6 +1064,14 @@ def process_native(option):
         # be const_casted to be accepted as native function's non-const argument
         option['method_actuals'] = [
             f['name'] if f['name'] != 'self' else 'const_cast<Tensor&>(*this)' for f in formals]
+        option['schema_order_method_actuals'] = [
+            f['name'] if f['name'] != 'self' else 'const_cast<Tensor&>(*this)' for f in schema_order_formals]
+
+        if find_formal_by_type('TensorOptions', formals) is not None:
+            option['schema_order_actuals'] = [
+                process_schema_order_actual(actual) for actual in option['schema_order_actuals']]
+            option['schema_order_method_actuals'] = [
+                process_schema_order_actual(actual) for actual in option['schema_order_method_actuals']]
 
         def gen_tensor_method(option, formals):
             # type: (Any, List[AtFormal]) -> FunctionCode
@@ -1068,12 +1124,23 @@ def swizzle_self(f):  # blegh
                 static_dispatch_method_body = STATIC_DISPATCH_FUNCTION_DEFAULT_BODY.substitute(
                     option, actuals=option['method_actuals'])
 
-            method_definition = C10_TENSOR_METHOD_DEFINITION
+            if option['use_c10_dispatcher'] == 'full':
+                tensor_method_actuals = option['schema_order_method_actuals']
+                tensor_method_cpp_signature = option['schema_order_cpp_signature']
+            else:
+                assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                tensor_method_actuals = option['method_actuals']
+                tensor_method_cpp_signature = option['cpp_signature']
+
+            method_definition = TENSOR_METHOD_DEFINITION.substitute(
+                option, static_dispatch_method_body=static_dispatch_method_body,
+                tensor_method_actuals=tensor_method_actuals,
+                tensor_method_cpp_signature=tensor_method_cpp_signature
+            )
             return FunctionCode(
                 declaration=TENSOR_METHOD_DECLARATION.substitute(
                     option, static_dispatch_method_body=static_dispatch_method_body),
-                definition=method_definition.substitute(
-                    option, static_dispatch_method_body=static_dispatch_method_body))
+                definition=method_definition)
 
         def gen_namespace_function(option, multidispatch_formals):
             # type: (Any, List[AtFormal]) -> FunctionCode
@@ -1109,8 +1176,18 @@ def gen_namespace_function(option, multidispatch_formals):
                 static_dispatch_function_body = STATIC_DISPATCH_FUNCTION_DEFAULT_BODY.substitute(
                     option, actuals=option['actuals'])
 
-            fn_definition = C10_FUNCTION_DEFINITION.substitute(
-                option, static_dispatch_function_body=static_dispatch_function_body)
+            if option['use_c10_dispatcher'] == 'full':
+                function_actuals = option['schema_order_actuals']
+                function_cpp_signature = option['schema_order_cpp_signature']
+            else:
+                assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                function_actuals = option['actuals']
+                function_cpp_signature = option['cpp_signature']
+
+            fn_definition = FUNCTION_DEFINITION.substitute(
+                option, static_dispatch_function_body=static_dispatch_function_body,
+                function_actuals=function_actuals,
+                function_cpp_signature=function_cpp_signature)
 
             return FunctionCode(definition=fn_definition, declaration=fn_declaration)
 
@@ -1213,6 +1290,7 @@ def gen_namespace_function(option, multidispatch_formals):
             matches_jit_signature=option["matches_jit_signature"],
             schema_string=option["schema_string"],
             arguments=formals,
+            schema_order_arguments=schema_order_formals,
             method_of=method_of,
             mode=option['mode'],
             python_module=option['python_module'],
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index b676d61399bdb..d3605bf385aa3 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -492,6 +492,7 @@ def generate_outputs():
     declarations += nn_parse.run(nn_files)
     declarations += native_parse.run(native_files)
     declarations = preprocess_declarations.run(declarations)
+
     per_op_registrations = defaultdict(list) if options.per_op_registration else None
     schema_registrations = [] if options.force_schema_registration else None
 
diff --git a/aten/src/ATen/gen_backend_select_register.py b/aten/src/ATen/gen_backend_select_register.py
index faba2eb1770f0..820bdfe3ee6f5 100644
--- a/aten/src/ATen/gen_backend_select_register.py
+++ b/aten/src/ATen/gen_backend_select_register.py
@@ -24,21 +24,26 @@
 GENERATED_COMMENT = CodeTemplate(
     "@" + "generated from ${filename}")
 
-FUNCTION_REGISTRATION = CodeTemplate("""\
+UNBOXEDONLY_FUNCTION_REGISTRATION = CodeTemplate("""\
   m.impl_UNBOXED("aten::${op_name_with_overload_name}", ${function_name});
 """)
 
+FUNCTION_REGISTRATION = CodeTemplate("""\
+  m.impl("aten::${op_name_with_overload_name}", c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${function_name})));
+""")
+
 FUNCTION_DEFINITION = CodeTemplate("""\
 // ${schema_string}
 Tensor ${function_name}(${method_formals}) {
   static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::${name}", "${overload_name}")
-    .typed<${cpp_signature}>();
+    .typed<${function_cpp_signature}>();
   ${dispatch_key_init}
-  return op.callWithDispatchKey(_dk, ${actuals});
+  return op.callWithDispatchKey(_dk, ${function_actuals});
 }
 """)
 
+
 def needs_backend_select(declaration_option):
     # We register an op under the BackendSelect dispatch key
     # if a TensorOptions argument has been gathered from its declared args
@@ -56,28 +61,39 @@ def register_backend_select_methods(declarations, template_path, file_manager):
     for decl in declarations:
         for option in decl["options"]:
             if needs_backend_select(option):
-                assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-
                 name = option['name']
                 op_name_with_overload_name = option['name']
                 if option.get('overload_name', '') != '':
                     name = "{0}_{1}".format(name, option['overload_name'])
                     op_name_with_overload_name = "{0}.{1}".format(op_name_with_overload_name, option['overload_name'])
 
-                func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'],
-                                                            op_name_with_overload_name=op_name_with_overload_name,
-                                                            function_name=name)
+                if option['use_c10_dispatcher'] == 'full':
+                    func_reg = FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'],
+                                                                op_name_with_overload_name=op_name_with_overload_name,
+                                                                function_name=name)
+                else:
+                    assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                    func_reg = UNBOXEDONLY_FUNCTION_REGISTRATION.substitute(schema_string=option['schema_string'],
+                                                                            op_name_with_overload_name=op_name_with_overload_name,
+                                                                            function_name=name)
 
                 dispatch_key_init = gen_dispatch_key_init('_dk', option['formals_list'])
 
+                if option['use_c10_dispatcher'] == 'full':
+                    function_cpp_signature = option['schema_order_cpp_signature']
+                    function_actuals = option['schema_order_actuals']
+                else:
+                    assert option['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                    function_cpp_signature = option['cpp_signature']
+                    function_actuals = option['actuals']
                 method_def = FUNCTION_DEFINITION.substitute(function_name=name,
                                                             schema_string=option['schema_string'],
                                                             method_formals=option['formals_with_defaults'],
                                                             name=option['name'],
                                                             overload_name=option['overload_name'],
                                                             dispatch_key_init=dispatch_key_init,
-                                                            cpp_signature=option['cpp_signature'],
-                                                            actuals=option['actuals'])
+                                                            function_cpp_signature=function_cpp_signature,
+                                                            function_actuals=function_actuals)
 
                 backend_select_function_registrations.append(func_reg)
                 backend_select_method_definitions.append(method_def)
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3b571b78b6722..403260d795f0a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -177,6 +177,7 @@
      CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
 
@@ -370,10 +371,13 @@
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -491,8 +495,10 @@
     CUDA: baddbmm_out_cuda
 
 - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
 
@@ -621,8 +627,10 @@
     CPU, CUDA: logical_or_out
 
 - func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: bmm(Tensor self, Tensor mat2) -> Tensor
   use_c10_dispatcher: full
@@ -729,6 +737,7 @@
   variants: function
 
 - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+  use_c10_dispatcher: full
   variants: method
 
 - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
@@ -909,6 +918,7 @@
     CUDA: cummin_helper_cuda
 
 - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -919,6 +929,7 @@
 - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
 
 - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -1064,6 +1075,7 @@
   device_guard: False
 
 - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
@@ -1072,16 +1084,20 @@
     Vulkan: empty_vulkan
 
 - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
 
 - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
 
 - func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
 
 # other overrides are to provide a more helpful error message that dtype is required
 - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: empty_affine_quantized_other_backends_stub
     QuantizedCPU, QuantizedCUDA: empty_affine_quantized
@@ -1089,6 +1105,7 @@
 # it's a factory function receiving a tensor argument, thus overriding explicitly
 # other overrides are to provide a more helpful error message that dtype is required
 - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  use_c10_dispatcher: full
   category_override: factory
   dispatch:
     CPU: empty_per_channel_affine_quantized_other_backends_stub
@@ -1103,9 +1120,11 @@
   device_guard: False
 
 - func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   device_guard: False
 
 - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
@@ -1160,8 +1179,10 @@
   device_guard: False
 
 - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1241,12 +1262,15 @@
   device_guard: False
 
 - func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: from_file
 
@@ -1289,16 +1313,22 @@
     CUDA: grid_sampler_3d_backward_cuda
 
 - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
@@ -1516,6 +1546,7 @@
   use_c10_dispatcher: full
 
 - func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: linspace.out(Scalar start, Scalar end, int steps=100, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1587,6 +1618,7 @@
   variants: function, method
 
 - func: logspace(Scalar start, Scalar end, int steps=100, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: logspace.out(Scalar start, Scalar end, int steps=100, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1595,6 +1627,7 @@
 
 # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -1707,12 +1740,14 @@
 # The CPU and GPU dispatch variants are named weirdly here because otherwise there
 # are namespacing issues in C++
 - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
     QuantizedCPU: quantized_mean_cpu
 
 - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
@@ -2015,10 +2050,12 @@
   device_guard: False
 
 - func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: ones.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
@@ -2113,6 +2150,7 @@
   supports_named_tensor: True
 
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -2121,6 +2159,7 @@
   device_guard: False
 
 - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rand.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2129,12 +2168,15 @@
 - func: rand.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint(int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint.generator(int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
 - func: randint.low(int low, int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint.low_generator(int low, int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2147,10 +2189,13 @@
 - func: randint.low_generator_out(int low, int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randn(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randn.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2165,8 +2210,10 @@
 - func: randn.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2178,8 +2225,10 @@
     CUDA: randperm_out_cuda
 
 - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -2421,6 +2470,7 @@
 
 # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -2511,9 +2561,11 @@
   device_guard: False
 
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -2571,9 +2623,11 @@
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -2887,10 +2941,12 @@
   device_guard: False
 
 - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: zeros.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
   use_c10_dispatcher: full
@@ -2940,11 +2996,13 @@
   use_c10_dispatcher: full
 
 - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
   use_c10_dispatcher: full
 
 - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
@@ -2983,6 +3041,7 @@
     SparseCPU: log_softmax_backward_sparse_cpu
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
@@ -2990,6 +3049,7 @@
   variants: function, method
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
@@ -3036,6 +3096,7 @@
   variants: function
 
 - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: clone
@@ -3247,10 +3308,13 @@
 - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
 
 - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
   dispatch:
@@ -3427,16 +3491,19 @@
   use_c10_dispatcher: full
 
 - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: quantize_per_tensor
 
 - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_tensor_list_cpu
 
 - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_channel_cpu
@@ -3535,14 +3602,17 @@
   device_guard: False
 
 - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -3562,20 +3632,26 @@
   variants: method
 
 - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
+  use_c10_dispatcher: full
 
 - func: can_cast(ScalarType from, ScalarType to) -> bool
+  use_c10_dispatcher: full
   variants: function
 
 - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 # NB: Does NOT check precondition that numel == 1
@@ -3645,8 +3721,10 @@
 
 # Quantized RNN layers
 # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+#  use_c10_dispatcher: full
 
 # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+#  use_c10_dispatcher: full
 
 # Quantized GRU layers
 
@@ -4195,11 +4273,13 @@
   variants: method, function
 
 - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: tril_indices_cpu
     CUDA: tril_indices_cuda
 
 - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: triu_indices_cpu
     CUDA: triu_indices_cuda
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
index 71ab636718253..69b024c0b8349 100644
--- a/aten/src/ATen/native_parse.py
+++ b/aten/src/ATen/native_parse.py
@@ -147,7 +147,7 @@ def type_argument_translations(arg):
     return t, name, default, nullable, size, annotation
 
 
-def parse_arguments(args, func_variants, declaration, func_return):
+def parse_arguments(args):
     arguments = []
     kwarg_only = False
 
@@ -174,6 +174,9 @@ def parse_arguments(args, func_variants, declaration, func_return):
             argument_dict['kwarg_only'] = True
         arguments.append(argument_dict)
 
+    return arguments
+
+def process_arguments(arguments, func_variants, declaration, func_return):
     is_out_fn = False
     arguments_out = []
     arguments_other = []
@@ -417,7 +420,8 @@ def run(paths):
                 declaration['overload_name'] = func.get('overload_name', overload_name)
                 declaration['inplace'] = re.search('(^__i|[^_]_$)', fn_name) is not None
                 return_arguments = parse_return_arguments(return_decl, declaration['inplace'], func)
-                arguments = parse_arguments(arguments, func.get('variants', []), declaration, return_arguments)
+                schema_order_arguments = parse_arguments(arguments)
+                arguments = process_arguments(schema_order_arguments, func.get('variants', []), declaration, return_arguments)
                 output_arguments = [x for x in arguments if x.get('output')]
                 propagate_field_names(output_arguments, return_arguments)
                 declaration['return'] = return_arguments if len(output_arguments) == 0 else output_arguments
@@ -435,6 +439,7 @@ def run(paths):
                 declaration['manual_kernel_registration'] = func.get('manual_kernel_registration', False)
                 declaration['category_override'] = func.get('category_override', '')
                 declaration['arguments'] = func.get('arguments', arguments)
+                declaration['schema_order_arguments'] = func.get('schema_order_arguments', schema_order_arguments)
                 declaration['type_method_definition_dispatch'] = \
                     parse_dispatch(fn_name, func.get('dispatch', declaration['name']))
                 declaration['python_module'] = func.get('python_module', '')
diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py
index 30e9dbc72748a..33d78abf61d75 100644
--- a/aten/src/ATen/nn_parse.py
+++ b/aten/src/ATen/nn_parse.py
@@ -232,6 +232,7 @@ def function_info(name, arguments, cimpls, buffers, backends, inplace, backend_t
                 'BFloat16' in backend_types['CUDA'] else False,
         'backend_types': backend_types,
         'arguments': arguments,
+        'schema_order_arguments': copy.deepcopy(arguments),
         'return': 'argument 0' if inplace else get_return(arguments),
         'buffers': buffers,
         'backends': backends,
diff --git a/aten/src/ATen/templates/BackendSelectRegister.cpp b/aten/src/ATen/templates/BackendSelectRegister.cpp
index bcbf25f3117ff..db7276913201e 100644
--- a/aten/src/ATen/templates/BackendSelectRegister.cpp
+++ b/aten/src/ATen/templates/BackendSelectRegister.cpp
@@ -7,6 +7,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <torch/library.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <c10/core/TensorOptions.h>
 
 namespace at {
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index a41816a3a613d..5bc88199bf424 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -15,6 +15,7 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/Context.h>
 #include <ATen/TracerMode.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp
index 53ed0ceb3bca3..72ac3d784dade 100644
--- a/aten/src/ATen/templates/PerOpRegistration.cpp
+++ b/aten/src/ATen/templates/PerOpRegistration.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Config.h>
 #include <torch/library.h>
 #include <ATen/TypeDefault.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 $extra_headers
 
 namespace at {
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 0035c1db37999..6883e0b216881 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -17,6 +17,7 @@
 #include <c10/util/Half.h>
 #include <c10/core/UndefinedTensorImpl.h>
 #include <c10/util/Optional.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <torch/library.h>
 
 #include <cstddef>
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index a85576d0ad933..89f99d85658c9 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -13,6 +13,7 @@
 #include <c10/core/TensorOptions.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <torch/library.h>
 
 namespace {
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index ea18d5ce782f2..ede79cf218612 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -29,6 +29,7 @@
 #include <utility>
 
 #include <ATen/Config.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <torch/library.h>
 $extra_cuda_headers
 $legacy_th_headers
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
index fd2d3c0045ee6..eab1fe913adf9 100644
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@@ -10,7 +10,7 @@ using namespace at;
 
 static int test_int;
 
-Tensor empty_override(IntArrayRef size, const TensorOptions & options, c10::optional<MemoryFormat> optional_memory_format) {
+Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> scalar_type, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
   test_int = 1;
   auto tensor_impl = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
       Storage(
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 95ae2a3c0e04f..d55d3df538736 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -186,6 +186,13 @@ static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
       "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
 }
 
+inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return c10::nullopt;
+  }
+  return typeMetaToScalarType(*type_meta);
+}
+
 static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
   if (auto mt = tryTypeMetaToScalarType(m)) {
     return (*mt) == t;
diff --git a/c10/test/util/TypeList_test.cpp b/c10/test/util/TypeList_test.cpp
index 59ce6f2b8981d..5721cb8aca888 100644
--- a/c10/test/util/TypeList_test.cpp
+++ b/c10/test/util/TypeList_test.cpp
@@ -152,3 +152,21 @@ namespace test_contains {
   static_assert(!contains<typelist<int, double>, float>::value, "");
   static_assert(!contains<typelist<>, double>::value, "");
 }
+
+namespace test_take {
+    static_assert(std::is_same<typelist<>, take_t<typelist<>, 0>>::value, "");
+    static_assert(std::is_same<typelist<>, take_t<typelist<int64_t>, 0>>::value, "");
+    static_assert(std::is_same<typelist<int64_t>, take_t<typelist<int64_t>, 1>>::value, "");
+    static_assert(std::is_same<typelist<>, take_t<typelist<int64_t, int32_t>, 0>>::value, "");
+    static_assert(std::is_same<typelist<int64_t>, take_t<typelist<int64_t, int32_t>, 1>>::value, "");
+    static_assert(std::is_same<typelist<int64_t, int32_t>, take_t<typelist<int64_t, int32_t>, 2>>::value, "");
+}
+
+namespace test_drop {
+    static_assert(std::is_same<typelist<>, drop_t<typelist<>, 0>>::value, "");
+    static_assert(std::is_same<typelist<int64_t>, drop_t<typelist<int64_t>, 0>>::value, "");
+    static_assert(std::is_same<typelist<>, drop_t<typelist<int64_t>, 1>>::value, "");
+    static_assert(std::is_same<typelist<int64_t, int32_t>, drop_t<typelist<int64_t, int32_t>, 0>>::value, "");
+    static_assert(std::is_same<typelist<int32_t>, drop_t<typelist<int64_t, int32_t>, 1>>::value, "");
+    static_assert(std::is_same<typelist<>, drop_t<typelist<int64_t, int32_t>, 2>>::value, "");
+}
diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h
index 7a05aaf094a28..424ede4532603 100644
--- a/c10/util/TypeList.h
+++ b/c10/util/TypeList.h
@@ -250,6 +250,37 @@ static_assert(
     std::is_same<int, last_t<typelist<double, float, int>>>::value,
     "");
 
+/**
+ * Take/drop a number of arguments from a typelist.
+ * Example:
+ *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
+ *   typelist<string, bool> == drop_t<typelist<int, string, bool>, 2>
+ */
+namespace detail {
+  template<class TypeList, size_t offset, class IndexSequence>
+  struct take_elements final {};
+
+  template<class TypeList, size_t offset, size_t... Indices>
+  struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
+    using type = typelist<typename element<offset + Indices, TypeList>::type...>;
+  };
+}
+
+template<class TypeList, size_t num> struct take final {
+  static_assert(is_instantiation_of<typelist, TypeList>::value, "In typelist::take<T, num>, the T argument must be typelist<...>.");
+  static_assert(num <= size<TypeList>::value, "Tried to typelist::take more elements than there are in the list");
+  using type = typename detail::take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
+};
+template<class TypeList, size_t num> using take_t = typename take<TypeList, num>::type;
+
+template<class TypeList, size_t num> struct drop final {
+  static_assert(is_instantiation_of<typelist, TypeList>::value, "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  static_assert(num <= size<TypeList>::value, "Tried to typelist::drop more elements than there are in the list");
+  using type = typename detail::take_elements<TypeList, num, std::make_index_sequence<size<TypeList>::value - num>>::type;
+};
+template<class TypeList, size_t num> using drop_t = typename drop<TypeList, num>::type;
+
+
 /**
  * Reverses a typelist.
  * Example:
@@ -288,8 +319,6 @@ struct find_if<typelist<Head, Tail...>, Condition, std::enable_if_t<!Condition<H
   static constexpr size_t value = 1 + find_if<typelist<Tail...>, Condition>::value;
 };
 
-
-
 /**
  * Maps a list of types into a list of values.
  * Examples:
diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp
index a308bb0b15c12..c2dfac7d6b6ed 100644
--- a/test/cpp_extensions/msnpu_extension.cpp
+++ b/test/cpp_extensions/msnpu_extension.cpp
@@ -20,9 +20,13 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
   return Tensor(std::move(tensor_impl));
 }
 
-Tensor empty_override(IntArrayRef size, const TensorOptions& options, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
   test_int = 0;
-  return get_tensor(options.dtype(), size);
+  caffe2::TypeMeta typeMeta;
+  if (dtype.has_value()) {
+    typeMeta = scalarTypeToTypeMeta(*dtype);
+  }
+  return get_tensor(typeMeta, size);
 }
 
 Tensor add_override(const Tensor & a, const Tensor & b , Scalar c) {
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index b5bd05ccdabe6..b6ac180460286 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -101,6 +101,26 @@ def get_simple_type(arg):
         simple_type = '{}?'.format(opt_match.group(1))
     return simple_type
 
+def has_tensoroptions_argument(declaration):
+    for argument in declaration['arguments']:
+        if 'TensorOptions' == argument['dynamic_type']:
+            return True
+    return False
+
+def process_schema_order_arg(schema_order_arg):
+    if schema_order_arg == 'dtype':
+        return 'optTypeMetaToScalarType(options.dtype_opt())'
+    elif schema_order_arg == 'layout':
+        return 'options.layout_opt()'
+    elif schema_order_arg == 'device':
+        return 'options.device_opt()'
+    elif schema_order_arg == 'pin_memory':
+        return 'options.pinned_memory_opt()'
+    elif schema_order_arg == 'memory_format':
+        return 'c10::impl::process_memory_format(options, memory_format)'
+    else:
+        return schema_order_arg
+
 
 def load_aten_declarations(path):
     with open(path, 'r') as f:
@@ -119,7 +139,12 @@ def load_aten_declarations(path):
 
         declaration['formals'] = [arg['type'] + ' ' + arg['name']
                                   for arg in declaration['arguments']]
+        declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name']
+                                               for arg in declaration['schema_order_arguments']]
         declaration['args'] = [arg['name'] for arg in declaration['arguments']]
+        declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
+        if has_tensoroptions_argument(declaration):
+            declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']]
         declaration['api_name'] = declaration['name']
         # NB: keep this in sync with common_with_cwrap.py
         if declaration.get('overload_name'):
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 774f358be8ee9..68d91a8d2b7f0 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -194,7 +194,9 @@
 """)
 
 WRAPPER_REGISTRATION = CodeTemplate("""\
-m.impl("${unqual_operator_name_with_overload}", TORCH_FN(${class_type}::${type_wrapper_name}));
+m.impl("${unqual_operator_name_with_overload}",
+       c10::impl::hacky_wrapper_for_legacy_signatures(TORCH_FN(${class_type}::${type_wrapper_name}))
+);
 """)
 
 UNPACK_TENSOR = CodeTemplate("""\
@@ -339,27 +341,41 @@
 
 # Generate a file that lists all functions and their schema string. Used for XLA
 REGISTRATION_DECLARATION = CodeTemplate("""\
-${return_type} ${api_name}(${formals}); // {"schema": "${schema_string}", "compound": "${compound}"}
+${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"}
 """)
 
 # ProfiledType templates
-PROFILE_DISPATCH_UNBOXED = CodeTemplate("""\
+UNBOXED_PROFILE_DISPATCH = CodeTemplate("""\
 static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
-    .typed<${return_type} (${arg_types})>();
+    .typed<${return_type} (${profiled_arg_types})>();
+RECORD_FUNCTION("${name}", std::vector<c10::IValue>({${input_names}}), Node::peek_at_next_sequence_nr());
+return c10::Dispatcher::singleton().redispatch<${profiled_ret_and_arg_types}>(${profiled_dispatch_args});
+""")
+PROFILE_DISPATCH = CodeTemplate("""\
+static auto op = c10::Dispatcher::singleton()
+    .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
+    .typed<${return_type} (${profiled_arg_types})>();
 RECORD_FUNCTION("${name}", std::vector<c10::IValue>({${input_names}}), Node::peek_at_next_sequence_nr());
-return c10::Dispatcher::singleton().redispatch<${ret_and_arg_types}>(${profiled_dispatch_args});
+return c10::Dispatcher::singleton().redispatch<${profiled_ret_and_arg_types}>(${profiled_dispatch_args});
 """)
 
 
 # TraceType templates
 # TODO: change `redispatch` to `NoTracerDispatchMode` + regular `call`.
-TRACE_DISPATCH_UNBOXED = CodeTemplate("""\
+UNBOXED_TRACE_DISPATCH = CodeTemplate("""\
 static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
     .typed<${return_type} (${arg_types})>();
 ${assign_return_values}c10::Dispatcher::singleton().redispatch<${ret_and_arg_types}>(${trace_dispatch_args});
 """)
+TRACE_DISPATCH = CodeTemplate("""\
+static auto op = c10::Dispatcher::singleton()
+    .findSchemaOrThrow("aten::${operator_name}", "${overload_name}")
+    .typed<${return_type} (${schema_order_arg_types})>();
+${assign_return_values}c10::Dispatcher::singleton()
+    .redispatch<${schema_order_ret_and_arg_types}>(${schema_order_trace_dispatch_args});
+""")
 
 
 FACTORY_FUNCTION_NAMES = None
@@ -598,10 +614,21 @@ def gen_variable_type(out, aten_declarations, template_path):
     registration_declarations = []
 
     for declaration in aten_declarations:
+        if declaration['use_c10_dispatcher'] == 'full':
+            declaration_formals = declaration['schema_order_formals']
+        else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+            declaration_formals = declaration['formals']
         if dispatch_strategy(declaration) == 'use_derived':
-            registration_declarations.append(REGISTRATION_DECLARATION.substitute(declaration, compound='false'))
+            registration_declarations.append(
+                REGISTRATION_DECLARATION.substitute(declaration,
+                                                    declaration_formals=declaration_formals,
+                                                    compound='false'))
         else:
-            registration_declarations.append(REGISTRATION_DECLARATION.substitute(declaration, compound='true'))
+            registration_declarations.append(
+                REGISTRATION_DECLARATION.substitute(declaration,
+                                                    declaration_formals=declaration_formals,
+                                                    compound='true'))
 
     env = {
         'registration_declarations': registration_declarations,
@@ -647,6 +674,7 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade
             profiled_wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
                 declaration, class_type='ProfiledType'))
         else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
             profiled_wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
                 declaration, class_type='ProfiledType'))
 
@@ -696,6 +724,9 @@ def emit_profiled_body(declaration):
 
     arg_types = ', '.join([a['type'] for a in declaration['arguments']])
     ret_and_arg_types = ', '.join([declaration['return_type']] + [a['type'] for a in declaration['arguments']])
+    schema_order_arg_types = ', '.join([a['type'] for a in declaration['schema_order_arguments']])
+    schema_order_ret_and_arg_types = ', '.join(
+        [declaration['return_type']] + [a['type'] for a in declaration['schema_order_arguments']])
 
     def check_record_function_input_type(simple_type):
         return simple_type in ['Tensor', 'Scalar']
@@ -706,14 +737,25 @@ def record_function_input_names():
             if check_record_function_input_type(arg['simple_type'])])
 
     profiled_dispatch_args = ['op', 'c10::DispatchKey::Profiler'] + declaration['args']
+    schema_order_profiled_dispatch_args = ['op', 'c10::DispatchKey::Profiler'] + declaration['schema_order_args']
+
+    if declaration['use_c10_dispatcher'] == 'full':
+        profiled_arg_types = schema_order_arg_types
+        profiled_ret_and_arg_types = schema_order_ret_and_arg_types
+        profiled_dispatch_args = schema_order_profiled_dispatch_args
+    else:
+        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+        profiled_arg_types = arg_types
+        profiled_ret_and_arg_types = ret_and_arg_types
+        profiled_dispatch_args = profiled_dispatch_args
 
-    call = PROFILE_DISPATCH_UNBOXED.substitute(
+    call = PROFILE_DISPATCH.substitute(
         declaration,
         name=name,
         input_names=record_function_input_names(),
         return_type=declaration['return_type'],
-        arg_types=arg_types,
-        ret_and_arg_types=ret_and_arg_types,
+        profiled_arg_types=profiled_arg_types,
+        profiled_ret_and_arg_types=profiled_ret_and_arg_types,
         profiled_dispatch_args=profiled_dispatch_args,
     )
 
@@ -737,16 +779,30 @@ def emit_trace_body(declaration):
 
     arg_types = ', '.join([a['type'] for a in declaration['arguments']])
     ret_and_arg_types = ', '.join([declaration['return_type']] + [a['type'] for a in declaration['arguments']])
+    schema_order_arg_types = ', '.join([a['type'] for a in declaration['schema_order_arguments']])
+    schema_order_ret_and_arg_types = ', '.join(
+        [declaration['return_type']] + [a['type'] for a in declaration['schema_order_arguments']])
 
     trace_dispatch_args = ['op', 'c10::DispatchKey::Tracer'] + declaration['args']
+    schema_order_trace_dispatch_args = ['op', 'c10::DispatchKey::Tracer'] + declaration['schema_order_args']
     assign_return_values = '{} = '.format(tie_return_values) if not modifies_arguments and not returns_void else ''
-    call = TRACE_DISPATCH_UNBOXED.substitute(
-        declaration,
-        arg_types=arg_types,
-        ret_and_arg_types=ret_and_arg_types,
-        trace_dispatch_args=trace_dispatch_args,
-        assign_return_values=assign_return_values,
-    )
+    if declaration['use_c10_dispatcher'] == 'full':
+        call = TRACE_DISPATCH.substitute(
+            declaration,
+            schema_order_arg_types=schema_order_arg_types,
+            assign_return_values=assign_return_values,
+            schema_order_ret_and_arg_types=schema_order_ret_and_arg_types,
+            schema_order_trace_dispatch_args=schema_order_trace_dispatch_args,
+        )
+    else:
+        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+        call = UNBOXED_TRACE_DISPATCH.substitute(
+            declaration,
+            arg_types=arg_types,
+            ret_and_arg_types=ret_and_arg_types,
+            trace_dispatch_args=trace_dispatch_args,
+            assign_return_values=assign_return_values,
+        )
     trace_body.append(call)
     trace_body.append(post_record_trace)
     if not returns_void:
diff --git a/tools/autograd/templates/ProfiledType.cpp b/tools/autograd/templates/ProfiledType.cpp
index f2e3cae2d36a4..3c72dcabe9694 100644
--- a/tools/autograd/templates/ProfiledType.cpp
+++ b/tools/autograd/templates/ProfiledType.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/TypeDefault.h>
 #include <torch/library.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 #include "torch/csrc/autograd/function.h"
 
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 7c91f1f64ef64..1b4f8146c557d 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/TypeDefault.h>
 #include <torch/library.h>
+#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 // ${generated_comment}