pytorch · zewenli98 · Oct 28, 2025 · Oct 28, 2025 · Oct 29, 2025 · Nov 5, 2025
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -107,12 +107,6 @@ void setup_input_tensors(
     TORCHTRT_CHECK(
         inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
 
-    auto expected_type =
-        util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-    TORCHTRT_CHECK(
-        inputs[i].dtype() == expected_type,
-        "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
-
     auto dims = core::util::toDims(inputs[i].sizes());
     auto shape = core::util::toVec(dims);
     LOG_DEBUG("Input Name: " << name << " Shape: " << dims);

diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch_tensorrt
+
+
+class AutocastExample(nn.Module):
+    def __init__(self):
+        super(AutocastExample, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=3, out_channels=8, kernel_size=3, stride=1, padding=1
+        )
+        self.relu1 = nn.ReLU()
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv2 = nn.Conv2d(
+            in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1
+        )
+        self.relu2 = nn.ReLU()
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.flatten = nn.Flatten()
+        self.fc1 = nn.Linear(16 * 8 * 8, 10)
+
+    def forward(self, x, y):
+        x = self.conv1(x)  # fp32 because of "^conv1$" in `autocast_excluded_nodes`
+        x = self.relu1(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
+        out = self.pool1(x)  # fp16
+        x = self.conv2(out)  # fp16
+        x = self.relu2(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
+        x = self.pool2(x)  # fp16
+        x = self.flatten(
+            x
+        )  # fp32 because of `torch.ops.aten.flatten.using_ints` in `autocast_excluded_ops`
+        # Respect the precisions in the pytorch autocast context
+        with torch.autocast(x.device.type, enabled=True, dtype=torch.float32):
+            x = self.fc1(x)
+            with torch.autocast(x.device.type, enabled=False):
+                x = torch.sub(x.half(), y)
+                out2 = torch.add(x, x)
+        with torch.autocast(x.device.type, enabled=True, dtype=torch.float16):
+            out2 = torch.log(out2)
+        return x, out, out2
+
+
+if __name__ == "__main__":
+    model = AutocastExample().cuda().eval()
+    inputs = (
+        torch.randn((1, 3, 32, 32), dtype=torch.float32, device="cuda"),
+        torch.randn((1,), dtype=torch.float16, device="cuda"),
+    )
+
+    ep = torch.export.export(model, inputs)
+
+    with torch_tensorrt.dynamo.Debugger(
+        "graphs",
+        logging_dir=".",
+        engine_builder_monitor=False,
+    ):
+        trt_mod = torch_tensorrt.compile(
+            ep.module(),
+            arg_inputs=inputs,
+            min_block_size=1,
+            use_python_runtime=True,
+            ##### weak typing #####
+            # use_explicit_typing=False,
+            # enabled_precisions={torch.float16},
+            ##### strong typing + autocast #####
+            use_explicit_typing=True,
+            enable_autocast=True,
+            autocast_low_precision_type=torch.float16,
+            autocast_excluded_nodes={"^conv1$", "relu"},
+            autocast_excluded_ops={torch.ops.aten.flatten.using_ints},
+            autocast_data_max=512,
+            autocast_max_depth_of_reduction=None,
+        )
+
+        trt_out = trt_mod(*inputs)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -141,7 +141,7 @@ def cross_compile_for_windows(
         disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
         assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
         sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
-        enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
+        enabled_precisions (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
@@ -434,6 +434,16 @@ def compile(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    enable_autocast: bool = _defaults.ENABLE_AUTOCAST,
+    autocast_low_precision_type: Optional[
+        Union[torch.dtype, dtype]
+    ] = _defaults.AUTOCAST_LOW_PRECISION_TYPE,
+    autocast_excluded_nodes: Collection[str] = _defaults.AUTOCAST_EXCLUDED_NODES,
+    autocast_excluded_ops: Collection[Target] = _defaults.AUTOCAST_EXCLUDED_OPS,
+    autocast_data_max: float = _defaults.AUTOCAST_DATA_MAX,
+    autocast_max_depth_of_reduction: Optional[
+        int
+    ] = _defaults.AUTOCAST_MAX_DEPTH_OF_REDUCTION,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -511,6 +521,12 @@ def compile(
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
+        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
+        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -584,6 +600,10 @@ def compile(
             "\nThis feature is unimplemented in Torch-TRT Dynamo currently."
         )
 
+    if enable_autocast:
+        use_explicit_typing = True
+        logger.debug("Autocast is enabled, setting use_explicit_typing to True.")
+
     if use_explicit_typing:
         if len(enabled_precisions) != 1 or not any(
             x in enabled_precisions
@@ -593,6 +613,19 @@ def compile(
                 f"use_explicit_typing was set to True, however found that enabled_precisions was also specified (saw: {enabled_precisions}, expected: dtype.f32, dtype.f4). enabled_precisions should not be used when use_explicit_typing=True"
             )
 
+    if autocast_low_precision_type is not None:
+        if not isinstance(autocast_low_precision_type, (torch.dtype, dtype)):
+            raise ValueError(
+                f"autocast_low_precision_type must be a torch.dtype or torch_tensorrt._enums.dtype, got {type(autocast_low_precision_type)}"
+            )
+        if autocast_low_precision_type not in {
+            torch.float16,
+            torch.bfloat16,
+        } and autocast_low_precision_type not in {dtype.f16, dtype.bf16}:
+            raise ValueError(
+                f"autocast_low_precision_type must be one of torch.float16, torch.bfloat16, dtype.f16, dtype.bf16, got {autocast_low_precision_type}"
+            )
+
     if use_fp32_acc:
         logger.debug(
             "FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights. \
@@ -622,6 +655,38 @@ def compile(
     if not isinstance(arg_inputs, collections.abc.Sequence):
         arg_inputs = [arg_inputs]  # type: ignore
 
+    # save intermediate outputs of each node for Autocast
+    autocast_intermediate_node_outputs = {}
+    if not use_explicit_typing:
+
+        class DumpInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
+            """Dump intermediate outputs of each node"""
+
+            def run_node(self, n: torch.fx.Node) -> Any:
+                if (
+                    n.op == "call_function"
+                    and n.target != torch.ops.higher_order.wrap_with_autocast
+                ):
+                    out = super().run_node(n)
+                    if not isinstance(out, torch.Tensor):
+                        raise ValueError(
+                            f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
+                        )
+                    autocast_intermediate_node_outputs[n.name] = out
+                    return out
+                return super().run_node(n)
+
+        def _materialize(x: Input | torch.Tensor) -> torch.Tensor:
+            """Materialize an Input object to a tensor"""
+            if isinstance(x, Input):
+                return x.torch_tensor
+            return x
+
+        with torch.no_grad():
+            mat_args = tuple(_materialize(a) for a in arg_inputs)
+            mat_kwargs = {k: _materialize(v) for k, v in kwarg_inputs.items()}
+            DumpInterpreter(exported_program.module()).run(*mat_args, **mat_kwargs)
+
     # Prepare torch_trt inputs
     trt_arg_inputs: Sequence[Input] = prepare_inputs(arg_inputs)
     trt_kwarg_inputs: Optional[dict[Any, Any]] = prepare_inputs(kwarg_inputs)
@@ -680,6 +745,13 @@ def compile(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "enable_autocast": enable_autocast,
+        "autocast_low_precision_type": autocast_low_precision_type,
+        "autocast_excluded_nodes": autocast_excluded_nodes,
+        "autocast_excluded_ops": autocast_excluded_ops,
+        "autocast_data_max": autocast_data_max,
+        "autocast_max_depth_of_reduction": autocast_max_depth_of_reduction,
+        "autocast_intermediate_node_outputs": autocast_intermediate_node_outputs,
     }
 
     settings = CompilationSettings(**compilation_options)

diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -57,6 +57,12 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+ENABLE_AUTOCAST = False
+AUTOCAST_LOW_PRECISION_TYPE = None
+AUTOCAST_EXCLUDED_NODES = set[str]()
+AUTOCAST_EXCLUDED_OPS = set[torch.fx.node.Target]()
+AUTOCAST_DATA_MAX = 512
+AUTOCAST_MAX_DEPTH_OF_REDUCTION = None
 
 if platform.system() == "Linux":
     import pwd

diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -1,17 +1,24 @@
 from dataclasses import dataclass, field
 from typing import Any, Collection, Optional, Set, Tuple, Union
 
+import torch
 from torch.fx.node import Target
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
+    AUTOCAST_DATA_MAX,
+    AUTOCAST_EXCLUDED_NODES,
+    AUTOCAST_EXCLUDED_OPS,
+    AUTOCAST_LOW_PRECISION_TYPE,
+    AUTOCAST_MAX_DEPTH_OF_REDUCTION,
     CACHE_BUILT_ENGINES,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
     DLA_SRAM_SIZE,
     DRYRUN,
+    ENABLE_AUTOCAST,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLE_WEIGHT_STREAMING,
@@ -97,6 +104,13 @@ class CompilationSettings:
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
+        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
+        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_intermediate_node_outputs (dict[str, torch.Tensor]): The intermediate node outputs of the graph. Default is {}.
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -140,6 +154,19 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    enable_autocast: bool = ENABLE_AUTOCAST
+    autocast_low_precision_type: Optional[dtype] = AUTOCAST_LOW_PRECISION_TYPE
+    autocast_excluded_nodes: Collection[str] = field(
+        default_factory=lambda: AUTOCAST_EXCLUDED_NODES
+    )
+    autocast_excluded_ops: Collection[Target] = field(
+        default_factory=lambda: AUTOCAST_EXCLUDED_OPS
+    )
+    autocast_data_max: float = AUTOCAST_DATA_MAX
+    autocast_max_depth_of_reduction: Optional[int] = AUTOCAST_MAX_DEPTH_OF_REDUCTION
+    autocast_intermediate_node_outputs: dict[str, torch.Tensor] = field(
+        default_factory=lambda: {}
+    )
 
     def __getstate__(self) -> dict[str, Any]:
         from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
@@ -157,6 +184,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         self.__dict__.update(state)
 
 
+# If any of the following setting is changed, the engine should be rebuilt.
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
     "enabled_precisions",
     "max_aux_streams",

diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -15,6 +15,13 @@
 from .remove_num_users_is_0_nodes import remove_num_users_is_0_nodes
 from .repair_input_as_output import repair_input_as_output
 from .replace_max_pool_with_indices import replace_max_pool_with_indices
+from .rule_based_autocast import rule_based_autocast
+
+pre_lowering_pass_list = [
+    remove_detach,
+    rule_based_autocast,
+    remove_assert_nodes,  # rule_based_autocast might insert assert nodes
+]
 
 post_lowering_pass_list = [
     remove_input_alias_fixing_clones,
@@ -27,10 +34,6 @@
     complex_graph_detection,
 ]
 
-pre_lowering_pass_list = [
-    remove_detach,
-]
-
 if not is_tegra_platform():
     from .fuse_distributed_ops import fuse_distributed_ops