diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index c3cb8fd51f..58528c3565 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -537,7 +537,7 @@ def convert_method_to_trt_engine( module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs ) - return dynamo_convert_exported_program_to_serialized_trt_engine( # type: ignore[no-any-return] + return dynamo_convert_exported_program_to_serialized_trt_engine( exp_program, arg_inputs=tuple(normalized_arg_inputs), kwarg_inputs=torchtrt_kwarg_inputs, @@ -1352,41 +1352,36 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None def _normalize_engine_constants_to_python(exp_program: "ExportedProgram") -> None: - pass + """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. + The C++ runtime stores engine constants as ``torch._C.ScriptObject`` + (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as + an opaque type so ``torch.export`` can serialise it with ``pickle``. By + converting before save the artifact is portable across both runtimes. + """ + import base64 -# TODO: Uncomment this when cross serialization is enabled -# """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. - -# The C++ runtime stores engine constants as ``torch._C.ScriptObject`` -# (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as -# an opaque type so ``torch.export`` can serialise it with ``pickle``. By -# converting before save the artifact is portable across both runtimes. -# """ -# import base64 - -# from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX -# from torch_tensorrt.dynamo.runtime._TRTEngine import ( -# EngineSerializer, -# TRTEngine, -# ) - -# for fqn, constant in list(exp_program.constants.items()): -# if isinstance(constant, (torch._C.ScriptObject, TRTEngine)): + from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX + from torch_tensorrt.dynamo.runtime._TRTEngine import ( + EngineSerializer, + TRTEngine, + ) -# state = constant.__getstate__() -# if len(state) == 2 and ( -# state[1] == "TRTEngine" -# or state[1] == "__torch__.torch.classes.tensorrt.Engine" -# ): -# serialized_info = list(state[0]) -# serialized_info[ENGINE_IDX] = base64.b64decode( -# serialized_info[ENGINE_IDX] -# ) -# exp_program.constants[fqn] = EngineSerializer(serialized_info) + for fqn, constant in list(exp_program.constants.items()): + if isinstance(constant, (torch._C.ScriptObject, TRTEngine)): + + state = constant.__getstate__() + if len(state) == 2 and ( + state[1] == "TRTEngine" + or state[1] == "__torch__.torch.classes.tensorrt.Engine" + ): + serialized_info = list(state[0]) + serialized_info[ENGINE_IDX] = base64.b64decode( + serialized_info[ENGINE_IDX] + ) + exp_program.constants[fqn] = EngineSerializer(serialized_info) -# def function_overload_with_kwargs( fn: Callable[..., Any], *args: Any, **kwargs: Any ) -> Any: diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 29c2ed076a..ea7e6f95bc 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -83,7 +83,7 @@ def cross_compile_for_windows( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -163,7 +163,7 @@ def cross_compile_for_windows( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -220,6 +220,16 @@ def cross_compile_for_windows( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted.", @@ -334,7 +344,6 @@ def cross_compile_for_windows( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } # disable the following settings is not supported for cross compilation for windows feature @@ -424,7 +433,7 @@ def compile( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -519,7 +528,7 @@ def compile( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -579,6 +588,16 @@ def compile( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted", @@ -731,7 +750,6 @@ def compile( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB") settings = CompilationSettings(**compilation_options) @@ -1218,7 +1236,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -1294,7 +1312,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -1344,6 +1362,16 @@ def convert_exported_program_to_serialized_trt_engine( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted", @@ -1473,7 +1501,6 @@ def convert_exported_program_to_serialized_trt_engine( "use_distributed_mode_trace": use_distributed_mode_trace, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } if "runtime_cache_path" in compilation_options: compilation_options.pop("runtime_cache_path") diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 784066cc75..8b2e1f24a4 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -70,7 +70,6 @@ DECOMPOSE_ATTENTION = False ATTN_BIAS_IS_CAUSAL = True DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY = "lazy" -USE_PYTHON_RUNTIME = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index c92fc77341..d844b8d92c 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -472,8 +472,6 @@ def inline_trt_modules( continue # Get the TRT submodule trt_module = getattr(gm, name) - if trt_module._use_python_runtime: - raise ValueError("Python runtime is not supported for serialization") # Ensure the trt module node in the main graph (gm) has inputs trt_module_node = [node for node in gm.graph.nodes if node.name == name] diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 3fe18e0a0d..2c3434b706 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -53,7 +53,6 @@ USE_DISTRIBUTED_MODE_TRACE, USE_FAST_PARTITIONER, USE_FP32_ACC, - USE_PYTHON_RUNTIME, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -118,7 +117,6 @@ class CompilationSettings: dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. - use_python_runtime (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). When ``False`` (default) the C++ runtime is used if available and the Python runtime is used as a fallback otherwise. """ workspace_size: int = WORKSPACE_SIZE @@ -181,7 +179,6 @@ class CompilationSettings: dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES decompose_attention: bool = DECOMPOSE_ATTENTION attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL - use_python_runtime: bool = USE_PYTHON_RUNTIME def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( @@ -196,6 +193,7 @@ def __getstate__(self) -> dict[str, Any]: return state def __setstate__(self, state: dict[str, Any]) -> None: + state.pop("use_python_runtime", None) self.__dict__.update(state) diff --git a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py index c9c6f8a433..9a690b7303 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py +++ b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py @@ -137,43 +137,42 @@ def set_runtime_states( # Pickle reconstruction — returns the right engine type for the current runtime # --------------------------------------------------------------------------- -# TODO: Uncomment this when cross serialization is enabled -# def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: -# """Reconstruct a TRT engine from its serialized info list. +def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: + """Reconstruct a TRT engine from its serialized info list. -# Called by pickle when deserializing a ``TRTEngine``. Checks which runtime -# is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or -# a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across -# runtimes. -# """ -# serialized_info = list(serialized_info) -# engine_field = serialized_info[ENGINE_IDX] -# if isinstance(engine_field, str): -# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) -# elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): -# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) + Called by pickle when deserializing a ``TRTEngine``. Checks which runtime + is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or + a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across + runtimes. + """ + serialized_info = list(serialized_info) + engine_field = serialized_info[ENGINE_IDX] + if isinstance(engine_field, str): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) + elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) -# if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: -# return torch.classes.tensorrt.Engine(tuple(serialized_info)) + if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: + return torch.classes.tensorrt.Engine(tuple(serialized_info)) -# return TRTEngine(serialized_info) + return TRTEngine(serialized_info) -# class EngineSerializer(OpaqueBase): # type: ignore[misc] -# def __init__(self, serialized_info: SerializedTensorRTEngineFmt) -> None: -# self.serialized_info = serialized_info +class EngineSerializer(OpaqueBase): # type: ignore[misc] + def __init__(self, serialized_info: SerializedTensorRTEngineFmt) -> None: + self.serialized_info = serialized_info -# def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: -# """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. + def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: + """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. -# The reconstruction function checks which runtime is available at -# load time and returns either a C++ ``torch.classes.tensorrt.Engine`` -# or a Python ``TRTEngine``, so a single saved artifact works on both. -# """ -# state = list(self.serialized_info) -# state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") -# return (_reconstruct_trt_engine, (state,)) + The reconstruction function checks which runtime is available at + load time and returns either a C++ ``torch.classes.tensorrt.Engine`` + or a Python ``TRTEngine``, so a single saved artifact works on both. + """ + state = list(self.serialized_info) + state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") + return (_reconstruct_trt_engine, (state,)) # --------------------------------------------------------------------------- @@ -935,45 +934,44 @@ def execute( return self._execute_standard(contiguous_inputs) -# register_opaque_type(EngineSerializer, typ="reference") +register_opaque_type(EngineSerializer, typ="reference") +if not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: -register_opaque_type(TRTEngine, typ="reference") + register_opaque_type(TRTEngine, typ="reference") + @torch.library.custom_op( # type: ignore[misc] + "tensorrt::execute_engine", mutates_args=() + ) + def execute_engine( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + outputs = engine.execute(input_tensors) + return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) -@torch.library.custom_op( # type: ignore[misc] - "tensorrt::execute_engine_python", mutates_args=() -) -def execute_engine_python( - input_tensors: List[torch.Tensor], engine: TRTEngine -) -> List[torch.Tensor]: - outputs = engine.execute(input_tensors) - return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) - - -@execute_engine_python.register_fake # type: ignore[misc] -def execute_engine_python_fake( - input_tensors: List[torch.Tensor], engine: TRTEngine -) -> List[torch.Tensor]: - """Abstract/fake kernel for ``tensorrt::execute_engine``. - - Called by FakeTensor propagation and ``torch.export`` to infer output - shapes and dtypes without executing the real TRT engine. Output shapes - are obtained by asking the engine's execution context to propagate the - concrete input shapes symbolically; dtypes come from the engine's - pre-parsed output dtype list. - """ - input_shapes = [list(t.shape) for t in input_tensors] - try: - output_shapes = engine.infer_outputs(input_shapes) - except Exception: - # Fall back to the statically-stored shapes when shape inference is - # unavailable (e.g. engine context not yet initialised in meta mode). - output_shapes = [list(s) for s in engine.output_shapes] - - return [ - torch.empty( - shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device - ) - for i, shape in enumerate(output_shapes) - ] + @execute_engine.register_fake # type: ignore[misc] + def execute_engine_fake( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + """Abstract/fake kernel for ``tensorrt::execute_engine``. + + Called by FakeTensor propagation and ``torch.export`` to infer output + shapes and dtypes without executing the real TRT engine. Output shapes + are obtained by asking the engine's execution context to propagate the + concrete input shapes symbolically; dtypes come from the engine's + pre-parsed output dtype list. + """ + input_shapes = [list(t.shape) for t in input_tensors] + try: + output_shapes = engine.infer_outputs(input_shapes) + except Exception: + # Fall back to the statically-stored shapes when shape inference is + # unavailable (e.g. engine context not yet initialised in meta mode). + output_shapes = [list(s) for s in engine.output_shapes] + + return [ + torch.empty( + shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device + ) + for i, shape in enumerate(output_shapes) + ] diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 0386c97ea3..1d83bd646f 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -126,10 +126,7 @@ def __init__( self.settings = copy.deepcopy(settings) self.weight_name_map = weight_name_map self.serialized_engine = serialized_engine - self.engine: Any = None - self._use_python_runtime = settings.use_python_runtime - - self.execute_engine_op: Any = None + self.engine: Optional[Any] = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources self.symbolic_shape_expressions = symbolic_shape_expressions @@ -232,34 +229,45 @@ def _pack_engine_info(self) -> List[str | bytes]: return engine_info + def get_engine(self) -> torch.classes.tensorrt.Engine: + """Return the underlying engine, raising if it has not been set up. + + Used by every engine-accessing method except the hot ``forward`` path, + which intentionally skips the check to avoid per-call overhead. + """ + if self.engine is None: + raise RuntimeError("Engine has not been setup yet.") + return self.engine + def get_streamable_device_memory_budget(self) -> Any: - return self.engine.streamable_device_memory_budget + return self.get_engine().streamable_device_memory_budget def get_automatic_device_memory_budget(self) -> Any: - return self.engine.automatic_device_memory_budget + return self.get_engine().automatic_device_memory_budget def get_device_memory_budget(self) -> Any: - return self.engine.device_memory_budget + return self.get_engine().device_memory_budget def set_device_memory_budget(self, budget_bytes: int) -> int: + engine = self.get_engine() if budget_bytes < 0: budget_bytes = self.get_streamable_device_memory_budget() - self.engine.device_memory_budget = budget_bytes - if self.engine.device_memory_budget != budget_bytes: + engine.device_memory_budget = budget_bytes + if engine.device_memory_budget != budget_bytes: logger.error(f"Failed to set weight streaming budget to {budget_bytes}") - budget_bytes = self.engine.device_memory_budget + budget_bytes = engine.device_memory_budget if self.get_streamable_device_memory_budget() == budget_bytes: logger.warning("Weight streaming is disabled") return budget_bytes def _reset_captured_graph(self) -> None: - self.engine.reset_captured_graph() + self.get_engine().reset_captured_graph() def use_dynamically_allocated_resources( self, dynamically_allocate_resources: bool = False ) -> None: self.dynamically_allocate_resources = dynamically_allocate_resources - self.engine.use_dynamically_allocated_resources( + self.get_engine().use_dynamically_allocated_resources( self.dynamically_allocate_resources ) @@ -275,17 +283,15 @@ def setup_engine(self) -> None: if self.engine is not None: return - if self._use_python_runtime: + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + else: from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine self.engine = TRTEngine( self._pack_engine_info(), profile_execution=self.profiling_enabled, ) - self.execute_engine_op = torch.ops.tensorrt.execute_engine_python - else: - self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) - self.execute_engine_op = torch.ops.tensorrt.execute_engine # requires_native_multidevice is set by the C++ constructor from the serialized REQUIRES_NATIVE_MULTIDEVICE_IDX field. if self.engine.requires_native_multidevice: @@ -330,7 +336,7 @@ def decode_metadata(encoded_metadata: bytes) -> Any: return metadata def get_extra_state(self) -> SerializedTorchTensorRTModuleFmt: - if self.engine: + if self.engine is not None: engine_info = self._pack_engine_info() assert isinstance(engine_info[ENGINE_IDX], (bytes, bytearray)) engine_info[ENGINE_IDX] = base64.b64encode(engine_info[ENGINE_IDX]) @@ -380,28 +386,18 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.weight_name_map = metadata["weight_name_map"] self.symbolic_shape_expressions = metadata["inout_symexprs"] - # Re-resolve the runtime now that we have the loaded settings: the - # original __init__ kwarg may have been False, but a saved engine - # can still pin use_python_runtime=True via the settings blob. - self._use_python_runtime = ( - getattr(self.settings, "use_python_runtime", False) - or not ENABLED_FEATURES.torch_tensorrt_runtime - ) - if self._use_python_runtime: + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + else: from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine self.engine = TRTEngine(serialized_engine_info) - self.execute_engine_op = torch.ops.tensorrt.execute_engine_python - else: - self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) - self.execute_engine_op = torch.ops.tensorrt.execute_engine self.engine.set_output_tensors_as_unowned( metadata["output_tensors_are_unowned"] ) else: self.engine = None - self.execute_engine_op = None self.settings = CompilationSettings() self.hardware_compatible = False @@ -410,7 +406,7 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.target_device = self._resolve_target_device() def set_pre_allocated_outputs(self, enable: bool) -> None: - self.engine.use_pre_allocated_outputs = enable + self.get_engine().use_pre_allocated_outputs = enable @property def pre_allocated_outputs(self) -> Any: @@ -420,13 +416,15 @@ def pre_allocated_outputs(self) -> Any: return getattr(self.engine, "pre_allocated_outputs", []) def set_use_output_allocator(self, enable: bool) -> None: - self.engine.use_output_allocator_outputs = enable + self.get_engine().use_output_allocator_outputs = enable def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: - """Run the TensorRT engine on GPU tensors (non-tensor args are cast to CUDA tensors).""" - if self.engine is None: - raise RuntimeError("Engine has not been setup yet.") + """Run the TensorRT engine on GPU tensors (non-tensor args are cast to CUDA tensors). + Note: callers are responsible for ensuring the engine has been set up; + the hot path intentionally omits a ``self.engine is None`` guard so + that a properly-bound module avoids the per-call attribute check. + """ target = self.target_device binding_names = self.input_binding_names # len-check inlined (cheaper than keeping an f-string around the hot path) @@ -454,12 +452,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: else: input_tensors.append(torch.tensor(i).cuda()) - if self.execute_engine_op is None: - raise RuntimeError( - "execute_engine op has not been bound. Call setup_engine() first." - ) - - outputs = self.execute_engine_op(input_tensors, self.engine) + outputs = torch.ops.tensorrt.execute_engine(input_tensors, self.engine) if len(outputs) == 1: return outputs[0] @@ -471,28 +464,26 @@ def enable_profiling( profile_format: str = "perfetto", ) -> None: """Enable engine profiling (optional path prefix and format for tracing output).""" - if self.engine is None: - raise RuntimeError("Engine has not been initialized yet.") + engine = self.get_engine() if profiling_results_dir is not None: - self.engine.profile_path_prefix = profiling_results_dir + engine.profile_path_prefix = profiling_results_dir - self.engine.enable_profiling() - if hasattr(self.engine, "set_profile_format"): - self.engine.set_profile_format(profile_format) + engine.enable_profiling() + if hasattr(engine, "set_profile_format"): + engine.set_profile_format(profile_format) self.profiling_enabled = True def set_output_tensors_as_unowned(self, enabled: bool) -> None: - self.engine.set_output_tensors_as_unowned(enabled) + self.get_engine().set_output_tensors_as_unowned(enabled) def are_output_tensors_unowned(self) -> bool: - return bool(self.engine.are_output_tensors_unowned()) + return bool(self.get_engine().are_output_tensors_unowned()) def disable_profiling(self) -> None: """Disable engine profiling and clear the profiling flag on this module.""" - if self.engine is None: - raise RuntimeError("Engine has not been initialized yet.") - self.engine.disable_profiling() + engine = self.get_engine() + engine.disable_profiling() self.profiling_enabled = False def get_layer_info(self) -> str: @@ -502,15 +493,9 @@ def get_layer_info(self) -> str: str: A JSON string which contains the layer information of the engine incapsulated in this module """ - if self.engine is None: - raise RuntimeError("Engine has not been initialized yet.") - - layer_info: str = self.engine.get_engine_layer_info() + layer_info: str = self.get_engine().get_engine_layer_info() return layer_info def dump_layer_info(self) -> None: """Dump layer information encoded by the TensorRT engine in this module to STDOUT""" - if self.engine is None: - raise RuntimeError("Engine has not been initialized yet.") - - self.engine.dump_engine_layer_info() + self.get_engine().dump_engine_layer_info() diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index db3e1cea45..33595f4709 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -592,15 +592,14 @@ def parse_dynamo_kwargs( if "options" in kwargs and len(kwargs) == 1: kwargs = kwargs["options"] - # TODO: Uncomment this when cross serialization is enabled - # if "use_python_runtime" in kwargs: - # warnings.warn( - # 'torch.compile option "use_python_runtime" was removed; use ' - # "the Python runtime is now selected automatically when the C++ extension is unavailable.", - # DeprecationWarning, - # stacklevel=2, - # ) - # kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} + if "use_python_runtime" in kwargs: + warnings.warn( + 'torch.compile option "use_python_runtime" was removed; use ' + "the Python runtime is now selected automatically when the C++ extension is unavailable.", + DeprecationWarning, + stacklevel=2, + ) + kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} if "truncate_long_and_double" in kwargs: if (