From d06c74af7f748ee6028eab1fed5ed3346882995e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 3 Oct 2023 11:55:48 -0700 Subject: [PATCH 01/73] chore: Switch to new export apis Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/aten_tracer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/py/torch_tensorrt/dynamo/aten_tracer.py b/py/torch_tensorrt/dynamo/aten_tracer.py index da346635a2..def04e7057 100644 --- a/py/torch_tensorrt/dynamo/aten_tracer.py +++ b/py/torch_tensorrt/dynamo/aten_tracer.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -import unittest.mock from typing import Any, List, Tuple import torch @@ -77,12 +76,9 @@ def trace( experimental_decompositions = kwargs.get( "enable_experimental_decompositions", False ) - with unittest.mock.patch( - "torch._export.DECOMP_TABLE", get_decompositions(experimental_decompositions) - ): - graph_module = export( - model, tuple(trace_inputs), constraints=constraints - ).module() - logger.debug("Post export graph: " + str(graph_module.graph)) - return graph_module + exp_program = export( + model, tuple(trace_inputs), constraints=constraints + ).run_decompositions(get_decompositions(experimental_decompositions)) + + return exp_program From ad3b0311b33508a85ae33dfdd591962561e453ac Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 19 Oct 2023 15:16:13 -0700 Subject: [PATCH 02/73] feat: Add support for dynamic shapes and remove constraints API Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/_Input.py | 7 ++- py/torch_tensorrt/dynamo/aten_tracer.py | 53 +++++------------------ tests/py/dynamo/models/test_dyn_models.py | 2 + 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py index 6e43a23903..4dd3cf62c2 100644 --- a/py/torch_tensorrt/_Input.py +++ b/py/torch_tensorrt/_Input.py @@ -47,6 +47,7 @@ class _ShapeMode(Enum): high_tensor_domain_excl: float = low_tensor_domain_incl + DOMAIN_OFFSET torch_dtype: torch.dtype = torch.float32 torch_tensor: torch.Tensor = None + name: str = "" def __init__(self, *args: Any, **kwargs: Any) -> None: """__init__ Method for torch_tensorrt.Input @@ -68,7 +69,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: format (torch.memory_format or torch_tensorrt.TensorFormat): The expected format of the input tensor (default: torch_tensorrt.TensorFormat.NCHW) tensor_domain (Tuple(float, float), optional): The domain of allowed values for the tensor, as interval notation: [tensor_domain[0], tensor_domain[1]). Note: Entering "None" (or not specifying) will set the bound to [0, 2) - + torch_tensor (torch.Tensor): Holds a corresponding torch tensor with this Input. + name (str, optional): Name of this input in the pytorch graph. Used to specify dynamic shapes in dynamo tracer. Examples: - Input([1,3,32,32], dtype=torch.float32, format=torch.channel_last) - Input(shape=(1,3,32,32), dtype=torch_tensorrt.dtype.int32, format=torch_tensorrt.TensorFormat.NCHW) @@ -180,6 +182,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: else: self.torch_tensor = self.example_tensor() + if "name" in kwargs: + self.name = kwargs["name"] + def __str__(self) -> str: if self.shape_mode == Input._ShapeMode.STATIC: return "Input(shape={}, dtype={}, format={}, domain=[{}, {}))".format( diff --git a/py/torch_tensorrt/dynamo/aten_tracer.py b/py/torch_tensorrt/dynamo/aten_tracer.py index f6d0ad4625..c894ca6f3c 100644 --- a/py/torch_tensorrt/dynamo/aten_tracer.py +++ b/py/torch_tensorrt/dynamo/aten_tracer.py @@ -1,10 +1,10 @@ from __future__ import annotations import logging -from typing import Any, List, Tuple +from typing import Any, Tuple import torch -from torch._export import dynamic_dim, export +from torch.export import Dim, export from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import ( ENABLE_EXPERIMENTAL_DECOMPOSITIONS, @@ -16,20 +16,6 @@ logger = logging.getLogger(__name__) -def get_random_tensor( - shape: List[Any], dtype: torch.dtype, device: torch.device -) -> torch.Tensor: - if dtype == torch.int32 or dtype == torch.int64: - return torch.randint(2, 10, shape, dtype=dtype, device=device) - elif dtype in (torch.float64, torch.float32, torch.float16): - return torch.randn(shape, dtype=dtype, device=device) - else: - logger.critical( - "Invalid dtype detected in creating input tensors for tracing the graph." - ) - raise - - def trace( model: torch.nn.Module | torch.fx.GraphModule, inputs: Tuple[Any, ...], @@ -39,49 +25,34 @@ def trace( if "debug" in kwargs and kwargs["debug"]: set_log_level(logger.parent, logging.DEBUG) - # Determine the dynamic dimension and setup constraints to input dimensions as dictated by TensorRT - # Torch dynamo does not allow 0/1 value for dynamic dimensions - # for inputs during tracing. Hence we create new inputs for export device = to_torch_device(kwargs.get("device", default_device())) torch_inputs = get_torch_inputs(inputs, device) - trace_inputs = [] - constraints = [] + dynamic_shapes = {} for idx, input in enumerate(inputs): if input.shape_mode == Input._ShapeMode.DYNAMIC: min_shape = input.shape["min_shape"] opt_shape = input.shape["opt_shape"] max_shape = input.shape["max_shape"] assert len(min_shape) == len(opt_shape) == len(max_shape) - - constraint_dims = [] - new_shape = [] + dynamic_dims = {} for dim in range(len(min_shape)): if min_shape[dim] == opt_shape[dim] == max_shape[dim]: - new_shape.append(torch_inputs[idx].shape[dim]) + continue else: - constraint_dims.append(dim) - if torch_inputs[idx].shape[dim] == 1: - new_shape.append(torch_inputs[idx].shape[dim] + 1) - else: - new_shape.append(torch_inputs[idx].shape[dim]) - - trace_input = get_random_tensor(new_shape, torch_inputs[idx].dtype, device) + dynamic_dims[dim] = Dim( + input.name + "_" + str(dim), + min=min_shape[dim], + max=max_shape[dim], + ) - for dim in constraint_dims: - if min_shape[dim] > 1: - constraints.append(min_shape[dim] <= dynamic_dim(trace_input, dim)) - if max_shape[dim] > 1: - constraints.append(dynamic_dim(trace_input, dim) <= max_shape[dim]) - trace_inputs.append(trace_input) - else: - trace_inputs.append(torch_inputs[idx]) + dynamic_shapes[input.name] = dynamic_dims experimental_decompositions = kwargs.get( "enable_experimental_decompositions", ENABLE_EXPERIMENTAL_DECOMPOSITIONS ) exp_program = export( - model, tuple(trace_inputs), constraints=constraints + model, tuple(torch_inputs), dynamic_shapes=dynamic_shapes ).run_decompositions(get_decompositions(experimental_decompositions)) return exp_program diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index 057a95879d..d110845145 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -36,6 +36,7 @@ def forward(self, x): opt_shape=(4, 3, 224, 224), max_shape=(8, 3, 224, 224), dtype=torch.float32, + name="x", ) ], "device": torchtrt.Device("cuda:0"), @@ -88,6 +89,7 @@ def forward(self, x): opt_shape=(4, 3, 224, 224), max_shape=(8, 3, 224, 224), dtype=torch.float32, + name="x", ) ], "device": torchtrt.Device("cuda:0"), From 1582b72f2e1f094bda8fcb83d6a20f0e78177e39 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 23 Oct 2023 13:28:39 -0700 Subject: [PATCH 03/73] chore: add dynamic shape support for certain converters Signed-off-by: Dheeraj Peri --- .../dynamo/conversion/aten_ops_converters.py | 18 +++++++++++++ .../dynamo/conversion/impl/shape.py | 25 ++++++++++++++++++- .../dynamo/conversion/impl/shuffle.py | 19 ++++++++++++-- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 70c4574b94..149e16c939 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -279,6 +279,24 @@ def aten_ops_sigmoid( ) +@dynamo_tensorrt_converter(torch.ops.aten.sym_size.int) # type: ignore[misc] +def aten_ops_symsize_int( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, +) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.shape.shape( + ctx, + target, + SourceIR.ATEN, + name, + args[0], + args_bounds_check(args, 1, None), + ) + + @dynamo_tensorrt_converter(torch.ops.aten.index.Tensor) # type: ignore[misc] @enforce_tensor_types( { diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index ef30b186c1..f4287feaf9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -8,7 +8,7 @@ from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import to_numpy +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_numpy from torch_tensorrt.dynamo.conversion.impl.elementwise.base import ( convert_binary_elementwise, ) @@ -16,6 +16,29 @@ from torch_tensorrt.fx.types import TRTTensor +def shape( + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR], + name: str, + input_val: TRTTensor, + dim: int, +) -> TRTTensor: + """ + This is the general shape layer implementation in TensorRT. + sym_size.int ops map to addShape layer in TensorRT and returns + the dynamic shape of the tensor optionally taking in a dim argument. + """ + input_shape = ctx.net.add_shape(input_val).get_output(0) + if not dim: + max_dim = len(input_val.shape) + dim = dim if dim > 0 else dim + max_dim + indices = get_trt_tensor(ctx, dim, name + "_dim") + gather_dim = ctx.net.add_gather(input_shape, indices, axis=0).get_output(0) + + return gather_dim + + def get_shape_with_dynamic_shape( ctx: ConversionContext, target: Target, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index 3a4c160d77..2b7a658338 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -2,7 +2,7 @@ from torch.fx.node import Target from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR +from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor from torch_tensorrt.fx.converters.converter_utils import set_layer_name from torch_tensorrt.fx.types import TRTTensor @@ -16,6 +16,21 @@ def reshape( shape: Sequence[int], ) -> TRTTensor: layer = ctx.net.add_shuffle(input) - layer.reshape_dims = tuple(shape) + if all(isinstance(s, int) for s in shape): + layer.reshape_dims = tuple(shape) + else: + # Convert all the dimensions to trt Tensors. + trt_shape = [] + + for i, s in enumerate(shape): + if isinstance(s, TRTTensor): + trt_shape.append(s) + else: + a = get_trt_tensor(ctx, s, f"{name}_{i}") + trt_shape.append(a) + shape_layer = ctx.net.add_concatenation(inputs=trt_shape) + shape_layer.axis = 0 + shape_layer.name = f"{name}_output_shape" + layer.set_input(1, shape_layer.get_output(0)) set_layer_name(layer, target, name, source_ir) return layer.get_output(0) From 4d01545db8ca89b67b0d44c5279d69a3b9876ac9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 25 Oct 2023 12:46:09 -0700 Subject: [PATCH 04/73] chore: minor updates Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/aten_tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/aten_tracer.py b/py/torch_tensorrt/dynamo/aten_tracer.py index c894ca6f3c..a28671daf0 100644 --- a/py/torch_tensorrt/dynamo/aten_tracer.py +++ b/py/torch_tensorrt/dynamo/aten_tracer.py @@ -28,7 +28,7 @@ def trace( device = to_torch_device(kwargs.get("device", default_device())) torch_inputs = get_torch_inputs(inputs, device) dynamic_shapes = {} - for idx, input in enumerate(inputs): + for input in inputs: if input.shape_mode == Input._ShapeMode.DYNAMIC: min_shape = input.shape["min_shape"] opt_shape = input.shape["opt_shape"] From 6731a571134d69c869dfcfd38de2c12143ab8e90 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 26 Oct 2023 14:50:34 -0700 Subject: [PATCH 05/73] chore: updates Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/compile.py | 4 ++++ py/torch_tensorrt/dynamo/partitioning/__init__.py | 7 ++++++- py/torch_tensorrt/dynamo/partitioning/common.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/compile.py b/py/torch_tensorrt/dynamo/compile.py index 5394c1382e..f3be9a223d 100644 --- a/py/torch_tensorrt/dynamo/compile.py +++ b/py/torch_tensorrt/dynamo/compile.py @@ -203,6 +203,10 @@ def compile_module( min_block_size=settings.min_block_size, torch_executed_ops=settings.torch_executed_ops, ) + # Run symbolic shape analysis + partitioning.fake_tensor_prop( + partitioned_module, sample_inputs, to_torch_device(settings.device) + ) # Store TRT replicas of Torch subgraphs trt_modules = {} diff --git a/py/torch_tensorrt/dynamo/partitioning/__init__.py b/py/torch_tensorrt/dynamo/partitioning/__init__.py index 1a8cc94099..8e67abda88 100644 --- a/py/torch_tensorrt/dynamo/partitioning/__init__.py +++ b/py/torch_tensorrt/dynamo/partitioning/__init__.py @@ -1,3 +1,8 @@ from ._adjacency_partitioner import partition as fast_partition from ._global_partitioner import partition as global_partition -from .common import get_graph_converter_support, get_submod_inputs, run_shape_analysis +from .common import ( + fake_tensor_prop, + get_graph_converter_support, + get_submod_inputs, + run_shape_analysis, +) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 8348738afa..b345a4b814 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Sequence, Set, Tuple import torch +from torch.fx.passes.fake_tensor_prop import FakeTensorProp from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG from torch_tensorrt.dynamo.utils import get_torch_inputs, input_is_dynamic @@ -9,6 +10,15 @@ logger = logging.getLogger(__name__) +def fake_tensor_prop( + gm: torch.fx.GraphModule, inputs: Sequence[Input], device: torch.device +) -> None: + torch_inputs = get_torch_inputs(inputs, device) + # Propagate fake tensors and generates metadata (shape, dtype) for the nodes in the graph + fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True) + FakeTensorProp(gm, mode=fake_mode).propagate(*torch_inputs) + + def run_shape_analysis( parent_module: torch.fx.GraphModule, inputs: Sequence[Input] ) -> Tuple[Dict[Any, Sequence[Any]], Dict[Any, Sequence[Any]]]: From 0b60aae4522fe7ab04bcf0acbe29ed0d29a9bb1e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 Nov 2023 23:24:08 -0800 Subject: [PATCH 06/73] chore: add sym int converter Signed-off-by: Dheeraj Peri --- .../dynamo/conversion/aten_ops_converters.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index b05713c360..b61f887acd 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -363,6 +363,22 @@ def aten_ops_sigmoid( args[0], ) +@dynamo_tensorrt_converter(torch.ops.aten.sym_size.int) +def aten_ops_symsize_int( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, +) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.shape.shape( + ctx, + target, + SourceIR.ATEN, + name, + args[0], + args_bounds_check(args, 1, None), + ) @dynamo_tensorrt_converter(torch.ops.aten.index.Tensor) @enforce_tensor_types( From 634612fe78e13a32e3d33233e3ba177ad732590d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 16 Nov 2023 00:38:39 -0800 Subject: [PATCH 07/73] feat: Replace the existing shape propagation with symbolic shape propagation Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/_compiler.py | 13 +-- py/torch_tensorrt/dynamo/_tracer.py | 2 +- .../dynamo/conversion/_conversion.py | 9 ++- .../dynamo/conversion/impl/shuffle.py | 9 ++- .../dynamo/partitioning/__init__.py | 1 + .../dynamo/partitioning/common.py | 80 +++++++++++++++++-- 6 files changed, 88 insertions(+), 26 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 9082e50664..a0a206091d 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -148,10 +148,10 @@ def compile( gm = exported_program.module() logger.debug("Input graph: " + str(gm.graph)) - # Apply lowering on the graph module torch_inputs = get_torch_inputs(inputs, device) gm = apply_lowering_passes(gm, torch_inputs) + logger.debug("Lowered Input graph: " + str(gm.graph)) enabled_precisions = set(enabled_precisions) @@ -263,10 +263,6 @@ def compile_module( min_block_size=settings.min_block_size, torch_executed_ops=settings.torch_executed_ops, ) - # Run symbolic shape analysis - partitioning.fake_tensor_prop( - partitioned_module, sample_inputs, to_torch_device(settings.device) - ) # Store TRT replicas of Torch subgraphs trt_modules = {} @@ -279,12 +275,7 @@ def compile_module( continue # Get the submodule inputs for min, opt, max shapes of the graph inputs - submodule_inputs = partitioning.get_submod_inputs( - partitioned_module, - submodule, - sample_inputs, - to_torch_device(settings.device), - ) + submodule_inputs = partitioning.construct_submodule_inputs(submodule) logger.debug( "Submodule name: %s\n Input shapes: %s\n %s", diff --git a/py/torch_tensorrt/dynamo/_tracer.py b/py/torch_tensorrt/dynamo/_tracer.py index 43812fd062..bbc68192c0 100644 --- a/py/torch_tensorrt/dynamo/_tracer.py +++ b/py/torch_tensorrt/dynamo/_tracer.py @@ -69,7 +69,7 @@ def trace( torch_inputs = get_torch_inputs(inputs, device) dynamic_shapes = {} for input in inputs: - if input.shape_mode == Input._ShapeMode.DYNAMIC: + if isinstance(input, Input) and input.shape_mode == Input._ShapeMode.DYNAMIC: min_shape = input.shape["min_shape"] opt_shape = input.shape["opt_shape"] max_shape = input.shape["max_shape"] diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 1cdea63680..2aa34952ed 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -3,6 +3,7 @@ import io from typing import Sequence +import tensorrt as trt import torch from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._settings import CompilationSettings @@ -10,8 +11,6 @@ from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule from torch_tensorrt.dynamo.utils import get_torch_inputs -import tensorrt as trt - def convert_module( module: torch.fx.GraphModule, @@ -40,6 +39,12 @@ def convert_module( # such as aten.sum - such outputs can be truncated output_dtypes = [] for output in module_outputs: + if not isinstance(output, torch.Tensor): + output = torch.tensor(output) + if isinstance(output, int): + output = output.to(torch.int32) + elif isinstance(output, float): + output = output.to(torch.float32) if settings.truncate_long_and_double and output.dtype == torch.float64: output_dtypes.append(torch.float32) elif settings.truncate_long_and_double and output.dtype == torch.int64: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index 2b7a658338..a52995d4b7 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -28,9 +28,10 @@ def reshape( else: a = get_trt_tensor(ctx, s, f"{name}_{i}") trt_shape.append(a) - shape_layer = ctx.net.add_concatenation(inputs=trt_shape) - shape_layer.axis = 0 - shape_layer.name = f"{name}_output_shape" - layer.set_input(1, shape_layer.get_output(0)) + shape_layer = ctx.net.add_concatenation(inputs=trt_shape) + shape_layer.axis = 0 + shape_layer.name = f"{name}_output_shape" + layer.set_input(1, shape_layer.get_output(0)) + set_layer_name(layer, target, name, source_ir) return layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/partitioning/__init__.py b/py/torch_tensorrt/dynamo/partitioning/__init__.py index 8e67abda88..3c2a9ea199 100644 --- a/py/torch_tensorrt/dynamo/partitioning/__init__.py +++ b/py/torch_tensorrt/dynamo/partitioning/__init__.py @@ -1,6 +1,7 @@ from ._adjacency_partitioner import partition as fast_partition from ._global_partitioner import partition as global_partition from .common import ( + construct_submodule_inputs, fake_tensor_prop, get_graph_converter_support, get_submod_inputs, diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index b345a4b814..e892834a5b 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -2,7 +2,6 @@ from typing import Any, Dict, Optional, Sequence, Set, Tuple import torch -from torch.fx.passes.fake_tensor_prop import FakeTensorProp from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG from torch_tensorrt.dynamo.utils import get_torch_inputs, input_is_dynamic @@ -10,13 +9,78 @@ logger = logging.getLogger(__name__) -def fake_tensor_prop( - gm: torch.fx.GraphModule, inputs: Sequence[Input], device: torch.device -) -> None: - torch_inputs = get_torch_inputs(inputs, device) - # Propagate fake tensors and generates metadata (shape, dtype) for the nodes in the graph - fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True) - FakeTensorProp(gm, mode=fake_mode).propagate(*torch_inputs) +def contains_sym_int(tensor: torch.Tensor) -> bool: + """ + Returns true if the given tensor has symbolic shape. + """ + for dim in tensor: + if isinstance(dim, torch.SymInt): + return True + return False + + +def construct_dynamic_input(input: Any) -> Input: + """ + Constructs a torch_tensorrt.Input based on a symbolic input + Args: + input: A symbolic shape tensor (which can have a mix of SymInt nodes and static values) + Returns: + A dynamic shaped torch_tensorrt.Input which has the properties of the symbolic shaped input. + """ + input_sym_shape = input.size() + min_shape = [] + opt_shape = [] + max_shape = [] + for dim in input_sym_shape: + if isinstance(dim, torch.SymInt): + node = dim.node + expr = node.expr + shape_env = node.shape_env + var_range = shape_env.var_to_range.get(expr, None) + var_val = shape_env.var_to_val.get(expr, None) + assert var_range, var_val + # Torchdynamo 0/1 specialization outlier + if var_range.lower == 2: + min_shape.append(1) + else: + min_shape.append(var_range.lower) + opt_shape.append(var_val) + max_shape.append(var_range.upper) + else: + min_shape.append(dim) + opt_shape.append(dim) + max_shape.append(dim) + + return Input( + min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape, dtype=input.dtype + ) + + +def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: + """ + Construct torch_tensorrt Inputs based on the module inputs. + The module inputs will have meta data which has the shape and dtype info + Args: + module: Input FX GraphModule + Returns: + Sequence of torch_tensorrt.Input's representing inputs to given module + """ + torchtrt_inputs = [] + module_inputs = [node for node in module.graph.nodes if node.op == "placeholder"] + for input in module_inputs: + if input.meta and "val" in input.meta: + input_meta = input.meta["val"] + input_shape = input_meta.size() + if contains_sym_int(input_shape): + torchtrt_inputs.append(construct_dynamic_input(input_meta)) + else: + torchtrt_inputs.append(Input(shape=input_shape, dtype=input_meta.dtype)) + else: + raise AssertionError( + f"Input {input.name} does not contain metadata. Please ensure you have exported the graph correctly" + ) + + return torchtrt_inputs def run_shape_analysis( From 93edba415b6e2a7109b935157faedc833315ce6e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 16 Nov 2023 08:36:27 -0800 Subject: [PATCH 08/73] chore: fix imports Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/partitioning/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/partitioning/__init__.py b/py/torch_tensorrt/dynamo/partitioning/__init__.py index 3c2a9ea199..5e5406e67c 100644 --- a/py/torch_tensorrt/dynamo/partitioning/__init__.py +++ b/py/torch_tensorrt/dynamo/partitioning/__init__.py @@ -2,7 +2,6 @@ from ._global_partitioner import partition as global_partition from .common import ( construct_submodule_inputs, - fake_tensor_prop, get_graph_converter_support, get_submod_inputs, run_shape_analysis, From 7ad927248bd4fdfea8034f345448e883c01c9d3c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 16 Nov 2023 08:42:28 -0800 Subject: [PATCH 09/73] chore: fix imports Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index b61f887acd..09b0092e85 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -363,6 +363,7 @@ def aten_ops_sigmoid( args[0], ) + @dynamo_tensorrt_converter(torch.ops.aten.sym_size.int) def aten_ops_symsize_int( ctx: ConversionContext, @@ -380,6 +381,7 @@ def aten_ops_symsize_int( args_bounds_check(args, 1, None), ) + @dynamo_tensorrt_converter(torch.ops.aten.index.Tensor) @enforce_tensor_types( { From f444d5459632397c401083bf64af9404d040c7f2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 21 Nov 2023 00:14:51 -0800 Subject: [PATCH 10/73] chore: updates Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/_compiler.py | 18 ++++++++++++++++-- py/torch_tensorrt/dynamo/backend/backends.py | 1 - .../dynamo/partitioning/common.py | 2 -- py/torch_tensorrt/dynamo/utils.py | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index a0a206091d..0651e1ac42 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -40,7 +40,6 @@ prepare_inputs, set_log_level, to_torch_device, - to_torch_tensorrt_device, ) logger = logging.getLogger(__name__) @@ -144,7 +143,7 @@ def compile( # Prepare torch_trt inputs inputs = prepare_inputs(inputs) - device = to_torch_tensorrt_device(device) + device = to_torch_device(device) gm = exported_program.module() logger.debug("Input graph: " + str(gm.graph)) @@ -234,6 +233,21 @@ def compile_module( f"Detected support for {num_supported_ops} operators out of {total_ops} in subgraph." ) + def contains_metadata(gm: torch.fx.GraphModule) -> bool: + for node in gm.graph.nodes: + if (not node.meta) or "val" not in node.meta and node.op != "output": + return False + return True + + # Check if the module has metadata (shape, dtype). If not, run symbolic shape propagation. + if not contains_metadata(gm): + from torch._inductor.compile_fx import fake_tensor_prop + + torch_inputs = get_torch_inputs(sample_inputs, settings.device) + with torch.no_grad(): + # This fails if the module has data-dependent shape operators. + fake_tensor_prop(gm, torch_inputs) + # Partition module into components that can be TRT-accelerated fast_partitioner_failed = False diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 1fa2806181..bade91c553 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -74,7 +74,6 @@ def _pretraced_backend( fake_mode, "allow_non_fake_inputs", True ), fake_mode: repair_input_aliasing(gm) - # Invoke AOTAutograd to translate operators to aten gm = aot_export_joint_simple( gm, diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index e892834a5b..2cd5dfca76 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -44,8 +44,6 @@ def construct_dynamic_input(input: Any) -> Input: min_shape.append(1) else: min_shape.append(var_range.lower) - opt_shape.append(var_val) - max_shape.append(var_range.upper) else: min_shape.append(dim) opt_shape.append(dim) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 26de1fcb27..31bda92ad3 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -88,7 +88,8 @@ def get_torch_inputs( if isinstance(input, Input) ] return [ - input.torch_tensor.to(device) for input in inputs if isinstance(input, Input) + input.torch_tensor.to(device) if isinstance(input, Input) else input + for input in inputs ] From 6e5c5828095d299ae46434fe9e790385dc1ed3af Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 28 Nov 2023 11:54:13 -0800 Subject: [PATCH 11/73] chore: change device calls Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/_compiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 0651e1ac42..33ea1a9f13 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -40,6 +40,7 @@ prepare_inputs, set_log_level, to_torch_device, + to_torch_tensorrt_device, ) logger = logging.getLogger(__name__) @@ -143,7 +144,7 @@ def compile( # Prepare torch_trt inputs inputs = prepare_inputs(inputs) - device = to_torch_device(device) + device = to_torch_tensorrt_device(device) gm = exported_program.module() logger.debug("Input graph: " + str(gm.graph)) From 83791f8665530f03227e906fedac62b53d2a1b28 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 5 Dec 2023 12:24:25 -0800 Subject: [PATCH 12/73] chore: fix metadata check Signed-off-by: Dheeraj Peri --- py/torch_tensorrt/dynamo/_compiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 33ea1a9f13..b99de1788f 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -236,7 +236,10 @@ def compile_module( def contains_metadata(gm: torch.fx.GraphModule) -> bool: for node in gm.graph.nodes: - if (not node.meta) or "val" not in node.meta and node.op != "output": + if node.op != "output" and (not node.meta) and "val" not in node.meta: + logger.debug( + f"Node {node.name} of op type {node.op} does not have metadata" + ) return False return True From 16394d91f947817cc05123e8e525a0f3e04aa471 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Sun, 7 Jan 2024 07:04:13 +0000 Subject: [PATCH 13/73] chore: minor fixes --- py/torch_tensorrt/dynamo/conversion/impl/shape.py | 2 +- py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py | 2 -- py/torch_tensorrt/dynamo/partitioning/common.py | 2 ++ 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index f4287feaf9..24554b6f9a 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -32,7 +32,7 @@ def shape( input_shape = ctx.net.add_shape(input_val).get_output(0) if not dim: max_dim = len(input_val.shape) - dim = dim if dim > 0 else dim + max_dim + dim = dim if dim >= 0 else dim + max_dim indices = get_trt_tensor(ctx, dim, name + "_dim") gather_dim = ctx.net.add_gather(input_shape, indices, axis=0).get_output(0) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index d6e12f5215..604eda8c96 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -11,7 +11,6 @@ from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones from .repair_input_as_output import repair_input_as_output from .replace_max_pool_with_indices import replace_max_pool_with_indices -from .view_to_reshape import view_to_reshape ATEN_LOWERING_PASSES = DynamoPassManager.build_from_passlist( [ @@ -22,7 +21,6 @@ lower_linear, fuse_prims_broadcast, replace_max_pool_with_indices, - view_to_reshape, ] ) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 2cd5dfca76..e892834a5b 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -44,6 +44,8 @@ def construct_dynamic_input(input: Any) -> Input: min_shape.append(1) else: min_shape.append(var_range.lower) + opt_shape.append(var_val) + max_shape.append(var_range.upper) else: min_shape.append(dim) opt_shape.append(dim) From b9a7ccd81c923c6f098bc5bf2f8f527241836c46 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 8 Jan 2024 21:48:22 +0000 Subject: [PATCH 14/73] chore: Add sym_size converter tests --- .../dynamo/conversion/aten_ops_converters.py | 11 +--- .../dynamo/conversion/impl/shape.py | 8 +-- tests/py/dynamo/models/test_dyn_models.py | 50 +++++++++++++++++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 74a8427fa7..f132c62ec6 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -400,15 +400,7 @@ def aten_ops_symsize_int( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: - return impl.shape.shape( - ctx, - target, - SourceIR.ATEN, - name, - args[0], - args_bounds_check(args, 1, None), - ) - + return impl.shape.shape(ctx, target, SourceIR.ATEN, name, args[0], kwargs["dim"]) def index_dtype_validator(node: Node) -> bool: @@ -420,6 +412,7 @@ def index_dtype_validator(node: Node) -> bool: return False return True + @dynamo_tensorrt_converter(torch.ops.aten.index.Tensor) @dynamo_tensorrt_converter( torch.ops.aten.index.Tensor, capability_validator=index_dtype_validator diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index 24554b6f9a..dc17764f80 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -30,13 +30,13 @@ def shape( the dynamic shape of the tensor optionally taking in a dim argument. """ input_shape = ctx.net.add_shape(input_val).get_output(0) - if not dim: + if dim is not None: max_dim = len(input_val.shape) dim = dim if dim >= 0 else dim + max_dim - indices = get_trt_tensor(ctx, dim, name + "_dim") - gather_dim = ctx.net.add_gather(input_shape, indices, axis=0).get_output(0) + dim_tensor = get_trt_tensor(ctx, dim, name + "_dim") + input_shape = ctx.net.add_gather(input_shape, dim_tensor, axis=0).get_output(0) - return gather_dim + return input_shape def get_shape_with_dynamic_shape( diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index d110845145..75c6c51dab 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -113,3 +113,53 @@ def forward(self, x): with torch.no_grad(): torch.cuda.empty_cache() + + +@pytest.mark.unit +def test_view(ir): + """ + Tests the model (which is fully convertible) with dynamic shapes + """ + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + input_shape = x.size() + y = x.view(input_shape[0], -1) + return y + + model = MyModule().eval().cuda() + input = torch.randn((6, 3, 4)).to("cuda") + + compile_spec = { + "inputs": [ + torchtrt.Input( + min_shape=(1, 3, 4), + opt_shape=(4, 3, 4), + max_shape=(8, 3, 4), + dtype=torch.float32, + name="x", + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float}, + "ir": ir, + "pass_through_build_failures": True, + "optimization_level": 1, + "min_block_size": 1, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_base_dynamic model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + with torch.no_grad(): + torch.cuda.empty_cache() From 15cc6435e960441e38408bf879848b77eab38326 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 8 Jan 2024 23:04:03 +0000 Subject: [PATCH 15/73] chore: Update test utilities --- .../dynamo/conversion/aten_ops_converters.py | 2 +- .../dynamo/conversion/impl/shape.py | 22 +++++++--- tests/py/dynamo/conversion/test_sym_size.py | 43 +++++++++++++++++++ 3 files changed, 59 insertions(+), 8 deletions(-) create mode 100644 tests/py/dynamo/conversion/test_sym_size.py diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index f132c62ec6..fcfa48ebec 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -400,7 +400,7 @@ def aten_ops_symsize_int( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: - return impl.shape.shape(ctx, target, SourceIR.ATEN, name, args[0], kwargs["dim"]) + return impl.shape.shape(ctx, target, SourceIR.ATEN, name, args[0], args[1]) def index_dtype_validator(node: Node) -> bool: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index dc17764f80..2d2481936b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -8,7 +8,11 @@ from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_numpy +from torch_tensorrt.dynamo.conversion.converter_utils import ( + get_positive_dim, + get_trt_tensor, + to_numpy, +) from torch_tensorrt.dynamo.conversion.impl.elementwise.base import ( convert_binary_elementwise, ) @@ -29,12 +33,16 @@ def shape( sym_size.int ops map to addShape layer in TensorRT and returns the dynamic shape of the tensor optionally taking in a dim argument. """ - input_shape = ctx.net.add_shape(input_val).get_output(0) - if dim is not None: - max_dim = len(input_val.shape) - dim = dim if dim >= 0 else dim + max_dim - dim_tensor = get_trt_tensor(ctx, dim, name + "_dim") - input_shape = ctx.net.add_gather(input_shape, dim_tensor, axis=0).get_output(0) + shape_layer = ctx.net.add_shape(input_val) + input_shape = shape_layer.get_output(0) + set_layer_name(shape_layer, target, name + "_shape", source_ir) + + n_dims = len(input_val.shape) + dim = get_positive_dim(dim, n_dims) + dim_tensor = get_trt_tensor(ctx, dim, name + "_dim") + gather_layer = ctx.net.add_gather(input_shape, dim_tensor, axis=0) + set_layer_name(gather_layer, target, name + "_gather", source_ir) + input_shape = gather_layer.get_output(0) return input_shape diff --git a/tests/py/dynamo/conversion/test_sym_size.py b/tests/py/dynamo/conversion/test_sym_size.py new file mode 100644 index 0000000000..5952122247 --- /dev/null +++ b/tests/py/dynamo/conversion/test_sym_size.py @@ -0,0 +1,43 @@ +import torch +import torch.nn as nn +from parameterized import parameterized +from torch.testing._internal.common_utils import run_tests + +from .harness import DispatchTestCase + + +class TestSymSizeConverter(DispatchTestCase): + @parameterized.expand( + [ + ((3, 2, 4),), + ] + ) + def test_sym_size_batch(self, input_shape): + class BatchDim(nn.Module): + def forward(self, x): + return torch.ops.aten.sym_size.int(x, 0) + + inputs = [torch.randn(*input_shape)] + self.run_test( + BatchDim(), + inputs, + ) + + @parameterized.expand( + [ + ((3, 2, 4),), + ] + ) + def test_sym_size_non_batch(self, input_shape): + class NonBatchDim(nn.Module): + def forward(self, x): + return torch.ops.aten.sym_size.int(x, 1) + + inputs = [torch.randn(*input_shape)] + self.run_test( + NonBatchDim(), + inputs, + ) + +if __name__ == "__main__": + run_tests() From 5234d74af917f3f286631163ce7dbd7b3ddd819d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 8 Jan 2024 23:09:02 +0000 Subject: [PATCH 16/73] chore: add testcase for sym_size.int --- tests/py/dynamo/conversion/test_sym_size.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/py/dynamo/conversion/test_sym_size.py b/tests/py/dynamo/conversion/test_sym_size.py index 5952122247..35bf75a509 100644 --- a/tests/py/dynamo/conversion/test_sym_size.py +++ b/tests/py/dynamo/conversion/test_sym_size.py @@ -39,5 +39,6 @@ def forward(self, x): inputs, ) + if __name__ == "__main__": run_tests() From 51e8bb7d0f80f0cee1b248ab0ea3950611bf979e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 26 Jan 2024 01:47:29 -0800 Subject: [PATCH 17/73] chore: revert output type change --- py/torch_tensorrt/dynamo/conversion/_conversion.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 5cc07913bb..844cb6789a 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -39,12 +39,6 @@ def interpret_module_to_result( # such as aten.sum - such outputs can be truncated output_dtypes = [] for output in module_outputs: - if not isinstance(output, torch.Tensor): - output = torch.tensor(output) - if isinstance(output, int): - output = output.to(torch.int32) - elif isinstance(output, float): - output = output.to(torch.float32) if settings.truncate_long_and_double and output.dtype == torch.float64: output_dtypes.append(torch.float32) elif settings.truncate_long_and_double and output.dtype == torch.int64: From 19c3fad9c87ef02b175fa5385be70a7d5d50652e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 26 Jan 2024 19:57:17 -0800 Subject: [PATCH 18/73] chore: add update_metadata utility --- .../lowering/passes/_aten_lowering_pass.py | 2 + .../dynamo/lowering/passes/pass_utils.py | 19 +++++++++- .../dynamo/lowering/passes/view_to_reshape.py | 37 +++++++++---------- .../dynamo/partitioning/common.py | 6 +-- tests/py/dynamo/models/test_dyn_models.py | 3 +- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index 24fca9b2f3..489805cb43 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -11,6 +11,7 @@ from .remove_input_alias_fixing_clones import remove_input_alias_fixing_clones from .repair_input_as_output import repair_input_as_output from .replace_max_pool_with_indices import replace_max_pool_with_indices +from .view_to_reshape import view_to_reshape ATEN_LOWERING_PASSES = DynamoPassManager.build_from_passlist( [ @@ -21,6 +22,7 @@ lower_linear, fuse_prims_broadcast, replace_max_pool_with_indices, + view_to_reshape, ] ) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py index 31a55099c2..ecb614e355 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, Dict, List import torch @@ -29,3 +29,20 @@ def get_tensor_placeholders( ] return placeholders + + +def update_metadata( + gm: torch.fx.GraphModule, target_op: Any, metadata: Dict[int, torch._ops.OpOverload] +) -> None: + """ + Given a graph and a node which has target_op in the graph, + a) If the node has metadata, store it in the map + b) If the node does not have metadata, retrieve it from the map + and assign to the node. + """ + for idx, node in enumerate(gm.graph.nodes): + if node.target == target_op: + if idx not in metadata and node.meta: + metadata[idx] = node.meta + elif idx in metadata and not node.meta: + node.meta = metadata[idx] diff --git a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py index efc836814f..3308f84c58 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py @@ -1,9 +1,10 @@ import logging -from typing import Callable, List, Sequence, Tuple +from typing import Dict, List, Sequence import torch from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( clean_up_graph_after_modifications, + update_metadata, ) logger = logging.getLogger(__name__) @@ -13,29 +14,25 @@ def view_to_reshape( gm: torch.fx.GraphModule, sample_inputs: Sequence[torch.Tensor] ) -> torch.fx.GraphModule: """Replace aten.view with an equivalent implementation which avoids Tensor memory issues""" - orig, replacement = view_replacement() - - if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement): - gm = clean_up_graph_after_modifications(gm) - logger.debug(f"Graph after replacing view with reshape:\n{gm.graph}") - - return gm - - -def view_replacement() -> ( - Tuple[ - torch.fx.GraphModule, - Callable[[torch.Tensor, List[torch.SymInt]], torch.Tensor], - ] -): - """Constructs the original and replacement functions for view""" + orig_op = torch.ops.aten.view.default + replacement_op = torch.ops.aten.reshape.default # Original graph def orig(input: torch.Tensor, shape: List[torch.SymInt]) -> torch.Tensor: - return torch.ops.aten.view.default(input, shape) + return orig_op(input, shape) # Replacement graph def replacement(input: torch.Tensor, shape: List[torch.SymInt]) -> torch.Tensor: - return torch.ops.aten.reshape.default(input, shape) + return replacement_op(input, shape) - return orig, replacement + # Store metadata of the orig_op and copy it to the replacement op + meta_map: Dict[int, torch._ops.OpOverload] = {} + update_metadata(gm, orig_op, meta_map) + + if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement): + gm = clean_up_graph_after_modifications(gm) + logger.debug(f"Graph after replacing view with reshape:\n{gm.graph}") + + update_metadata(gm, replacement_op, meta_map) + + return gm diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index e892834a5b..26d2e22b7a 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -43,9 +43,9 @@ def construct_dynamic_input(input: Any) -> Input: if var_range.lower == 2: min_shape.append(1) else: - min_shape.append(var_range.lower) - opt_shape.append(var_val) - max_shape.append(var_range.upper) + min_shape.append(int(var_range.lower)) + opt_shape.append(int(var_val)) + max_shape.append(int(var_range.upper)) else: min_shape.append(dim) opt_shape.append(dim) diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index 51f84e3684..f9f1d02c02 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -3,9 +3,8 @@ import pytest import timm import torch -from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity - import torch_tensorrt as torchtrt +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity assertions = unittest.TestCase() From ed48551e11b4b9df9bbd329f6dd7f9be396ecbe9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 26 Jan 2024 21:06:39 -0800 Subject: [PATCH 19/73] chore: change debug to warning if the graph does not have metadata --- py/torch_tensorrt/dynamo/_compiler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 13c997337e..9bf1002e20 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -5,6 +5,7 @@ from typing import Any, Collection, List, Optional, Sequence, Set, Tuple, Union import torch +import torch_tensorrt from torch.export import ExportedProgram from torch.fx.node import Target from torch_tensorrt import _enums @@ -66,8 +67,6 @@ to_torch_tensorrt_device, ) -import torch_tensorrt - logger = logging.getLogger(__name__) @@ -305,8 +304,8 @@ def compile_module( def contains_metadata(gm: torch.fx.GraphModule) -> bool: for node in gm.graph.nodes: if node.op != "output" and (not node.meta) and "val" not in node.meta: - logger.debug( - f"Node {node.name} of op type {node.op} does not have metadata" + logger.warning( + f"Node {node.name} of op type {node.op} does not have metadata. This could sometimes lead to undefined behavior." ) return False return True From 18b7e11cf7dbd684f5b4431542ba91ae3d1db0bf Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 29 Jan 2024 16:56:33 -0800 Subject: [PATCH 20/73] feat: add lowering passes to support dynamic shapes for torch.compile --- py/torch_tensorrt/dynamo/backend/backends.py | 14 +++- py/torch_tensorrt/dynamo/lowering/__init__.py | 1 + py/torch_tensorrt/dynamo/lowering/_fusers.py | 82 ------------------- .../dynamo/lowering/_remove_sym_nodes.py | 30 +++++++ 4 files changed, 42 insertions(+), 85 deletions(-) delete mode 100644 py/torch_tensorrt/dynamo/lowering/_fusers.py create mode 100644 py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index bade91c553..9ccaaab198 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -13,6 +13,7 @@ from torch_tensorrt.dynamo.lowering import ( apply_lowering_passes, get_decompositions, + remove_sym_nodes, repair_input_aliasing, ) from torch_tensorrt.dynamo.utils import ( @@ -74,10 +75,17 @@ def _pretraced_backend( fake_mode, "allow_non_fake_inputs", True ), fake_mode: repair_input_aliasing(gm) + + # Remove sym_int placeholders and inputs + remove_sym_nodes(gm) + torch_inputs = [ + input for input in sample_inputs if isinstance(input, torch.Tensor) + ] + # Invoke AOTAutograd to translate operators to aten gm = aot_export_joint_simple( gm, - sample_inputs, + torch_inputs, trace_joint=False, decompositions=get_decompositions( settings.enable_experimental_decompositions @@ -86,10 +94,10 @@ def _pretraced_backend( logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph)) - gm = apply_lowering_passes(gm, sample_inputs) + gm = apply_lowering_passes(gm, torch_inputs) torchtrt_inputs = prepare_inputs( - sample_inputs, disable_memory_format_check=True + torch_inputs, disable_memory_format_check=True ) trt_compiled = compile_module( gm, diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py index 7c4e9fdd2d..587d3bb68c 100644 --- a/py/torch_tensorrt/dynamo/lowering/__init__.py +++ b/py/torch_tensorrt/dynamo/lowering/__init__.py @@ -4,5 +4,6 @@ ) from ._decompositions import get_decompositions # noqa: F401 from ._fusers import * # noqa: F401 +from ._remove_sym_nodes import remove_sym_nodes from ._repair_input_aliasing import repair_input_aliasing from .passes import apply_lowering_passes diff --git a/py/torch_tensorrt/dynamo/lowering/_fusers.py b/py/torch_tensorrt/dynamo/lowering/_fusers.py deleted file mode 100644 index 720e4ab030..0000000000 --- a/py/torch_tensorrt/dynamo/lowering/_fusers.py +++ /dev/null @@ -1,82 +0,0 @@ -import torch -from torch_tensorrt.fx.tracer.acc_tracer import acc_ops - - -def check_permute(node: torch.fx.Node) -> bool: - ranks = len(node.meta["tensor_meta"].shape) - permutation = [i % ranks for i in node.kwargs["permutation"]] - allowed_permutation = list(range(ranks)) - allowed_permutation[-1] = ranks - 2 - allowed_permutation[-2] = ranks - 1 - return permutation == allowed_permutation - - -def trt_transposed_matmul( - lhs: torch.Tensor, rhs: torch.Tensor, lhs_transposed: bool, rhs_transposed: bool -) -> torch.Tensor: - if lhs_transposed: - lhs = lhs.transpose(-1, -2) - if rhs_transposed: - rhs = rhs.transpose(-1, -2) - return torch.matmul(lhs, rhs) - - -def fuse_permute_matmul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: - """ - Fuse pattern like permute + matmul if permute is transposing the last two dimension. - """ - for node in gm.graph.nodes: - if node.target == acc_ops.matmul: - lhs, rhs = node.kwargs["input"], node.kwargs["other"] - lhs_transposed = rhs_tranposed = False - skip = False - - if lhs.target == acc_ops.permute and check_permute(lhs): - lhs_transposed = True - lhs = lhs.kwargs["input"] - - if rhs.target == acc_ops.permute and check_permute(rhs): - rhs_tranposed = True - rhs = rhs.kwargs["input"] - - if (not skip) and (lhs_transposed or rhs_tranposed): - with gm.graph.inserting_before(node): - fused_node = gm.graph.call_function( - trt_transposed_matmul, - args=(lhs, rhs, lhs_transposed, rhs_tranposed), - ) - node.replace_all_uses_with(fused_node) - - gm.graph.eliminate_dead_code() - gm.graph.lint() - gm.recompile() - return gm - - -def trt_transposed_linear( - input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor -) -> torch.Tensor: - return torch.matmul(input.transpose(-1, -2), weight.t()) + bias - - -def fuse_permute_linear(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: - """ - Fuse pattern like permute + linear if permute is transposing the last two dimension. - """ - for node in gm.graph.nodes: - if node.target == acc_ops.linear: - inp = node.kwargs["input"] - if inp.target == acc_ops.permute and check_permute(inp): - inp = inp.kwargs["input"] - weight = node.kwargs["weight"] - bias = node.kwargs["bias"] - with gm.graph.inserting_before(node): - fused_node = gm.graph.call_function( - trt_transposed_linear, args=(inp, weight, bias) - ) - node.replace_all_uses_with(fused_node) - - gm.graph.eliminate_dead_code() - gm.graph.lint() - gm.recompile() - return gm diff --git a/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py new file mode 100644 index 0000000000..e85117a423 --- /dev/null +++ b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py @@ -0,0 +1,30 @@ +import logging + +import torch + +logger = logging.getLogger(__name__) + + +def remove_sym_nodes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: + """Remove sym_int placeholders which get inserted due to torch.compile's + dynamic=True behavior + """ + # Extract SymInt placeholder Tensors + placeholders = [ + node + for node in gm.graph.nodes + if ( + node.op == "placeholder" + and isinstance(node.type, type) + and issubclass(node.type, torch.SymInt) + ) + ] + + for node in placeholders: + gm.graph.erase_node(node) + + gm.graph.lint() + gm.recompile() + logger.debug(f"Removed SymInt placeholders:\n{gm.graph}") + + return gm From 3a39d2723d875e854b82d18915374446d0c6a5df Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 29 Jan 2024 17:20:17 -0800 Subject: [PATCH 21/73] chore: add test case --- py/torch_tensorrt/dynamo/lowering/__init__.py | 1 - tests/py/dynamo/models/test_dyn_compile.py | 53 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/py/dynamo/models/test_dyn_compile.py diff --git a/py/torch_tensorrt/dynamo/lowering/__init__.py b/py/torch_tensorrt/dynamo/lowering/__init__.py index 587d3bb68c..a89780ded4 100644 --- a/py/torch_tensorrt/dynamo/lowering/__init__.py +++ b/py/torch_tensorrt/dynamo/lowering/__init__.py @@ -3,7 +3,6 @@ torch_enabled_decompositions, ) from ._decompositions import get_decompositions # noqa: F401 -from ._fusers import * # noqa: F401 from ._remove_sym_nodes import remove_sym_nodes from ._repair_input_aliasing import repair_input_aliasing from .passes import apply_lowering_passes diff --git a/tests/py/dynamo/models/test_dyn_compile.py b/tests/py/dynamo/models/test_dyn_compile.py new file mode 100644 index 0000000000..3b8e452e32 --- /dev/null +++ b/tests/py/dynamo/models/test_dyn_compile.py @@ -0,0 +1,53 @@ +import unittest + +import pytest +import torch +import torch_tensorrt as torchtrt +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity + +assertions = unittest.TestCase() + + +@pytest.mark.unit +def test_dyn_full_compile(ir): + """ + Tests the model (which is fully convertible) with dynamic shapes + """ + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) + self.relu = torch.nn.ReLU() + + def forward(self, x): + torch._check(x.size()[0] >= 1) + torch._check(x.size()[0] <= 8) + out = self.conv(x) + out = self.relu(out) + return out + + model = MyModule().eval().cuda() + input_bs4 = torch.randn((4, 3, 224, 224)).to("cuda") + torch._dynamo.mark_dynamic(input_bs4, 0) + compile_spec = { + "inputs": [input_bs4], + "min_block_size": 1, + "debug": True, + } + # Compile the model + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + trt_model(input_bs4) + + input_bs6 = torch.randn((6, 3, 224, 224)).to("cuda") + cos_sim = cosine_similarity(model(input_bs6), trt_model(input_bs6)) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_dyn_full_compile model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + with torch.no_grad(): + torch.cuda.empty_cache() From abb2677b0daf9fff15b05df7b7bbb2f19e7dc86e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 2 Feb 2024 10:57:11 -0800 Subject: [PATCH 22/73] chore: add view test case --- tests/py/dynamo/models/test_dyn_compile.py | 39 ++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/py/dynamo/models/test_dyn_compile.py b/tests/py/dynamo/models/test_dyn_compile.py index 3b8e452e32..b8db6e14aa 100644 --- a/tests/py/dynamo/models/test_dyn_compile.py +++ b/tests/py/dynamo/models/test_dyn_compile.py @@ -51,3 +51,42 @@ def forward(self, x): with torch.no_grad(): torch.cuda.empty_cache() + + +@pytest.mark.unit +def test_dyn_view(ir): + """ + Tests the model (which is fully convertible) with dynamic shapes + """ + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + torch._check(x.size()[0] <= 8) + input_shape = x.size() + y = x.view(input_shape[0], -1) + return y + + model = MyModule().eval().cuda() + input_bs4 = torch.randn((4, 3, 4)).to("cuda") + torch._dynamo.mark_dynamic(input_bs4, 0) + compile_spec = {"inputs": [input_bs4], "min_block_size": 1, "debug": True} + + # Compile the model + trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) + trt_model(input_bs4) + + input_bs6 = torch.randn((6, 3, 4)).to("cuda") + cos_sim = cosine_similarity(model(input_bs6), trt_model(input_bs6)) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"test_base_dynamic model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + with torch.no_grad(): + torch.cuda.empty_cache() From 9aff04bad406c36ca3d2a5cc9f1f058903fe41f2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 6 Feb 2024 16:00:41 -0800 Subject: [PATCH 23/73] chore: gpt2 changes + linting --- .../dynamo/conversion/converter_utils.py | 2 +- .../dynamo/conversion/impl/grid.py | 6 ++-- .../dynamo/conversion/impl/select.py | 12 ++++---- .../dynamo/conversion/impl/upsample.py | 2 +- .../dynamo/conversion/ops_evaluators.py | 14 ++++++++-- .../dynamo/lowering/passes/pass_utils.py | 28 ++++++++++--------- .../dynamo/lowering/passes/view_to_reshape.py | 14 ++++++---- versions.py | 7 ++--- 8 files changed, 48 insertions(+), 37 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index f90c869c15..1378f5da17 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -270,7 +270,7 @@ def create_constant( """ numpy_value = to_numpy(value, dtype) constant = ctx.net.add_constant( - (1,) if isinstance(value, (int, float, bool)) else value.shape, + trt.Dims() if isinstance(value, (int, float, bool)) else value.shape, numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, ) constant.name = name diff --git a/py/torch_tensorrt/dynamo/conversion/impl/grid.py b/py/torch_tensorrt/dynamo/conversion/impl/grid.py index 672fc97351..63ff93b0c7 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/grid.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/grid.py @@ -1,13 +1,11 @@ -from typing import Optional, Sequence +from typing import Optional import tensorrt as trt -import torch from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import cast_trt_tensor from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTNetwork, TRTTensor +from torch_tensorrt.fx.types import TRTTensor # nearest, linear, cubic GridSamplerInterpolationMode = { diff --git a/py/torch_tensorrt/dynamo/conversion/impl/select.py b/py/torch_tensorrt/dynamo/conversion/impl/select.py index db586be65f..dc33129d24 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/select.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/select.py @@ -90,7 +90,7 @@ def index( # is_numpy is a flag to specify if all the indices are numpy or torchTensor. # If any is not this flag will be set to False _LOGGER.debug( - f"Determining whether aten.index constant-index optimization can be invoked" + "Determining whether aten.index constant-index optimization can be invoked" ) is_numpy = all( isinstance(ind, (torch.Tensor, np.ndarray)) for ind in index if ind is not None @@ -123,7 +123,7 @@ def index( return identity_layer.get_output(0) elif len(tensor_indices) == 1: indices_tensor = get_trt_tensor( - ctx, tensor_indices[0], name + f"_parameter_to_fp32_tensor" + ctx, tensor_indices[0], name + "_parameter_to_fp32_tensor" ) index = adv_indx_indices[0] _LOGGER.debug(f"The advanced index indices is {adv_indx_indices}") @@ -204,7 +204,7 @@ def index( cum_adv_index = cum_adv_index + adv_index multiplier = multiplier * input_shape[adv_indx_indices[i]] cum_adv_index = get_trt_tensor( - ctx, cum_adv_index, name + f"_index_sum_intermediate" + ctx, cum_adv_index, name + "_index_sum_intermediate" ) else: multiplier = get_trt_tensor( @@ -263,7 +263,7 @@ def index( adv_indx_count == adv_indx_indices[adv_indx_count - 1] - adv_indx_indices[0] + 1 ): - _LOGGER.debug(f"The indices are continuous in this case") + _LOGGER.debug("The indices are continuous in this case") concat_tensor_reshape.append( get_trt_tensor(ctx, -1, name + "_dynamic_concat") ) @@ -287,7 +287,7 @@ def index( source_ir, ) unfold_tensor = regular_index_shuffle_layer.get_output(0) - _LOGGER.debug(f"The tensor is unfolded now") + _LOGGER.debug("The tensor is unfolded now") _LOGGER.debug(f"The unfolded tensor shape is {unfold_tensor.shape}") # Transpose folded advanced indexed axis to its original location. @@ -342,7 +342,7 @@ def index( reshape_output = unfold_advanced_shuffle_layer.get_output(0) else: - _LOGGER.debug(f"The indices are not continuous in this case") + _LOGGER.debug("The indices are not continuous in this case") concat_final_tensor = [] concat_final_tensor.append(cum_adv_index_shape_tensor) for i in range(0, rank): diff --git a/py/torch_tensorrt/dynamo/conversion/impl/upsample.py b/py/torch_tensorrt/dynamo/conversion/impl/upsample.py index 3313730ec3..594bb4167c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/upsample.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/upsample.py @@ -29,7 +29,7 @@ def upsample( resize_layer.scales = [1.0, 1.0] + list(scale_factors) else: raise RuntimeError( - f"At least one of out_shape and scale_factors should be specified." + "At least one of out_shape and scale_factors should be specified." ) # interpolate mode diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index f83e0e5008..b35f198028 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -2,7 +2,7 @@ import operator from typing import Dict, Sequence, Tuple, Union -import numpy as np +import tensorrt as trt import torch from torch.fx.node import Argument, Node, Target from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -46,4 +46,14 @@ def aten_ops_arange_start_step( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: - return np.arange(*args) + # breakpoint() + fill_layer = ctx.net.add_fill(trt.Dims(), trt.FillOperation.LINSPACE) + fill_layer.set_input(0, args[1]) + fill_layer.set_output_type(0, trt.DataType.INT32) + # fill_layer.set_input(1, 0) + # fill_layer.set_input(2, 1) + # start_tensor = get_trt_tensor(ctx, 0, "_start_tensor") + # fill_layer.set_input(1, start_tensor) + # delta_tensor = get_trt_tensor(ctx, torch.tensor([0], dtype=torch.int32), "_delta_tensor") + # fill_layer.set_input(2, delta_tensor) + return fill_layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py index ecb614e355..e3c0f46e9f 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, List import torch @@ -31,18 +31,20 @@ def get_tensor_placeholders( return placeholders -def update_metadata( - gm: torch.fx.GraphModule, target_op: Any, metadata: Dict[int, torch._ops.OpOverload] +def get_metadata(gm: torch.fx.GraphModule, target_op: Any) -> List[torch._ops.OpOverload]: + """ + Return the list which has the metadata of all the target_op nodes present in the graph. + """ + return [node.meta for node in gm.graph.nodes if node.target == target_op] + + +def set_metadata( + gm: torch.fx.GraphModule, target_op: Any, metadata: List[torch._ops.OpOverload] ) -> None: """ - Given a graph and a node which has target_op in the graph, - a) If the node has metadata, store it in the map - b) If the node does not have metadata, retrieve it from the map - and assign to the node. + Return the list which has the metadata of all the target_op nodes present in the graph. """ - for idx, node in enumerate(gm.graph.nodes): - if node.target == target_op: - if idx not in metadata and node.meta: - metadata[idx] = node.meta - elif idx in metadata and not node.meta: - node.meta = metadata[idx] + target_nodes = [node for node in gm.graph.nodes if node.target == target_op] + assert len(target_nodes) == len(metadata) + for idx, node in enumerate(target_nodes): + node.meta = metadata[idx] diff --git a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py index 3308f84c58..db0346348b 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py @@ -1,10 +1,11 @@ import logging -from typing import Dict, List, Sequence +from typing import List, Sequence import torch from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( clean_up_graph_after_modifications, - update_metadata, + get_metadata, + set_metadata, ) logger = logging.getLogger(__name__) @@ -25,14 +26,15 @@ def orig(input: torch.Tensor, shape: List[torch.SymInt]) -> torch.Tensor: def replacement(input: torch.Tensor, shape: List[torch.SymInt]) -> torch.Tensor: return replacement_op(input, shape) - # Store metadata of the orig_op and copy it to the replacement op - meta_map: Dict[int, torch._ops.OpOverload] = {} - update_metadata(gm, orig_op, meta_map) + # Store metadata of the orig_op + metadata = get_metadata(gm, orig_op) + # breakpoint() if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement): gm = clean_up_graph_after_modifications(gm) logger.debug(f"Graph after replacing view with reshape:\n{gm.graph}") - update_metadata(gm, replacement_op, meta_map) + # Copy the orig_op's metadata to the replacement op + set_metadata(gm, replacement_op, metadata) return gm diff --git a/versions.py b/versions.py index 772737aab7..db418a06d2 100644 --- a/versions.py +++ b/versions.py @@ -1,11 +1,10 @@ -import yaml -import re import os +import re import subprocess - from datetime import datetime from pathlib import Path -from typing import List + +import yaml __version__ = "0.0.0" __cuda_version__ = "0.0" From 440fcd5deb24101bb3b1c5856329dcaac7576c88 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 6 Feb 2024 16:01:01 -0800 Subject: [PATCH 24/73] chore: gpt2 changes + linting --- py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py index e3c0f46e9f..0ffc6d3c76 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py @@ -31,7 +31,9 @@ def get_tensor_placeholders( return placeholders -def get_metadata(gm: torch.fx.GraphModule, target_op: Any) -> List[torch._ops.OpOverload]: +def get_metadata( + gm: torch.fx.GraphModule, target_op: Any +) -> List[torch._ops.OpOverload]: """ Return the list which has the metadata of all the target_op nodes present in the graph. """ From 002db3c36e14c32c1d6e6308238e4de7646bf671 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 6 Feb 2024 19:20:12 -0800 Subject: [PATCH 25/73] chore: add fallback option if val is missing in metadata --- .../dynamo/conversion/aten_ops_converters.py | 1 - .../dynamo/partitioning/common.py | 37 +++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 7e833e8b81..7f187e7134 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -413,7 +413,6 @@ def index_dtype_validator(node: Node) -> bool: return True -@dynamo_tensorrt_converter(torch.ops.aten.index.Tensor) @dynamo_tensorrt_converter( torch.ops.aten.index.Tensor, capability_validator=index_dtype_validator ) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 26d2e22b7a..109bda275f 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -19,19 +19,18 @@ def contains_sym_int(tensor: torch.Tensor) -> bool: return False -def construct_dynamic_input(input: Any) -> Input: +def construct_dynamic_input(input_shape: torch.Size, input_dtype: torch.dtype) -> Input: """ Constructs a torch_tensorrt.Input based on a symbolic input Args: - input: A symbolic shape tensor (which can have a mix of SymInt nodes and static values) + input_shape: A symbolic shape / regular shape of a tensor (which can have a mix of SymInt nodes and static values) Returns: A dynamic shaped torch_tensorrt.Input which has the properties of the symbolic shaped input. """ - input_sym_shape = input.size() min_shape = [] opt_shape = [] max_shape = [] - for dim in input_sym_shape: + for dim in input_shape: if isinstance(dim, torch.SymInt): node = dim.node expr = node.expr @@ -52,10 +51,20 @@ def construct_dynamic_input(input: Any) -> Input: max_shape.append(dim) return Input( - min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape, dtype=input.dtype + min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape, dtype=input_dtype ) +def get_input(input_shape: torch.Size, input_dtype: torch.dtype) -> Input: + """ + Based on type of dimensions in the input_shape, construct regular or dynamic shaped inputs + """ + if contains_sym_int(input_shape): + return construct_dynamic_input(input_shape, input_dtype) + else: + return Input(shape=input_shape, dtype=input_dtype) + + def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: """ Construct torch_tensorrt Inputs based on the module inputs. @@ -68,13 +77,19 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: torchtrt_inputs = [] module_inputs = [node for node in module.graph.nodes if node.op == "placeholder"] for input in module_inputs: - if input.meta and "val" in input.meta: - input_meta = input.meta["val"] - input_shape = input_meta.size() - if contains_sym_int(input_shape): - torchtrt_inputs.append(construct_dynamic_input(input_meta)) + if input.meta: + if "val" in input.meta: + input_meta = input.meta["val"] + input_shape = input_meta.size() + torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + elif "tensor_meta" in input.meta: + input_meta = input.meta["tensor_meta"] + input_shape = input_meta.shape + torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) else: - torchtrt_inputs.append(Input(shape=input_shape, dtype=input_meta.dtype)) + raise AssertionError( + f"Input {input.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly" + ) else: raise AssertionError( f"Input {input.name} does not contain metadata. Please ensure you have exported the graph correctly" From 00cd17b973c2ebf6d32622f126f6b6158ce9a8d1 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 12 Feb 2024 17:27:30 -0800 Subject: [PATCH 26/73] chore: tmp changes --- .../dynamo/conversion/impl/slice/base.py | 17 ++++++++++++++++- .../dynamo/conversion/impl/slice/ops.py | 14 ++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py index 018ac63b8c..21a38a290d 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py @@ -8,8 +8,23 @@ has_dynamic_shape, set_layer_name, ) +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor from torch_tensorrt.fx.types import Shape, TRTTensor +def get_dynamic_shape(ctx, target, source_ir, name, shape, input): + trt_shape = [] + shape = input.shape + for i, s in enumerate(shape): + if isinstance(s, TRTTensor): + trt_shape.append(s) + else: + a = get_trt_tensor(ctx, s, f"{name}_{i}") + trt_shape.append(a) + shape_layer = ctx.net.add_concatenation(inputs=trt_shape) + shape_layer.axis = 0 + shape_layer.name = f"{name}_output_shape" + + return shape_layer.get_output(0) def slice( ctx: ConversionContext, @@ -23,7 +38,7 @@ def slice( ) -> TRTTensor: dynamic_shape = has_dynamic_shape(input.shape) if dynamic_shape: - shape = get_shape_with_dynamic_shape(ctx, target, source_ir, name, shape, input) + shape = get_dynamic_shape(ctx, target, source_ir, name, shape, input) layer = ctx.net.add_slice( input, start=start, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 5f1db00f33..dba4ad52a5 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -69,7 +69,6 @@ def expand( ) -> TRTTensor: shape_rank = len(shape) initial_tensor_rank = len(input_t.shape) - # If the rank of the input tensor is less than the shape's rank, pad with ones if initial_tensor_rank < shape_rank: input_t = prepend_ones( @@ -99,9 +98,16 @@ def expand( stride = tuple( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) - set_layer_name(layer, target, name, source_ir) - return layer.get_output(0) + # layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) + + # set_layer_name(layer, target, name, source_ir) + # return layer.get_output(0) + breakpoint() + expand_output = slice( + ctx, target, source_ir, name, input_t, start, shape, stride + ) + return expand_output + def chunk( From 6ac70cd2cb2c219dd5f99ffa02faa8ae89f466cb Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 12 Feb 2024 17:27:37 -0800 Subject: [PATCH 27/73] chore: tmp changes --- py/torch_tensorrt/dynamo/conversion/impl/slice/base.py | 6 ++++-- py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py | 5 +---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py index 21a38a290d..64225227aa 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py @@ -3,14 +3,15 @@ from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, set_layer_name, ) -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor from torch_tensorrt.fx.types import Shape, TRTTensor + def get_dynamic_shape(ctx, target, source_ir, name, shape, input): trt_shape = [] shape = input.shape @@ -23,9 +24,10 @@ def get_dynamic_shape(ctx, target, source_ir, name, shape, input): shape_layer = ctx.net.add_concatenation(inputs=trt_shape) shape_layer.axis = 0 shape_layer.name = f"{name}_output_shape" - + return shape_layer.get_output(0) + def slice( ctx: ConversionContext, target: Target, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index dba4ad52a5..70badd796c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -103,11 +103,8 @@ def expand( # set_layer_name(layer, target, name, source_ir) # return layer.get_output(0) breakpoint() - expand_output = slice( - ctx, target, source_ir, name, input_t, start, shape, stride - ) + expand_output = slice(ctx, target, source_ir, name, input_t, start, shape, stride) return expand_output - def chunk( From 39615a276067188209deb0a296f4e5f5f3382234 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 26 Feb 2024 11:50:49 -0800 Subject: [PATCH 28/73] chore: fixes --- py/torch_tensorrt/dynamo/conversion/converter_utils.py | 2 +- py/torch_tensorrt/dynamo/conversion/impl/shape.py | 4 ++-- py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 7d5e59367b..f9d14917f1 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -270,7 +270,7 @@ def create_constant( """ numpy_value = to_numpy(value, dtype) constant = ctx.net.add_constant( - trt.Dims() if isinstance(value, (int, float, bool)) else value.shape, + (1,) if isinstance(value, (int, float, bool)) else value.shape, numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, ) constant.name = name diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index 2d2481936b..bd48351916 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -11,7 +11,6 @@ from torch_tensorrt.dynamo.conversion.converter_utils import ( get_positive_dim, get_trt_tensor, - to_numpy, ) from torch_tensorrt.dynamo.conversion.impl.elementwise.base import ( convert_binary_elementwise, @@ -87,8 +86,9 @@ def get_shape_with_dynamic_shape( scale_res = scale_layer.get_output(0) length = input_shape.shape[0] + zero_layer = ctx.net.add_constant( - input_shape.shape, to_numpy(torch.zeros((length), dtype=torch.int32)) + input_shape.shape, np.zeros((length), dtype=np.int32) ) set_layer_name(zero_layer, target, f"{name}_zeros") diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 70badd796c..2dc3efe77b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -102,7 +102,6 @@ def expand( # set_layer_name(layer, target, name, source_ir) # return layer.get_output(0) - breakpoint() expand_output = slice(ctx, target, source_ir, name, input_t, start, shape, stride) return expand_output From cd866609a18fb0dffe1e798a1831bcf698095de5 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 14 Mar 2024 02:01:07 -0700 Subject: [PATCH 29/73] feat: Add save API for torch-trt compiled models --- .github/scripts/install-torch-tensorrt.sh | 3 +- py/torch_tensorrt/_compile.py | 67 +++++++++++++++++++++ py/torch_tensorrt/dynamo/_compiler.py | 9 +-- py/torch_tensorrt/dynamo/_defaults.py | 1 - py/torch_tensorrt/dynamo/_exporter.py | 17 +----- py/torch_tensorrt/dynamo/_settings.py | 3 - tests/py/dynamo/models/test_export_serde.py | 58 +++++++++--------- 7 files changed, 104 insertions(+), 54 deletions(-) diff --git a/.github/scripts/install-torch-tensorrt.sh b/.github/scripts/install-torch-tensorrt.sh index 2930421d5b..9757fadeb4 100644 --- a/.github/scripts/install-torch-tensorrt.sh +++ b/.github/scripts/install-torch-tensorrt.sh @@ -2,7 +2,8 @@ set -eou pipefail # Source conda so it's available to the script environment source ${BUILD_ENV_FILE} -${CONDA_RUN} ${PIP_INSTALL_TORCH} torchvision pyyaml +${CONDA_RUN} ${PIP_INSTALL_TORCH} torchvision --extra-index-url https://pypi.python.org/simple +${CONDA_RUN} python -m pip install pyyaml mpmath==1.3.0 export TRT_VERSION=$(${CONDA_RUN} python -c "import versions; versions.tensorrt_version()") ${CONDA_RUN} python -m pip install /opt/torch-tensorrt-builds/torch_tensorrt*+${CU_VERSION}*.whl tensorrt~=${TRT_VERSION} tensorrt-bindings~=${TRT_VERSION} --extra-index-url=https://pypi.ngc.nvidia.com diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 9dd816e633..aa1bc53a0a 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -6,6 +6,7 @@ import torch import torch.fx +import torch_tensorrt.dynamo import torch_tensorrt.ts from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input @@ -29,6 +30,7 @@ __all__ = [ "compile", "convert_method_to_trt_engine", + "save", ] @@ -332,3 +334,68 @@ def convert_method_to_trt_engine( ) else: raise RuntimeError("Module is an unknown format or the ir requested is unknown") + + +def save( + module: Any, + file_path: str = "", + *, + output_format: str = "exported_program", + inputs: Optional[Sequence[torch.Tensor]] = None, + retrace: bool = False, +) -> None: + """ + Save the model to disk in the specified output format. + Arguments: + module : Compiled Torch-TensorRT module (Options include torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule) + inputs (torch.Tensor): Torch input tensors + """ + module_type = _parse_module_type(module) + accepted_formats = {"exported_program", "torchscript"} + if inputs and not all(isinstance(input, torch.Tensor) for input in inputs): + raise ValueError( + "Not all inputs provided are torch.tensors. Please provide torch.tensors as inputs" + ) + if output_format not in accepted_formats: + raise ValueError( + f"Provided output_format {output_format} is not supported. Supported options are exported_program | torchscript" + ) + if not file_path: + raise ValueError("File path cannot be empty. Please provide a valid file path") + + if module_type == _ModuleType.nn: + raise ValueError( + "Input model is of type nn.Module. Saving nn.Module directly is not supported. Supported model types torch.jit.ScriptModule | torch.fx.GraphModule | torch.export.ExportedProgram." + ) + elif module_type == _ModuleType.ts: + if output_format == "exported_program": + raise ValueError( + "Provided model is a torch.jit.ScriptModule but the output_format specified is exported_program. Please verify the output_format" + ) + else: + torch.jit.save(module, file_path) + elif module_type == _ModuleType.ep: + if output_format == "torchscript": + raise ValueError( + "Provided model is a torch.export.ExportedProgram but the output_format specified is torchscript. Please verify the output_format" + ) + else: + torch.export.save(module, file_path) + elif module_type == _ModuleType.fx: + if not inputs: + raise ValueError( + "Provided model is a torch.fx.GraphModule however the inputs are empty. Please provide valid torch.tensors as inputs to trace and save the model" + ) + # The module type is torch.fx.GraphModule + if output_format == "torchscript": + module_ts = torch.jit.trace(module, inputs) + torch.jit.save(module_ts, file_path) + else: + if not retrace: + from torch_tensorrt.dynamo._exporter import export + + exp_program = export(module, inputs) + torch.export.save(exp_program, file_path) + else: + exp_program = torch.export.export(module, tuple(inputs), strict=False) + torch.export.save(exp_program, file_path) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 6312532f1c..b321eabcb2 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -30,7 +30,6 @@ MIN_BLOCK_SIZE, NUM_AVG_TIMING_ITERS, OPTIMIZATION_LEVEL, - OUTPUT_FORMAT, PASS_THROUGH_BUILD_FAILURES, PRECISION, REFIT, @@ -48,7 +47,6 @@ dryrun_stats_display, parse_non_trt_nodes, ) -from torch_tensorrt.dynamo._exporter import export from torch_tensorrt.dynamo.conversion import ( CompilationSettings, UnsupportedOperatorException, @@ -102,9 +100,8 @@ def compile( enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = DRYRUN, hardware_compatible: bool = HARDWARE_COMPATIBLE, - output_format: str = OUTPUT_FORMAT, **kwargs: Any, -) -> Union[ExportedProgram, torch.jit.ScriptModule, torch.fx.GraphModule]: +) -> torch.fx.GraphModule: """Compile a TorchScript module for NVIDIA GPUs using TensorRT Takes a existing TorchScript module and a set of settings to configure the compiler @@ -246,14 +243,12 @@ def compile( "dla_global_dram_size": dla_global_dram_size, "dryrun": dryrun, "hardware_compatible": hardware_compatible, - "output_format": output_format, } settings = CompilationSettings(**compilation_options) logger.info("Compilation Settings: %s\n", settings) trt_gm = compile_module(gm, inputs, settings) - trt_result = export(trt_gm, torch_inputs, output_format) - return trt_result + return trt_gm def compile_module( diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index ec038c0dba..3d48ab3def 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -26,7 +26,6 @@ REQUIRE_FULL_COMPILATION = False DRYRUN = False HARDWARE_COMPATIBLE = False -OUTPUT_FORMAT = "exported_program" def default_device() -> Device: diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index c7e2f37795..bae20ac235 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -18,27 +18,16 @@ def export( gm: torch.fx.GraphModule, inputs: Sequence[torch.Tensor], - output_format: str, ) -> ExportedProgram: """Export the result of TensorRT compilation into the desired output format. Arguments: gm (torch.fx.GraphModule): Compiled Torch-TensorRT module, generated by ``torch_tensorrt.dynamo.compile`` inputs (torch.Tensor): Torch input tensors - output_format (str): Output format of the result of TRT compilation. Options include "exported_program" (or) "ep" | "torchscript" (or) "ts" | "graph_module" (or) "fx". Default is "exported_program" """ - if output_format == "torchscript" or output_format == "ts": - return torch.jit.trace(gm, inputs) - elif output_format == "exported_program" or output_format == "ep": - patched_module = transform(gm, inputs) - exp_program = create_trt_exp_program(patched_module) - return exp_program - elif output_format == "graph_module" or output_format == "fx": - return gm - else: - raise ValueError( - f"Invalid output format {output_format} specified. Supported options include exported_program (or) ep | torchscript (or) ts | graph_module (or) fx" - ) + patched_module = transform(gm, inputs) + exp_program = create_trt_exp_program(patched_module) + return exp_program def transform( diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index c00b049f45..2420a227d8 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -19,7 +19,6 @@ MIN_BLOCK_SIZE, NUM_AVG_TIMING_ITERS, OPTIMIZATION_LEVEL, - OUTPUT_FORMAT, PASS_THROUGH_BUILD_FAILURES, PRECISION, REFIT, @@ -71,7 +70,6 @@ class CompilationSettings: TRT Engines. Prints detailed logs of the graph structure and nature of partitioning. Optionally saves the ouptut to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) - output_format (str): Output format of the result of TRT compilation. Options include "exported_program" (or) "ep" | "torchscript" (or) "ts" | "graph_module" (or) "fx". Default is "exported_program" """ precision: torch.dtype = PRECISION @@ -99,4 +97,3 @@ class CompilationSettings: dla_global_dram_size: int = DLA_GLOBAL_DRAM_SIZE dryrun: Union[bool, str] = DRYRUN hardware_compatible: bool = HARDWARE_COMPATIBLE - output_format: str = OUTPUT_FORMAT diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py index efa593890e..0905c5e859 100644 --- a/tests/py/dynamo/models/test_export_serde.py +++ b/tests/py/dynamo/models/test_export_serde.py @@ -42,18 +42,18 @@ def forward(self, x): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") - + deser_trt_module = deser_trt_exp_program.module() # Check Pyt and TRT exported program outputs - cos_sim = cosine_similarity(model(input), trt_exp_program(input)[0]) + cos_sim = cosine_similarity(model(input), trt_module(input)[0]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, msg=f"test_base_model_full_compile TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) # Check Pyt and deserialized TRT exported program outputs - cos_sim = cosine_similarity(model(input), deser_trt_exp_program(input)[0]) + cos_sim = cosine_similarity(model(input), deser_trt_module(input)[0]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, msg=f"test_base_model_full_compile TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", @@ -93,12 +93,13 @@ def forward(self, x): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") + deser_trt_module = deser_trt_exp_program.module() # Check Pyt and TRT exported program outputs outputs_pyt = model(input) - outputs_trt = trt_exp_program(input) + outputs_trt = trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx]) assertions.assertTrue( @@ -107,7 +108,7 @@ def forward(self, x): ) # Check Pyt and deserialized TRT exported program outputs - outputs_trt_deser = deser_trt_exp_program(input) + outputs_trt_deser = deser_trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx]) assertions.assertTrue( @@ -149,12 +150,13 @@ def forward(self, x): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") + deser_trt_module = deser_trt_exp_program.module() # Check Pyt and TRT exported program outputs outputs_pyt = model(input) - outputs_trt = trt_exp_program(input) + outputs_trt = trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx]) assertions.assertTrue( @@ -163,7 +165,7 @@ def forward(self, x): ) # Check Pyt and deserialized TRT exported program outputs - outputs_trt_deser = deser_trt_exp_program(input) + outputs_trt_deser = deser_trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx]) assertions.assertTrue( @@ -207,12 +209,12 @@ def forward(self, x): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") - + deser_trt_module = deser_trt_exp_program.module() outputs_pyt = model(input) - outputs_trt = trt_exp_program(input) + outputs_trt = trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx]) assertions.assertTrue( @@ -220,7 +222,7 @@ def forward(self, x): msg=f"test_hybrid_relu_fallback TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - outputs_trt_deser = deser_trt_exp_program(input) + outputs_trt_deser = deser_trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx]) assertions.assertTrue( @@ -248,19 +250,19 @@ def test_resnet18(ir): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") - + deser_trt_module = deser_trt_exp_program.module() outputs_pyt = model(input) - outputs_trt = trt_exp_program(input) + outputs_trt = trt_module(input) cos_sim = cosine_similarity(outputs_pyt, outputs_trt[0]) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, msg=f"test_resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - outputs_trt_deser = deser_trt_exp_program(input) + outputs_trt_deser = deser_trt_module(input) cos_sim = cosine_similarity(outputs_pyt, outputs_trt_deser[0]) assertions.assertTrue( @@ -303,12 +305,12 @@ def forward(self, x): } exp_program = torchtrt.dynamo.trace(model, **compile_spec) - trt_exp_program = torchtrt.dynamo.compile(exp_program, **compile_spec) - torch.export.save(trt_exp_program, "/tmp/trt.ep") + trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") - + deser_trt_module = deser_trt_exp_program.module() outputs_pyt = model(input) - outputs_trt = trt_exp_program(input) + outputs_trt = trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt[idx]) @@ -317,7 +319,7 @@ def forward(self, x): msg=f"test_hybrid_conv_fallback TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", ) - outputs_trt_deser = deser_trt_exp_program(input) + outputs_trt_deser = deser_trt_module(input) for idx in range(len(outputs_pyt)): cos_sim = cosine_similarity(outputs_pyt[idx], outputs_trt_deser[idx]) assertions.assertTrue( From eab0dba2955a87550fe12e7b67ae092597b8c453 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 18 Mar 2024 12:43:45 -0700 Subject: [PATCH 30/73] chore: Fix save failures --- core/runtime/TRTEngine.cpp | 2 +- py/torch_tensorrt/_compile.py | 9 +- py/torch_tensorrt/dynamo/_exporter.py | 135 +++++++++++++------- tests/py/dynamo/models/test_export_serde.py | 2 +- 4 files changed, 100 insertions(+), 48 deletions(-) diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp index 92e5d7a8ff..7a046f6d94 100644 --- a/core/runtime/TRTEngine.cpp +++ b/core/runtime/TRTEngine.cpp @@ -241,7 +241,7 @@ std::string TRTEngine::to_str() const { exec_ctx->getEngine().getTensorDataType(out_binding_names[o].c_str())) << std::endl; } - ss << " }" << std::endl; + ss << " ]" << std::endl; ss << " Device: " << device_info << std::endl; ss << " Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl; // clang-format on diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index aa1bc53a0a..443dec8869 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -397,5 +397,10 @@ def save( exp_program = export(module, inputs) torch.export.save(exp_program, file_path) else: - exp_program = torch.export.export(module, tuple(inputs), strict=False) - torch.export.save(exp_program, file_path) + from torch._higher_order_ops.torchbind import enable_torchbind_tracing + + with enable_torchbind_tracing(): + exp_program = torch.export.export( + module, tuple(inputs), strict=False + ) + torch.export.save(exp_program, file_path) diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index bae20ac235..cf06bc4531 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -1,3 +1,4 @@ +import copy import operator from typing import Any, Dict, Sequence, Tuple, cast @@ -6,8 +7,11 @@ from torch._subclasses.fake_tensor import FakeTensor from torch.export import ExportedProgram, ExportGraphSignature from torch.export.exported_program import ( + CustomObjArgument, InputKind, InputSpec, + ModuleCallEntry, + ModuleCallSignature, OutputKind, OutputSpec, TensorArgument, @@ -44,24 +48,27 @@ def transform( Returns an inlined torch.fx.GraphModule """ + gm_export = copy.deepcopy(gm) # Run shape analysis - _, outputs_map = partitioning.run_shape_analysis(gm, inputs) + _, outputs_map = partitioning.run_shape_analysis(gm_export, inputs) # Inline TensorRT submodules - inline_trt_modules(gm, outputs_map) + inline_trt_modules(gm_export, outputs_map) # Inline pytorch submodules - inline_torch_modules(gm) + inline_torch_modules(gm_export) # Clean the graph - gm.delete_all_unused_submodules() - gm.graph.eliminate_dead_code() - gm.graph.lint() + gm_export.delete_all_unused_submodules() + gm_export.graph.eliminate_dead_code() + gm_export.graph.lint() - return gm + return gm_export -def lift(gm: torch.fx.GraphModule, graph_signature: Any) -> torch.fx.GraphModule: +def lift( + gm: torch.fx.GraphModule, graph_signature: Any +) -> Tuple[torch.fx.GraphModule, ExportGraphSignature, Dict[str, Any], Dict[str, Any]]: """ Given an unlifted fx.GraphModule, lift all parameters, buffers into placeholders. Arguments: @@ -75,6 +82,7 @@ def lift(gm: torch.fx.GraphModule, graph_signature: Any) -> torch.fx.GraphModule # exp_program.state_dict contains parameters and buffers whereas a graph_module's state_dict # has all parameters registered as torch.tensors. state_dict = gm.state_dict() + constants = {} fake_mode = detect_fake_mode( tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder") @@ -89,52 +97,68 @@ def lift(gm: torch.fx.GraphModule, graph_signature: Any) -> torch.fx.GraphModule break # At first the user_inputs are only present in the graph_signature.input_specs and hence non_user_input_idx=0 - # The input_specs should be of the form [params, buffers, constant_tensors, user_inputs] + # The input_specs should be of the form [params, buffers, constant_tensors, custom_obj, user_inputs] non_user_input_idx = 0 for node in gm.graph.nodes: if node.op == "get_attr": - if node.target not in state_dict: - raise ValueError( - f"The get_attr node : {node.name} with target: {node.target} value could not be found in state_dict. Please check the input exported_program's graphmodule parameters." - ) - constant_tensor = state_dict[node.target] - input_kind = InputKind.CONSTANT_TENSOR + lift_val = None + input_kind = None - # state_dict has these parameters/buffers as torch.Tensors. We override them as torch.nn.Parameter/torch.Tensors respectively. - for name, _ in gm.named_parameters(): - if node.target == name: - input_kind = InputKind.PARAMETER - state_dict[name] = torch.nn.Parameter(state_dict[name]) - break - for name, _ in gm.named_buffers(): - if node.target == name: - input_kind = InputKind.BUFFER - break + if node.target not in state_dict: + constants[node.target] = getattr(gm, node.target) + input_kind = InputKind.CUSTOM_OBJ + lift_val = constants[node.target] + else: + lift_val = state_dict[node.target] + + input_kind = InputKind.CONSTANT_TENSOR + + # state_dict has these parameters/buffers as torch.Tensors. We override them as torch.nn.Parameter/torch.Tensors respectively. + for name, _ in gm.named_parameters(): + if node.target == name: + input_kind = InputKind.PARAMETER + state_dict[name] = torch.nn.Parameter(state_dict[name]) + break + for name, _ in gm.named_buffers(): + if node.target == name: + input_kind = InputKind.BUFFER + break + + assert lift_val is not None and input_kind is not None # Replace get_attr nodes with placeholder nodes and copy metadata. with gm.graph.inserting_before(first_user_input): - const_placeholder_node = gm.graph.placeholder(node.target) + const_placeholder_node = gm.graph.placeholder( + node.target.replace(".", "_") + ) # Copy the node meta into this new placeholder node const_placeholder_node.meta = node.meta - const_placeholder_node.meta["val"] = cast( - FakeTensor, - torch.empty_strided( - tuple(constant_tensor.shape), - tuple([1] * len(constant_tensor.shape)), - ), - ) + + if isinstance(lift_val, torch.Tensor): + const_placeholder_node.meta["val"] = cast( + FakeTensor, + torch.empty_strided( + tuple(lift_val.shape), + tuple([1] * len(lift_val.shape)), + ), + ) node.replace_all_uses_with(const_placeholder_node) gm.graph.erase_node(node) # Add these parameters/buffers/constants to the existing graph signature # before user inputs. These specs are looked up in the state_dict during ExportedProgram creation. + input_spec_arg = TensorArgument(name=const_placeholder_node.name) + if input_kind == InputKind.CUSTOM_OBJ: + input_spec_arg = CustomObjArgument( + name=const_placeholder_node.name, class_fqn="" + ) graph_signature.input_specs.insert( non_user_input_idx, InputSpec( kind=input_kind, - arg=TensorArgument(name=const_placeholder_node.name), + arg=input_spec_arg, target=node.target, ), ) @@ -143,7 +167,7 @@ def lift(gm: torch.fx.GraphModule, graph_signature: Any) -> torch.fx.GraphModule gm.graph.eliminate_dead_code() gm.graph.lint() - return gm, graph_signature, state_dict + return gm, graph_signature, state_dict, constants def get_duplicate_nodes( @@ -281,18 +305,30 @@ def create_trt_exp_program( input_specs=input_specs, output_specs=output_specs ) + module_call_graph = [ + ModuleCallEntry( + "", + ModuleCallSignature( + inputs=[], + outputs=[], + in_spec=gm.graph._codegen.pytree_info.in_spec, + out_spec=gm.graph._codegen.pytree_info.out_spec, + ), + ) + ] + # Lift parameters/buffers/constants in the graph # torch.export serialization expects them to be lifted - gm, trt_graph_signature, state_dict = lift(gm, trt_graph_signature) + gm, trt_graph_signature, state_dict, constants = lift(gm, trt_graph_signature) trt_exp_program = ExportedProgram( - gm, - gm.graph, - trt_graph_signature, - state_dict, - {}, - [], - [], + root=gm, + graph=gm.graph, + graph_signature=trt_graph_signature, + state_dict=state_dict, + range_constraints={}, + module_call_graph=module_call_graph, + constants=constants, ) return trt_exp_program @@ -319,9 +355,13 @@ def inline_trt_modules( num_outputs = len(outputs_map[trt_module_node.name]) # Insert a call_function node to perform inference on TRT engine with gm.graph.inserting_before(trt_module_node): + engine_name = f"{name}_engine" + setattr(gm, engine_name, trt_module.engine) + engine_node = gm.graph.get_attr(engine_name) + trt_node = gm.graph.call_function( torch.ops.tensorrt.execute_engine.default, - (trt_module_node.args, trt_module.engine), + (trt_module_node.args, engine_node), ) trt_node.meta["val"] = [] assert num_outputs > 0 @@ -337,6 +377,13 @@ def inline_trt_modules( ) ) + # meta["val"] should be a lighter version of a tensor. For eg: it should be a FakeTensor (with output shape and dtype properties) + # Lighter version of a custom_obj is not defined clearly. meta["val"] does not have any type expectations but + # for custom object nodes, it should be CustomObjArgument + engine_node.meta["val"] = CustomObjArgument( + name=engine_node.name, class_fqn="" + ) + if num_outputs == 1: # Insert getitem nodes as outputs (for export serialization to work) with gm.graph.inserting_after(trt_node): diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py index 0905c5e859..40fa01c2c9 100644 --- a/tests/py/dynamo/models/test_export_serde.py +++ b/tests/py/dynamo/models/test_export_serde.py @@ -146,7 +146,6 @@ def forward(self, x): ) ], "ir": ir, - "debug": True, } exp_program = torchtrt.dynamo.trace(model, **compile_spec) @@ -306,6 +305,7 @@ def forward(self, x): exp_program = torchtrt.dynamo.trace(model, **compile_spec) trt_module = torchtrt.dynamo.compile(exp_program, **compile_spec) + torchtrt.save(trt_module, "/tmp/trt.ep", inputs=[input]) deser_trt_exp_program = torch.export.load("/tmp/trt.ep") deser_trt_module = deser_trt_exp_program.module() From b191d62bafac9740657cb9dc67ccafb213d4914c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 18 Mar 2024 16:29:04 -0700 Subject: [PATCH 31/73] chore: update to 2.3 rc build --- py/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py/requirements.txt b/py/requirements.txt index cd52d32436..419c325653 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -1,9 +1,9 @@ numpy packaging pybind11==2.6.2 ---extra-index-url https://download.pytorch.org/whl/nightly/cu121 -torch>=2.3.0.dev,<2.4.0 -torchvision>=0.18.0.dev,<0.19.0 +--index-url https://download.pytorch.org/whl/test/cu121 +torch>=2.3.0,<2.4.0 +torchvision>=0.18.0,<0.19.0 --extra-index-url https://pypi.ngc.nvidia.com tensorrt==8.6.1 pyyaml From 5f34d4fe7231167e91fae32581fabc420c79904b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 19 Mar 2024 13:36:37 -0700 Subject: [PATCH 32/73] chore: minor fixes --- py/torch_tensorrt/dynamo/conversion/converter_utils.py | 2 +- tests/py/dynamo/models/test_dyn_models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 7d5e59367b..f9d14917f1 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -270,7 +270,7 @@ def create_constant( """ numpy_value = to_numpy(value, dtype) constant = ctx.net.add_constant( - trt.Dims() if isinstance(value, (int, float, bool)) else value.shape, + (1,) if isinstance(value, (int, float, bool)) else value.shape, numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, ) constant.name = name diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index f9f1d02c02..e4675b41be 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -152,7 +152,7 @@ def forward(self, x): } trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) + cos_sim = cosine_similarity(model(input), trt_mod(input)) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, msg=f"test_base_dynamic model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", From 8674a3c437d767c6e2b09db3a574af9a99c318d3 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 19 Mar 2024 16:18:17 -0700 Subject: [PATCH 33/73] chore: minor fixes --- py/torch_tensorrt/dynamo/_exporter.py | 1 + .../lowering/test_aten_lowering_passes.py | 12 ++++++++---- tests/py/dynamo/models/test_models_export.py | 19 ++++++++++--------- tests/py/dynamo/testing_utilities.py | 1 + 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index cf06bc4531..d4a9fd3584 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -129,6 +129,7 @@ def lift( # Replace get_attr nodes with placeholder nodes and copy metadata. with gm.graph.inserting_before(first_user_input): + # Ensure name doesn't contain period as it is used for submodules const_placeholder_node = gm.graph.placeholder( node.target.replace(".", "_") ) diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py index bc75a8aa3d..3afc5e5923 100644 --- a/tests/py/dynamo/lowering/test_aten_lowering_passes.py +++ b/tests/py/dynamo/lowering/test_aten_lowering_passes.py @@ -1,9 +1,12 @@ import torch -from torch.testing._internal.common_utils import TestCase, run_tests - import torch_tensorrt +from torch.testing._internal.common_utils import TestCase, run_tests -from ..testing_utilities import DECIMALS_OF_AGREEMENT, lower_graph_testing +from ..testing_utilities import ( + DECIMALS_OF_AGREEMENT, + DECIMALS_OF_AGREEMENT_3, + lower_graph_testing, +) class TestInputAsOutput(TestCase): @@ -444,10 +447,11 @@ def forward(self, input, weight, bias): max_diff = float( torch.max(torch.abs(optimized_model_results - torch_model_results)) ) + self.assertAlmostEqual( max_diff, 0, - DECIMALS_OF_AGREEMENT, + DECIMALS_OF_AGREEMENT_3, msg=f"Linear TRT outputs don't match with the original model.", ) torch._dynamo.reset() diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index fd7b40592a..bc8bf12c95 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -159,11 +159,11 @@ def test_bert_base_uncased(ir): model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() input = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") input2 = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") - model = ( - transformers_trace(model, input_names=["input_ids", "attention_mask"]) - .eval() - .cuda() - ) + # model = ( + # transformers_trace(model, input_names=["input_ids", "attention_mask"]) + # .eval() + # .cuda() + # ) compile_spec = { "inputs": [ @@ -182,8 +182,8 @@ def test_bert_base_uncased(ir): "enabled_precisions": {torch.float}, "truncate_long_and_double": True, "ir": ir, - "min_block_size": 10, - "torch_executed_ops": {"torch.ops.aten.gelu.default"}, + "min_block_size": 15, + "debug": True, } trt_mod = torchtrt.compile(model, **compile_spec) model_outputs = model(input, input2) @@ -192,8 +192,9 @@ def test_bert_base_uncased(ir): len(model_outputs) == len(trt_model_outputs), msg=f"Number of outputs for BERT model compilation is different with Pytorch {len(model_outputs)} and TensorRT {len(trt_model_outputs)}. Please check the compilation.", ) - for index, key in enumerate(model_outputs): - out, trt_out = model_outputs[key], trt_model_outputs[index] + + for key, _ in model_outputs.items(): + out, trt_out = model_outputs[key], trt_model_outputs[key] cos_sim = cosine_similarity(out, trt_out) assertions.assertTrue( cos_sim > COSINE_THRESHOLD, diff --git a/tests/py/dynamo/testing_utilities.py b/tests/py/dynamo/testing_utilities.py index 742b9fc1a3..c815d2fde4 100644 --- a/tests/py/dynamo/testing_utilities.py +++ b/tests/py/dynamo/testing_utilities.py @@ -14,6 +14,7 @@ ) DECIMALS_OF_AGREEMENT = 4 +DECIMALS_OF_AGREEMENT_3 = 3 def fx_dynamo_testing_backend( From f4e8fe9bc3f114dd8da3760dca510f78a8f58a0d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 19 Mar 2024 17:04:38 -0700 Subject: [PATCH 34/73] chore: remove duplicate bert test case --- tests/py/dynamo/models/test_models_export.py | 53 -------------------- 1 file changed, 53 deletions(-) diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index bc8bf12c95..4d0f4e2e7f 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -105,55 +105,6 @@ def test_efficientnet_b0(ir): torch._dynamo.reset() -@pytest.mark.unit -def test_bert_base_uncased(ir): - model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() - input = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") - model = ( - transformers_trace(model, input_names=["input_ids", "attention_mask"]) - .eval() - .cuda() - ) - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "truncate_long_and_double": True, - "ir": ir, - "min_block_size": 10, - } - trt_mod = torchtrt.compile(model, **compile_spec) - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - assertions.assertTrue( - len(model_outputs) == len(trt_model_outputs), - msg=f"Number of outputs for BERT model compilation is different with Pytorch {len(model_outputs)} and TensorRT {len(trt_model_outputs)}. Please check the compilation.", - ) - for index, key in enumerate(model_outputs): - out, trt_out = model_outputs[key], trt_model_outputs[index] - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - @pytest.mark.unit def test_bert_base_uncased(ir): model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() @@ -183,7 +134,6 @@ def test_bert_base_uncased(ir): "truncate_long_and_double": True, "ir": ir, "min_block_size": 15, - "debug": True, } trt_mod = torchtrt.compile(model, **compile_spec) model_outputs = model(input, input2) @@ -204,9 +154,6 @@ def test_bert_base_uncased(ir): # Clean up model env torch._dynamo.reset() - with torch.no_grad(): - torch.cuda.empty_cache() - @pytest.mark.unit def test_resnet18_half(ir): From 4ae6ab95a98f5b1f644c3bfd3824ac76df442dc7 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 19 Mar 2024 17:05:51 -0700 Subject: [PATCH 35/73] chore: remove comments --- tests/py/dynamo/models/test_models_export.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index 4d0f4e2e7f..84f6bf7a36 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -110,11 +110,6 @@ def test_bert_base_uncased(ir): model = BertModel.from_pretrained("bert-base-uncased").cuda().eval() input = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") input2 = torch.randint(0, 1, (1, 14), dtype=torch.int32).to("cuda") - # model = ( - # transformers_trace(model, input_names=["input_ids", "attention_mask"]) - # .eval() - # .cuda() - # ) compile_spec = { "inputs": [ From 78f7eb550603134666b4e8f646d5dee43358e699 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 2 Apr 2024 05:45:27 -0700 Subject: [PATCH 36/73] chore: updates --- .../dynamo/conversion/aten_ops_converters.py | 5 ++ .../dynamo/conversion/impl/slice/ops.py | 4 - .../dynamo/conversion/ops_evaluators.py | 7 -- .../dynamo/lowering/passes/view_to_reshape.py | 1 - .../dynamo/partitioning/__init__.py | 1 - .../dynamo/partitioning/common.py | 75 ------------------- tests/py/dynamo/models/test_dyn_models.py | 2 +- 7 files changed, 6 insertions(+), 89 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 3f547f9d40..72998e1917 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -392,6 +392,11 @@ def aten_ops_sigmoid( ) +@enforce_tensor_types( + { + 0: (TRTTensor,), + } +) @dynamo_tensorrt_converter(torch.ops.aten.sym_size.int) def aten_ops_symsize_int( ctx: ConversionContext, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 70badd796c..e578ebee54 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -98,11 +98,7 @@ def expand( stride = tuple( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - # layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) - # set_layer_name(layer, target, name, source_ir) - # return layer.get_output(0) - breakpoint() expand_output = slice(ctx, target, source_ir, name, input_t, start, shape, stride) return expand_output diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index b35f198028..5ddd8c5e3a 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -46,14 +46,7 @@ def aten_ops_arange_start_step( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: - # breakpoint() fill_layer = ctx.net.add_fill(trt.Dims(), trt.FillOperation.LINSPACE) fill_layer.set_input(0, args[1]) fill_layer.set_output_type(0, trt.DataType.INT32) - # fill_layer.set_input(1, 0) - # fill_layer.set_input(2, 1) - # start_tensor = get_trt_tensor(ctx, 0, "_start_tensor") - # fill_layer.set_input(1, start_tensor) - # delta_tensor = get_trt_tensor(ctx, torch.tensor([0], dtype=torch.int32), "_delta_tensor") - # fill_layer.set_input(2, delta_tensor) return fill_layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py index db0346348b..b2da354122 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py @@ -28,7 +28,6 @@ def replacement(input: torch.Tensor, shape: List[torch.SymInt]) -> torch.Tensor: # Store metadata of the orig_op metadata = get_metadata(gm, orig_op) - # breakpoint() if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement): gm = clean_up_graph_after_modifications(gm) diff --git a/py/torch_tensorrt/dynamo/partitioning/__init__.py b/py/torch_tensorrt/dynamo/partitioning/__init__.py index 5e5406e67c..25487da065 100644 --- a/py/torch_tensorrt/dynamo/partitioning/__init__.py +++ b/py/torch_tensorrt/dynamo/partitioning/__init__.py @@ -3,6 +3,5 @@ from .common import ( construct_submodule_inputs, get_graph_converter_support, - get_submod_inputs, run_shape_analysis, ) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 109bda275f..270973c8c3 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -4,7 +4,6 @@ import torch from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG -from torch_tensorrt.dynamo.utils import get_torch_inputs, input_is_dynamic logger = logging.getLogger(__name__) @@ -135,80 +134,6 @@ def get_submodule_io( return submod_inputs_shape_map, submod_outputs_shape_map -def get_submod_inputs( - mod: torch.fx.GraphModule, - submod: torch.fx.GraphModule, - inputs: Sequence[Input], - device: torch.device, -) -> Optional[Sequence[torch.Tensor]]: - """Helper function to get inputs to a Torch submodule - - Args: - mod: Parent FX GraphModule - submod: Child FX GraphModule - inputs: Sample inputs to parent module - Returns: - Sequence of Tensors representing inputs to child module - """ - acc_inputs: Any = None - - def get_input(self: Any, inputs: Sequence[torch.Tensor]) -> None: - nonlocal acc_inputs - acc_inputs = inputs - return - - # Register a hook to capture submodule input - handle = submod.register_forward_pre_hook(get_input) - # Iterate over min, opt, max shapes for dynamic inputs - inputs_map = {} - - if input_is_dynamic(inputs): - for mode in ["min_shape", "opt_shape", "max_shape"]: - torch_inputs = get_torch_inputs(inputs, device, mode) - mod(*torch_inputs) - inputs_map[mode] = acc_inputs - handle.remove() - else: - torch_inputs = get_torch_inputs(inputs, device) - mod(*torch_inputs) - handle.remove() - assert isinstance(acc_inputs, tuple) - return [ - Input(shape=acc_input.shape, dtype=acc_input.dtype) - for acc_input in acc_inputs - ] - - num_submodule_inputs = ( - len(inputs_map["min_shape"]) if inputs_map["min_shape"] else 0 - ) - submodule_inputs = [] - for idx in range(num_submodule_inputs): - if not isinstance(inputs_map["min_shape"][idx], torch.Tensor): - input_val = torch.tensor(inputs_map["opt_shape"][idx], dtype=torch.int32) - logger.warning( - "Detected a zero-dimensional input. This might be a shape tensor input which is not currently supported. This might result in undefined behavior" - ) - submodule_inputs.append( - Input( - shape=[1], - torch_tensor=input_val, - dtype=input_val.dtype, - ) - ) - else: - submodule_inputs.append( - Input( - min_shape=inputs_map["min_shape"][idx].shape, - opt_shape=inputs_map["opt_shape"][idx].shape, - max_shape=inputs_map["max_shape"][idx].shape, - torch_tensor=inputs_map["opt_shape"][idx], - dtype=inputs_map["opt_shape"][idx].dtype, - ) - ) - - return submodule_inputs - - def get_graph_converter_support( graph_module: torch.fx.GraphModule, verbose: bool = DEBUG, diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index e4675b41be..822ee468a9 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -64,7 +64,7 @@ def forward(self, x): @pytest.mark.unit def test_base_dynamic_fallback(ir): """ - Tests the model (which is fully convertible) with dynamic shapes + Tests the model with dynamic shapes where torch.abs op is forced to run in PyTorch """ class MyModule(torch.nn.Module): From e9b649d2466b87bd3200fe2bbd26b2de61d57f66 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 4 Apr 2024 21:46:19 -0700 Subject: [PATCH 37/73] chore: revert changes --- .../dynamo/conversion/impl/slice/base.py | 19 +------------------ .../dynamo/conversion/impl/slice/ops.py | 5 +++-- .../dynamo/conversion/ops_evaluators.py | 7 ++----- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py index 64225227aa..018ac63b8c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py @@ -3,7 +3,6 @@ from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, @@ -12,22 +11,6 @@ from torch_tensorrt.fx.types import Shape, TRTTensor -def get_dynamic_shape(ctx, target, source_ir, name, shape, input): - trt_shape = [] - shape = input.shape - for i, s in enumerate(shape): - if isinstance(s, TRTTensor): - trt_shape.append(s) - else: - a = get_trt_tensor(ctx, s, f"{name}_{i}") - trt_shape.append(a) - shape_layer = ctx.net.add_concatenation(inputs=trt_shape) - shape_layer.axis = 0 - shape_layer.name = f"{name}_output_shape" - - return shape_layer.get_output(0) - - def slice( ctx: ConversionContext, target: Target, @@ -40,7 +23,7 @@ def slice( ) -> TRTTensor: dynamic_shape = has_dynamic_shape(input.shape) if dynamic_shape: - shape = get_dynamic_shape(ctx, target, source_ir, name, shape, input) + shape = get_shape_with_dynamic_shape(ctx, target, source_ir, name, shape, input) layer = ctx.net.add_slice( input, start=start, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index e578ebee54..61d71fe9a0 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -99,8 +99,9 @@ def expand( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - expand_output = slice(ctx, target, source_ir, name, input_t, start, shape, stride) - return expand_output + layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) + set_layer_name(layer, target, name, source_ir) + return layer.get_output(0) def chunk( diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index 5ddd8c5e3a..f83e0e5008 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -2,7 +2,7 @@ import operator from typing import Dict, Sequence, Tuple, Union -import tensorrt as trt +import numpy as np import torch from torch.fx.node import Argument, Node, Target from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -46,7 +46,4 @@ def aten_ops_arange_start_step( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: - fill_layer = ctx.net.add_fill(trt.Dims(), trt.FillOperation.LINSPACE) - fill_layer.set_input(0, args[1]) - fill_layer.set_output_type(0, trt.DataType.INT32) - return fill_layer.get_output(0) + return np.arange(*args) From 5a627619e8aa7f09f56d3a255bc75d86e9730d29 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 5 Apr 2024 19:52:39 +0000 Subject: [PATCH 38/73] chore: patches for llamav2 --- .../dynamo/conversion/converter_utils.py | 11 ++- .../dynamo/conversion/impl/slice/ops.py | 93 ++++++++++++++----- .../dynamo/conversion/ops_evaluators.py | 25 +++++ 3 files changed, 106 insertions(+), 23 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index f9d14917f1..8353bc0970 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -254,6 +254,7 @@ def create_constant( value: Union[int, float, bool, np.ndarray, torch.Tensor], name: str, dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType]], + rank: Optional[int] = 1, ) -> TRTTensor: """ Add a TensorRT constant layer whose value is `value` to `ctx.net`. @@ -269,8 +270,13 @@ def create_constant( A TensorRT ITensor that represents the given value. """ numpy_value = to_numpy(value, dtype) + shape = (1,) + # Rank 0 constant is required in IFillLayer inputs. + if rank == 0: + shape = trt.Dims() + constant = ctx.net.add_constant( - (1,) if isinstance(value, (int, float, bool)) else value.shape, + shape if isinstance(value, (int, float, bool)) else value.shape, numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, ) constant.name = name @@ -282,6 +288,7 @@ def get_trt_tensor( input_val: Any, name: str, dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType]] = None, + rank: int = 1, ) -> TRTTensor: """ Given a value of random type, we try to convert it to a TensorRT ITensor. @@ -316,7 +323,7 @@ def get_trt_tensor( input_val = input_val.astype(np.float32) if isinstance(input_val, (torch.Tensor, np.ndarray, int, float, bool)): - return create_constant(ctx, input_val, name, dtype) + return create_constant(ctx, input_val, name, dtype, rank) elif isinstance(input_val, TRTTensor): return input_val else: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 61d71fe9a0..38d5e2a0ea 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -8,10 +8,13 @@ from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( + cast_trt_tensor, get_positive_dim, get_trt_tensor, ) +from torch_tensorrt.dynamo.conversion.impl.elementwise import div, sub from torch_tensorrt.dynamo.conversion.impl.slice.base import slice +from torch_tensorrt.dynamo.conversion.impl.unary import ceil from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, prepend_ones, @@ -31,6 +34,7 @@ def slice_op( # TODO: This should be slice not whatever is in base stop: Optional[int], step: int, ) -> TRTTensor: + # Special case for start being None if start is None: start = 0 @@ -39,24 +43,72 @@ def slice_op( # TODO: This should be slice not whatever is in base if stop is None: stop = input.shape[dim] - dim = get_positive_dim(dim, len(input.shape)) - start = get_positive_dim(start, input.shape[dim]) - stop = get_positive_dim(stop, input.shape[dim]) - - if has_dynamic_shape(input.shape): - # Check whether slice target dim is dynamic shape dim - assert input.shape[dim] != -1, "Can't slice on dynamic shape dimension!" - - start_slice = [0] * len(input.shape) - start_slice[dim] = start - stride_slice = [1] * len(input.shape) - stride_slice[dim] = step - output_shape = list(input.shape) - output_shape[dim] = math.ceil((stop - start) / step) - - return slice( - ctx, target, source_ir, name, input, start_slice, output_shape, stride_slice - ) + is_slice_dynamic = False + if ( + isinstance(start, TRTTensor) + or isinstance(step, TRTTensor) + or isinstance(stop, TRTTensor) + ): + is_slice_dynamic = True + + if not is_slice_dynamic: + dim = get_positive_dim(dim, len(input.shape)) + start = get_positive_dim(start, input.shape[dim]) + stop = get_positive_dim(stop, input.shape[dim]) + + if has_dynamic_shape(input.shape): + # Check whether slice target dim is dynamic shape dim + assert input.shape[dim] != -1, "Can't slice on dynamic shape dimension!" + + start_slice = [0] * len(input.shape) + start_slice[dim] = start + stride_slice = [1] * len(input.shape) + stride_slice[dim] = step + output_shape = list(input.shape) + output_shape[dim] = math.ceil((stop - start) / step) + + return slice( + ctx, target, source_ir, name, input, start_slice, output_shape, stride_slice + ) + else: + dim = get_positive_dim(dim, len(input.shape)) + # Make start, stop, step an ITensor + start = get_trt_tensor(ctx, start, name + "_start") + stop = get_trt_tensor(ctx, stop, name + "_stop") + stop_casted = cast_trt_tensor(ctx, stop, trt.float32, name + "_casted") + step = get_trt_tensor(ctx, step, name + "_step") + # Calculate size for ISlice Layer = ceil((stop-start)/step) + shape = sub( + ctx, + target, + SourceIR.ATEN, + name + "_sub", + stop_casted, + start, + ) + shape = div( + ctx, + target, + SourceIR.ATEN, + name + "_div", + shape, + step, + ) + shape = ceil( + ctx, + target, + SourceIR.ATEN, + name + "_shape", + shape, + ) + shape = cast_trt_tensor(ctx, shape, trt.int32, name + "_shape_casted") + slice_layer = ctx.net.add_slice( + input, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() + ) + slice_layer.set_input(1, start) + slice_layer.set_input(2, shape) + slice_layer.set_input(3, step) + return slice_layer.get_output(0) def expand( @@ -99,9 +151,8 @@ def expand( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) - set_layer_name(layer, target, name, source_ir) - return layer.get_output(0) + expand_output = slice(ctx, target, source_ir, name, input_t, start, shape, stride) + return expand_output def chunk( diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index f83e0e5008..f283266ac0 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -3,13 +3,17 @@ from typing import Dict, Sequence, Tuple, Union import numpy as np +import tensorrt as trt import torch from torch.fx.node import Argument, Node, Target +from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( ConverterRegistry, dynamo_tensorrt_converter, ) +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor +from torch_tensorrt.dynamo.conversion.impl.elementwise import sub from torch_tensorrt.fx.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -46,4 +50,25 @@ def aten_ops_arange_start_step( kwargs: Dict[str, Argument], name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: + # Case where inputs to arange are dynamic + if np.any([isinstance(tensor, TRTTensor) for tensor in args]): + start = get_trt_tensor(ctx, args[0], name + "_start", rank=0) + end = get_trt_tensor(ctx, args[1], name + "_end", rank=0) + # Calculate shape = (end-start) / 1 (in this case) + shape = sub( + ctx, + target, + SourceIR.ATEN, + name + "_shape", + end, + start, + ) + + fill_layer = ctx.net.add_fill(trt.Dims(), trt.FillOperation.LINSPACE) + fill_layer.set_input(0, shape) + # Set start index + fill_layer.set_input(1, start) + # Set output type to INT32 + fill_layer.set_output_type(0, trt.DataType.INT32) + return fill_layer.get_output(0) return np.arange(*args) From c1574be78645cead5e123f3cfb0722fce2ee7a2b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 24 Apr 2024 18:49:22 +0000 Subject: [PATCH 39/73] chore: updates --- .../lowering/passes/fuse_prims_broadcast.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py index 312926e870..56fad67a48 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py @@ -2,7 +2,8 @@ from typing import Sequence import torch -from torch.fx.passes.shape_prop import ShapeProp + +# from torch.fx.passes.shape_prop import ShapeProp from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( clean_up_graph_after_modifications, ) @@ -17,15 +18,15 @@ def fuse_prims_broadcast( """Fuses prim nodes which are effectively the ATen equivalents with keep_dim=True""" modified_graph = False - # Propagate shapes through the graph to determine if broadcast can be resolved - try: - ShapeProp(gm).propagate(*sample_inputs) - except (RuntimeError, AssertionError): - logger.warning( - "Shape Propagation Failed on Graph, skipping fuse_prims_broadcast lowering pass", - exc_info=True, - ) - return gm + # # Propagate shapes through the graph to determine if broadcast can be resolved + # try: + # ShapeProp(gm).propagate(*sample_inputs) + # except (RuntimeError, AssertionError): + # logger.warning( + # "Shape Propagation Failed on Graph, skipping fuse_prims_broadcast lowering pass", + # exc_info=True, + # ) + # return gm for node in gm.graph.nodes: # If the node is a sum prims operator, with broadcast_in_dim being the only consumer From 8c68359af2f7ae74a82f5ae32f3f533de3dcf922 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 25 Apr 2024 16:59:19 -0700 Subject: [PATCH 40/73] chore: add consistent graph log --- py/torch_tensorrt/dynamo/backend/backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 66a9729cc0..dbb900009a 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -96,6 +96,8 @@ def _pretraced_backend( gm = apply_lowering_passes(gm, torch_inputs) + logger.debug("Lowered Input graph:\n " + str(gm.graph)) + torchtrt_inputs = prepare_inputs( torch_inputs, disable_memory_format_check=True ) From 1481ad37e0e2ed8a7fef951f0fdb94f9723f4132 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 26 Apr 2024 17:29:30 -0700 Subject: [PATCH 41/73] chore: updates --- py/torch_tensorrt/dynamo/conversion/impl/cat.py | 2 +- py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cat.py b/py/torch_tensorrt/dynamo/conversion/impl/cat.py index d6ffc77377..2f43f925ba 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cat.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cat.py @@ -27,7 +27,7 @@ def cat( each_input = get_trt_tensor(ctx, each_input, f"{name}_tensor_{i}") trt_inputs.append(each_input) concat_layer = ctx.net.add_concatenation(trt_inputs) - dim = get_positive_dim(dim, len(input[0].shape)) + dim = get_positive_dim(dim, len(trt_inputs[0].shape)) concat_layer.axis = dim set_layer_name(concat_layer, target, f"{name}_gather", source_ir) return concat_layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 847c5587b3..e1d08203f4 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -12,6 +12,7 @@ get_positive_dim, get_trt_tensor, ) +from torch_tensorrt.dynamo.conversion.impl.cat import cat from torch_tensorrt.dynamo.conversion.impl.elementwise import div, sub from torch_tensorrt.dynamo.conversion.impl.slice.base import slice from torch_tensorrt.dynamo.conversion.impl.unary import ceil @@ -151,7 +152,15 @@ def expand( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) + shape_ = shape + # Handle dynamic shapes case where shape has dynamic dimension + if any(isinstance(ele, TRTTensor) for ele in shape): + shape_ = cat(ctx, target, source_ir, name + "_shape_concat", shape, 0) + layer = ctx.net.add_slice(input_t, start=start, shape=trt.Dims(), stride=stride) + layer.set_input(2, shape_) + else: + layer = ctx.net.add_slice(input_t, start=start, shape=shape_, stride=stride) + set_layer_name(layer, target, name, source_ir) return layer.get_output(0) From 7a59e639addf18875716ab3c7223d1d5d20e11de Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 29 Apr 2024 15:45:43 -0700 Subject: [PATCH 42/73] feat: Add validators for dynamic shapes in converter registration --- .../dynamo/conversion/_ConverterRegistry.py | 20 ++++++++++++++++++- .../dynamo/conversion/aten_ops_converters.py | 6 ++++-- .../dynamo/conversion/converter_utils.py | 13 ++++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 050a62ef3e..9967198772 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -79,6 +79,7 @@ class ConverterSupport: converter_implementation: ConverterImplSignature capability_validator: Callable[[Node], bool] = field(default=lambda node: True) + dynamic: bool = False # Dictionary representing Dynamo aten-only converters @@ -88,9 +89,11 @@ class ConverterSupport: def dynamo_tensorrt_converter( key: Target, + *, enabled: bool = True, capability_validator: Optional[Callable[[Node], bool]] = None, priority: ConverterPriority = ConverterPriority.STANDARD, + dynamic: bool = False, ) -> Callable[[ConverterImplSignature], ConverterImplSignature]: """Decorator for Dynamo TensorRT Converter @@ -116,7 +119,9 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat # If no capability_validator function is specified, use the default function - always return true if capability_validator is None: - converter_support = ConverterSupport(converter_implementation=converter) + converter_support = ConverterSupport( + converter_implementation=converter, dynamic=dynamic + ) else: assert callable( capability_validator @@ -124,6 +129,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat converter_support = ConverterSupport( converter_implementation=converter, capability_validator=capability_validator, + dynamic=dynamic, ) # OpOverloadPackets are only valid if they have a single overload, or @@ -323,6 +329,18 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: + # TODO: Importing this here avoids circular import issue. One potential fix is moving this function into _ConverterRegistry file. + from torch_tensorrt.dynamo.conversion.converter_utils import ( + dynamic_unsupported, + ) + + has_static_inputs = dynamic_unsupported(node) + # If there are dynamic inputs but the converter doesn't support it explicitly, throw a warning. + if not has_static_inputs and not candidate.dynamic: + logger.warning( + f"The converter for node {node.target} received dynamic shaped inputs but the static version of the converter is being used. Please report this issue at https://github.com/pytorch/TensorRT/issues" + ) + if candidate.capability_validator(node): return ( candidate.converter_implementation, diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 72998e1917..e6f14c34e7 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -358,7 +358,7 @@ def aten_ops_grid( ) -@dynamo_tensorrt_converter(torch.ops.aten.relu.default) +@dynamo_tensorrt_converter(torch.ops.aten.relu.default, dynamic=True) def aten_ops_relu( ctx: ConversionContext, target: Target, @@ -2080,7 +2080,9 @@ def conv_param_validator(conv_node: Node) -> bool: @dynamo_tensorrt_converter( - torch.ops.aten.convolution.default, capability_validator=conv_param_validator + torch.ops.aten.convolution.default, + capability_validator=conv_param_validator, + dynamic=True, ) @enforce_tensor_types( { diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 7e55110459..44a08fa8c8 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -82,9 +82,18 @@ def _dynamic_unsupported( def _is_subnode_dynamic(subnode: torch.fx.Node) -> bool: """Checks if a node itself has Dynamic properties""" - return getattr( + _has_symbolic_sizes_strides = getattr( subnode.meta["val"], "_has_symbolic_sizes_strides", False - ) or isinstance(subnode.meta["val"], (SymFloat, SymInt, SymBool)) + ) + + is_shape_dynamic = False + if "val" in subnode.meta: + shape = subnode.meta["val"].size() + is_shape_dynamic = any( + isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape + ) + + return _has_symbolic_sizes_strides or is_shape_dynamic # Check node value itself if arg_positions_to_check is None and _is_subnode_dynamic(node): From f55d41ae02913cf76477bbb550f573aa365597b2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 29 Apr 2024 19:41:10 -0700 Subject: [PATCH 43/73] chore: updates --- .../dynamo/conversion/_ConverterRegistry.py | 91 ++++++++++++++++--- .../dynamo/conversion/aten_ops_converters.py | 12 +-- .../dynamo/conversion/converter_utils.py | 66 -------------- 3 files changed, 83 insertions(+), 86 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 9967198772..9deee3d250 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import logging from dataclasses import dataclass, field from enum import Enum, auto @@ -17,6 +18,8 @@ cast, ) +import torch +from torch import SymBool, SymFloat, SymInt from torch._ops import OpOverloadPacket from torch.fx.node import Argument, Node, Target, _get_qualified_name from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -75,11 +78,12 @@ class ConverterSupport: capability_validator: Function which takes in a Node and returns a bool indicating whether that node can be supported by its companion converter. Note that this function must not modify the node or its graph + supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic inputs. """ converter_implementation: ConverterImplSignature capability_validator: Callable[[Node], bool] = field(default=lambda node: True) - dynamic: bool = False + supports_dynamic_shapes: bool = False # Dictionary representing Dynamo aten-only converters @@ -87,13 +91,78 @@ class ConverterSupport: DYNAMO_ATEN_CONVERTERS: Dict[Target, Sequence[ConverterSupport]] = {} +def has_dynamic_shapes(node: torch.fx.Node) -> bool: + """Returns True if a node has dynamic args, kwargs, or outputs""" + return _has_dynamic_shapes(node=node) + + +def has_dynamic_shapes_in_args( + arg_positions_to_check: Optional[List[int]] = None, +) -> Callable[[torch.fx.Node], bool]: + """Returns True if a node has dynamic inputs in node.args at specified positions""" + return functools.partial( + _has_dynamic_shapes, arg_positions_to_check=arg_positions_to_check + ) + + +def _has_dynamic_shapes( + node: torch.fx.Node, arg_positions_to_check: Optional[List[int]] = None +) -> bool: + # Validate that none of the inputs to the node have Dynamic shapes + assert isinstance( + node, torch.fx.Node + ), "Inputs to validator functions must be FX Nodes" + + def _is_subnode_dynamic(subnode: torch.fx.Node) -> bool: + """Checks if a node itself has Dynamic properties""" + _has_symbolic_sizes_strides, is_shape_dynamic = False, False + if "val" in subnode.meta: + _has_symbolic_sizes_strides = getattr( + subnode.meta["val"], "_has_symbolic_sizes_strides", False + ) + + shape = subnode.meta["val"].size() + is_shape_dynamic = any( + isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape + ) + + return _has_symbolic_sizes_strides or is_shape_dynamic + + # Check node value itself + if arg_positions_to_check is None and _is_subnode_dynamic(node): + return True + + # Check node arguments individually + if arg_positions_to_check is None and any( + _is_subnode_dynamic(arg) for arg in node.args if isinstance(arg, torch.fx.Node) + ): + return True + # Check specific arg positions if the caller has specified positions to check + elif arg_positions_to_check is not None and any( + _is_subnode_dynamic(node.args[i]) + for i in arg_positions_to_check + if isinstance(node.args[i], torch.fx.Node) + ): + return True + + # Check node keyword arguments individually + if arg_positions_to_check is None and any( + _is_subnode_dynamic(kwarg) + for kwarg in node.kwargs.values() + if isinstance(kwarg, torch.fx.Node) + ): + return True + + return False + + def dynamo_tensorrt_converter( key: Target, *, enabled: bool = True, capability_validator: Optional[Callable[[Node], bool]] = None, priority: ConverterPriority = ConverterPriority.STANDARD, - dynamic: bool = False, + supports_dynamic_shapes: bool = False, ) -> Callable[[ConverterImplSignature], ConverterImplSignature]: """Decorator for Dynamo TensorRT Converter @@ -120,7 +189,8 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat # If no capability_validator function is specified, use the default function - always return true if capability_validator is None: converter_support = ConverterSupport( - converter_implementation=converter, dynamic=dynamic + converter_implementation=converter, + supports_dynamic_shapes=supports_dynamic_shapes, ) else: assert callable( @@ -129,7 +199,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat converter_support = ConverterSupport( converter_implementation=converter, capability_validator=capability_validator, - dynamic=dynamic, + supports_dynamic_shapes=supports_dynamic_shapes, ) # OpOverloadPackets are only valid if they have a single overload, or @@ -329,16 +399,13 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: - # TODO: Importing this here avoids circular import issue. One potential fix is moving this function into _ConverterRegistry file. - from torch_tensorrt.dynamo.conversion.converter_utils import ( - dynamic_unsupported, - ) - - has_static_inputs = dynamic_unsupported(node) # If there are dynamic inputs but the converter doesn't support it explicitly, throw a warning. - if not has_static_inputs and not candidate.dynamic: + if ( + not candidate.supports_dynamic_shapes + and has_dynamic_shapes(node) + ): logger.warning( - f"The converter for node {node.target} received dynamic shaped inputs but the static version of the converter is being used. Please report this issue at https://github.com/pytorch/TensorRT/issues" + f"The converter for node {node.target} received dynamic shaped inputs although it was designed for static inputs. This shouldn't likely cause issues unless there are some dimensions which are dynamic (excluding the batch). If you encounter any issues, please post at https://github.com/pytorch/TensorRT/issues" ) if candidate.capability_validator(node): diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index e6f14c34e7..f35d1ec444 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -12,7 +12,6 @@ dynamo_tensorrt_converter, ) from torch_tensorrt.dynamo.conversion.converter_utils import ( - dynamic_unsupported_with_args, enforce_tensor_types, is_only_operator_on_placeholder, ) @@ -358,7 +357,7 @@ def aten_ops_grid( ) -@dynamo_tensorrt_converter(torch.ops.aten.relu.default, dynamic=True) +@dynamo_tensorrt_converter(torch.ops.aten.relu.default, supports_dynamic_shapes=True) def aten_ops_relu( ctx: ConversionContext, target: Target, @@ -645,14 +644,11 @@ def aten_ops_softmax( @dynamo_tensorrt_converter( - torch.ops.aten.split.Tensor, capability_validator=dynamic_unsupported_with_args([1]) -) -@dynamo_tensorrt_converter( - torch.ops.aten.split.sizes, capability_validator=dynamic_unsupported_with_args([1]) + torch.ops.aten.split.Tensor, ) +@dynamo_tensorrt_converter(torch.ops.aten.split.sizes) @dynamo_tensorrt_converter( torch.ops.aten.split_with_sizes.default, - capability_validator=dynamic_unsupported_with_args([1]), ) def aten_ops_split( ctx: ConversionContext, @@ -2082,7 +2078,7 @@ def conv_param_validator(conv_node: Node) -> bool: @dynamo_tensorrt_converter( torch.ops.aten.convolution.default, capability_validator=conv_param_validator, - dynamic=True, + supports_dynamic_shapes=True, ) @enforce_tensor_types( { diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 44a08fa8c8..949e047e38 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -6,7 +6,6 @@ import numpy as np import tensorrt as trt import torch -from torch import SymBool, SymFloat, SymInt from torch.fx.node import Argument, Target from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -58,71 +57,6 @@ def is_only_operator_on_placeholder(node: torch.fx.Node) -> bool: ) -def dynamic_unsupported(node: torch.fx.Node) -> bool: - """Validates that a node has no dynamic args, kwargs, or outputs""" - return _dynamic_unsupported(node=node) - - -def dynamic_unsupported_with_args( - arg_positions_to_check: Optional[List[int]] = None, -) -> Callable[[torch.fx.Node], bool]: - """Returns a validator that a node has no dynamic args at specific positions""" - return functools.partial( - _dynamic_unsupported, arg_positions_to_check=arg_positions_to_check - ) - - -def _dynamic_unsupported( - node: torch.fx.Node, arg_positions_to_check: Optional[List[int]] = None -) -> bool: - # Validate that none of the inputs to the node have Dynamic shapes - assert isinstance( - node, torch.fx.Node - ), "Inputs to validator functions must be FX Nodes" - - def _is_subnode_dynamic(subnode: torch.fx.Node) -> bool: - """Checks if a node itself has Dynamic properties""" - _has_symbolic_sizes_strides = getattr( - subnode.meta["val"], "_has_symbolic_sizes_strides", False - ) - - is_shape_dynamic = False - if "val" in subnode.meta: - shape = subnode.meta["val"].size() - is_shape_dynamic = any( - isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape - ) - - return _has_symbolic_sizes_strides or is_shape_dynamic - - # Check node value itself - if arg_positions_to_check is None and _is_subnode_dynamic(node): - return False - - # Check node arguments individually - if arg_positions_to_check is None and any( - _is_subnode_dynamic(arg) for arg in node.args if isinstance(arg, torch.fx.Node) - ): - return False - # Check specific arg positions if the caller has specified positions to check - elif arg_positions_to_check is not None and any( - _is_subnode_dynamic(node.args[i]) - for i in arg_positions_to_check - if isinstance(node.args[i], torch.fx.Node) - ): - return False - - # Check node keyword arguments individually - if arg_positions_to_check is None and any( - _is_subnode_dynamic(kwarg) - for kwarg in node.kwargs.values() - if isinstance(kwarg, torch.fx.Node) - ): - return False - - return True - - def cast_trt_tensor( ctx: ConversionContext, input_val: TRTTensor, From 87da1c183c61f972f535f9549d6a42df62847c89 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 30 Apr 2024 10:43:45 -0700 Subject: [PATCH 44/73] chore: updates --- .../dynamo/conversion/_ConverterRegistry.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 9deee3d250..81a079145a 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -120,11 +120,22 @@ def _is_subnode_dynamic(subnode: torch.fx.Node) -> bool: _has_symbolic_sizes_strides = getattr( subnode.meta["val"], "_has_symbolic_sizes_strides", False ) - - shape = subnode.meta["val"].size() - is_shape_dynamic = any( - isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape - ) + meta_val = subnode.meta["val"] + if isinstance(meta_val, (list, tuple)): + for val in meta_val: + shape = val.size() + if any( + isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape + ): + is_shape_dynamic = True + break + elif isinstance(meta_val, (SymFloat, SymInt, SymBool)): + is_shape_dynamic = True + else: + shape = subnode.meta["val"].size() + is_shape_dynamic = any( + isinstance(dim, (SymFloat, SymInt, SymBool)) for dim in shape + ) return _has_symbolic_sizes_strides or is_shape_dynamic From e3e7927de406ee38c89d36245f2dfa2185b53fed Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 1 May 2024 15:57:49 -0700 Subject: [PATCH 45/73] chore: updates --- .../dynamo/conversion/_ConverterRegistry.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 4bd4706f2b..0484d04442 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -18,6 +18,7 @@ cast, ) +import tensorrt as trt import torch from torch import SymBool, SymFloat, SymInt from torch._ops import OpOverloadPacket @@ -25,8 +26,6 @@ from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.fx.converter_registry import CONVERTERS as FX_CONVERTERS -import tensorrt as trt - logger = logging.getLogger(__name__) LegacyConverterImplSignature = Callable[ @@ -411,22 +410,28 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: - # If there are dynamic inputs but the converter doesn't support it explicitly, throw a warning. if ( - not candidate.supports_dynamic_shapes + candidate.capability_validator(node) and has_dynamic_shapes(node) + and candidate.supports_dynamic_shapes ): - logger.warning( - f"The converter for node {node.target} received dynamic shaped inputs although it was designed for static inputs. This shouldn't likely cause issues unless there are some dimensions which are dynamic (excluding the batch). If you encounter any issues, please post at https://github.com/pytorch/TensorRT/issues" + # If node has dynamic inputs and the converter supports dynamic shapes, it is enabled + return ( + candidate.converter_implementation, + calling_convention, ) - - if candidate.capability_validator(node): + elif candidate.capability_validator( + node + ) and not has_dynamic_shapes(node): + # For static shapes all converters are turned on based on capability_validator check return ( candidate.converter_implementation, calling_convention, ) else: - return converters, calling_convention + # Assuming FX converters don't have dynamic shapes supported + if not has_dynamic_shapes(node): + return converters, calling_convention raise KeyError( f"None of the converter registries have a validated entry for {key}, with node {node}" From 8ec68dae49635bfdc10f9c0d484fd6abfd6513a1 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 1 May 2024 19:22:04 -0700 Subject: [PATCH 46/73] chore: address failures and implement flag to enable all converters --- py/torch_tensorrt/dynamo/_compiler.py | 5 +++ py/torch_tensorrt/dynamo/_defaults.py | 1 + py/torch_tensorrt/dynamo/_settings.py | 3 ++ .../dynamo/conversion/_ConverterRegistry.py | 16 +++++--- .../dynamo/conversion/aten_ops_converters.py | 39 ++++++++++++------- 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 32b0ca65d7..1a2297f84c 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -47,6 +47,7 @@ def compile( *, device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE, disable_tf32: bool = _defaults.DISABLE_TF32, + disable_dynamic_converter_checks: bool = _defaults.DISABLE_DYNAMIC_CONVERTER_CHECKS, sparse_weights: bool = _defaults.SPARSE_WEIGHTS, enabled_precisions: ( Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype] @@ -189,6 +190,7 @@ def compile( ), "debug": debug, "device": device, + "disable_dynamic_converter_checks": disable_dynamic_converter_checks, "workspace_size": workspace_size, "min_block_size": min_block_size, "torch_executed_ops": ( @@ -239,6 +241,9 @@ def compile_module( """ dryrun_tracker = DryRunTracker() + # Disable dynamic_shapes support checks for converters + CONVERTERS.disable_dynamic_checks(settings.disable_dynamic_converter_checks) + # Set torch-executed ops CONVERTERS.set_disallowed_targets(settings.torch_executed_ops) diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 97430137c0..a57c20dcd6 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -6,6 +6,7 @@ DEBUG = False DEVICE = None DISABLE_TF32 = False +DISABLE_DYNAMIC_CONVERTER_CHECKS = False DLA_LOCAL_DRAM_SIZE = 1073741824 DLA_GLOBAL_DRAM_SIZE = 536870912 DLA_SRAM_SIZE = 1048576 diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 9592bc1fd5..1c29344d49 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -6,6 +6,7 @@ from torch_tensorrt._enums import EngineCapability, dtype from torch_tensorrt.dynamo._defaults import ( DEBUG, + DISABLE_DYNAMIC_CONVERTER_CHECKS, DISABLE_TF32, DLA_GLOBAL_DRAM_SIZE, DLA_LOCAL_DRAM_SIZE, @@ -57,6 +58,7 @@ class CompilationSettings: device (Device): GPU to compile the model on require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT. Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path + disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. disable_tf32 (bool): Whether to disable TF32 computation for TRT layers sparse_weights (bool): Whether to allow the builder to use sparse weights refit (bool): Whether to build a refittable engine @@ -87,6 +89,7 @@ class CompilationSettings: device: Device = field(default_factory=default_device) require_full_compilation: bool = REQUIRE_FULL_COMPILATION disable_tf32: bool = DISABLE_TF32 + disable_dynamic_converter_checks: bool = DISABLE_DYNAMIC_CONVERTER_CHECKS sparse_weights: bool = SPARSE_WEIGHTS refit: bool = REFIT engine_capability: EngineCapability = field( diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 0484d04442..7af25ae552 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -280,6 +280,7 @@ def __init__( ], registry_names: Optional[Sequence[str]] = None, registry_calling_conventions: Optional[Sequence[CallingConvention]] = None, + disable_dynamic_converter_checks: bool = False, ): # Copy reference to each dictionary object into attribute list self.registries = list(registries) @@ -301,9 +302,12 @@ def __init__( ] self.disallowed_targets: Collection[Target] = set() - + self.disable_dynamic_converter_checks = disable_dynamic_converter_checks self.validate_invariants() + def disable_dynamic_checks(self, disable_dynamic_converter_checks: bool) -> None: + self.disable_dynamic_converter_checks = disable_dynamic_converter_checks + def set_disallowed_targets(self, torch_executed_ops: Collection[Target]) -> None: self.disallowed_targets = torch_executed_ops @@ -410,10 +414,12 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: - if ( - candidate.capability_validator(node) - and has_dynamic_shapes(node) - and candidate.supports_dynamic_shapes + if candidate.capability_validator(node) and ( + self.disable_dynamic_converter_checks + or ( + has_dynamic_shapes(node) + and candidate.supports_dynamic_shapes + ) ): # If node has dynamic inputs and the converter supports dynamic shapes, it is enabled return ( diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 5c851d75b5..c17649540d 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -58,8 +58,10 @@ def one_user_validator(node: Node) -> bool: @dynamo_tensorrt_converter( torch.ops.aten.native_batch_norm.default, capability_validator=one_user_validator ) -@dynamo_tensorrt_converter(torch.ops.aten.batch_norm.default) -@dynamo_tensorrt_converter(torch.ops.aten.batch_norm) +@dynamo_tensorrt_converter( + torch.ops.aten.batch_norm.default, supports_dynamic_shapes=True +) +@dynamo_tensorrt_converter(torch.ops.aten.batch_norm, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -93,6 +95,7 @@ def aten_ops_batch_norm( @dynamo_tensorrt_converter( torch.ops.aten._native_batch_norm_legit_no_training.default, capability_validator=one_user_validator, + supports_dynamic_shapes=True, ) def aten_ops_batch_norm_legit_no_training( ctx: ConversionContext, @@ -378,7 +381,7 @@ def aten_ops_relu( ) -@dynamo_tensorrt_converter(torch.ops.aten.sigmoid.default) +@dynamo_tensorrt_converter(torch.ops.aten.sigmoid.default, supports_dynamic_shapes=True) def aten_ops_sigmoid( ctx: ConversionContext, target: Target, @@ -400,7 +403,7 @@ def aten_ops_sigmoid( 0: (TRTTensor,), } ) -@dynamo_tensorrt_converter(torch.ops.aten.sym_size.int) +@dynamo_tensorrt_converter(torch.ops.aten.sym_size.int, supports_dynamic_shapes=True) def aten_ops_symsize_int( ctx: ConversionContext, target: Target, @@ -1116,8 +1119,8 @@ def aten_ops_min( ) -@dynamo_tensorrt_converter(torch.ops.aten.mean.default) -@dynamo_tensorrt_converter(torch.ops.aten.mean.dim) +@dynamo_tensorrt_converter(torch.ops.aten.mean.default, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.mean.dim, supports_dynamic_shapes=True) def aten_ops_mean( ctx: ConversionContext, target: Target, @@ -1221,7 +1224,7 @@ def aten_ops_recip( ) -@dynamo_tensorrt_converter(torch.ops.aten.abs.default) +@dynamo_tensorrt_converter(torch.ops.aten.abs.default, supports_dynamic_shapes=True) def aten_ops_abs( ctx: ConversionContext, target: Target, @@ -1568,8 +1571,8 @@ def aten_ops_isnan( ) -@dynamo_tensorrt_converter(torch.ops.aten.add.Tensor) -@dynamo_tensorrt_converter(torch.ops.aten.add.Scalar) +@dynamo_tensorrt_converter(torch.ops.aten.add.Tensor, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.add.Scalar, supports_dynamic_shapes=True) def aten_ops_add( ctx: ConversionContext, target: Target, @@ -2329,13 +2332,19 @@ def max_pool_param_validator(pool_node: Node) -> bool: # Note: MaxPool1d uses max_pool2d as it converts to 2D first. @dynamo_tensorrt_converter( - torch.ops.aten.max_pool1d.default, capability_validator=max_pool_param_validator + torch.ops.aten.max_pool1d.default, + capability_validator=max_pool_param_validator, + supports_dynamic_shapes=True, ) @dynamo_tensorrt_converter( - torch.ops.aten.max_pool2d.default, capability_validator=max_pool_param_validator + torch.ops.aten.max_pool2d.default, + capability_validator=max_pool_param_validator, + supports_dynamic_shapes=True, ) @dynamo_tensorrt_converter( - torch.ops.aten.max_pool3d.default, capability_validator=max_pool_param_validator + torch.ops.aten.max_pool3d.default, + capability_validator=max_pool_param_validator, + supports_dynamic_shapes=True, ) def aten_ops_max_pool( ctx: ConversionContext, @@ -2380,8 +2389,8 @@ def tensorrt_scaled_dot_product_attention( ) -@dynamo_tensorrt_converter(torch.ops.aten.reshape.default) -@dynamo_tensorrt_converter(torch.ops.aten.view.default) +@dynamo_tensorrt_converter(torch.ops.aten.reshape.default, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.view.default, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -2490,7 +2499,7 @@ def aten_ops_argmin( ) -@dynamo_tensorrt_converter(torch.ops.aten.addmm.default) +@dynamo_tensorrt_converter(torch.ops.aten.addmm.default, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), From 151fc40b824615e8adb1b4e6907d9cfe6352e0ea Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 1 May 2024 19:27:16 -0700 Subject: [PATCH 47/73] chore: update docstring --- py/torch_tensorrt/dynamo/_compiler.py | 1 + py/torch_tensorrt/dynamo/_settings.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 1a2297f84c..d189865366 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -107,6 +107,7 @@ def compile( device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas + disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False sparse_weights (bool): Enable sparsity for convolution and fully connected layers. enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels refit (bool): Enable refitting diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 1c29344d49..f3b6d89dcd 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -58,7 +58,7 @@ class CompilationSettings: device (Device): GPU to compile the model on require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT. Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path - disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. + disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False disable_tf32 (bool): Whether to disable TF32 computation for TRT layers sparse_weights (bool): Whether to allow the builder to use sparse weights refit (bool): Whether to build a refittable engine From a2ed092e597b25bee95b12560279e6e70038a3d9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 1 May 2024 20:07:07 -0700 Subject: [PATCH 48/73] chore: add testcase --- .../partitioning/test_dynamic_partitioning.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tests/py/dynamo/partitioning/test_dynamic_partitioning.py diff --git a/tests/py/dynamo/partitioning/test_dynamic_partitioning.py b/tests/py/dynamo/partitioning/test_dynamic_partitioning.py new file mode 100644 index 0000000000..08283ebffb --- /dev/null +++ b/tests/py/dynamo/partitioning/test_dynamic_partitioning.py @@ -0,0 +1,109 @@ +from copy import deepcopy + +import numpy as np +import torch +import torch_tensorrt +from torch.testing._internal.common_utils import TestCase, run_tests +from torch_tensorrt.dynamo import partitioning + +from ..testing_utilities import lower_graph_testing + +# This testcase assumes that torch.ops.aten.clamp.default converter doesn't support +# dynamic shapes. One should remove this testcase when the support is added. +# This testcase tests if the graph is partitioned correctly into a TRT segment +# and a Pytorch segment when the torch.ops.aten.clamp.default converter gets disabled +# due to lack of dynamic shape support. + + +class TestDynamicPartitioning(TestCase): + def test_partition_dynamic_clamp(self): + class Clamp(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.relu(x) + return torch.ops.aten.clamp.default(x, min=2.5, max=6.5) + + model = Clamp().eval().cuda() + trt_model = torch_tensorrt.compile( + model, + inputs=[ + torch_tensorrt.Input( + min_shape=(1, 3, 8, 8), + opt_shape=(4, 3, 8, 8), + max_shape=(8, 3, 8, 8), + dtype=torch.float32, + name="x", + ) + ], + dryrun=True, + min_block_size=1, + ) + trt_segments, pyt_segments = 0, 0 + for submod in list(trt_model.named_children()): + if "_run_on_acc" in submod[0]: + trt_segments += 1 + elif "_run_on_gpu" in submod[0]: + pyt_segments += 1 + + self.assertEquals( + trt_segments, + 1, + f"Number of TRT segments should be 1 but got {trt_segments}", + ) + self.assertEquals( + pyt_segments, + 1, + f"Number of PyTorch segments should be 1 but got {pyt_segments}", + ) + + def test_disable_dynamic_converter_checks(self): + class Clamp(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.relu(x) + return torch.ops.aten.clamp.default(x, min=2.5, max=6.5) + + model = Clamp().eval().cuda() + trt_model = torch_tensorrt.compile( + model, + inputs=[ + torch_tensorrt.Input( + min_shape=(1, 3, 8, 8), + opt_shape=(4, 3, 8, 8), + max_shape=(8, 3, 8, 8), + dtype=torch.float32, + name="x", + ) + ], + dryrun=True, + disable_dynamic_converter_checks=True, + min_block_size=1, + ) + + trt_segments, pyt_segments = 0, 0 + for submod in list(trt_model.named_children()): + if "_run_on_acc" in submod[0]: + trt_segments += 1 + elif "_run_on_gpu" in submod[0]: + pyt_segments += 1 + + self.assertEquals( + trt_segments, + 1, + f"Number of TRT segments should be 2 but got {trt_segments}", + ) + self.assertEquals( + pyt_segments, + 0, + f"Number of PyTorch segments should be 0 but got {pyt_segments}", + ) + + +if __name__ == "__main__": + run_tests() From c1f5d15209cef0e1bff1f4ce23b5456cc4a593af Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 2 May 2024 10:26:30 -0700 Subject: [PATCH 49/73] chore: updates --- py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py | 4 +++- py/torch_tensorrt/dynamo/conversion/ops_evaluators.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index c17649540d..6b83c37eeb 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -56,7 +56,9 @@ def one_user_validator(node: Node) -> bool: @dynamo_tensorrt_converter( - torch.ops.aten.native_batch_norm.default, capability_validator=one_user_validator + torch.ops.aten.native_batch_norm.default, + capability_validator=one_user_validator, + supports_dynamic_shapes=True, ) @dynamo_tensorrt_converter( torch.ops.aten.batch_norm.default, supports_dynamic_shapes=True diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index f83e0e5008..ea2f1c4d89 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -23,7 +23,11 @@ def getitem_validator(getitem_node: Node) -> bool: # TODO: Subsequent evaluators should be registered here with their own validators -@dynamo_tensorrt_converter(operator.getitem, capability_validator=getitem_validator) +@dynamo_tensorrt_converter( + operator.getitem, + capability_validator=getitem_validator, + supports_dynamic_shapes=True, +) @dynamo_tensorrt_converter(torch.ops.aten.detach.default) def generic_evaluator( ctx: ConversionContext, From 649b79da38b24f9981372730210ed2ee5efe9fe9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 2 May 2024 12:08:27 -0700 Subject: [PATCH 50/73] chore: rename disable_dynamic_converter_checks to assume_dynamic_shape_support --- py/torch_tensorrt/dynamo/_compiler.py | 16 +++++++++++----- py/torch_tensorrt/dynamo/_defaults.py | 2 +- py/torch_tensorrt/dynamo/_settings.py | 6 +++--- .../dynamo/conversion/_ConverterRegistry.py | 10 +++++----- .../partitioning/test_dynamic_partitioning.py | 4 ++-- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index d189865366..c3cca50f65 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -47,7 +47,7 @@ def compile( *, device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE, disable_tf32: bool = _defaults.DISABLE_TF32, - disable_dynamic_converter_checks: bool = _defaults.DISABLE_DYNAMIC_CONVERTER_CHECKS, + assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT, sparse_weights: bool = _defaults.SPARSE_WEIGHTS, enabled_precisions: ( Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype] @@ -107,7 +107,7 @@ def compile( device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True) disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas - disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False + assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False sparse_weights (bool): Enable sparsity for convolution and fully connected layers. enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels refit (bool): Enable refitting @@ -191,7 +191,7 @@ def compile( ), "debug": debug, "device": device, - "disable_dynamic_converter_checks": disable_dynamic_converter_checks, + "assume_dynamic_shape_support": assume_dynamic_shape_support, "workspace_size": workspace_size, "min_block_size": min_block_size, "torch_executed_ops": ( @@ -242,8 +242,8 @@ def compile_module( """ dryrun_tracker = DryRunTracker() - # Disable dynamic_shapes support checks for converters - CONVERTERS.disable_dynamic_checks(settings.disable_dynamic_converter_checks) + # Assume converters support dynamic shapes and disable validation + CONVERTERS.set_dynamic_shape_support(settings.assume_dynamic_shape_support) # Set torch-executed ops CONVERTERS.set_disallowed_targets(settings.torch_executed_ops) @@ -449,6 +449,7 @@ def convert_module_to_trt_engine( Set[torch.dtype | dtype] | Tuple[torch.dtype | dtype] ) = _defaults.ENABLED_PRECISIONS, debug: bool = _defaults.DEBUG, + assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT, workspace_size: int = _defaults.WORKSPACE_SIZE, min_block_size: int = _defaults.MIN_BLOCK_SIZE, torch_executed_ops: Optional[Set[str]] = None, @@ -556,6 +557,7 @@ def convert_module_to_trt_engine( enabled_precisions = {dtype._from(e) for e in enabled_precisions} compilation_options = { + "assume_dynamic_shape_support": assume_dynamic_shape_support, "enabled_precisions": enabled_precisions, "debug": debug, "workspace_size": workspace_size, @@ -595,6 +597,10 @@ def convert_module_to_trt_engine( settings = CompilationSettings(**compilation_options) logger.info("Compilation Settings: %s\n", settings) + + # Assume converters support dynamic shapes and disable validation + CONVERTERS.set_dynamic_shape_support(settings.assume_dynamic_shape_support) + try: interpreter_result = interpret_module_to_result(gm, input_list, settings) except UnsupportedOperatorException: diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index a57c20dcd6..a621efcc16 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -6,7 +6,7 @@ DEBUG = False DEVICE = None DISABLE_TF32 = False -DISABLE_DYNAMIC_CONVERTER_CHECKS = False +ASSUME_DYNAMIC_SHAPE_SUPPORT = False DLA_LOCAL_DRAM_SIZE = 1073741824 DLA_GLOBAL_DRAM_SIZE = 536870912 DLA_SRAM_SIZE = 1048576 diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index f3b6d89dcd..e13d4b5e22 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -5,8 +5,8 @@ from torch_tensorrt._Device import Device from torch_tensorrt._enums import EngineCapability, dtype from torch_tensorrt.dynamo._defaults import ( + ASSUME_DYNAMIC_SHAPE_SUPPORT, DEBUG, - DISABLE_DYNAMIC_CONVERTER_CHECKS, DISABLE_TF32, DLA_GLOBAL_DRAM_SIZE, DLA_LOCAL_DRAM_SIZE, @@ -58,7 +58,7 @@ class CompilationSettings: device (Device): GPU to compile the model on require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT. Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path - disable_dynamic_converter_checks (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False + assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False disable_tf32 (bool): Whether to disable TF32 computation for TRT layers sparse_weights (bool): Whether to allow the builder to use sparse weights refit (bool): Whether to build a refittable engine @@ -89,7 +89,7 @@ class CompilationSettings: device: Device = field(default_factory=default_device) require_full_compilation: bool = REQUIRE_FULL_COMPILATION disable_tf32: bool = DISABLE_TF32 - disable_dynamic_converter_checks: bool = DISABLE_DYNAMIC_CONVERTER_CHECKS + assume_dynamic_shape_support: bool = ASSUME_DYNAMIC_SHAPE_SUPPORT sparse_weights: bool = SPARSE_WEIGHTS refit: bool = REFIT engine_capability: EngineCapability = field( diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 7af25ae552..8069b9b9c0 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -280,7 +280,7 @@ def __init__( ], registry_names: Optional[Sequence[str]] = None, registry_calling_conventions: Optional[Sequence[CallingConvention]] = None, - disable_dynamic_converter_checks: bool = False, + assume_dynamic_shape_support: bool = False, ): # Copy reference to each dictionary object into attribute list self.registries = list(registries) @@ -302,11 +302,11 @@ def __init__( ] self.disallowed_targets: Collection[Target] = set() - self.disable_dynamic_converter_checks = disable_dynamic_converter_checks + self.assume_dynamic_shape_support = assume_dynamic_shape_support self.validate_invariants() - def disable_dynamic_checks(self, disable_dynamic_converter_checks: bool) -> None: - self.disable_dynamic_converter_checks = disable_dynamic_converter_checks + def set_dynamic_shape_support(self, assume_dynamic_shape_support: bool) -> None: + self.assume_dynamic_shape_support = assume_dynamic_shape_support def set_disallowed_targets(self, torch_executed_ops: Collection[Target]) -> None: self.disallowed_targets = torch_executed_ops @@ -415,7 +415,7 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: if candidate.capability_validator(node) and ( - self.disable_dynamic_converter_checks + self.assume_dynamic_shape_support or ( has_dynamic_shapes(node) and candidate.supports_dynamic_shapes diff --git a/tests/py/dynamo/partitioning/test_dynamic_partitioning.py b/tests/py/dynamo/partitioning/test_dynamic_partitioning.py index 08283ebffb..9b18c1fc2f 100644 --- a/tests/py/dynamo/partitioning/test_dynamic_partitioning.py +++ b/tests/py/dynamo/partitioning/test_dynamic_partitioning.py @@ -59,7 +59,7 @@ def forward(self, x): f"Number of PyTorch segments should be 1 but got {pyt_segments}", ) - def test_disable_dynamic_converter_checks(self): + def test_assume_dynamic_shape_support_converters(self): class Clamp(torch.nn.Module): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -82,7 +82,7 @@ def forward(self, x): ) ], dryrun=True, - disable_dynamic_converter_checks=True, + assume_dynamic_shape_support=True, min_block_size=1, ) From 6f945fa4c7743849d15dbcb2cc9b2e6a3e287ad9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 3 May 2024 10:08:32 -0700 Subject: [PATCH 51/73] chore: updates --- .../dynamo/conversion/aten_ops_converters.py | 69 ++++++++++++------- .../dynamo/conversion/ops_evaluators.py | 22 ++++-- py/torch_tensorrt/dynamo/utils.py | 4 ++ .../py/dynamo/conversion/test_arange_aten.py | 13 ++++ 4 files changed, 77 insertions(+), 31 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 6b83c37eeb..8417c394f9 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -127,10 +127,14 @@ def aten_ops_batch_norm_legit_no_training( @dynamo_tensorrt_converter( - torch.ops.aten.native_layer_norm.default, capability_validator=one_user_validator + torch.ops.aten.native_layer_norm.default, + capability_validator=one_user_validator, + supports_dynamic_shapes=True, ) -@dynamo_tensorrt_converter(torch.ops.aten.layer_norm.default) -@dynamo_tensorrt_converter(torch.ops.aten.layer_norm) +@dynamo_tensorrt_converter( + torch.ops.aten.layer_norm.default, supports_dynamic_shapes=True +) +@dynamo_tensorrt_converter(torch.ops.aten.layer_norm, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -253,7 +257,9 @@ def embedding_param_validator(embedding_node: Node) -> bool: @dynamo_tensorrt_converter( - torch.ops.aten.embedding.default, capability_validator=embedding_param_validator + torch.ops.aten.embedding.default, + capability_validator=embedding_param_validator, + supports_dynamic_shapes=True, ) def aten_ops_embedding( ctx: ConversionContext, @@ -451,7 +457,7 @@ def aten_ops_index( ) -@dynamo_tensorrt_converter(torch.ops.aten.tanh.default) +@dynamo_tensorrt_converter(torch.ops.aten.tanh.default, supports_dynamic_shapes=True) def aten_ops_tanh( ctx: ConversionContext, target: Target, @@ -542,10 +548,10 @@ def aten_ops_hard_sigmoid( ) -@dynamo_tensorrt_converter(torch.ops.aten.matmul) -@dynamo_tensorrt_converter(torch.ops.aten.mm.default) -@dynamo_tensorrt_converter(torch.ops.aten.mv.default) -@dynamo_tensorrt_converter(torch.ops.aten.bmm.default) +@dynamo_tensorrt_converter(torch.ops.aten.matmul, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.mm.default, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.mv.default, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.bmm.default, supports_dynamic_shapes=True) def aten_ops_matmul( ctx: ConversionContext, target: Target, @@ -626,7 +632,9 @@ def aten_ops_erf( ) -@dynamo_tensorrt_converter(torch.ops.aten.unsqueeze.default) +@dynamo_tensorrt_converter( + torch.ops.aten.unsqueeze.default, supports_dynamic_shapes=True +) def aten_ops_unsqueeze( ctx: ConversionContext, target: Target, @@ -639,7 +647,9 @@ def aten_ops_unsqueeze( ) -@dynamo_tensorrt_converter(torch.ops.aten._softmax.default) +@dynamo_tensorrt_converter( + torch.ops.aten._softmax.default, supports_dynamic_shapes=True +) def aten_ops_softmax( ctx: ConversionContext, target: Target, @@ -654,10 +664,12 @@ def aten_ops_softmax( @dynamo_tensorrt_converter( torch.ops.aten.split.Tensor, + supports_dynamic_shapes=True, ) -@dynamo_tensorrt_converter(torch.ops.aten.split.sizes) +@dynamo_tensorrt_converter(torch.ops.aten.split.sizes, supports_dynamic_shapes=True) @dynamo_tensorrt_converter( torch.ops.aten.split_with_sizes.default, + supports_dynamic_shapes=True, ) def aten_ops_split( ctx: ConversionContext, @@ -731,7 +743,7 @@ def aten_ops_select( ) -@dynamo_tensorrt_converter(torch.ops.aten.slice.Tensor) +@dynamo_tensorrt_converter(torch.ops.aten.slice.Tensor, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -827,7 +839,7 @@ def aten_ops_tile( ) -@dynamo_tensorrt_converter(torch.ops.aten.permute.default) +@dynamo_tensorrt_converter(torch.ops.aten.permute.default, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -898,10 +910,12 @@ def validator(to_copy_node: Node) -> bool: @dynamo_tensorrt_converter( torch.ops.aten.clone.default, capability_validator=lambda node: not is_only_operator_on_placeholder(node), + supports_dynamic_shapes=True, ) @dynamo_tensorrt_converter( torch.ops.aten._to_copy.default, capability_validator=to_copy_dtype_validator(placeholder_only=False), + supports_dynamic_shapes=True, ) def aten_ops_clone_copy_dtype( ctx: ConversionContext, @@ -950,7 +964,7 @@ def aten_ops_clone_copy_placeholder( ) -@dynamo_tensorrt_converter(torch.ops.aten.expand.default) +@dynamo_tensorrt_converter(torch.ops.aten.expand.default, supports_dynamic_shapes=True) @enforce_tensor_types( { 0: (TRTTensor,), @@ -1573,6 +1587,7 @@ def aten_ops_isnan( ) +@dynamo_tensorrt_converter(operator.add, supports_dynamic_shapes=True) @dynamo_tensorrt_converter(torch.ops.aten.add.Tensor, supports_dynamic_shapes=True) @dynamo_tensorrt_converter(torch.ops.aten.add.Scalar, supports_dynamic_shapes=True) def aten_ops_add( @@ -1605,8 +1620,8 @@ def aten_ops_add( ) -@dynamo_tensorrt_converter(torch.ops.aten.mul.Tensor) -@dynamo_tensorrt_converter(torch.ops.aten.mul.Scalar) +@dynamo_tensorrt_converter(torch.ops.aten.mul.Tensor, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.mul.Scalar, supports_dynamic_shapes=True) def aten_ops_mul( ctx: ConversionContext, target: Target, @@ -1692,11 +1707,11 @@ def aten_ops_sub( ) -@dynamo_tensorrt_converter(torch.ops.aten.div.Tensor) -@dynamo_tensorrt_converter(torch.ops.aten.div.Tensor_mode) -@dynamo_tensorrt_converter(torch.ops.aten.div.Scalar) -@dynamo_tensorrt_converter(torch.ops.aten.div.Scalar_mode) -@dynamo_tensorrt_converter(torch.ops.prims.div.default) +@dynamo_tensorrt_converter(torch.ops.aten.div.Tensor, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.div.Tensor_mode, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.div.Scalar, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.div.Scalar_mode, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.prims.div.default, supports_dynamic_shapes=True) def aten_ops_div( ctx: ConversionContext, target: Target, @@ -1739,9 +1754,13 @@ def aten_ops_div( ) -@dynamo_tensorrt_converter(torch.ops.aten.pow.Tensor_Tensor) -@dynamo_tensorrt_converter(torch.ops.aten.pow.Scalar) -@dynamo_tensorrt_converter(torch.ops.aten.pow.Tensor_Scalar) +@dynamo_tensorrt_converter( + torch.ops.aten.pow.Tensor_Tensor, supports_dynamic_shapes=True +) +@dynamo_tensorrt_converter(torch.ops.aten.pow.Scalar, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter( + torch.ops.aten.pow.Tensor_Scalar, supports_dynamic_shapes=True +) def aten_ops_pow( ctx: ConversionContext, target: Target, diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index d8f6bc7433..c0b7bdad67 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -32,7 +32,7 @@ def getitem_validator(getitem_node: Node) -> bool: capability_validator=getitem_validator, supports_dynamic_shapes=True, ) -@dynamo_tensorrt_converter(torch.ops.aten.detach.default) +@dynamo_tensorrt_converter(torch.ops.aten.detach.default, supports_dynamic_shapes=True) def generic_evaluator( ctx: ConversionContext, target: Target, @@ -46,7 +46,9 @@ def generic_evaluator( return target(*args) -@dynamo_tensorrt_converter(torch.ops.aten.arange.start_step) +@dynamo_tensorrt_converter( + torch.ops.aten.arange.start_step, supports_dynamic_shapes=True +) def aten_ops_arange_start_step( ctx: ConversionContext, target: Target, @@ -56,8 +58,14 @@ def aten_ops_arange_start_step( ) -> Union[TRTTensor, Sequence[TRTTensor]]: # Case where inputs to arange are dynamic if np.any([isinstance(tensor, TRTTensor) for tensor in args]): - start = get_trt_tensor(ctx, args[0], name + "_start", rank=0) + start_rank_0 = get_trt_tensor(ctx, args[0], name + "_start_rank_0", rank=0) + start_rank_1 = get_trt_tensor(ctx, args[0], name + "_start_rank_1", rank=1) end = get_trt_tensor(ctx, args[1], name + "_end", rank=0) + if len(args) > 2: + step = args[2] + else: + step = 1 + step = get_trt_tensor(ctx, step, name + "_step", rank=1) # Calculate shape = (end-start) / 1 (in this case) shape = sub( ctx, @@ -65,13 +73,15 @@ def aten_ops_arange_start_step( SourceIR.ATEN, name + "_shape", end, - start, + start_rank_1, ) - fill_layer = ctx.net.add_fill(trt.Dims(), trt.FillOperation.LINSPACE) + fill_layer = ctx.net.add_fill(shape.shape, trt.FillOperation.LINSPACE) fill_layer.set_input(0, shape) # Set start index - fill_layer.set_input(1, start) + fill_layer.set_input(1, start_rank_0) + # Set delta/step + fill_layer.set_input(2, step) # Set output type to INT32 fill_layer.set_output_type(0, trt.DataType.INT32) return fill_layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 6ea9503b84..be0019159a 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -158,6 +158,10 @@ def parse_complex_tensor_structs( """ if isinstance(inputs, (torch.Tensor, Input)): return apply_fn(getattr(inputs, attribute_to_extract, None)) + elif isinstance(inputs, (int, float, bool)): + # inputs is a python scalar value + inputs_torch = torch.tensor(inputs) + return apply_fn(getattr(inputs_torch, attribute_to_extract, None)) elif isinstance(inputs, (list, tuple)): torchtrt_input_list = [] diff --git a/tests/py/dynamo/conversion/test_arange_aten.py b/tests/py/dynamo/conversion/test_arange_aten.py index e06239eb4e..4c2317366d 100644 --- a/tests/py/dynamo/conversion/test_arange_aten.py +++ b/tests/py/dynamo/conversion/test_arange_aten.py @@ -33,6 +33,19 @@ def forward(self, x): use_dynamo_tracer=True, ) + def test_arange_dynamic(self): + class Arange(nn.Module): + def forward(self, end_tensor): + return torch.ops.aten.arange.start_step(0, end_tensor, 1) + + inputs = [torch.tensor(7, dtype=torch.int32)] + self.run_test( + Arange(), + inputs, + check_dtype=False, # Turned off as end argument doesn't accept tensors + # use_dynamo_tracer=True, + ) + if __name__ == "__main__": run_tests() From b5315730062721126d5547f9a01eded67a0fd98e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 7 May 2024 00:14:13 -0700 Subject: [PATCH 52/73] chore: updates --- py/torch_tensorrt/_Input.py | 10 ++- py/torch_tensorrt/dynamo/_compiler.py | 51 +++++++------ .../dynamo/conversion/_TRTInterpreter.py | 33 +++++--- .../dynamo/conversion/_conversion.py | 16 ++-- .../dynamo/conversion/impl/slice/ops.py | 10 ++- .../dynamo/partitioning/common.py | 76 ++++++++++++------- .../dynamo/runtime/_OutputAllocator.py | 29 +++++++ .../runtime/_PythonTorchTensorRTModule.py | 24 ++++-- py/torch_tensorrt/dynamo/runtime/__init__.py | 1 + py/torch_tensorrt/dynamo/utils.py | 23 ++++++ tests/py/dynamo/conversion/harness.py | 17 +++-- .../py/dynamo/conversion/test_arange_aten.py | 19 ++++- 12 files changed, 220 insertions(+), 89 deletions(-) create mode 100644 py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py index 32f19ce1f0..18636f8114 100644 --- a/py/torch_tensorrt/_Input.py +++ b/py/torch_tensorrt/_Input.py @@ -47,6 +47,7 @@ class _ShapeMode(Enum): high_tensor_domain_excl: float = low_tensor_domain_incl + DOMAIN_OFFSET torch_tensor: torch.Tensor = None name: str = "" + is_shape_tensor: bool = False def __init__(self, *args: Any, **kwargs: Any) -> None: """__init__ Method for torch_tensorrt.Input @@ -161,6 +162,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: else: self._explicit_set_dtype = False + if "is_shape_tensor" in kwargs: + self.is_shape_tensor = kwargs["is_shape_tensor"] + if "format" in kwargs: self.format = memory_format._from(kwargs["format"]) @@ -174,7 +178,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: if "torch_tensor" in kwargs: self.torch_tensor = kwargs["torch_tensor"] else: - if self.shape_mode == Input._ShapeMode.DYNAMIC: + if self.is_shape_tensor: + self.torch_tensor = torch.tensor( + kwargs["opt_shape"], dtype=kwargs["dtype"] + ) + elif self.shape_mode == Input._ShapeMode.DYNAMIC: self.torch_tensor = self.example_tensor("opt_shape") else: self.torch_tensor = self.example_tensor() diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index d189865366..9eaebf0050 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -387,18 +387,23 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: submodule_inputs, "dtype", lambda t: t.to(torch.dtype) ) - submodule_outputs = submodule( - *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) - ) - - subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( - submodule_outputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( - submodule_outputs, "dtype" - ) + # subgraph_output_shapes = [get_node_shape(node) for node in submodule.graph.nodes if node.op=="output"] + # try: + # submodule_outputs = submodule( + # *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) + # ) + # except: + # breakpoint() + # print("done") + + # subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( + # submodule_outputs, + # "shape", + # lambda x: dict(x) if isinstance(x, dict) else tuple(x), + # ) + # subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( + # submodule_outputs, "dtype" + # ) dryrun_tracker.tensorrt_graph_count += 1 dryrun_tracker.per_subgraph_data.append(subgraph_data) @@ -414,19 +419,19 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: trt_modules[name] = trt_module - sample_outputs = gm( - *get_torch_inputs(sample_inputs, to_torch_device(settings.device)) - ) + # sample_outputs = gm( + # *get_torch_inputs(sample_inputs, to_torch_device(settings.device)) + # ) - if not isinstance(sample_outputs, (list, tuple)): - sample_outputs = [sample_outputs] + # if not isinstance(sample_outputs, (list, tuple)): + # sample_outputs = [sample_outputs] - dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( - sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) - ) - dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( - sample_outputs, "dtype" - ) + # dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( + # sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) + # ) + # dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( + # sample_outputs, "dtype" + # ) # Replace all FX Modules with TRT Modules for name, trt_module in trt_modules.items(): diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 59d2c5d6c0..23e1e8d47a 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set import numpy as np +import tensorrt as trt import torch import torch.fx from torch.fx.node import _get_qualified_name @@ -25,7 +26,6 @@ from torch_tensorrt.fx.observer import Observer from torch_tensorrt.logging import TRT_LOGGER -import tensorrt as trt from packaging import version _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -365,18 +365,27 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: max_shape = current_input.shape["max_shape"] # TODO: Does not support disjoint optimization profiles? assert self.optimization_profiles is not None - self.optimization_profiles[0].set_shape( - target, min_shape, opt_shape, max_shape - ) + if current_input.is_shape_tensor: + self.optimization_profiles[0].set_shape_input( + target, min_shape, opt_shape, max_shape + ) + shape.append(1) + else: + self.optimization_profiles[0].set_shape( + target, min_shape, opt_shape, max_shape + ) - assert len(min_shape) == len(opt_shape) == len(max_shape) - for i in range(len(min_shape)): - if min_shape[i] == opt_shape[i] == max_shape[i]: - shape.append(min_shape[i]) - else: - # -1 to represent the dynamic dimension - shape.append(-1) - elif current_input.shape_mode == Input._ShapeMode.STATIC: + assert len(min_shape) == len(opt_shape) == len(max_shape) + for i in range(len(min_shape)): + if min_shape[i] == opt_shape[i] == max_shape[i]: + shape.append(min_shape[i]) + else: + # -1 to represent the dynamic dimension + shape.append(-1) + elif ( + not current_input.is_shape_tensor + and current_input.shape_mode == Input._ShapeMode.STATIC + ): assert isinstance(current_input.shape, tuple) shape = list(current_input.shape) else: diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 64bb14ad21..1de955f680 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -4,7 +4,9 @@ import logging from typing import List, Sequence +import tensorrt as trt import torch +from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._features import ENABLED_FEATURES @@ -17,8 +19,6 @@ from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule from torch_tensorrt.dynamo.utils import get_torch_inputs -import tensorrt as trt - logger = logging.getLogger(__name__) @@ -28,12 +28,12 @@ def infer_module_output_dtypes( device: Device, truncate_double: bool = False, ) -> List[dtype]: - torch_inputs = get_torch_inputs(inputs, device) - module = module.to(device.to(torch.device)) - module_outputs = module(*torch_inputs) - - if not isinstance(module_outputs, (list, tuple)): - module_outputs = [module_outputs] + with maybe_disable_fake_tensor_mode(): + torch_inputs = get_torch_inputs(inputs, device) + module = module.to(device.to(torch.device)) + module_outputs = module(*torch_inputs) + if not isinstance(module_outputs, (list, tuple)): + module_outputs = [module_outputs] # Int64 outputs can sometimes be generated from within other operators # such as aten.sum - such outputs can be truncated diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index e1d08203f4..3a9f4bcebf 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -156,8 +156,16 @@ def expand( # Handle dynamic shapes case where shape has dynamic dimension if any(isinstance(ele, TRTTensor) for ele in shape): shape_ = cat(ctx, target, source_ir, name + "_shape_concat", shape, 0) - layer = ctx.net.add_slice(input_t, start=start, shape=trt.Dims(), stride=stride) + start_tensor = cat(ctx, target, source_ir, name + "_start_concat", start, 0) + stride_tensor = cat(ctx, target, source_ir, name + "_stride_concat", stride, 0) + # start_tensor = get_trt_tensor(ctx, np.array(start, dtype=np.int32), name + "_start") + # stride_tensor = get_trt_tensor(ctx, np.array(stride, dtype=np.int32), name + "_stride") + layer = ctx.net.add_slice( + input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() + ) + layer.set_input(1, start_tensor) layer.set_input(2, shape_) + layer.set_input(3, stride_tensor) else: layer = ctx.net.add_slice(input_t, start=start, shape=shape_, stride=stride) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index 270973c8c3..e014078f7c 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -2,6 +2,8 @@ from typing import Any, Dict, Optional, Sequence, Set, Tuple import torch +from torch._subclasses.fake_tensor import FakeTensor +from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode from torch_tensorrt._Input import Input from torch_tensorrt.dynamo._defaults import DEBUG @@ -12,13 +14,12 @@ def contains_sym_int(tensor: torch.Tensor) -> bool: """ Returns true if the given tensor has symbolic shape. """ - for dim in tensor: - if isinstance(dim, torch.SymInt): - return True - return False + return any(isinstance(dim, torch.SymInt) for dim in tensor) -def construct_dynamic_input(input_shape: torch.Size, input_dtype: torch.dtype) -> Input: +def construct_dynamic_input( + input_shape: torch.Size, input_dtype: torch.dtype, is_shape_tensor: bool = False +) -> Input: """ Constructs a torch_tensorrt.Input based on a symbolic input Args: @@ -50,18 +51,26 @@ def construct_dynamic_input(input_shape: torch.Size, input_dtype: torch.dtype) - max_shape.append(dim) return Input( - min_shape=min_shape, opt_shape=opt_shape, max_shape=max_shape, dtype=input_dtype + min_shape=min_shape, + opt_shape=opt_shape, + max_shape=max_shape, + dtype=input_dtype, + is_shape_tensor=is_shape_tensor, ) -def get_input(input_shape: torch.Size, input_dtype: torch.dtype) -> Input: +def get_input( + input_shape: torch.Size, dtype: torch.dtype, is_shape_tensor: bool = False +) -> Input: """ Based on type of dimensions in the input_shape, construct regular or dynamic shaped inputs """ if contains_sym_int(input_shape): - return construct_dynamic_input(input_shape, input_dtype) + return construct_dynamic_input( + input_shape, dtype, is_shape_tensor=is_shape_tensor + ) else: - return Input(shape=input_shape, dtype=input_dtype) + return Input(shape=input_shape, dtype=dtype, is_shape_tensor=is_shape_tensor) def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: @@ -73,28 +82,41 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: Returns: Sequence of torch_tensorrt.Input's representing inputs to given module """ - torchtrt_inputs = [] - module_inputs = [node for node in module.graph.nodes if node.op == "placeholder"] - for input in module_inputs: - if input.meta: - if "val" in input.meta: - input_meta = input.meta["val"] - input_shape = input_meta.size() - torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) - elif "tensor_meta" in input.meta: - input_meta = input.meta["tensor_meta"] - input_shape = input_meta.shape - torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + with maybe_disable_fake_tensor_mode(): + torchtrt_inputs = [] + module_inputs = [ + node for node in module.graph.nodes if node.op == "placeholder" + ] + for input in module_inputs: + if input.meta: + if "val" in input.meta: + input_meta = input.meta["val"] + if isinstance(input_meta, (FakeTensor, torch.Tensor)): + input_shape = input_meta.size() + torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + elif isinstance(input_meta, torch.SymInt): + torchtrt_inputs.append( + get_input([input_meta], torch.int32, is_shape_tensor=True) + ) + else: + raise ValueError( + f"The meta val for input node {input.target} is of type : {type(input_meta)}. Supported types: torch.Tensor|FakeTensor|torch.SymInt" + ) + + elif "tensor_meta" in input.meta: + input_meta = input.meta["tensor_meta"] + input_shape = input_meta.shape + torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) + else: + raise AssertionError( + f"Input {input.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly" + ) else: raise AssertionError( - f"Input {input.name} does not contain val and tensor_meta fields in the metadata. Please ensure you have exported the graph correctly" + f"Input {input.name} does not contain metadata. Please ensure you have exported the graph correctly" ) - else: - raise AssertionError( - f"Input {input.name} does not contain metadata. Please ensure you have exported the graph correctly" - ) - return torchtrt_inputs + return torchtrt_inputs def run_shape_analysis( diff --git a/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py b/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py new file mode 100644 index 0000000000..575b32fafe --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py @@ -0,0 +1,29 @@ +import logging + +import tensorrt as trt +import torch + +logger = logging.getLogger(__name__) + + +class OutputAllocator(trt.IOutputAllocator): + def __init__(self): + trt.IOutputAllocator.__init__(self) + self.buffers = {} + self.shapes = {} + + def reallocate_output(self, tensor_name, memory, size, alignment): + shape = (size,) + if tensor_name not in self.buffers: + self.buffers[tensor_name] = torch.empty( + shape, dtype=torch.uint8, device="cuda" + ) + else: + self.buffers[tensor_name] = self.buffers[tensor_name].resize_(shape) + logger.debug( + f"Reallocated output tensor: {tensor_name} to: {self.buffers[tensor_name]}" + ) + return self.buffers[tensor_name].data_ptr() + + def notify_shape(self, tensor_name, shape): + self.shapes[tensor_name] = tuple(shape) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 0c152e15f1..ac6f0cbdc1 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -10,6 +10,7 @@ from torch.nn import Module from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype +from torch_tensorrt.dynamo.runtime import OutputAllocator from torch_tensorrt.dynamo.runtime.tools import ( _is_switch_required, _select_rt_device, @@ -52,6 +53,7 @@ def __init__( self.profiling_enabled = ( profiling_enabled if profiling_enabled is not None else False ) + self.output_allocator = OutputAllocator() self._initialize() def _initialize(self) -> None: @@ -210,14 +212,18 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . for i, output_name in enumerate(self.output_names): shape = tuple(self.context.get_tensor_shape(output_name)) - - output = torch.empty( - size=shape, - dtype=self.output_dtypes[i].to(torch.dtype), - device=torch.cuda.current_device(), - ) - bindings.append(output.data_ptr()) - outputs.append(output) + if -1 in shape: + self.context.set_output_allocator( + output_name, self.output_allocator + ) + else: + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i].to(torch.dtype), + device=torch.cuda.current_device(), + ) + bindings.append(output.data_ptr()) + outputs.append(output) # Assign tensor address appropriately for idx in range(self.engine.num_io_tensors): @@ -233,6 +239,8 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . else nullcontext() ): self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream) + raw_array = self.output_allocator.buffers[self.output_names[0]] + shape = self.output_allocator.shapes[self.output_names[0]] if len(outputs) == 1: return outputs[0] diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py index ee1caab972..9862e85767 100644 --- a/py/torch_tensorrt/dynamo/runtime/__init__.py +++ b/py/torch_tensorrt/dynamo/runtime/__init__.py @@ -1,2 +1,3 @@ +from ._OutputAllocator import OutputAllocator # noqa: F401 from ._PythonTorchTensorRTModule import PythonTorchTensorRTModule # noqa: F401 from ._TorchTensorRTModule import TorchTensorRTModule # noqa: F401 diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index be0019159a..f0ad9e19c1 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -5,6 +5,7 @@ from typing import Any, Callable, Dict, Optional, Sequence, Union import torch +from torch._subclasses.fake_tensor import FakeTensor from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input @@ -72,6 +73,28 @@ def input_is_dynamic(inputs: Sequence[Union[Input, torch.Tensor]]) -> bool: ) +def get_node_shape(node: torch.fx.Node) -> Sequence[int]: + """ + Return the shape of the output of this node as recorded in the node.meta["val"] + """ + shapes = [] + if node.meta and "val" in node.meta: + metadata = node.meta["val"] + if isinstance(metadata, (list, tuple)): + for output in metadata: + shapes.append(output.size()) + elif isinstance(metadata, (torch.Tensor, FakeTensor)): + shapes.append(metadata.size()) + elif isinstance(metadata, torch.SymInt): + shapes.append(metadata) + else: + logger.warning( + f"Requested output shape for node {node.target} but it is missing metadata" + ) + + return shapes + + def get_torch_inputs( inputs: Sequence[Input], device: Union[Device, torch.device, str], mode: str = "" ) -> Sequence[torch.tensor]: diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 7ce3939371..c3648fb8b1 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -55,11 +55,12 @@ def run_test( rtol, atol, check_dtype=True, + pyt_inputs=None, ): with torch.no_grad(): cuda_inputs = [] for i in inputs: - cuda_inputs.append(i.cuda()) + cuda_inputs.append(i) mod.eval() start = time.perf_counter() @@ -71,9 +72,11 @@ def run_test( interpreter_result.input_names, interpreter_result.output_names, ) - mod = mod.cuda() - ref_outputs = mod(*cuda_inputs) + if pyt_inputs is not None: + ref_outputs = mod(*pyt_inputs) + else: + ref_outputs = mod(*cuda_inputs) torch.cuda.synchronize() start_event = torch.cuda.Event(enable_timing=True) @@ -279,6 +282,8 @@ def run_test_with_dynamic_shape( output_dtypes=None, use_dynamo_tracer=False, enable_passes=False, + use_example_tensors=True, + pyt_inputs=None, ): mod.eval() inputs = [spec.example_tensor("opt_shape") for spec in input_specs] @@ -291,7 +296,7 @@ def run_test_with_dynamic_shape( # Previous instance of the interpreter auto-casted 64-bit inputs # We replicate this behavior here - compilation_settings = CompilationSettings(truncate_double=True) + compilation_settings = CompilationSettings(truncate_double=True, debug=True) interp = TRTInterpreter( mod, @@ -302,4 +307,6 @@ def run_test_with_dynamic_shape( # Since the lowering is based on optimal shape. We need to test with # different shape(for ex. max shape) for testing dynamic shape inputs_max = [spec.example_tensor("max_shape") for spec in input_specs] - super().run_test(mod, inputs_max, interp, rtol, atol) + if not use_example_tensors: + inputs_max = [spec.torch_tensor for spec in input_specs] + super().run_test(mod, inputs_max, interp, rtol, atol, pyt_inputs=pyt_inputs) diff --git a/tests/py/dynamo/conversion/test_arange_aten.py b/tests/py/dynamo/conversion/test_arange_aten.py index 4c2317366d..9e2e882097 100644 --- a/tests/py/dynamo/conversion/test_arange_aten.py +++ b/tests/py/dynamo/conversion/test_arange_aten.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn +import torch_tensorrt from parameterized import parameterized from torch.testing._internal.common_utils import run_tests @@ -38,12 +39,22 @@ class Arange(nn.Module): def forward(self, end_tensor): return torch.ops.aten.arange.start_step(0, end_tensor, 1) - inputs = [torch.tensor(7, dtype=torch.int32)] - self.run_test( + pyt_input = [7] + inputs = [ + torch_tensorrt.Input( + min_shape=(5,), + opt_shape=(7,), + max_shape=(10,), + dtype=torch.int32, + torch_tensor=torch.tensor(pyt_input, dtype=torch.int32).cuda(), + is_shape_tensor=True, + ) + ] + self.run_test_with_dynamic_shape( Arange(), inputs, - check_dtype=False, # Turned off as end argument doesn't accept tensors - # use_dynamo_tracer=True, + use_example_tensors=False, + pyt_inputs=pyt_input, ) From e0415a500f22bd2cd3cbe40bdc30abc00b5f1d49 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 13 May 2024 08:28:26 -0700 Subject: [PATCH 53/73] chore: updates --- core/runtime/execute_engine.cpp | 18 +++++-- .../dynamo/conversion/_TRTInterpreter.py | 6 ++- .../dynamo/conversion/impl/shuffle.py | 15 +++++- .../dynamo/conversion/ops_evaluators.py | 11 ++-- .../dynamo/partitioning/common.py | 3 +- .../runtime/_PythonTorchTensorRTModule.py | 54 +++++++++++-------- .../py/dynamo/conversion/test_arange_aten.py | 8 +-- 7 files changed, 74 insertions(+), 41 deletions(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index a1ee30e994..11021ebe8d 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -142,12 +142,22 @@ std::vector execute_engine(std::vector inputs, c10::intr auto dims = core::util::toDims(inputs[i].sizes()); auto shape = core::util::toVec(dims); LOG_DEBUG("Input Name: " << name << " Shape: " << dims); - compiled_engine->exec_ctx->setInputShape(name.c_str(), dims); - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr()); + if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().cpu().data_ptr()); + } else { + compiled_engine->exec_ctx->setInputShape(name.c_str(), dims); + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr()); + } } - TORCHTRT_CHECK( - compiled_engine->exec_ctx->allInputShapesSpecified(), "Not enough inputs provided (runtime.RunCudaEngine)"); + // Check if input shapes can be inferred. + char const** unInferredInputNames; + if (compiled_engine->exec_ctx->inferShapes(inputs.size(), unInferredInputNames)) { + LOG_WARNING( + "The shapes of the inputs: " + << unInferredInputNames + << " cannot be inferred and could lead to undefined behavior. This could happen if the input tensor addresses/shapes haven't been configured correctly"); + } } std::vector outputs(compiled_engine->num_io.second); diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 23e1e8d47a..4f81fc6c6b 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -24,7 +24,6 @@ get_trt_tensor, ) from torch_tensorrt.fx.observer import Observer -from torch_tensorrt.logging import TRT_LOGGER from packaging import version @@ -57,7 +56,7 @@ def __init__( ): super().__init__(module) - self.logger = TRT_LOGGER + self.logger = trt.Logger(trt.Logger.VERBOSE) self.builder = trt.Builder(self.logger) flag = 0 @@ -366,6 +365,8 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: # TODO: Does not support disjoint optimization profiles? assert self.optimization_profiles is not None if current_input.is_shape_tensor: + # For shape_tensors, min/opt/max_shapes correspond to actual values + # of the shapes provided during runtime self.optimization_profiles[0].set_shape_input( target, min_shape, opt_shape, max_shape ) @@ -397,6 +398,7 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: _LOGGER.debug( f"Adding input to in-progress INetwork: {target} [shape={shape}, dtype={trt_input_dtype}]" ) + return self.ctx.net.add_input( name=target, shape=tuple(shape), diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index b2a79af5cb..0264ca2b20 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -2,8 +2,13 @@ import torch_tensorrt.dynamo.conversion.impl as impl from torch.fx.node import Target +from torch_tensorrt import _enums from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor +from torch_tensorrt.dynamo.conversion.converter_utils import ( + SourceIR, + cast_trt_tensor, + get_trt_tensor, +) from torch_tensorrt.fx.converters.converter_utils import set_layer_name from torch_tensorrt.fx.types import TRTTensor @@ -25,7 +30,13 @@ def reshape( for i, s in enumerate(shape): if isinstance(s, TRTTensor): - trt_shape.append(s) + dim_int32 = cast_trt_tensor( + ctx, + s, + _enums.dtype.int32, + name + "_int32_casted", + ) + trt_shape.append(dim_int32) else: a = get_trt_tensor(ctx, s, f"{name}_{i}") trt_shape.append(a) diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index c0b7bdad67..fc5d210831 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -60,13 +60,13 @@ def aten_ops_arange_start_step( if np.any([isinstance(tensor, TRTTensor) for tensor in args]): start_rank_0 = get_trt_tensor(ctx, args[0], name + "_start_rank_0", rank=0) start_rank_1 = get_trt_tensor(ctx, args[0], name + "_start_rank_1", rank=1) - end = get_trt_tensor(ctx, args[1], name + "_end", rank=0) + end = get_trt_tensor(ctx, args[1], name + "_end", rank=1) if len(args) > 2: step = args[2] else: step = 1 step = get_trt_tensor(ctx, step, name + "_step", rank=1) - # Calculate shape = (end-start) / 1 (in this case) + # Calculate shape = (end-start) / step shape = sub( ctx, target, @@ -75,14 +75,13 @@ def aten_ops_arange_start_step( end, start_rank_1, ) - - fill_layer = ctx.net.add_fill(shape.shape, trt.FillOperation.LINSPACE) + fill_layer = ctx.net.add_fill( + shape.shape, trt.FillOperation.LINSPACE, shape.dtype + ) fill_layer.set_input(0, shape) # Set start index fill_layer.set_input(1, start_rank_0) # Set delta/step fill_layer.set_input(2, step) - # Set output type to INT32 - fill_layer.set_output_type(0, trt.DataType.INT32) return fill_layer.get_output(0) return np.arange(*args) diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py index e014078f7c..9ac677484f 100644 --- a/py/torch_tensorrt/dynamo/partitioning/common.py +++ b/py/torch_tensorrt/dynamo/partitioning/common.py @@ -95,8 +95,9 @@ def construct_submodule_inputs(module: torch.fx.GraphModule) -> Sequence[Input]: input_shape = input_meta.size() torchtrt_inputs.append(get_input(input_shape, input_meta.dtype)) elif isinstance(input_meta, torch.SymInt): + # Assuming sym_integers | shape inputs always have torch.int64 dtype torchtrt_inputs.append( - get_input([input_meta], torch.int32, is_shape_tensor=True) + get_input([input_meta], torch.int64, is_shape_tensor=True) ) else: raise ValueError( diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index ac6f0cbdc1..6d766bd718 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -19,6 +19,7 @@ from torch_tensorrt.logging import TRT_LOGGER logger = logging.getLogger(__name__) +DYNAMIC_DIM = -1 class PythonTorchTensorRTModule(Module): # type: ignore[misc] @@ -176,7 +177,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(inputs)}." contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs] - bindings = [] for i, input_name in enumerate(self.input_names): if not contiguous_inputs[i].is_cuda: logger.warning( @@ -195,9 +195,26 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . contiguous_inputs[i].dtype == self.input_dtypes[i] ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." - bindings.append(contiguous_inputs[i].data_ptr()) - self.context.set_input_shape( - input_name, tuple(contiguous_inputs[i].shape) + if self.engine.is_shape_inference_io(input_name): + # TODO: Sometimes addresses are getting corrupted + input_clone = contiguous_inputs[i].cpu() + self.context.set_tensor_address( + input_name, input_clone.data_ptr() + ) + else: + self.context.set_input_shape( + input_name, tuple(contiguous_inputs[i].shape) + ) + self.context.set_tensor_address( + input_name, contiguous_inputs[i].data_ptr() + ) + + # Check if input shapes can be inferred. + uninferred_input_names = self.context.infer_shapes() + if uninferred_input_names: + logger.warning( + f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \ + This could happen if the input tensor addresses/shapes haven't been configured correctly" ) with ( @@ -212,24 +229,19 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . for i, output_name in enumerate(self.output_names): shape = tuple(self.context.get_tensor_shape(output_name)) - if -1 in shape: - self.context.set_output_allocator( - output_name, self.output_allocator - ) - else: - output = torch.empty( - size=shape, - dtype=self.output_dtypes[i].to(torch.dtype), - device=torch.cuda.current_device(), + + if DYNAMIC_DIM in shape: + raise ValueError( + "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." ) - bindings.append(output.data_ptr()) - outputs.append(output) - # Assign tensor address appropriately - for idx in range(self.engine.num_io_tensors): - self.context.set_tensor_address( - self.engine.get_tensor_name(idx), bindings[idx] - ) + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i].to(torch.dtype), + device=torch.cuda.current_device(), + ) + self.context.set_tensor_address(output_name, output.data_ptr()) + outputs.append(output) with ( torch.autograd.profiler.record_function( @@ -239,8 +251,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . else nullcontext() ): self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream) - raw_array = self.output_allocator.buffers[self.output_names[0]] - shape = self.output_allocator.shapes[self.output_names[0]] if len(outputs) == 1: return outputs[0] diff --git a/tests/py/dynamo/conversion/test_arange_aten.py b/tests/py/dynamo/conversion/test_arange_aten.py index 9e2e882097..32a243330f 100644 --- a/tests/py/dynamo/conversion/test_arange_aten.py +++ b/tests/py/dynamo/conversion/test_arange_aten.py @@ -39,14 +39,14 @@ class Arange(nn.Module): def forward(self, end_tensor): return torch.ops.aten.arange.start_step(0, end_tensor, 1) - pyt_input = [7] + pyt_input = 7 inputs = [ torch_tensorrt.Input( min_shape=(5,), opt_shape=(7,), max_shape=(10,), - dtype=torch.int32, - torch_tensor=torch.tensor(pyt_input, dtype=torch.int32).cuda(), + dtype=torch.int64, + torch_tensor=torch.tensor(pyt_input, dtype=torch.int64).cuda(), is_shape_tensor=True, ) ] @@ -54,7 +54,7 @@ def forward(self, end_tensor): Arange(), inputs, use_example_tensors=False, - pyt_inputs=pyt_input, + pyt_inputs=[pyt_input], ) From 6dd2c90ed82480fbd71ed551e06d35622b9789ce Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 08:06:18 -0700 Subject: [PATCH 54/73] chore: updates --- examples/gpt2_tc.py | 68 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 examples/gpt2_tc.py diff --git a/examples/gpt2_tc.py b/examples/gpt2_tc.py new file mode 100644 index 0000000000..2f0da8f2e9 --- /dev/null +++ b/examples/gpt2_tc.py @@ -0,0 +1,68 @@ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList +from transformers.generation.stopping_criteria import ( + EosTokenCriteria, + MaxLengthCriteria, +) + +# Define tokenizer and model +torch_device = "cuda" if torch.cuda.is_available() else "cpu" +tokenizer = AutoTokenizer.from_pretrained("gpt2") +model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False + ) + .eval() + .to(torch_device) +) + +# Input prompt +model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( + torch_device +) +input_ids = model_inputs["input_ids"] +max_tokens = 40 + +# Pyt model outputs +greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) +print( + "Pytorch model generated text: ", + tokenizer.decode(greedy_output[0], skip_special_tokens=True), +) + +# Compile Torch-TRT model +torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) +model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "debug": False, + "enabled_precisions": {torch.float}, + "torch_executed_ops": {"torch.ops.aten.slice.Tensor"}, + "use_python_runtime": True, + }, +) + +# Auto-regressive generation loop for greedy search +stopping_criteria = StoppingCriteriaList( + [ + MaxLengthCriteria(max_length=max_tokens), + EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), + ] +) +while True: + trt_outputs = model(input_ids) + logits = trt_outputs.logits + next_token_logits = logits[:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if stopping_criteria(input_ids, logits).item(): + break + +# Decode the sentence +print( + "TensorRT model generated text: ", + tokenizer.decode(input_ids[0], skip_special_tokens=True), +) From 798aa302e25a60f18e613d1d98499271180bad54 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 10:22:33 -0700 Subject: [PATCH 55/73] chore: updates --- py/torch_tensorrt/dynamo/_compiler.py | 51 +++++----- .../dynamo/conversion/_TRTInterpreter.py | 3 +- .../dynamo/conversion/impl/slice/ops.py | 2 - .../dynamo/conversion/ops_evaluators.py | 17 ++-- .../lowering/passes/fuse_prims_broadcast.py | 10 -- .../dynamo/runtime/_OutputAllocator.py | 29 ------ .../runtime/_PythonTorchTensorRTModule.py | 2 - py/torch_tensorrt/dynamo/runtime/__init__.py | 1 - tests/py/dynamo/conversion/harness.py | 4 +- tests/py/dynamo/models/test_dyn_compile.py | 92 ------------------- tests/py/dynamo/models/test_dyn_models.py | 50 ---------- 11 files changed, 38 insertions(+), 223 deletions(-) delete mode 100644 py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py delete mode 100644 tests/py/dynamo/models/test_dyn_compile.py diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 9eaebf0050..d189865366 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -387,23 +387,18 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: submodule_inputs, "dtype", lambda t: t.to(torch.dtype) ) - # subgraph_output_shapes = [get_node_shape(node) for node in submodule.graph.nodes if node.op=="output"] - # try: - # submodule_outputs = submodule( - # *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) - # ) - # except: - # breakpoint() - # print("done") - - # subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( - # submodule_outputs, - # "shape", - # lambda x: dict(x) if isinstance(x, dict) else tuple(x), - # ) - # subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( - # submodule_outputs, "dtype" - # ) + submodule_outputs = submodule( + *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) + ) + + subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( + submodule_outputs, + "shape", + lambda x: dict(x) if isinstance(x, dict) else tuple(x), + ) + subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( + submodule_outputs, "dtype" + ) dryrun_tracker.tensorrt_graph_count += 1 dryrun_tracker.per_subgraph_data.append(subgraph_data) @@ -419,19 +414,19 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool: trt_modules[name] = trt_module - # sample_outputs = gm( - # *get_torch_inputs(sample_inputs, to_torch_device(settings.device)) - # ) + sample_outputs = gm( + *get_torch_inputs(sample_inputs, to_torch_device(settings.device)) + ) - # if not isinstance(sample_outputs, (list, tuple)): - # sample_outputs = [sample_outputs] + if not isinstance(sample_outputs, (list, tuple)): + sample_outputs = [sample_outputs] - # dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( - # sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) - # ) - # dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( - # sample_outputs, "dtype" - # ) + dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( + sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) + ) + dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( + sample_outputs, "dtype" + ) # Replace all FX Modules with TRT Modules for name, trt_module in trt_modules.items(): diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 4f81fc6c6b..5fe14043b8 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -24,6 +24,7 @@ get_trt_tensor, ) from torch_tensorrt.fx.observer import Observer +from torch_tensorrt.logging import TRT_LOGGER from packaging import version @@ -56,7 +57,7 @@ def __init__( ): super().__init__(module) - self.logger = trt.Logger(trt.Logger.VERBOSE) + self.logger = TRT_LOGGER self.builder = trt.Builder(self.logger) flag = 0 diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 3a9f4bcebf..83aa7822bd 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -158,8 +158,6 @@ def expand( shape_ = cat(ctx, target, source_ir, name + "_shape_concat", shape, 0) start_tensor = cat(ctx, target, source_ir, name + "_start_concat", start, 0) stride_tensor = cat(ctx, target, source_ir, name + "_stride_concat", stride, 0) - # start_tensor = get_trt_tensor(ctx, np.array(start, dtype=np.int32), name + "_start") - # stride_tensor = get_trt_tensor(ctx, np.array(stride, dtype=np.int32), name + "_stride") layer = ctx.net.add_slice( input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() ) diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index fc5d210831..530c75cc42 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -13,7 +13,7 @@ dynamo_tensorrt_converter, ) from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor -from torch_tensorrt.dynamo.conversion.impl.elementwise import sub +from torch_tensorrt.dynamo.conversion.impl.elementwise import div, sub from torch_tensorrt.fx.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -61,20 +61,25 @@ def aten_ops_arange_start_step( start_rank_0 = get_trt_tensor(ctx, args[0], name + "_start_rank_0", rank=0) start_rank_1 = get_trt_tensor(ctx, args[0], name + "_start_rank_1", rank=1) end = get_trt_tensor(ctx, args[1], name + "_end", rank=1) - if len(args) > 2: - step = args[2] - else: - step = 1 + step = args[2] if len(args) > 2 else 1 step = get_trt_tensor(ctx, step, name + "_step", rank=1) # Calculate shape = (end-start) / step shape = sub( ctx, target, SourceIR.ATEN, - name + "_shape", + name + "_sub", end, start_rank_1, ) + shape = div( + ctx, + target, + SourceIR.ATEN, + name + "_sub", + shape, + step, + ) fill_layer = ctx.net.add_fill( shape.shape, trt.FillOperation.LINSPACE, shape.dtype ) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py index 56fad67a48..a6a2eaedb1 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py @@ -18,16 +18,6 @@ def fuse_prims_broadcast( """Fuses prim nodes which are effectively the ATen equivalents with keep_dim=True""" modified_graph = False - # # Propagate shapes through the graph to determine if broadcast can be resolved - # try: - # ShapeProp(gm).propagate(*sample_inputs) - # except (RuntimeError, AssertionError): - # logger.warning( - # "Shape Propagation Failed on Graph, skipping fuse_prims_broadcast lowering pass", - # exc_info=True, - # ) - # return gm - for node in gm.graph.nodes: # If the node is a sum prims operator, with broadcast_in_dim being the only consumer # it is a candidate for fusing diff --git a/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py b/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py deleted file mode 100644 index 575b32fafe..0000000000 --- a/py/torch_tensorrt/dynamo/runtime/_OutputAllocator.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging - -import tensorrt as trt -import torch - -logger = logging.getLogger(__name__) - - -class OutputAllocator(trt.IOutputAllocator): - def __init__(self): - trt.IOutputAllocator.__init__(self) - self.buffers = {} - self.shapes = {} - - def reallocate_output(self, tensor_name, memory, size, alignment): - shape = (size,) - if tensor_name not in self.buffers: - self.buffers[tensor_name] = torch.empty( - shape, dtype=torch.uint8, device="cuda" - ) - else: - self.buffers[tensor_name] = self.buffers[tensor_name].resize_(shape) - logger.debug( - f"Reallocated output tensor: {tensor_name} to: {self.buffers[tensor_name]}" - ) - return self.buffers[tensor_name].data_ptr() - - def notify_shape(self, tensor_name, shape): - self.shapes[tensor_name] = tuple(shape) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 6d766bd718..0633719e97 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -10,7 +10,6 @@ from torch.nn import Module from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype -from torch_tensorrt.dynamo.runtime import OutputAllocator from torch_tensorrt.dynamo.runtime.tools import ( _is_switch_required, _select_rt_device, @@ -54,7 +53,6 @@ def __init__( self.profiling_enabled = ( profiling_enabled if profiling_enabled is not None else False ) - self.output_allocator = OutputAllocator() self._initialize() def _initialize(self) -> None: diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py index 9862e85767..ee1caab972 100644 --- a/py/torch_tensorrt/dynamo/runtime/__init__.py +++ b/py/torch_tensorrt/dynamo/runtime/__init__.py @@ -1,3 +1,2 @@ -from ._OutputAllocator import OutputAllocator # noqa: F401 from ._PythonTorchTensorRTModule import PythonTorchTensorRTModule # noqa: F401 from ._TorchTensorRTModule import TorchTensorRTModule # noqa: F401 diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index c3648fb8b1..3444edbaa7 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -60,7 +60,7 @@ def run_test( with torch.no_grad(): cuda_inputs = [] for i in inputs: - cuda_inputs.append(i) + cuda_inputs.append(i.cuda()) mod.eval() start = time.perf_counter() @@ -296,7 +296,7 @@ def run_test_with_dynamic_shape( # Previous instance of the interpreter auto-casted 64-bit inputs # We replicate this behavior here - compilation_settings = CompilationSettings(truncate_double=True, debug=True) + compilation_settings = CompilationSettings(truncate_double=True) interp = TRTInterpreter( mod, diff --git a/tests/py/dynamo/models/test_dyn_compile.py b/tests/py/dynamo/models/test_dyn_compile.py deleted file mode 100644 index b8db6e14aa..0000000000 --- a/tests/py/dynamo/models/test_dyn_compile.py +++ /dev/null @@ -1,92 +0,0 @@ -import unittest - -import pytest -import torch -import torch_tensorrt as torchtrt -from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity - -assertions = unittest.TestCase() - - -@pytest.mark.unit -def test_dyn_full_compile(ir): - """ - Tests the model (which is fully convertible) with dynamic shapes - """ - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) - self.relu = torch.nn.ReLU() - - def forward(self, x): - torch._check(x.size()[0] >= 1) - torch._check(x.size()[0] <= 8) - out = self.conv(x) - out = self.relu(out) - return out - - model = MyModule().eval().cuda() - input_bs4 = torch.randn((4, 3, 224, 224)).to("cuda") - torch._dynamo.mark_dynamic(input_bs4, 0) - compile_spec = { - "inputs": [input_bs4], - "min_block_size": 1, - "debug": True, - } - # Compile the model - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) - trt_model(input_bs4) - - input_bs6 = torch.randn((6, 3, 224, 224)).to("cuda") - cos_sim = cosine_similarity(model(input_bs6), trt_model(input_bs6)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"test_dyn_full_compile model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - with torch.no_grad(): - torch.cuda.empty_cache() - - -@pytest.mark.unit -def test_dyn_view(ir): - """ - Tests the model (which is fully convertible) with dynamic shapes - """ - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - torch._check(x.size()[0] <= 8) - input_shape = x.size() - y = x.view(input_shape[0], -1) - return y - - model = MyModule().eval().cuda() - input_bs4 = torch.randn((4, 3, 4)).to("cuda") - torch._dynamo.mark_dynamic(input_bs4, 0) - compile_spec = {"inputs": [input_bs4], "min_block_size": 1, "debug": True} - - # Compile the model - trt_model = torch.compile(model, backend="tensorrt", options=compile_spec) - trt_model(input_bs4) - - input_bs6 = torch.randn((6, 3, 4)).to("cuda") - cos_sim = cosine_similarity(model(input_bs6), trt_model(input_bs6)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"test_base_dynamic model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - with torch.no_grad(): - torch.cuda.empty_cache() diff --git a/tests/py/dynamo/models/test_dyn_models.py b/tests/py/dynamo/models/test_dyn_models.py index f2da38d746..a6302db7be 100644 --- a/tests/py/dynamo/models/test_dyn_models.py +++ b/tests/py/dynamo/models/test_dyn_models.py @@ -233,53 +233,3 @@ def test_resnet_dynamic(ir): with torch.no_grad(): torch.cuda.empty_cache() - - -@pytest.mark.unit -def test_view(ir): - """ - Tests the model (which is fully convertible) with dynamic shapes - """ - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - input_shape = x.size() - y = x.view(input_shape[0], -1) - return y - - model = MyModule().eval().cuda() - input = torch.randn((6, 3, 4)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - min_shape=(1, 3, 4), - opt_shape=(4, 3, 4), - max_shape=(8, 3, 4), - dtype=torch.float32, - name="x", - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 1, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"test_base_dynamic model TRT outputs don't match with the pytorch model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - with torch.no_grad(): - torch.cuda.empty_cache() From f9475093a37d9d11085701d850dbf1d255cf8f48 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 10:25:02 -0700 Subject: [PATCH 56/73] chore: remove dyn shape support for split converter --- py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 8417c394f9..ea1e2569a6 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -664,12 +664,10 @@ def aten_ops_softmax( @dynamo_tensorrt_converter( torch.ops.aten.split.Tensor, - supports_dynamic_shapes=True, ) -@dynamo_tensorrt_converter(torch.ops.aten.split.sizes, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.split.sizes) @dynamo_tensorrt_converter( torch.ops.aten.split_with_sizes.default, - supports_dynamic_shapes=True, ) def aten_ops_split( ctx: ConversionContext, From 477a49bf0fb557cabe3737ad66caf2880eeb00f3 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 11:15:13 -0700 Subject: [PATCH 57/73] chore: updates --- tests/py/dynamo/models/test_export_serde.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/py/dynamo/models/test_export_serde.py b/tests/py/dynamo/models/test_export_serde.py index b6519815a4..de216de916 100644 --- a/tests/py/dynamo/models/test_export_serde.py +++ b/tests/py/dynamo/models/test_export_serde.py @@ -1,7 +1,6 @@ import unittest import pytest -import timm import torch import torch_tensorrt as torchtrt import torchvision.models as models From 1828d5b8dd779e9839761380ed6e0ff665bbb748 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 16:19:20 -0700 Subject: [PATCH 58/73] chore: updates --- .github/workflows/build-test.yml | 10 +--------- .../dynamo/conversion/aten_ops_converters.py | 8 +++----- .../dynamo/conversion/ops_evaluators.py | 12 ++++++++---- tests/modules/requirements.txt | 5 ++--- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 4e0153dcc3..e878a5f163 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -77,8 +77,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/modules # Don't use requirements.txt here as it contains tensorrt and torch which should have been installed by now. @@ -87,7 +85,7 @@ jobs: popd pushd . cd tests/py/ts - ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install pytest parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ @@ -115,7 +113,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -144,7 +141,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -174,7 +170,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -203,7 +198,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -234,7 +228,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -264,7 +257,6 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/core ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index ea1e2569a6..cd88f7b866 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -662,12 +662,10 @@ def aten_ops_softmax( ) +@dynamo_tensorrt_converter(torch.ops.aten.split.Tensor, supports_dynamic_shapes=True) +@dynamo_tensorrt_converter(torch.ops.aten.split.sizes, supports_dynamic_shapes=True) @dynamo_tensorrt_converter( - torch.ops.aten.split.Tensor, -) -@dynamo_tensorrt_converter(torch.ops.aten.split.sizes) -@dynamo_tensorrt_converter( - torch.ops.aten.split_with_sizes.default, + torch.ops.aten.split_with_sizes.default, supports_dynamic_shapes=True ) def aten_ops_split( ctx: ConversionContext, diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index 530c75cc42..5e0231be90 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -12,8 +12,11 @@ ConverterRegistry, dynamo_tensorrt_converter, ) -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor -from torch_tensorrt.dynamo.conversion.impl.elementwise import div, sub +from torch_tensorrt.dynamo.conversion.converter_utils import ( + cast_trt_tensor, + get_trt_tensor, +) +from torch_tensorrt.dynamo.conversion.impl.elementwise import sub, trunc_div from torch_tensorrt.fx.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -72,14 +75,15 @@ def aten_ops_arange_start_step( end, start_rank_1, ) - shape = div( + shape = trunc_div( ctx, target, SourceIR.ATEN, - name + "_sub", + name + "_shape", shape, step, ) + shape = cast_trt_tensor(ctx, shape, trt.int32, name + "_shape_casted") fill_layer = ctx.net.add_fill( shape.shape, trt.FillOperation.LINSPACE, shape.dtype ) diff --git a/tests/modules/requirements.txt b/tests/modules/requirements.txt index da63a6dad1..03e51d0b61 100644 --- a/tests/modules/requirements.txt +++ b/tests/modules/requirements.txt @@ -1,3 +1,2 @@ -timm -transformers -torchvision +timm==0.9.12 +transformers==4.40.2 From 66c7b19352999232ff0ae0c453fc29660241c462 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 21:09:04 -0700 Subject: [PATCH 59/73] chore: updates --- .../dynamo/runtime/_PythonTorchTensorRTModule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 0633719e97..a35b8ca8c9 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -195,9 +195,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . if self.engine.is_shape_inference_io(input_name): # TODO: Sometimes addresses are getting corrupted - input_clone = contiguous_inputs[i].cpu() + inputs_cpu = contiguous_inputs[i].cpu() self.context.set_tensor_address( - input_name, input_clone.data_ptr() + input_name, inputs_cpu.data_ptr() ) else: self.context.set_input_shape( From afa85fc033cc6f96cc8565763338a1b2b73e9820 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 14 May 2024 22:35:24 -0700 Subject: [PATCH 60/73] chore: updates --- .../dynamo/lowering/passes/fuse_prims_broadcast.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py index a6a2eaedb1..aa7403f94e 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/fuse_prims_broadcast.py @@ -2,8 +2,6 @@ from typing import Sequence import torch - -# from torch.fx.passes.shape_prop import ShapeProp from torch_tensorrt.dynamo.lowering.passes.pass_utils import ( clean_up_graph_after_modifications, ) From ac4feba7a7bdb2d0efd65121fa51c70ed91c76d1 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 15 May 2024 14:33:45 -0700 Subject: [PATCH 61/73] chore: roll back GHA changes --- .github/workflows/build-test.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index e878a5f163..4e0153dcc3 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -77,6 +77,8 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/modules # Don't use requirements.txt here as it contains tensorrt and torch which should have been installed by now. @@ -85,7 +87,7 @@ jobs: popd pushd . cd tests/py/ts - ${CONDA_RUN} python -m pip install pytest parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ @@ -113,6 +115,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -141,6 +144,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -170,6 +174,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -198,6 +203,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -228,6 +234,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver @@ -257,6 +264,7 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 + export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/core ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver From aa8e0b50b7291d3888a3e0644c5c844fc71310fa Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 15 May 2024 17:22:58 -0700 Subject: [PATCH 62/73] chore: updates --- core/runtime/execute_engine.cpp | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 11021ebe8d..809b6a0bea 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -124,6 +124,8 @@ std::vector execute_engine(std::vector inputs, c10::intr } } + // this is a buffer to store shape tensor input addresses throughout the runtime scope + std::list> inputShapeTensorValues; { std::unique_ptr input_profiler_guard; if (compiled_engine->profile_execution) { @@ -143,7 +145,11 @@ std::vector execute_engine(std::vector inputs, c10::intr auto shape = core::util::toVec(dims); LOG_DEBUG("Input Name: " << name << " Shape: " << dims); if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { - compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().cpu().data_ptr()); + auto input_cpu = inputs[i].clone().contiguous().cpu(); + std::vector inputs_cpu_vec( + input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); + inputShapeTensorValues.emplace_back(inputs_cpu_vec); + compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()); } else { compiled_engine->exec_ctx->setInputShape(name.c_str(), dims); compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr()); @@ -151,13 +157,14 @@ std::vector execute_engine(std::vector inputs, c10::intr } // Check if input shapes can be inferred. - char const** unInferredInputNames; - if (compiled_engine->exec_ctx->inferShapes(inputs.size(), unInferredInputNames)) { - LOG_WARNING( - "The shapes of the inputs: " - << unInferredInputNames - << " cannot be inferred and could lead to undefined behavior. This could happen if the input tensor addresses/shapes haven't been configured correctly"); - } + int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()}; + std::vector names(io_size); + int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data()); + TORCHTRT_CHECK( + nbNames == 0, + "The shapes of the inputs: " + << names + << " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly"); } std::vector outputs(compiled_engine->num_io.second); From 5b9891588627732a60fe8cbf5436c3435064c4f2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 15 May 2024 21:48:16 -0700 Subject: [PATCH 63/73] chore: updates --- py/torch_tensorrt/dynamo/utils.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index f0ad9e19c1..be0019159a 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, Optional, Sequence, Union import torch -from torch._subclasses.fake_tensor import FakeTensor from torch_tensorrt._Device import Device from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input @@ -73,28 +72,6 @@ def input_is_dynamic(inputs: Sequence[Union[Input, torch.Tensor]]) -> bool: ) -def get_node_shape(node: torch.fx.Node) -> Sequence[int]: - """ - Return the shape of the output of this node as recorded in the node.meta["val"] - """ - shapes = [] - if node.meta and "val" in node.meta: - metadata = node.meta["val"] - if isinstance(metadata, (list, tuple)): - for output in metadata: - shapes.append(output.size()) - elif isinstance(metadata, (torch.Tensor, FakeTensor)): - shapes.append(metadata.size()) - elif isinstance(metadata, torch.SymInt): - shapes.append(metadata) - else: - logger.warning( - f"Requested output shape for node {node.target} but it is missing metadata" - ) - - return shapes - - def get_torch_inputs( inputs: Sequence[Input], device: Union[Device, torch.device, str], mode: str = "" ) -> Sequence[torch.tensor]: From 40dbbff57c596be4e062cfdc3a4287e7d38d9eed Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 15 May 2024 22:28:13 -0700 Subject: [PATCH 64/73] chore: updates --- .github/workflows/build-test.yml | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 4e0153dcc3..cbaed6551e 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -78,16 +78,15 @@ jobs: script: | export USE_HOST_DEPS=1 export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/modules # Don't use requirements.txt here as it contains tensorrt and torch which should have been installed by now. - ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers timm pybind11==2.6.2 + ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers==4.40.2 timm==0.9.16 pybind11==2.6.2 ${CONDA_RUN} python hub.py popd pushd . cd tests/py/ts - ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ @@ -115,10 +114,9 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/ popd @@ -144,10 +142,9 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py popd @@ -174,10 +171,9 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py popd @@ -203,10 +199,9 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_comple_be_e2e_test_results.xml --ir torch_compile models/test_models.py ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -234,10 +229,9 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -264,9 +258,8 @@ jobs: pre-script: ${{ matrix.pre-script }} script: | export USE_HOST_DEPS=1 - export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH pushd . cd tests/py/core - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . popd From 80a2e9eb8b9a7ae10ff830b01e897551c364a17f Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 15 May 2024 23:05:40 -0700 Subject: [PATCH 65/73] chore: updates --- py/torch_tensorrt/dynamo/conversion/ops_evaluators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index 5e0231be90..0f2581d0d2 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -83,7 +83,7 @@ def aten_ops_arange_start_step( shape, step, ) - shape = cast_trt_tensor(ctx, shape, trt.int32, name + "_shape_casted") + shape = cast_trt_tensor(ctx, shape, end.dtype, name + "_shape_casted") fill_layer = ctx.net.add_fill( shape.shape, trt.FillOperation.LINSPACE, shape.dtype ) From 382ea09d998179195313295c5b8a271bcf93efeb Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 16 May 2024 11:22:39 -0700 Subject: [PATCH 66/73] chore: updates --- .../dynamo/conversion/_ConverterRegistry.py | 37 ++++++++++++------- .../dynamo/conversion/aten_ops_converters.py | 8 +++- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py index 8069b9b9c0..1afb9749c6 100644 --- a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py +++ b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py @@ -91,6 +91,11 @@ class ConverterSupport: DYNAMO_ATEN_CONVERTERS: Dict[Target, Sequence[ConverterSupport]] = {} +def has_static_shapes(node: torch.fx.Node) -> bool: + """Returns True if a node has static args, kwargs, or outputs""" + return not _has_dynamic_shapes(node=node) + + def has_dynamic_shapes(node: torch.fx.Node) -> bool: """Returns True if a node has dynamic args, kwargs, or outputs""" return _has_dynamic_shapes(node=node) @@ -105,6 +110,18 @@ def has_dynamic_shapes_in_args( ) +def has_static_shapes_in_args( + arg_positions_to_check: Optional[List[int]] = None, +) -> Callable[[torch.fx.Node], bool]: + """Returns True if a node has static inputs in node.args at specified positions""" + _has_static_shapes = lambda node, arg_positions_to_check: not _has_dynamic_shapes( + node, arg_positions_to_check + ) + return functools.partial( + _has_static_shapes, arg_positions_to_check=arg_positions_to_check + ) + + def _has_dynamic_shapes( node: torch.fx.Node, arg_positions_to_check: Optional[List[int]] = None ) -> bool: @@ -414,22 +431,16 @@ def __getitem__( if isinstance(converters, (list, tuple)): for candidate in converters: + # We enable the converter under 4 conditions + # 1) capability validator is True + # 2) Assume dynamic_shape support is True + # 3) Node only has static shaped inputs + # 4) Node has dynamic inputs and the converter has supports_dynamic_shapes=True if candidate.capability_validator(node) and ( self.assume_dynamic_shape_support - or ( - has_dynamic_shapes(node) - and candidate.supports_dynamic_shapes - ) + or not has_dynamic_shapes(node) + or candidate.supports_dynamic_shapes ): - # If node has dynamic inputs and the converter supports dynamic shapes, it is enabled - return ( - candidate.converter_implementation, - calling_convention, - ) - elif candidate.capability_validator( - node - ) and not has_dynamic_shapes(node): - # For static shapes all converters are turned on based on capability_validator check return ( candidate.converter_implementation, calling_convention, diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index b31617247a..23483986a5 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -12,6 +12,7 @@ from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( dynamo_tensorrt_converter, + has_static_shapes_in_args, ) from torch_tensorrt.dynamo.conversion.converter_utils import ( enforce_tensor_types, @@ -627,11 +628,14 @@ def aten_ops_softmax( @dynamo_tensorrt_converter( - torch.ops.aten.split.Tensor, + torch.ops.aten.split.Tensor, capability_validator=has_static_shapes_in_args([1]) +) +@dynamo_tensorrt_converter( + torch.ops.aten.split.sizes, capability_validator=has_static_shapes_in_args([1]) ) -@dynamo_tensorrt_converter(torch.ops.aten.split.sizes) @dynamo_tensorrt_converter( torch.ops.aten.split_with_sizes.default, + capability_validator=has_static_shapes_in_args([1]), ) def aten_ops_split( ctx: ConversionContext, From 31bf8ed9f7f6c3d03f4767f2ca4aa9a01b676798 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 16 May 2024 18:27:55 -0700 Subject: [PATCH 67/73] chore: updates --- core/runtime/execute_engine.cpp | 11 ++++--- .../dynamo/conversion/impl/cat.py | 8 +++++ .../dynamo/conversion/impl/slice/ops.py | 30 +++++++++++++++++-- .../runtime/_PythonTorchTensorRTModule.py | 5 ++-- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 809b6a0bea..0267f9ce04 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -125,7 +125,7 @@ std::vector execute_engine(std::vector inputs, c10::intr } // this is a buffer to store shape tensor input addresses throughout the runtime scope - std::list> inputShapeTensorValues; + std::list> inputShapeTensorValues; { std::unique_ptr input_profiler_guard; if (compiled_engine->profile_execution) { @@ -145,9 +145,12 @@ std::vector execute_engine(std::vector inputs, c10::intr auto shape = core::util::toVec(dims); LOG_DEBUG("Input Name: " << name << " Shape: " << dims); if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) { - auto input_cpu = inputs[i].clone().contiguous().cpu(); - std::vector inputs_cpu_vec( - input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); + // Shape tensor inputs are casted to int32 explicitly. + // Refer to + // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 + auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt32); + std::vector inputs_cpu_vec( + input_cpu.data_ptr(), input_cpu.data_ptr() + input_cpu.numel()); inputShapeTensorValues.emplace_back(inputs_cpu_vec); compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()); } else { diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cat.py b/py/torch_tensorrt/dynamo/conversion/impl/cat.py index 2f43f925ba..346a5ac727 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cat.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cat.py @@ -1,11 +1,14 @@ from typing import Optional, Sequence, Union import numpy as np +import tensorrt as trt import torch from torch.fx.node import Target +from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( + cast_trt_tensor, get_positive_dim, get_trt_tensor, ) @@ -20,11 +23,16 @@ def cat( name: str, input: Sequence[Union[TRTTensor, torch.Tensor, np.ndarray]], dim: int, + cast_dtype: Union[_enums.dtype, trt.DataType, np.dtype] = None, ) -> Union[TRTTensor, Sequence[TRTTensor]]: trt_inputs = [] for i, each_input in enumerate(input): if not isinstance(each_input, TRTTensor): each_input = get_trt_tensor(ctx, each_input, f"{name}_tensor_{i}") + if cast_dtype: + each_input = cast_trt_tensor( + ctx, each_input, cast_dtype, f"{name}_tensor_int32_cast{i}" + ) trt_inputs.append(each_input) concat_layer = ctx.net.add_concatenation(trt_inputs) dim = get_positive_dim(dim, len(trt_inputs[0].shape)) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 83aa7822bd..7a1a86d6b9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -155,9 +155,33 @@ def expand( shape_ = shape # Handle dynamic shapes case where shape has dynamic dimension if any(isinstance(ele, TRTTensor) for ele in shape): - shape_ = cat(ctx, target, source_ir, name + "_shape_concat", shape, 0) - start_tensor = cat(ctx, target, source_ir, name + "_start_concat", start, 0) - stride_tensor = cat(ctx, target, source_ir, name + "_stride_concat", stride, 0) + shape_ = cat( + ctx, + target, + source_ir, + name + "_shape_concat", + shape, + 0, + cast_dtype=trt.int32, + ) + start_tensor = cat( + ctx, + target, + source_ir, + name + "_start_concat", + start, + 0, + cast_dtype=trt.int32, + ) + stride_tensor = cat( + ctx, + target, + source_ir, + name + "_stride_concat", + stride, + 0, + cast_dtype=trt.int32, + ) layer = ctx.net.add_slice( input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() ) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index a35b8ca8c9..a391179696 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -194,8 +194,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." if self.engine.is_shape_inference_io(input_name): - # TODO: Sometimes addresses are getting corrupted - inputs_cpu = contiguous_inputs[i].cpu() + # Shape tensor inputs are casted to int32 explicitly. + # Refer to https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435 + inputs_cpu = contiguous_inputs[i].cpu().to(torch.int32) self.context.set_tensor_address( input_name, inputs_cpu.data_ptr() ) From 18c0b4b422acb28f11ef094e79df875a5bc62324 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 17 May 2024 10:05:42 -0700 Subject: [PATCH 68/73] chore: fix tests --- py/torch_tensorrt/dynamo/_tracer.py | 10 ++--- tests/py/dynamo/conversion/harness.py | 41 +++++++++++++------ .../conversion/test_bitwise_and_aten.py | 3 ++ .../conversion/test_bitwise_not_aten.py | 1 + .../dynamo/conversion/test_bitwise_or_aten.py | 3 ++ .../conversion/test_bitwise_xor_aten.py | 3 ++ .../conversion/test_convolution_aten.py | 5 +-- .../conversion/test_deconvolution_aten.py | 5 +-- .../conversion/test_embedding_bag_aten.py | 3 ++ .../test_convert_module_to_trt_engine.py | 2 +- 10 files changed, 50 insertions(+), 26 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_tracer.py b/py/torch_tensorrt/dynamo/_tracer.py index 11c0f6b3ac..6bc334f427 100644 --- a/py/torch_tensorrt/dynamo/_tracer.py +++ b/py/torch_tensorrt/dynamo/_tracer.py @@ -58,13 +58,9 @@ def trace( device = to_torch_device(kwargs.get("device", default_device())) torch_inputs = get_torch_inputs(inputs, device) - dynamic_shapes = {} + dynamic_shapes = [] for input in inputs: if isinstance(input, Input) and input.shape_mode == Input._ShapeMode.DYNAMIC: - if not input.name: - raise AssertionError( - f"Expected a name for a dynamic input with shape {input.shape} but found none" - ) min_shape = input.shape["min_shape"] opt_shape = input.shape["opt_shape"] max_shape = input.shape["max_shape"] @@ -80,8 +76,8 @@ def trace( max=max_shape[dim], ) - dynamic_shapes[input.name] = dynamic_dims + dynamic_shapes.append(dynamic_dims) - exp_program = export(mod, tuple(torch_inputs), dynamic_shapes=dynamic_shapes) + exp_program = export(mod, tuple(torch_inputs), dynamic_shapes=tuple(dynamic_shapes)) return exp_program diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 3444edbaa7..6bb9b0c500 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -6,16 +6,20 @@ from typing import Callable, List, Optional, Set, Tuple import torch +import torch_tensorrt +from torch.fx.passes.shape_prop import ShapeProp from torch.testing._internal.common_utils import TestCase from torch_tensorrt import Input from torch_tensorrt._enums import dtype +from torch_tensorrt.dynamo import _defaults from torch_tensorrt.dynamo._settings import CompilationSettings # Use interpreter, input spec, and test case from fx_ts_compat to test Dynamo Converter Registry from torch_tensorrt.dynamo.conversion import TRTInterpreter from torch_tensorrt.dynamo.conversion._conversion import infer_module_output_dtypes -from torch_tensorrt.dynamo.lowering import apply_lowering_passes +from torch_tensorrt.dynamo.lowering import apply_lowering_passes, get_decompositions from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule +from torch_tensorrt.dynamo.utils import get_torch_inputs _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -62,7 +66,6 @@ def run_test( for i in inputs: cuda_inputs.append(i.cuda()) - mod.eval() start = time.perf_counter() interpreter_result = interpreter.run() sec = time.perf_counter() - start @@ -201,19 +204,30 @@ def generate_graph( original_inputs: List[torch.Tensor], use_dynamo_tracer: bool, enable_passes: bool, + propagate_shapes: bool = False, ): + mod = mod.eval() + torch_inputs = get_torch_inputs(original_inputs, _defaults.DEVICE) if use_dynamo_tracer: - fx_module = torch._dynamo.export( - mod, - *original_inputs, - aten_graph=True, - assume_static_by_default=True, - tracing_mode="real", - ).graph_module + exported_program = torch_tensorrt.dynamo.trace(mod, tuple(original_inputs)) + exported_program = exported_program.run_decompositions( + get_decompositions(False) + ) + fx_module = exported_program.module() else: fx_module = torch.fx.symbolic_trace(mod) if enable_passes: - fx_module = apply_lowering_passes(fx_module, original_inputs) + fx_module = apply_lowering_passes(fx_module, torch_inputs) + + if propagate_shapes: + # TODO: This is currently being used to test embedding_bag_aten due to https://github.com/pytorch/TensorRT/issues/2843 + try: + ShapeProp(fx_module).propagate(*torch_inputs) + except (RuntimeError, AssertionError): + logger.warning( + "Shape Propagation failed on Graph, skipping it", + exc_info=False, + ) return fx_module def run_test( @@ -226,6 +240,7 @@ def run_test( check_dtype=True, use_dynamo_tracer=False, enable_passes=False, + propagate_shapes=False, ): mod.eval() mod = self.generate_graph( @@ -233,6 +248,7 @@ def run_test( inputs, use_dynamo_tracer=use_dynamo_tracer, enable_passes=enable_passes, + propagate_shapes=propagate_shapes, ) # Previous instance of the interpreter auto-casted 64-bit inputs @@ -284,14 +300,15 @@ def run_test_with_dynamic_shape( enable_passes=False, use_example_tensors=True, pyt_inputs=None, + propagate_shapes=False, ): mod.eval() - inputs = [spec.example_tensor("opt_shape") for spec in input_specs] mod = self.generate_graph( mod, - inputs, + input_specs, use_dynamo_tracer=use_dynamo_tracer, enable_passes=enable_passes, + propagate_shapes=propagate_shapes, ) # Previous instance of the interpreter auto-casted 64-bit inputs diff --git a/tests/py/dynamo/conversion/test_bitwise_and_aten.py b/tests/py/dynamo/conversion/test_bitwise_and_aten.py index 5c2a78a18a..8e7d8cef73 100644 --- a/tests/py/dynamo/conversion/test_bitwise_and_aten.py +++ b/tests/py/dynamo/conversion/test_bitwise_and_aten.py @@ -26,6 +26,7 @@ def forward(self, lhs_val, rhs_val): bitwise_and(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -46,6 +47,7 @@ def forward(self, tensor): bitwise_and(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -66,6 +68,7 @@ def forward(self, tensor): bitwise_and(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) diff --git a/tests/py/dynamo/conversion/test_bitwise_not_aten.py b/tests/py/dynamo/conversion/test_bitwise_not_aten.py index b811f1e51a..33d8629aff 100644 --- a/tests/py/dynamo/conversion/test_bitwise_not_aten.py +++ b/tests/py/dynamo/conversion/test_bitwise_not_aten.py @@ -25,6 +25,7 @@ def forward(self, val): bitwise_not(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) diff --git a/tests/py/dynamo/conversion/test_bitwise_or_aten.py b/tests/py/dynamo/conversion/test_bitwise_or_aten.py index b5e0200734..e912a9c473 100644 --- a/tests/py/dynamo/conversion/test_bitwise_or_aten.py +++ b/tests/py/dynamo/conversion/test_bitwise_or_aten.py @@ -26,6 +26,7 @@ def forward(self, lhs_val, rhs_val): bitwise_or(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -46,6 +47,7 @@ def forward(self, tensor): bitwise_or(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -66,6 +68,7 @@ def forward(self, tensor): bitwise_or(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) diff --git a/tests/py/dynamo/conversion/test_bitwise_xor_aten.py b/tests/py/dynamo/conversion/test_bitwise_xor_aten.py index 8c1a8136ef..4bd2790bf9 100644 --- a/tests/py/dynamo/conversion/test_bitwise_xor_aten.py +++ b/tests/py/dynamo/conversion/test_bitwise_xor_aten.py @@ -26,6 +26,7 @@ def forward(self, lhs_val, rhs_val): bitwise_xor(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -46,6 +47,7 @@ def forward(self, tensor): bitwise_xor(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) @parameterized.expand( @@ -66,6 +68,7 @@ def forward(self, tensor): bitwise_xor(), inputs, enable_passes=True, + use_dynamo_tracer=True, ) diff --git a/tests/py/dynamo/conversion/test_convolution_aten.py b/tests/py/dynamo/conversion/test_convolution_aten.py index 7d69c871a9..95f4de92b5 100644 --- a/tests/py/dynamo/conversion/test_convolution_aten.py +++ b/tests/py/dynamo/conversion/test_convolution_aten.py @@ -1,7 +1,6 @@ import torch from parameterized import param, parameterized from torch.testing._internal.common_utils import run_tests - from torch_tensorrt import Input from .harness import DispatchTestCase @@ -138,7 +137,7 @@ def forward(self, x): Input( shape=(-1, 3, -1, -1), dtype=torch.float32, - shape_ranges=[((1, 3, 1, 1), (1, 3, 4, 4), (32, 3, 128, 128))], + shape_ranges=[((1, 3, 1, 1), (2, 3, 4, 4), (32, 3, 128, 128))], ), ] self.run_test_with_dynamic_shape( @@ -201,7 +200,7 @@ def forward(self, x): Input( shape=(-1, 3, -1, -1, -1), dtype=torch.float32, - shape_ranges=[((1, 3, 1, 1, 1), (1, 3, 4, 4, 4), (8, 3, 32, 32, 32))], + shape_ranges=[((1, 3, 1, 1, 1), (2, 3, 4, 4, 4), (8, 3, 32, 32, 32))], ), ] self.run_test_with_dynamic_shape( diff --git a/tests/py/dynamo/conversion/test_deconvolution_aten.py b/tests/py/dynamo/conversion/test_deconvolution_aten.py index 6024b6946e..307275dba1 100644 --- a/tests/py/dynamo/conversion/test_deconvolution_aten.py +++ b/tests/py/dynamo/conversion/test_deconvolution_aten.py @@ -1,7 +1,6 @@ import torch from parameterized import param, parameterized from torch.testing._internal.common_utils import run_tests - from torch_tensorrt import Input from .harness import DispatchTestCase @@ -152,7 +151,7 @@ def forward(self, x): Input( shape=(-1, 3, -1, -1), dtype=torch.float32, - shape_ranges=[((1, 3, 1, 1), (1, 3, 4, 4), (32, 3, 128, 128))], + shape_ranges=[((1, 3, 1, 1), (2, 3, 4, 4), (32, 3, 128, 128))], ), ] self.run_test_with_dynamic_shape( @@ -221,7 +220,7 @@ def forward(self, x): Input( shape=(-1, 3, -1, -1, -1), dtype=torch.float32, - shape_ranges=[((1, 3, 1, 1, 1), (1, 3, 4, 4, 4), (8, 3, 32, 32, 32))], + shape_ranges=[((1, 3, 1, 1, 1), (2, 3, 4, 4, 4), (8, 3, 32, 32, 32))], ), ] self.run_test_with_dynamic_shape( diff --git a/tests/py/dynamo/conversion/test_embedding_bag_aten.py b/tests/py/dynamo/conversion/test_embedding_bag_aten.py index 2154937b43..9664e1be58 100644 --- a/tests/py/dynamo/conversion/test_embedding_bag_aten.py +++ b/tests/py/dynamo/conversion/test_embedding_bag_aten.py @@ -144,6 +144,7 @@ def forward(self, weight, indices): inputs=[weight, indices], precision=weight.dtype, enable_passes=True, + propagate_shapes=True, ) @parameterized.expand( @@ -340,6 +341,7 @@ def forward(self, weight, indices, offsets): inputs=[weight, indices, offsets], precision=weight.dtype, enable_passes=True, + propagate_shapes=True, ) @parameterized.expand( @@ -403,6 +405,7 @@ def forward(self, weight, indices, offsets): inputs=[weight, indices, offsets], precision=weight.dtype, enable_passes=True, + propagate_shapes=True, ) diff --git a/tests/py/dynamo/runtime/test_convert_module_to_trt_engine.py b/tests/py/dynamo/runtime/test_convert_module_to_trt_engine.py index c23684646a..975f0b7ffa 100644 --- a/tests/py/dynamo/runtime/test_convert_module_to_trt_engine.py +++ b/tests/py/dynamo/runtime/test_convert_module_to_trt_engine.py @@ -27,7 +27,7 @@ def forward(self, a, b): # Inference on TRT Engine py_trt_module = PythonTorchTensorRTModule( - trt_engine_str, ["a", "b"], ["output0"] + trt_engine_str, ["arg0_1", "arg1_1"], ["output0"] ) trt_output = py_trt_module(input_data_0, input_data_1).cpu() From c6f7b4a0f886e62e895a851fb7acefb58a7bbaa8 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 17 May 2024 11:16:11 -0700 Subject: [PATCH 69/73] chore: updates --- .../dynamo/conversion/aten_ops_converters.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index 3a5a84f90b..69b29cf400 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -240,26 +240,8 @@ def aten_ops_cat( ) -def embedding_param_validator(embedding_node: Node) -> bool: - scale_grad_by_freq = args_bounds_check(embedding_node.args, 3) - sparse = args_bounds_check(embedding_node.args, 4) - - if scale_grad_by_freq is not None: - _LOGGER.debug( - f"Currently we don't support specifying scale gradient by word frequency, got {scale_grad_by_freq}." - ) - return False - - if sparse is not None: - _LOGGER.debug(f"Currently we don't support sparse gradient, got {sparse}.") - return False - - return True - - @dynamo_tensorrt_converter( torch.ops.aten.embedding.default, - capability_validator=embedding_param_validator, supports_dynamic_shapes=True, ) def aten_ops_embedding( From bb5d30d97ad937a94a423407f5b78cc40feb9af0 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 17 May 2024 13:14:57 -0700 Subject: [PATCH 70/73] chore: updates --- .../dynamo/conversion/_TRTInterpreter.py | 2 +- .../dynamo/conversion/impl/slice/ops.py | 129 +++--------------- 2 files changed, 20 insertions(+), 111 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 5fe14043b8..3f2c017f6e 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -371,7 +371,7 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: self.optimization_profiles[0].set_shape_input( target, min_shape, opt_shape, max_shape ) - shape.append(1) + shape.append(len(opt_shape)) else: self.optimization_profiles[0].set_shape( target, min_shape, opt_shape, max_shape diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 7a1a86d6b9..61d71fe9a0 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -8,14 +8,10 @@ from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( - cast_trt_tensor, get_positive_dim, get_trt_tensor, ) -from torch_tensorrt.dynamo.conversion.impl.cat import cat -from torch_tensorrt.dynamo.conversion.impl.elementwise import div, sub from torch_tensorrt.dynamo.conversion.impl.slice.base import slice -from torch_tensorrt.dynamo.conversion.impl.unary import ceil from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, prepend_ones, @@ -35,7 +31,6 @@ def slice_op( # TODO: This should be slice not whatever is in base stop: Optional[int], step: int, ) -> TRTTensor: - # Special case for start being None if start is None: start = 0 @@ -44,72 +39,24 @@ def slice_op( # TODO: This should be slice not whatever is in base if stop is None: stop = input.shape[dim] - is_slice_dynamic = False - if ( - isinstance(start, TRTTensor) - or isinstance(step, TRTTensor) - or isinstance(stop, TRTTensor) - ): - is_slice_dynamic = True - - if not is_slice_dynamic: - dim = get_positive_dim(dim, len(input.shape)) - start = get_positive_dim(start, input.shape[dim]) - stop = get_positive_dim(stop, input.shape[dim]) - - if has_dynamic_shape(input.shape): - # Check whether slice target dim is dynamic shape dim - assert input.shape[dim] != -1, "Can't slice on dynamic shape dimension!" - - start_slice = [0] * len(input.shape) - start_slice[dim] = start - stride_slice = [1] * len(input.shape) - stride_slice[dim] = step - output_shape = list(input.shape) - output_shape[dim] = math.ceil((stop - start) / step) - - return slice( - ctx, target, source_ir, name, input, start_slice, output_shape, stride_slice - ) - else: - dim = get_positive_dim(dim, len(input.shape)) - # Make start, stop, step an ITensor - start = get_trt_tensor(ctx, start, name + "_start") - stop = get_trt_tensor(ctx, stop, name + "_stop") - stop_casted = cast_trt_tensor(ctx, stop, trt.float32, name + "_casted") - step = get_trt_tensor(ctx, step, name + "_step") - # Calculate size for ISlice Layer = ceil((stop-start)/step) - shape = sub( - ctx, - target, - SourceIR.ATEN, - name + "_sub", - stop_casted, - start, - ) - shape = div( - ctx, - target, - SourceIR.ATEN, - name + "_div", - shape, - step, - ) - shape = ceil( - ctx, - target, - SourceIR.ATEN, - name + "_shape", - shape, - ) - shape = cast_trt_tensor(ctx, shape, trt.int32, name + "_shape_casted") - slice_layer = ctx.net.add_slice( - input, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() - ) - slice_layer.set_input(1, start) - slice_layer.set_input(2, shape) - slice_layer.set_input(3, step) - return slice_layer.get_output(0) + dim = get_positive_dim(dim, len(input.shape)) + start = get_positive_dim(start, input.shape[dim]) + stop = get_positive_dim(stop, input.shape[dim]) + + if has_dynamic_shape(input.shape): + # Check whether slice target dim is dynamic shape dim + assert input.shape[dim] != -1, "Can't slice on dynamic shape dimension!" + + start_slice = [0] * len(input.shape) + start_slice[dim] = start + stride_slice = [1] * len(input.shape) + stride_slice[dim] = step + output_shape = list(input.shape) + output_shape[dim] = math.ceil((stop - start) / step) + + return slice( + ctx, target, source_ir, name, input, start_slice, output_shape, stride_slice + ) def expand( @@ -152,45 +99,7 @@ def expand( [int(i == o) for i, o in zip(input_tensor_shape, shape)] ) # stride == 1 if dimensions match, 0 otherwise - shape_ = shape - # Handle dynamic shapes case where shape has dynamic dimension - if any(isinstance(ele, TRTTensor) for ele in shape): - shape_ = cat( - ctx, - target, - source_ir, - name + "_shape_concat", - shape, - 0, - cast_dtype=trt.int32, - ) - start_tensor = cat( - ctx, - target, - source_ir, - name + "_start_concat", - start, - 0, - cast_dtype=trt.int32, - ) - stride_tensor = cat( - ctx, - target, - source_ir, - name + "_stride_concat", - stride, - 0, - cast_dtype=trt.int32, - ) - layer = ctx.net.add_slice( - input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims() - ) - layer.set_input(1, start_tensor) - layer.set_input(2, shape_) - layer.set_input(3, stride_tensor) - else: - layer = ctx.net.add_slice(input_t, start=start, shape=shape_, stride=stride) - + layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride) set_layer_name(layer, target, name, source_ir) return layer.get_output(0) From da72508b0ed29732542a37524dee9de674214df0 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 21 May 2024 15:29:56 -0700 Subject: [PATCH 71/73] chore: address review comments --- py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 5 +++-- py/torch_tensorrt/dynamo/conversion/converter_utils.py | 10 ++++++---- py/torch_tensorrt/dynamo/conversion/impl/cat.py | 2 +- py/torch_tensorrt/dynamo/conversion/impl/shuffle.py | 2 +- py/torch_tensorrt/dynamo/conversion/ops_evaluators.py | 10 +++++----- .../dynamo/runtime/_PythonTorchTensorRTModule.py | 2 +- py/torch_tensorrt/dynamo/utils.py | 1 + 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 3f2c017f6e..4de6aeb98f 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -23,6 +23,7 @@ get_node_name, get_trt_tensor, ) +from torch_tensorrt.dynamo.utils import DYNAMIC_DIM from torch_tensorrt.fx.observer import Observer from torch_tensorrt.logging import TRT_LOGGER @@ -365,6 +366,7 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: max_shape = current_input.shape["max_shape"] # TODO: Does not support disjoint optimization profiles? assert self.optimization_profiles is not None + assert len(min_shape) == len(opt_shape) == len(max_shape) if current_input.is_shape_tensor: # For shape_tensors, min/opt/max_shapes correspond to actual values # of the shapes provided during runtime @@ -377,13 +379,12 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor: target, min_shape, opt_shape, max_shape ) - assert len(min_shape) == len(opt_shape) == len(max_shape) for i in range(len(min_shape)): if min_shape[i] == opt_shape[i] == max_shape[i]: shape.append(min_shape[i]) else: # -1 to represent the dynamic dimension - shape.append(-1) + shape.append(DYNAMIC_DIM) elif ( not current_input.is_shape_tensor and current_input.shape_mode == Input._ShapeMode.STATIC diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 87761d984b..d1bdf72d21 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -194,7 +194,7 @@ def create_constant( value: Union[int, float, bool, np.ndarray, torch.Tensor], name: str, dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType, _enums.dtype]], - rank: Optional[int] = 1, + min_rank: Optional[int] = 1, ) -> TRTTensor: """ Add a TensorRT constant layer whose value is `value` to `ctx.net`. @@ -206,12 +206,13 @@ def create_constant( name (str): Name of the added TensorRT Constant layer. dtype (Optional[Union[torch.dtype, np.dtype, TRTDataType]]): If a dtype is given, we will convert the type of the given `value` to this dtype. + min_rank (int): minimum rank of the constant tensor. Returns: A TensorRT ITensor that represents the given value. """ shape = (1,) # Rank 0 constant is required in IFillLayer inputs. - if rank == 0: + if min_rank == 0: shape = trt.Dims() numpy_value = to_numpy( value, _enums.dtype._from(dtype).to(np.dtype) if dtype is not None else None @@ -229,7 +230,7 @@ def get_trt_tensor( input_val: Any, name: str, dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType, _enums.dtype]] = None, - rank: int = 1, + min_rank: int = 1, ) -> TRTTensor: """ Given a value of random type, we try to convert it to a TensorRT ITensor. @@ -242,6 +243,7 @@ def get_trt_tensor( one. dtype (Optional[Union[torch.dtype, np.dtype, TRTDataType]]): If dtype is provided, the given value will be converted to this dtype. + min_rank (int): minimum rank of the constant tensor. Returns: A TensorRT ITensor that represents the given value. """ @@ -254,7 +256,7 @@ def get_trt_tensor( input_val = input_val.astype(np.float32) if isinstance(input_val, (torch.Tensor, np.ndarray, int, float, bool)): - return create_constant(ctx, input_val, name, dtype, rank) + return create_constant(ctx, input_val, name, dtype, min_rank) elif isinstance(input_val, TRTTensor): return input_val else: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cat.py b/py/torch_tensorrt/dynamo/conversion/impl/cat.py index 346a5ac727..d7d32b5bb0 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cat.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cat.py @@ -31,7 +31,7 @@ def cat( each_input = get_trt_tensor(ctx, each_input, f"{name}_tensor_{i}") if cast_dtype: each_input = cast_trt_tensor( - ctx, each_input, cast_dtype, f"{name}_tensor_int32_cast{i}" + ctx, each_input, cast_dtype, f"{name}_tensor_int32_cast_{i}" ) trt_inputs.append(each_input) concat_layer = ctx.net.add_concatenation(trt_inputs) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index 0264ca2b20..b2d005b175 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -34,7 +34,7 @@ def reshape( ctx, s, _enums.dtype.int32, - name + "_int32_casted", + name + f"_int32_casted_{i}", ) trt_shape.append(dim_int32) else: diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index 0f2581d0d2..43009c306a 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -60,12 +60,12 @@ def aten_ops_arange_start_step( name: str, ) -> Union[TRTTensor, Sequence[TRTTensor]]: # Case where inputs to arange are dynamic - if np.any([isinstance(tensor, TRTTensor) for tensor in args]): - start_rank_0 = get_trt_tensor(ctx, args[0], name + "_start_rank_0", rank=0) - start_rank_1 = get_trt_tensor(ctx, args[0], name + "_start_rank_1", rank=1) - end = get_trt_tensor(ctx, args[1], name + "_end", rank=1) + if any(isinstance(tensor, TRTTensor) for tensor in args): + start_rank_0 = get_trt_tensor(ctx, args[0], name + "_start_rank_0", min_rank=0) + start_rank_1 = get_trt_tensor(ctx, args[0], name + "_start_rank_1", min_rank=1) + end = get_trt_tensor(ctx, args[1], name + "_end", min_rank=1) step = args[2] if len(args) > 2 else 1 - step = get_trt_tensor(ctx, step, name + "_step", rank=1) + step = get_trt_tensor(ctx, step, name + "_step", min_rank=1) # Calculate shape = (end-start) / step shape = sub( ctx, diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index a391179696..4aa520542d 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -15,10 +15,10 @@ _select_rt_device, multi_gpu_device_check, ) +from torch_tensorrt.dynamo.utils import DYNAMIC_DIM from torch_tensorrt.logging import TRT_LOGGER logger = logging.getLogger(__name__) -DYNAMIC_DIM = -1 class PythonTorchTensorRTModule(Module): # type: ignore[misc] diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index be0019159a..4ea8687016 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -16,6 +16,7 @@ logger = logging.getLogger(__name__) COSINE_THRESHOLD = 0.99 +DYNAMIC_DIM = -1 def use_python_runtime_parser(use_python_runtime: Optional[bool] = None) -> bool: From e33976a5271f5fb292f45c14e387646f18c9288e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 21 May 2024 15:34:06 -0700 Subject: [PATCH 72/73] chore: remove gpt2 example --- examples/gpt2_tc.py | 68 --------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 examples/gpt2_tc.py diff --git a/examples/gpt2_tc.py b/examples/gpt2_tc.py deleted file mode 100644 index 2f0da8f2e9..0000000000 --- a/examples/gpt2_tc.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList -from transformers.generation.stopping_criteria import ( - EosTokenCriteria, - MaxLengthCriteria, -) - -# Define tokenizer and model -torch_device = "cuda" if torch.cuda.is_available() else "cpu" -tokenizer = AutoTokenizer.from_pretrained("gpt2") -model = ( - AutoModelForCausalLM.from_pretrained( - "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False - ) - .eval() - .to(torch_device) -) - -# Input prompt -model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to( - torch_device -) -input_ids = model_inputs["input_ids"] -max_tokens = 40 - -# Pyt model outputs -greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens) -print( - "Pytorch model generated text: ", - tokenizer.decode(greedy_output[0], skip_special_tokens=True), -) - -# Compile Torch-TRT model -torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) -model.forward = torch.compile( - model.forward, - backend="tensorrt", - dynamic=None, - options={ - "debug": False, - "enabled_precisions": {torch.float}, - "torch_executed_ops": {"torch.ops.aten.slice.Tensor"}, - "use_python_runtime": True, - }, -) - -# Auto-regressive generation loop for greedy search -stopping_criteria = StoppingCriteriaList( - [ - MaxLengthCriteria(max_length=max_tokens), - EosTokenCriteria(eos_token_id=tokenizer.eos_token_id), - ] -) -while True: - trt_outputs = model(input_ids) - logits = trt_outputs.logits - next_token_logits = logits[:, -1, :] - next_tokens = torch.argmax(next_token_logits, dim=-1) - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if stopping_criteria(input_ids, logits).item(): - break - -# Decode the sentence -print( - "TensorRT model generated text: ", - tokenizer.decode(input_ids[0], skip_special_tokens=True), -) From f28684c1714135b2a95ca3ee985e7126591126af Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 22 May 2024 10:54:37 -0700 Subject: [PATCH 73/73] chore: updates --- .github/workflows/build-test-windows.yml | 8 ++++---- .github/workflows/build-test.yml | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index c7f1ba1d6b..66a81c1fd4 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -72,7 +72,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/ popd @@ -98,7 +98,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py popd @@ -125,7 +125,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_comple_be_e2e_test_results.xml --ir torch_compile models/test_models.py popd @@ -152,7 +152,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index cbaed6551e..99b0aa2674 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -81,12 +81,12 @@ jobs: pushd . cd tests/modules # Don't use requirements.txt here as it contains tensorrt and torch which should have been installed by now. - ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers==4.40.2 timm==0.9.16 pybind11==2.6.2 + ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers==4.39.3 timm==0.9.16 pybind11==2.6.2 ${CONDA_RUN} python hub.py popd pushd . cd tests/py/ts - ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/ ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/ @@ -116,7 +116,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/ popd @@ -144,7 +144,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py popd @@ -173,7 +173,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py popd @@ -201,7 +201,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_comple_be_e2e_test_results.xml --ir torch_compile models/test_models.py ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -231,7 +231,7 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/dynamo - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -260,6 +260,6 @@ jobs: export USE_HOST_DEPS=1 pushd . cd tests/py/core - ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver + ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.39.3 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml . popd