pytorch · peri044 · May 22, 2024 · Oct 3, 2023 · Oct 9, 2023 · Oct 11, 2023
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -78,16 +78,15 @@ jobs:
       script: |
         export USE_HOST_DEPS=1
         export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/modules
         # Don't use requirements.txt here as it contains tensorrt and torch which should have been installed by now.
-        ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers timm pybind11==2.6.2
+        ${CONDA_RUN} python -m pip install numpy packaging pyyaml transformers==4.40.2 timm==0.9.16 pybind11==2.6.2
         ${CONDA_RUN} python hub.py
         popd
         pushd .
         cd tests/py/ts
-        ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_api_test_results.xml api/
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_models_test_results.xml models/
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/ts_integrations_test_results.xml integrations/
@@ -115,10 +114,9 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/dynamo
-        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 10 conversion/
         popd
 
@@ -144,10 +142,9 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/dynamo
-        ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_fe_test_results.xml --ir dynamo models/test_models_export.py
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/test_dyn_models.py
         popd
@@ -174,10 +171,9 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/dynamo
-        ${CONDA_RUN} python -m pip install --pre pytest timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         popd
 
@@ -203,10 +199,9 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/dynamo
-        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
         ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_comple_be_e2e_test_results.xml --ir torch_compile models/test_models.py
         ${CONDA_RUN} python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
@@ -234,10 +229,9 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/dynamo
-        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
         ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
@@ -264,9 +258,8 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.1.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/core
-        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
+        ${CONDA_RUN} python -m pip install --pre pytest-xdist timm==0.9.16 transformers==4.40.2 parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
         ${CONDA_RUN} python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
         popd
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -124,6 +124,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
   }
 
+  // this is a buffer to store shape tensor input addresses throughout the runtime scope
+  std::list<std::vector<int32_t>> inputShapeTensorValues;
   {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
     if (compiled_engine->profile_execution) {
@@ -142,12 +144,30 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       auto dims = core::util::toDims(inputs[i].sizes());
       auto shape = core::util::toVec(dims);
       LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
-      compiled_engine->exec_ctx->setInputShape(name.c_str(), dims);
-      compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr());
+      if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
+        // Shape tensor inputs are casted to int32 explicitly.
+        // Refer to
+        // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
+        auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt32);
+        std::vector<int32_t> inputs_cpu_vec(
+            input_cpu.data_ptr<int32_t>(), input_cpu.data_ptr<int32_t>() + input_cpu.numel());
+        inputShapeTensorValues.emplace_back(inputs_cpu_vec);
+        compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data());
+      } else {
+        compiled_engine->exec_ctx->setInputShape(name.c_str(), dims);
+        compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputs[i].view(shape).contiguous().data_ptr());
+      }
     }
 
+    // Check if input shapes can be inferred.
+    int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
+    std::vector<char const*> names(io_size);
+    int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
     TORCHTRT_CHECK(
-        compiled_engine->exec_ctx->allInputShapesSpecified(), "Not enough inputs provided (runtime.RunCudaEngine)");
+        nbNames == 0,
+        "The shapes of the inputs: "
+            << names
+            << " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly");
   }
 
   std::vector<at::Tensor> outputs(compiled_engine->num_io.second);

diff --git a/examples/gpt2_tc.py b/examples/gpt2_tc.py
@@ -0,0 +1,68 @@
+import torch
+import torch_tensorrt
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList
+from transformers.generation.stopping_criteria import (
+    EosTokenCriteria,
+    MaxLengthCriteria,
+)
+
+# Define tokenizer and model
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+model = (
+    AutoModelForCausalLM.from_pretrained(
+        "gpt2", pad_token_id=tokenizer.eos_token_id, use_cache=False
+    )
+    .eval()
+    .to(torch_device)
+)
+
+# Input prompt
+model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(
+    torch_device
+)
+input_ids = model_inputs["input_ids"]
+max_tokens = 40
+
+# Pyt model outputs
+greedy_output = model.generate(**model_inputs, max_new_tokens=max_tokens)
+print(
+    "Pytorch model generated text: ",
+    tokenizer.decode(greedy_output[0], skip_special_tokens=True),
+)
+
+# Compile Torch-TRT model
+torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
+model.forward = torch.compile(
+    model.forward,
+    backend="tensorrt",
+    dynamic=None,
+    options={
+        "debug": False,
+        "enabled_precisions": {torch.float},
+        "torch_executed_ops": {"torch.ops.aten.slice.Tensor"},
+        "use_python_runtime": True,
+    },
+)
+
+# Auto-regressive generation loop for greedy search
+stopping_criteria = StoppingCriteriaList(
+    [
+        MaxLengthCriteria(max_length=max_tokens),
+        EosTokenCriteria(eos_token_id=tokenizer.eos_token_id),
+    ]
+)
+while True:
+    trt_outputs = model(input_ids)
+    logits = trt_outputs.logits
+    next_token_logits = logits[:, -1, :]
+    next_tokens = torch.argmax(next_token_logits, dim=-1)
+    input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+    if stopping_criteria(input_ids, logits).item():
+        break
+
+# Decode the sentence
+print(
+    "TensorRT model generated text: ",
+    tokenizer.decode(input_ids[0], skip_special_tokens=True),
+)
diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py
@@ -47,6 +47,7 @@ class _ShapeMode(Enum):
     high_tensor_domain_excl: float = low_tensor_domain_incl + DOMAIN_OFFSET
     torch_tensor: torch.Tensor = None
     name: str = ""
+    is_shape_tensor: bool = False
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         """__init__ Method for torch_tensorrt.Input
@@ -161,6 +162,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         else:
             self._explicit_set_dtype = False
 
+        if "is_shape_tensor" in kwargs:
+            self.is_shape_tensor = kwargs["is_shape_tensor"]
+
         if "format" in kwargs:
             self.format = memory_format._from(kwargs["format"])
 
@@ -174,7 +178,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         if "torch_tensor" in kwargs:
             self.torch_tensor = kwargs["torch_tensor"]
         else:
-            if self.shape_mode == Input._ShapeMode.DYNAMIC:
+            if self.is_shape_tensor:
+                self.torch_tensor = torch.tensor(
+                    kwargs["opt_shape"], dtype=kwargs["dtype"]
+                )
+            elif self.shape_mode == Input._ShapeMode.DYNAMIC:
                 self.torch_tensor = self.example_tensor("opt_shape")
             else:
                 self.torch_tensor = self.example_tensor()

diff --git a/py/torch_tensorrt/dynamo/_tracer.py b/py/torch_tensorrt/dynamo/_tracer.py
@@ -58,13 +58,9 @@ def trace(
 
     device = to_torch_device(kwargs.get("device", default_device()))
     torch_inputs = get_torch_inputs(inputs, device)
-    dynamic_shapes = {}
+    dynamic_shapes = []
     for input in inputs:
         if isinstance(input, Input) and input.shape_mode == Input._ShapeMode.DYNAMIC:
-            if not input.name:
-                raise AssertionError(
-                    f"Expected a name for a dynamic input with shape {input.shape} but found none"
-                )
             min_shape = input.shape["min_shape"]
             opt_shape = input.shape["opt_shape"]
             max_shape = input.shape["max_shape"]
@@ -80,8 +76,8 @@ def trace(
                         max=max_shape[dim],
                     )
 
-            dynamic_shapes[input.name] = dynamic_dims
+            dynamic_shapes.append(dynamic_dims)
 
-    exp_program = export(mod, tuple(torch_inputs), dynamic_shapes=dynamic_shapes)
+    exp_program = export(mod, tuple(torch_inputs), dynamic_shapes=tuple(dynamic_shapes))
 
     return exp_program
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -96,6 +96,8 @@ def _pretraced_backend(
 
             gm = apply_lowering_passes(gm, torch_inputs)
 
+            logger.debug("Lowered Input graph:\n " + str(gm.graph))
+
             torchtrt_inputs = prepare_inputs(
                 torch_inputs, disable_memory_format_check=True
             )

diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -4,6 +4,7 @@
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set
 
 import numpy as np
+import tensorrt as trt
 import torch
 import torch.fx
 from torch.fx.node import _get_qualified_name
@@ -25,7 +26,6 @@
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.logging import TRT_LOGGER
 
-import tensorrt as trt
 from packaging import version
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -365,18 +365,29 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor:
             max_shape = current_input.shape["max_shape"]
             # TODO: Does not support disjoint optimization profiles?
             assert self.optimization_profiles is not None
-            self.optimization_profiles[0].set_shape(
-                target, min_shape, opt_shape, max_shape
-            )
+            if current_input.is_shape_tensor:
+                # For shape_tensors, min/opt/max_shapes correspond to actual values
+                # of the shapes provided during runtime
+                self.optimization_profiles[0].set_shape_input(
+                    target, min_shape, opt_shape, max_shape
+                )
+                shape.append(1)
+            else:
+                self.optimization_profiles[0].set_shape(
+                    target, min_shape, opt_shape, max_shape
+                )
 
-            assert len(min_shape) == len(opt_shape) == len(max_shape)
-            for i in range(len(min_shape)):
-                if min_shape[i] == opt_shape[i] == max_shape[i]:
-                    shape.append(min_shape[i])
-                else:
-                    # -1 to represent the dynamic dimension
-                    shape.append(-1)
-        elif current_input.shape_mode == Input._ShapeMode.STATIC:
+                assert len(min_shape) == len(opt_shape) == len(max_shape)
+                for i in range(len(min_shape)):
+                    if min_shape[i] == opt_shape[i] == max_shape[i]:
+                        shape.append(min_shape[i])
+                    else:
+                        # -1 to represent the dynamic dimension
+                        shape.append(-1)
+        elif (
+            not current_input.is_shape_tensor
+            and current_input.shape_mode == Input._ShapeMode.STATIC
+        ):
             assert isinstance(current_input.shape, tuple)
             shape = list(current_input.shape)
         else:
@@ -388,6 +399,7 @@ def placeholder(self, target: str, args: Any, kwargs: Any) -> trt.ITensor:
         _LOGGER.debug(
             f"Adding input to in-progress INetwork: {target} [shape={shape}, dtype={trt_input_dtype}]"
         )
+
         return self.ctx.net.add_input(
             name=target,
             shape=tuple(shape),

diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -4,7 +4,9 @@
 import logging
 from typing import List, Sequence
 
+import tensorrt as trt
 import torch
+from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
@@ -17,8 +19,6 @@
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
 from torch_tensorrt.dynamo.utils import get_torch_inputs
 
-import tensorrt as trt
-
 logger = logging.getLogger(__name__)
 
 
@@ -28,12 +28,12 @@ def infer_module_output_dtypes(
     device: Device,
     truncate_double: bool = False,
 ) -> List[dtype]:
-    torch_inputs = get_torch_inputs(inputs, device)
-    module = module.to(device.to(torch.device))
-    module_outputs = module(*torch_inputs)
-
-    if not isinstance(module_outputs, (list, tuple)):
-        module_outputs = [module_outputs]
+    with maybe_disable_fake_tensor_mode():
+        torch_inputs = get_torch_inputs(inputs, device)
+        module = module.to(device.to(torch.device))
+        module_outputs = module(*torch_inputs)
+        if not isinstance(module_outputs, (list, tuple)):
+            module_outputs = [module_outputs]
 
     # Int64 outputs can sometimes be generated from within other operators
     # such as aten.sum - such outputs can be truncated