pytorch · lanluo-nvidia · Sep 26, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -177,6 +177,7 @@ jobs:
         cd tests/py
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
   tests-py-dynamo-serde:

diff --git a/.github/workflows/build-test-linux-x86_64_rtx.yml b/.github/workflows/build-test-linux-x86_64_rtx.yml
@@ -141,6 +141,7 @@ jobs:
         cd tests/py
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
   tests-py-dynamo-serde:

diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
@@ -172,6 +172,7 @@ jobs:
         cd tests/py
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
   tests-py-dynamo-serde:

diff --git a/.github/workflows/build-test-windows_rtx.yml b/.github/workflows/build-test-windows_rtx.yml
@@ -143,6 +143,7 @@ jobs:
         cd tests/py
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_llm.xml llm/
         popd
 
   tests-py-dynamo-serde:

diff --git a/.github/workflows/build_windows.yml b/.github/workflows/build_windows.yml
@@ -288,6 +288,15 @@ jobs:
           BUILD_PARAMS: ${{ inputs.wheel-build-params }}
         run: |
           source "${BUILD_ENV_FILE}"
+          # commented out due to still failed with the following error:
+          # C:\actions-runner\_work\_temp\conda_environment_18042354682\lib\site-packages\torch\include\torch/csrc/utils/python_arg_parser.h(42): fatal error C1083: Cannot open include file: 'fmt/format.h': No such file or directory
+          # workaround: download fmt and copy to torch include path in pre_build_script_windows.sh
+          # conda update -n base -c defaults conda
+          # conda install -c conda-forge fmt -y
+          # conda list fmt
+          # echo "path: $PATH"
+
+          ${CONDA_RUN} python -m pip install fmt
           if [[ ${{ inputs.is-release-wheel }} == true || ${{ inputs.is-release-tarball }} == true ]]; then
             # release version for upload to pypi
             # BUILD_VERSION example: 2.4.0+cu121, we don't want the +cu121 part, so remove +cu121
@@ -344,20 +353,26 @@ jobs:
           SMOKE_TEST_SCRIPT: ${{ inputs.smoke-test-script }}
         run: |
           source "${BUILD_ENV_FILE}"
+          set -x
           WHEEL_NAME=$(ls "${{ inputs.repository }}/dist/")
           echo "$WHEEL_NAME"
+          nvidia-smi
+          nvcc --version
           ${CONDA_RUN} pip install "${{ inputs.repository }}/dist/$WHEEL_NAME"
+          ${CONDA_RUN} python -m pip list
+
           if [[ $USE_TRT_RTX == true ]]; then
             # TODO: lan to remove this once we have a better way to do a smoke test
             echo "Smoke test for TensorRT-RTX is not skipped for now"
           else
-            if [[ ! -f "${{ inputs.repository }}"/${SMOKE_TEST_SCRIPT} ]]; then
-              echo "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT} not found"
-              ${CONDA_RUN} "${{ inputs.repository }}/${ENV_SCRIPT}" python -c "import ${PACKAGE_NAME}; print('package version is ', ${PACKAGE_NAME}.__version__)"
-            else
-              echo "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT} found"
-              ${CONDA_RUN} "${{ inputs.repository }}/${ENV_SCRIPT}" python "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT}"
-            fi
+            echo "Skip smoke test in windows"
+            # if [[ ! -f "${{ inputs.repository }}"/${SMOKE_TEST_SCRIPT} ]]; then
+            #   echo "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT} not found"
+            #   ${CONDA_RUN} "${{ inputs.repository }}/${ENV_SCRIPT}" python -c "import ${PACKAGE_NAME}; print('package version is ', ${PACKAGE_NAME}.__version__)"
+            # else
+            #   echo "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT} found"
+            #   ${CONDA_RUN} "${{ inputs.repository }}/${ENV_SCRIPT}" python "${{ inputs.repository }}/${SMOKE_TEST_SCRIPT}"
+            # fi
           fi
       - name: Smoke Test ARM64
         if: inputs.architecture == 'arm64'

diff --git a/MODULE.bazel b/MODULE.bazel
@@ -101,9 +101,9 @@ http_archive(
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/tars/TensorRT-10.13.2.6.Linux.x86_64-gnu.cuda-12.9.tar.gz",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-13.0.tar.gz",
     ],
 )
 
@@ -119,9 +119,9 @@ http_archive(
 http_archive(
     name = "tensorrt_sbsa",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/tars/TensorRT-10.13.2.6.Linux.aarch64-gnu.cuda-13.0.tar.gz",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.aarch64-gnu.cuda-13.0.tar.gz",
     ],
 )
 
@@ -137,9 +137,9 @@ http_archive(
 http_archive(
     name = "tensorrt_win",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/zip/TensorRT-10.13.2.6.Windows.win10.cuda-12.9.zip",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/zip/TensorRT-10.13.3.9.Windows.win10.cuda-13.0.zip",
     ],
 )
 

diff --git a/dev_dep_versions.yml b/dev_dep_versions.yml
@@ -1,3 +1,3 @@
 __cuda_version__: "12.8"
-__tensorrt_version__: "10.12.0"
+__tensorrt_version__: "10.13.3"
 __tensorrt_rtx_version__: "1.0.0"
diff --git a/packaging/driver_upgrade.bat b/packaging/driver_upgrade.bat
@@ -1,9 +1,9 @@
-set WIN_DRIVER_VN=528.89
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
+set WIN_DRIVER_VN=580.88
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
+curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
 if errorlevel 1 exit /b 1
 
-start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
+start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
 if errorlevel 1 exit /b 1
 
-del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
+del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
@@ -59,15 +59,13 @@ fi
 export TORCH_BUILD_NUMBER=$(python -c "import torch, urllib.parse as ul; print(ul.quote_plus(torch.__version__))")
 export TORCH_INSTALL_PATH=$(python -c "import torch, os; print(os.path.dirname(torch.__file__))")
 
-if [[ ${TENSORRT_VERSION} != "" ]]; then
-  # Replace dependencies in the original pyproject.toml with the current TensorRT version. It is used for CI tests of different TensorRT versions.
-  # For example, if the current testing TensorRT version is 10.7.0, but the pyproject.toml tensorrt>=10.8.0,<10.9.0, then the following sed command
-  # will replace tensorrt>=10.8.0,<10.9.0 with tensorrt==10.7.0
-  sed -i -e "s/tensorrt>=.*,<.*\"/tensorrt>=${TENSORRT_VERSION},<$(echo "${TENSORRT_VERSION}" | awk -F. '{print $1"."$2+1".0"}')\"/g" \
-         -e "s/tensorrt-cu12>=.*,<.*\"/tensorrt-cu12>=${TENSORRT_VERSION},<$(echo "${TENSORRT_VERSION}" | awk -F. '{print $1"."$2+1".0"}')\"/g" \
-         -e "s/tensorrt-cu12-bindings>=.*,<.*\"/tensorrt-cu12-bindings>=${TENSORRT_VERSION},<$(echo "${TENSORRT_VERSION}" | awk -F. '{print $1"."$2+1".0"}')\"/g" \
-         -e "s/tensorrt-cu12-libs>=.*,<.*\"/tensorrt-cu12-libs>=${TENSORRT_VERSION},<$(echo "${TENSORRT_VERSION}" | awk -F. '{print $1"."$2+1".0"}')\"/g" \
-         pyproject.toml
+# CU_UPPERBOUND eg:13.0 or 12.9
+# tensorrt tar for linux and windows are different across cuda version
+# for sbsa it is the same tar across cuda version
+if [[ ${CU_VERSION:2:2} == "13" ]]; then
+    export CU_UPPERBOUND="13.0"
+else
+    export CU_UPPERBOUND="12.9"
 fi
 
 cat toolchains/ci_workspaces/MODULE.bazel.tmpl | envsubst > MODULE.bazel

diff --git a/packaging/pre_build_script_windows.sh b/packaging/pre_build_script_windows.sh
@@ -1,6 +1,6 @@
 set -x
 
-pip install -U numpy packaging pyyaml setuptools wheel
+pip install -U numpy packaging pyyaml setuptools wheel fmt
 
 choco install bazelisk -y
 
@@ -27,6 +27,22 @@ pip install --force-reinstall --pre ${TORCH} --index-url ${INDEX_URL}
 export CUDA_HOME="$(echo ${CUDA_PATH} | sed -e 's#\\#\/#g')"
 export TORCH_INSTALL_PATH="$(python -c "import torch, os; print(os.path.dirname(torch.__file__))" | sed -e 's#\\#\/#g')"
 
+# tried with conda install -c conda-forge fmt -y, but build still failed in windows with the following error:
+# C:\actions-runner\_work\_temp\conda_environment_18042354682\lib\site-packages\torch\include\torch/csrc/utils/python_arg_parser.h(42): fatal error C1083: Cannot open include file: 'fmt/format.h': No such file or directory
+# workaround: download fmt from github and copy to torch include path
+curl -L -o fmt.zip https://github.com/fmtlib/fmt/releases/download/12.0.0/fmt-12.0.0.zip
+unzip fmt.zip
+cp -r fmt-12.0.0/include/fmt/ $TORCH_INSTALL_PATH/include/
+ls -lart $TORCH_INSTALL_PATH/include/fmt/
+
+# CU_UPPERBOUND eg:13.0 or 12.9
+# tensorrt tar for linux and windows are different across cuda version
+# for sbsa it is the same tar across cuda version
+if [[ ${CU_VERSION:2:2} == "13" ]]; then
+    export CU_UPPERBOUND="13.0"
+else
+    export CU_UPPERBOUND="12.9"
+fi
 cat toolchains/ci_workspaces/MODULE.bazel.tmpl | envsubst > MODULE.bazel
 
 if [[ ${TENSORRT_VERSION} != "" ]]; then

diff --git a/setup.py b/setup.py
@@ -732,6 +732,14 @@ def run(self):
     "dllist",
 ]
 
+cuda_version = torch.version.cuda
+if cuda_version.startswith("12"):
+    tensorrt_prefix = "tensorrt-cu12"
+elif cuda_version.startswith("13"):
+    tensorrt_prefix = "tensorrt-cu13"
+else:
+    raise ValueError(f"Unsupported CUDA version: {cuda_version}")
+
 
 def get_requirements():
     if IS_JETPACK:
@@ -750,9 +758,9 @@ def get_requirements():
             else:
                 requirements = requirements + [
                     "tensorrt>=10.13.0,<10.14.0",
-                    "tensorrt-cu12>=10.13.0,<10.14.0",
-                    "tensorrt-cu12-bindings>=10.13.0,<10.14.0",
-                    "tensorrt-cu12-libs>=10.13.0,<10.14.0",
+                    f"{tensorrt_prefix}>=10.13.0,<10.14.0",
+                    f"{tensorrt_prefix}-bindings>=10.13.0,<10.14.0",
+                    f"{tensorrt_prefix}-libs>=10.13.0,<10.14.0",
                 ]
     return requirements
 
@@ -771,9 +779,9 @@ def get_sbsa_requirements():
     return sbsa_requirements + [
         "torch>=2.10.0.dev,<2.11.0",
         "tensorrt>=10.13.0,<10.14.0",
-        "tensorrt-cu12>=10.13.0,<10.14.0",
-        "tensorrt-cu12-bindings>=10.13.0,<10.14.0",
-        "tensorrt-cu12-libs>=10.13.0,<10.14.0",
+        f"{tensorrt_prefix}>=10.13.0,<10.14.0",
+        f"{tensorrt_prefix}-bindings>=10.13.0,<10.14.0",
+        f"{tensorrt_prefix}-libs>=10.13.0,<10.14.0",
     ]
 
 

diff --git a/tests/py/dynamo/conversion/test_index_aten.py b/tests/py/dynamo/conversion/test_index_aten.py
@@ -1,8 +1,12 @@
+import unittest
+
 import torch
 import torch.nn as nn
+import torch_tensorrt
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
 from torch_tensorrt import Input
+from torch_tensorrt.dynamo.utils import is_tegra_platform, is_thor
 
 from .harness import DispatchTestCase
 
@@ -216,6 +220,10 @@ def forward(self, input):
         )
 
 
+@unittest.skipIf(
+    torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx or is_thor() or is_tegra_platform(),
+    "nonzero is not supported for tensorrt_rtx",
+)
 class TestIndexDynamicInputNonDynamicIndexConverter(DispatchTestCase):
     def test_index_input_non_dynamic_index_dynamic(self):
         class TestIndexWithRuntimeIndex(torch.nn.Module):

diff --git a/tests/py/dynamo/models/test_llm_models.py → tests/py/dynamo/llm/test_llm_models.py b/tests/py/dynamo/models/test_llm_models.py → tests/py/dynamo/llm/test_llm_models.py
@@ -16,7 +16,8 @@
 @pytest.mark.unit
 @pytest.mark.parametrize("precision", ["FP16", "BF16", "FP32"])
 def test_llm_decoder_layer(precision):
-
+    if torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx and precision == "BF16":
+        pytest.skip("TensorRT-RTX does not support bfloat16, skipping test")
     with torch.inference_mode():
         args = argparse.Namespace()
         args.debug = False

diff --git a/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py b/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py
@@ -189,6 +189,7 @@ def test_hierarchical_adjacency_partition_with_two_backends(self):
         )
         from torch_tensorrt.dynamo.lowering import (
             get_decompositions,
+            post_lowering,
             pre_export_lowering,
         )
 
@@ -199,7 +200,7 @@ def test_hierarchical_adjacency_partition_with_two_backends(self):
         exported_program = pre_export_lowering(exported_program)
         exported_program = exported_program.run_decompositions(get_decompositions())
         gm = exported_program.module()
-
+        gm = post_lowering(gm)
         partitioned_graph, _ = partitioning.hierarchical_adjacency_partition(
             gm,
             min_block_size=1,

diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl
@@ -75,9 +75,9 @@ http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "ht
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/tars/TensorRT-10.13.2.6.Linux.x86_64-gnu.cuda-12.9.tar.gz",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.x86_64-gnu.cuda-${CU_UPPERBOUND}.tar.gz",
     ],
 )
 
@@ -93,9 +93,9 @@ http_archive(
 http_archive(
     name = "tensorrt_sbsa",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/tars/TensorRT-10.13.2.6.Linux.aarch64-gnu.cuda-13.0.tar.gz",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/tars/TensorRT-10.13.3.9.Linux.aarch64-gnu.cuda-13.0.tar.gz",
     ],
 )
 
@@ -111,9 +111,9 @@ http_archive(
 http_archive(
     name = "tensorrt_win",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    strip_prefix = "TensorRT-10.13.2.6",
+    strip_prefix = "TensorRT-10.13.3.9",
     urls = [
-        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.2/zip/TensorRT-10.13.2.6.Windows.win10.cuda-12.9.zip",
+        "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.13.3/zip/TensorRT-10.13.3.9.Windows.win10.cuda-${CU_UPPERBOUND}.zip",
     ],
 )