pytorch · lanluo-nvidia · Sep 24, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 24, 2025
diff --git a/.github/scripts/filter-matrix.py b/.github/scripts/filter-matrix.py
@@ -15,7 +15,7 @@
 jetpack_cuda_versions: List[str] = ["cu126"]
 
 jetpack_container_image: str = "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-sbsa_container_image: str = "quay.io/pypa/manylinux_2_34_aarch64"
+sbsa_container_image: str = "quay.io/pypa/manylinux_2_39_aarch64"
 
 
 def validate_matrix(matrix_dict: Dict[str, Any]) -> None:
@@ -41,19 +41,15 @@ def filter_matrix_item(
         # Skipping disabled CUDA version
         return False
     if is_jetpack:
-        if limit_pr_builds:
-            # pr build,matrix passed from test-infra is cu128, python 3.9, change to cu126, python 3.10
-            item["desired_cuda"] = "cu126"
-            item["python_version"] = "3.10"
+        # pr build,matrix passed from test-infra is cu126,cu128 and cu130, python 3.10, filter to cu126, python 3.10
+        # nightly/main build, matrix passed from test-infra is cu126, cu128 and cu130, all python versions, filter to cu126, python 3.10
+        if (
+            item["python_version"] in jetpack_python_versions
+            and item["desired_cuda"] in jetpack_cuda_versions
+        ):
             item["container_image"] = jetpack_container_image
             return True
-        else:
-            # nightly/main build, matrix passed from test-infra is cu128, all python versions, change to cu126, python 3.10
-            if item["python_version"] in jetpack_python_versions:
-                item["desired_cuda"] = "cu126"
-                item["container_image"] = jetpack_container_image
-                return True
-            return False
+        return False
     else:
         if item["gpu_arch_type"] == "cuda-aarch64":
             # pytorch image:pytorch/manylinuxaarch64-builder:cuda12.8 comes with glibc2.28

diff --git a/.github/scripts/install-cuda-aarch64.sh b/.github/scripts/install-cuda-aarch64.sh
@@ -8,23 +8,42 @@ install_cuda_aarch64() {
     # CUDA_MAJOR_VERSION: cu128 --> 12
     CUDA_MAJOR_VERSION=${CU_VERSION:2:2}
     dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
-    # nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
-    dnf -y install cuda-compiler-${CU_VER}.aarch64 \
+
+    # nccl version must match libtorch_cuda.so was built with
+    if [[ ${CU_VERSION:0:4} == "cu12" ]]; then
+        # cu12: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
+        if [[ ${CU_VERSION} == "cu128" ]]; then
+            nccl_version="2.26.2-1"
+        elif [[ ${CU_VERSION} == "cu126" ]]; then
+            nccl_version="2.24.3-1"
+        else
+            # removed cu129 support from pytorch upstream
+            echo "Unsupported CUDA version: ${CU_VERSION}"
+            exit 1
+        fi
+    elif [[ ${CU_VERSION:0:4} == "cu13" ]]; then
+        # cu13: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu13.txt
+        nccl_version="2.27.7-1"
+    fi
+
+    dnf --nogpgcheck -y install cuda-compiler-${CU_VER}.aarch64 \
                    cuda-libraries-${CU_VER}.aarch64 \
                    cuda-libraries-devel-${CU_VER}.aarch64 \
-                   libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER}
+                   libnccl-${nccl_version}+cuda${CU_DOT_VER} libnccl-devel-${nccl_version}+cuda${CU_DOT_VER} libnccl-static-${nccl_version}+cuda${CU_DOT_VER}
     dnf clean all
-
-    nvshmem_version=3.3.9
+    # nvshmem version is from https://github.com/pytorch/pytorch/blob/f9fa138a3910bd1de1e7acb95265fa040672a952/.ci/docker/common/install_cuda.sh#L67
+    nvshmem_version=3.3.24
     nvshmem_path="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${CUDA_MAJOR_VERSION}/txz/agnostic/aarch64"
-    nvshmem_filename="libnvshmem_cuda12-linux-sbsa-${nvshmem_version}.tar.gz"
-    curl -L ${nvshmem_path}/${nvshmem_filename} -o nvshmem.tar.gz
-    tar -xzf nvshmem.tar.gz
-    cp -a libnvshmem/lib/* /usr/local/cuda/lib64/
-    cp -a libnvshmem/include/* /usr/local/cuda/include/
-    rm -rf nvshmem.tar.gz nvshmem
+    nvshmem_prefix="libnvshmem-linux-sbsa-${nvshmem_version}_cuda${CUDA_MAJOR_VERSION}-archive"
+    nvshmem_tarname="${nvshmem_prefix}.tar.xz"
+    curl -L ${nvshmem_path}/${nvshmem_tarname} -o nvshmem.tar.xz
+    tar -xJf nvshmem.tar.xz
+    cp -a ${nvshmem_prefix}/lib/* /usr/local/cuda/lib64/
+    cp -a ${nvshmem_prefix}/include/* /usr/local/cuda/include/
+    rm -rf nvshmem.tar.xz ${nvshmem_prefix}
     echo "nvshmem ${nvshmem_version} for cuda ${CUDA_MAJOR_VERSION} installed successfully"
 
+    export PATH=/usr/local/cuda/bin:$PATH
     export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/include:/usr/lib64:$LD_LIBRARY_PATH
     ls -lart /usr/local/
     nvcc --version

diff --git a/.github/scripts/install-cuda-dss.sh b/.github/scripts/install-cuda-dss.sh
@@ -0,0 +1,12 @@
+# for now we only need to install cuda_dss for jetpack
+install_cuda_dss_aarch64() {
+    echo "install cuda_dss for ${CU_VERSION}"
+    arch_path='sbsa'
+    # version is from https://github.com/pytorch/pytorch/blob/22c5e8c17c7551c9dd2855589ae774c1e147343a/.ci/docker/common/install_cudss.sh
+    CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz
+    # only for cuda 12
+    tar xf ${CUDSS_NAME}.tar.xz
+    cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/
+}
diff --git a/.github/workflows/build_linux.yml b/.github/workflows/build_linux.yml
@@ -348,8 +348,8 @@ jobs:
           source "${BUILD_ENV_FILE}"
           WHEEL_NAME=$(ls "${{ inputs.repository }}/dist/")
           echo "$WHEEL_NAME"
-          if [[ ${{ inputs.is-jetpack }} == true ]]; then
-            echo "Skipping smoke test for jetpack, since it is not the actual jetpack environment"
+          if [[ ${{ inputs.architecture }} == "aarch64" ]]; then
+            echo "Skipping smoke test for aarch64, since it is not an actual gpu runner"
           else
             ${CONDA_RUN} pip install "${{ inputs.repository }}/dist/$WHEEL_NAME"
             # Checking that we have a pinned version of torch in our dependency tree

diff --git a/MODULE.bazel b/MODULE.bazel
@@ -51,7 +51,7 @@ new_local_repository(
 new_local_repository(
     name = "cuda_win",
     build_file = "@//third_party/cuda:BUILD",
-    path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/",
+    path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/",
 )
 
 http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
@@ -12,13 +12,15 @@ if [[ $(uname -m) == "aarch64" ]]; then
   if [[ ${os_name} == "ubuntu" ]]; then
       IS_JETPACK=true
       apt-get update
-      apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip
+      apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip libfmt-dev
   else
       IS_SBSA=true
       yum install -y ninja-build gettext zip unzip
+      yum install -y fmt-devel
   fi
 else
   BAZEL_PLATFORM="amd64"
+  yum install -y fmt-devel
 fi
 
 
@@ -43,6 +45,8 @@ pip uninstall -y torch torchvision
 
 if [[ ${IS_JETPACK} == true ]]; then
     # install torch 2.8 for jp6.2
+    source .github/scripts/install-cuda-dss.sh
+    install_cuda_dss_aarch64
     pip install torch==2.8.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126/
 else
     TORCH=$(grep "^torch>" py/requirements.txt)

diff --git a/setup.py b/setup.py
@@ -85,7 +85,7 @@ def load_dep_info():
 
 dir_path = os.path.join(str(get_root_dir()), "py")
 
-IS_AARCH64 = platform.uname().processor == "aarch64"
+IS_AARCH64 = platform.machine() == "aarch64"
 IS_JETPACK = False
 
 PY_ONLY = False

diff --git a/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py b/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py
@@ -242,17 +242,17 @@ def test_hierarchical_adjacency_partition_with_two_backends_with_torch_executed_
         )
         from torch_tensorrt.dynamo.lowering import (
             get_decompositions,
+            post_lowering,
             pre_export_lowering,
         )
 
         model = self.SimpleModel().cuda().eval()
         example_input = torch.randn(1, 3, 224, 224).cuda()
-
         exported_program = torch.export.export(model, (example_input,))
         exported_program = pre_export_lowering(exported_program)
         exported_program = exported_program.run_decompositions(get_decompositions())
         gm = exported_program.module()
-
+        gm = post_lowering(gm)
         partitioned_graph, _ = partitioning.hierarchical_adjacency_partition(
             gm,
             min_block_size=1,

diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl
@@ -45,7 +45,7 @@ new_local_repository(
 new_local_repository(
     name = "cuda_l4t",
     build_file = "@//third_party/cuda:BUILD",
-    path = "/usr/local/cuda-12.9",
+    path = "/usr/local/cuda-12.6",
 )
 
 new_local_repository(