diff --git a/.github/scripts/filter-matrix.py b/.github/scripts/filter-matrix.py index ee3cfd491a..3633a11380 100644 --- a/.github/scripts/filter-matrix.py +++ b/.github/scripts/filter-matrix.py @@ -15,7 +15,7 @@ jetpack_cuda_versions: List[str] = ["cu126"] jetpack_container_image: str = "nvcr.io/nvidia/l4t-jetpack:r36.4.0" -sbsa_container_image: str = "quay.io/pypa/manylinux_2_34_aarch64" +sbsa_container_image: str = "quay.io/pypa/manylinux_2_39_aarch64" def validate_matrix(matrix_dict: Dict[str, Any]) -> None: @@ -41,19 +41,15 @@ def filter_matrix_item( # Skipping disabled CUDA version return False if is_jetpack: - if limit_pr_builds: - # pr build,matrix passed from test-infra is cu128, python 3.9, change to cu126, python 3.10 - item["desired_cuda"] = "cu126" - item["python_version"] = "3.10" + # pr build,matrix passed from test-infra is cu126,cu128 and cu130, python 3.10, filter to cu126, python 3.10 + # nightly/main build, matrix passed from test-infra is cu126, cu128 and cu130, all python versions, filter to cu126, python 3.10 + if ( + item["python_version"] in jetpack_python_versions + and item["desired_cuda"] in jetpack_cuda_versions + ): item["container_image"] = jetpack_container_image return True - else: - # nightly/main build, matrix passed from test-infra is cu128, all python versions, change to cu126, python 3.10 - if item["python_version"] in jetpack_python_versions: - item["desired_cuda"] = "cu126" - item["container_image"] = jetpack_container_image - return True - return False + return False else: if item["gpu_arch_type"] == "cuda-aarch64": # pytorch image:pytorch/manylinuxaarch64-builder:cuda12.8 comes with glibc2.28 diff --git a/.github/scripts/install-cuda-aarch64.sh b/.github/scripts/install-cuda-aarch64.sh index a15eaa1332..710878a1b3 100755 --- a/.github/scripts/install-cuda-aarch64.sh +++ b/.github/scripts/install-cuda-aarch64.sh @@ -8,23 +8,42 @@ install_cuda_aarch64() { # CUDA_MAJOR_VERSION: cu128 --> 12 CUDA_MAJOR_VERSION=${CU_VERSION:2:2} dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo - # nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt - dnf -y install cuda-compiler-${CU_VER}.aarch64 \ + + # nccl version must match libtorch_cuda.so was built with + if [[ ${CU_VERSION:0:4} == "cu12" ]]; then + # cu12: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt + if [[ ${CU_VERSION} == "cu128" ]]; then + nccl_version="2.26.2-1" + elif [[ ${CU_VERSION} == "cu126" ]]; then + nccl_version="2.24.3-1" + else + # removed cu129 support from pytorch upstream + echo "Unsupported CUDA version: ${CU_VERSION}" + exit 1 + fi + elif [[ ${CU_VERSION:0:4} == "cu13" ]]; then + # cu13: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu13.txt + nccl_version="2.27.7-1" + fi + + dnf --nogpgcheck -y install cuda-compiler-${CU_VER}.aarch64 \ cuda-libraries-${CU_VER}.aarch64 \ cuda-libraries-devel-${CU_VER}.aarch64 \ - libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER} + libnccl-${nccl_version}+cuda${CU_DOT_VER} libnccl-devel-${nccl_version}+cuda${CU_DOT_VER} libnccl-static-${nccl_version}+cuda${CU_DOT_VER} dnf clean all - - nvshmem_version=3.3.9 + # nvshmem version is from https://github.com/pytorch/pytorch/blob/f9fa138a3910bd1de1e7acb95265fa040672a952/.ci/docker/common/install_cuda.sh#L67 + nvshmem_version=3.3.24 nvshmem_path="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${CUDA_MAJOR_VERSION}/txz/agnostic/aarch64" - nvshmem_filename="libnvshmem_cuda12-linux-sbsa-${nvshmem_version}.tar.gz" - curl -L ${nvshmem_path}/${nvshmem_filename} -o nvshmem.tar.gz - tar -xzf nvshmem.tar.gz - cp -a libnvshmem/lib/* /usr/local/cuda/lib64/ - cp -a libnvshmem/include/* /usr/local/cuda/include/ - rm -rf nvshmem.tar.gz nvshmem + nvshmem_prefix="libnvshmem-linux-sbsa-${nvshmem_version}_cuda${CUDA_MAJOR_VERSION}-archive" + nvshmem_tarname="${nvshmem_prefix}.tar.xz" + curl -L ${nvshmem_path}/${nvshmem_tarname} -o nvshmem.tar.xz + tar -xJf nvshmem.tar.xz + cp -a ${nvshmem_prefix}/lib/* /usr/local/cuda/lib64/ + cp -a ${nvshmem_prefix}/include/* /usr/local/cuda/include/ + rm -rf nvshmem.tar.xz ${nvshmem_prefix} echo "nvshmem ${nvshmem_version} for cuda ${CUDA_MAJOR_VERSION} installed successfully" + export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/include:/usr/lib64:$LD_LIBRARY_PATH ls -lart /usr/local/ nvcc --version diff --git a/.github/scripts/install-cuda-dss.sh b/.github/scripts/install-cuda-dss.sh new file mode 100644 index 0000000000..9746e04142 --- /dev/null +++ b/.github/scripts/install-cuda-dss.sh @@ -0,0 +1,12 @@ +# for now we only need to install cuda_dss for jetpack +install_cuda_dss_aarch64() { + echo "install cuda_dss for ${CU_VERSION}" + arch_path='sbsa' + # version is from https://github.com/pytorch/pytorch/blob/22c5e8c17c7551c9dd2855589ae774c1e147343a/.ci/docker/common/install_cudss.sh + CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz + # only for cuda 12 + tar xf ${CUDSS_NAME}.tar.xz + cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/ + cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/ +} \ No newline at end of file diff --git a/.github/workflows/build_linux.yml b/.github/workflows/build_linux.yml index 32dd21de82..09cd11fbbd 100644 --- a/.github/workflows/build_linux.yml +++ b/.github/workflows/build_linux.yml @@ -348,8 +348,8 @@ jobs: source "${BUILD_ENV_FILE}" WHEEL_NAME=$(ls "${{ inputs.repository }}/dist/") echo "$WHEEL_NAME" - if [[ ${{ inputs.is-jetpack }} == true ]]; then - echo "Skipping smoke test for jetpack, since it is not the actual jetpack environment" + if [[ ${{ inputs.architecture }} == "aarch64" ]]; then + echo "Skipping smoke test for aarch64, since it is not an actual gpu runner" else ${CONDA_RUN} pip install "${{ inputs.repository }}/dist/$WHEEL_NAME" # Checking that we have a pinned version of torch in our dependency tree diff --git a/MODULE.bazel b/MODULE.bazel index ee208fcc23..ceaad641b7 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -51,7 +51,7 @@ new_local_repository( new_local_repository( name = "cuda_win", build_file = "@//third_party/cuda:BUILD", - path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/", + path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/", ) http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh index be449b73af..32b91ff3fe 100755 --- a/packaging/pre_build_script.sh +++ b/packaging/pre_build_script.sh @@ -12,13 +12,15 @@ if [[ $(uname -m) == "aarch64" ]]; then if [[ ${os_name} == "ubuntu" ]]; then IS_JETPACK=true apt-get update - apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip + apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip libfmt-dev else IS_SBSA=true yum install -y ninja-build gettext zip unzip + yum install -y fmt-devel fi else BAZEL_PLATFORM="amd64" + yum install -y fmt-devel fi @@ -43,6 +45,8 @@ pip uninstall -y torch torchvision if [[ ${IS_JETPACK} == true ]]; then # install torch 2.8 for jp6.2 + source .github/scripts/install-cuda-dss.sh + install_cuda_dss_aarch64 pip install torch==2.8.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126/ else TORCH=$(grep "^torch>" py/requirements.txt) diff --git a/setup.py b/setup.py index 5ef742fe76..1a4f57cb88 100644 --- a/setup.py +++ b/setup.py @@ -85,7 +85,7 @@ def load_dep_info(): dir_path = os.path.join(str(get_root_dir()), "py") -IS_AARCH64 = platform.uname().processor == "aarch64" +IS_AARCH64 = platform.machine() == "aarch64" IS_JETPACK = False PY_ONLY = False diff --git a/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py b/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py index 0553fb4b45..ece9796c28 100644 --- a/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py +++ b/tests/py/dynamo/partitioning/test_hierarchical_partitioning.py @@ -242,17 +242,17 @@ def test_hierarchical_adjacency_partition_with_two_backends_with_torch_executed_ ) from torch_tensorrt.dynamo.lowering import ( get_decompositions, + post_lowering, pre_export_lowering, ) model = self.SimpleModel().cuda().eval() example_input = torch.randn(1, 3, 224, 224).cuda() - exported_program = torch.export.export(model, (example_input,)) exported_program = pre_export_lowering(exported_program) exported_program = exported_program.run_decompositions(get_decompositions()) gm = exported_program.module() - + gm = post_lowering(gm) partitioned_graph, _ = partitioning.hierarchical_adjacency_partition( gm, min_block_size=1, diff --git a/toolchains/ci_workspaces/MODULE.bazel.tmpl b/toolchains/ci_workspaces/MODULE.bazel.tmpl index 97b8707e4a..491d5f4ac3 100644 --- a/toolchains/ci_workspaces/MODULE.bazel.tmpl +++ b/toolchains/ci_workspaces/MODULE.bazel.tmpl @@ -45,7 +45,7 @@ new_local_repository( new_local_repository( name = "cuda_l4t", build_file = "@//third_party/cuda:BUILD", - path = "/usr/local/cuda-12.9", + path = "/usr/local/cuda-12.6", ) new_local_repository(