From a22bf56371bb224d76c59f71cb3b173d2d1f5c10 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Fri, 14 Feb 2025 13:06:27 +0530 Subject: [PATCH 1/6] Add ppc64le wheel build support --- .ci/docker/manywheel/Dockerfile_ppc64le | 44 ++++++++ .github/scripts/ppc64le-build.sh | 34 ++++++ .github/scripts/ppc64le-ci/README.md | 50 +++++++++ .../actions-runner.Dockerfile | 102 ++++++++++++++++++ .../actions-runner@.service | 32 ++++++ .../fs/usr/bin/actions-runner | 75 +++++++++++++ .../self-hosted-builder/fs/usr/bin/entrypoint | 30 ++++++ .../self-hosted-builder/helpers/app_token.sh | 40 +++++++ .../helpers/gh_cat_token.sh | 16 +++ .../helpers/gh_token_generator.sh | 10 ++ .github/workflows/_linux-build.yml | 58 +++++++--- .github/workflows/ppc64le.yml | 39 +++++++ 12 files changed, 517 insertions(+), 13 deletions(-) create mode 100755 .ci/docker/manywheel/Dockerfile_ppc64le create mode 100755 .github/scripts/ppc64le-build.sh create mode 100755 .github/scripts/ppc64le-ci/README.md create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh create mode 100755 .github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh create mode 100755 .github/workflows/ppc64le.yml diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le new file mode 100755 index 000000000000..e60f6428a7c6 --- /dev/null +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -0,0 +1,44 @@ +# Use UBI 9.3 as base image +FROM registry.access.redhat.com/ubi9/ubi:9.3 + +# Install necessary dependencies +RUN dnf install -y \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + dnf install -y git cmake ninja-build gcc-toolset-13 rust cargo zip \ + python3 python3-devel && \ + dnf clean all + +ENV PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" +ENV MANPATH="/opt/rh/gcc-toolset-13/root/usr/share/man" +ENV INFOPATH="/opt/rh/gcc-toolset-13/root/usr/share/info" +ENV PCP_DIR="/opt/rh/gcc-toolset-13/root" +ENV LD_LIBRARY_PATH="/opt/rh/gcc-toolset-13/root/usr/lib64:/opt/rh/gcc-toolset-13/root/usr/lib" + +# Set Python and pip aliases to use Python 3.9 +RUN ln -sf /usr/bin/python3 /usr/bin/python && \ + ln -sf /usr/bin/pip3 /usr/bin/pip + +COPY requirements.txt . +# Install Python packages via pip +RUN pip install wheel setuptools pyyaml typing_extensions expecttest + +#RUN source /opt/rh/gcc-toolset-13/enable && pip install -r requirements.txt +RUN pip install -r requirements.txt + +# Copy the PyTorch source code into the container +COPY . /workspace/pytorch + +WORKDIR /workspace/pytorch + +# Ensure submodules are initialized and updated +RUN git submodule update --init --recursive + +# Copy the build script and make it executable +COPY .github/scripts/ppc64le-build.sh /ppc64le-build.sh +RUN chmod +x /ppc64le-build.sh + +# Verify permissions and ensure Unix line endings +RUN dos2unix /ppc64le-build.sh || sed -i 's/\r$//' /ppc64le-build.sh +RUN chmod +x /ppc64le-build.sh + + diff --git a/.github/scripts/ppc64le-build.sh b/.github/scripts/ppc64le-build.sh new file mode 100755 index 000000000000..a338fdfa7f18 --- /dev/null +++ b/.github/scripts/ppc64le-build.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Environment variables +PACKAGE_NAME=pytorch +PACKAGE_VERSION=${PACKAGE_VERSION:-v2.4.0} + +cd /workspace/$PACKAGE_NAME + +# Clean up old artifacts +rm -rf build/ dist/ torch.egg-info/ + +# Build and install PyTorch wheel +if ! (MAX_JOBS=4 python setup.py bdist_wheel && pip install dist/*.whl); then + echo "------------------$PACKAGE_NAME:install_fails-------------------------------------" + exit 1 +fi + +# register PrivateUse1HooksInterface +python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_bfloat16 +python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float16 +python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float32 +python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float64 + +cd .. +pip install pytest pytest-xdist + +if ! pytest "$PACKAGE_NAME/test/test_utils.py"; then + echo "------------------$PACKAGE_NAME:install_success_but_test_fails---------------------" + exit 2 + +else + echo "------------------$PACKAGE_NAME:install_and_test_both_success-------------------------" + exit 0 +fi \ No newline at end of file diff --git a/.github/scripts/ppc64le-ci/README.md b/.github/scripts/ppc64le-ci/README.md new file mode 100755 index 000000000000..5be1405613d2 --- /dev/null +++ b/.github/scripts/ppc64le-ci/README.md @@ -0,0 +1,50 @@ +# Configuring the builder. + +## Install prerequisites. + +``` +Install Docker +``` +## Clone pytorch repository + +## Add services. + +``` +$ sudo cp self-hosted-builder/*.service /etc/systemd/system/ +$ sudo systemctl daemon-reload +``` +Next step is to build `actions-runner` image using: + +``` +## clone gaplib repo (https://github.com/anup-kodlekere/gaplib.git) and copy runner-sdk-8.ppc64le patch from gaplib/build-files into pytorch/.github\scripts\ppc64le-ci\self-hosted-builder + +$ cd self-hosted-builder +$ sudo docker build \ + --pull \ + -f actions-runner.Dockerfile \ + --build-arg RUNNERPATCH="runner-sdk-8.ppc64le.patch" \ + -t iiilinuxibmcom/actions-runner. \ + . +``` + +Now prepare all necessary files for runner registration: + +``` +$ sudo mkdir -p /etc/actions-runner/ +$ sudo chmod 755 /etc/actions-runner/ +$ sudo /bin/cp /etc/actions-runner//key_private.pem +$ sudo echo | sudo tee /etc/actions-runner//appid.env +$ sudo echo | sudo tee /etc/actions-runner//installid.env +$ sudo echo NAME= | sudo tee /etc/actions-runner//env +$ sudo echo OWNER= | sudo tee -a /etc/actions-runner//env +$ sudo echo REPO=pytorch | sudo tee -a /etc/actions-runner//env +$ cd self-hosted-builder +$ sudo /bin/cp helpers/*.sh /usr/local/bin/ +$ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh +``` + +## Autostart the runner. + +``` +$ sudo systemctl enable --now actions-runner@$NAME +``` diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile new file mode 100755 index 000000000000..c52a3c718c4a --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile @@ -0,0 +1,102 @@ +# Self-Hosted IBM Power Github Actions Runner. +FROM ubuntu:22.04 + +# Set non-interactive mode for apt +ENV DEBIAN_FRONTEND=noninteractive + +# Fix sources to point to ports.ubuntu.com for ppc64le +RUN echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy main restricted universe multiverse" > /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports jammy-security main restricted universe multiverse" >> /etc/apt/sources.list + +# Fix sources for ppc64le and update system +RUN apt-get update -o Acquire::Retries=5 -o Acquire::http::Timeout="10" && \ + apt-get -y install --no-install-recommends \ + build-essential \ + curl \ + sudo \ + jq \ + gnupg-agent \ + iptables \ + ca-certificates \ + software-properties-common \ + vim \ + zip \ + python3 \ + python3-pip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Switch to iptables-legacy +RUN update-alternatives --set iptables /usr/sbin/iptables-legacy && \ + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy + + +# Add Docker GPG key and repository +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg && \ + echo "deb [arch=ppc64el signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list && \ + apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install dotnet SDK and other dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + git \ + dotnet-sdk-8.0 \ + cmake \ + make \ + automake \ + autoconf \ + m4 \ + libtool && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + +# Setup user and permissions +RUN useradd -c "Action Runner" -m runner && \ + usermod -L runner && \ + echo "runner ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/runner && \ + groupadd docker || true && \ + usermod -aG docker runner && \ + (test -S /var/run/docker.sock && chmod 660 /var/run/docker.sock && chgrp docker /var/run/docker.sock || true) + + +# Add and configure GitHub Actions runner +ARG RUNNERREPO="https://github.com/actions/runner" +ARG RUNNERPATCH + +ADD ${RUNNERPATCH} /tmp/runner.patch + +RUN git clone -q ${RUNNERREPO} /tmp/runner && \ + cd /tmp/runner && \ + git checkout main -b build && \ + git apply /tmp/runner.patch && \ + sed -i'' -e /version/s/8......\"$/${SDK}.0.100\"/ src/global.json + +RUN cd /tmp/runner/src && \ + ./dev.sh layout && \ + ./dev.sh package && \ + ./dev.sh test && \ + rm -rf /root/.dotnet /root/.nuget + +RUN mkdir -p /opt/runner && \ + tar -xf /tmp/runner/_package/*.tar.gz -C /opt/runner && \ + chown -R runner:runner /opt/runner && \ + su - runner -c "/opt/runner/config.sh --version" + +RUN rm -rf /tmp/runner /tmp/runner.patch + +# Copy custom scripts and set permissions +COPY fs/ / +RUN chmod 777 /usr/bin/actions-runner /usr/bin/entrypoint + +# Switch to the runner user +USER runner + +# Set working directory +WORKDIR /opt/runner + +# Define entry point and command +ENTRYPOINT ["/usr/bin/entrypoint"] +CMD ["/usr/bin/actions-runner"] + diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service new file mode 100755 index 000000000000..abbc0ffc97e5 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service @@ -0,0 +1,32 @@ +[Unit] +Description=Self-Hosted IBM power Github Actions Runner +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always + +# Cleanup stale containers +ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i +ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env +ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt + +ExecStart=/usr/bin/docker run \ + --env-file=/etc/actions-runner/%i/env \ + --volume /etc/actions-runner/%i/ghtoken.txt:/run/runner_secret \ + --volume /var/run/docker.sock:/var/run/docker.sock \ + --init \ + --interactive \ + --name=actions-runner.%i \ + --rm \ + --privileged \ + --log-driver=journald \ + iiilinuxibmcom/actions-runner.%i +ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1" +ExecStop=/bin/sh -c "docker wait actions-runner.%i" +ExecStop=/bin/sh -c "docker rm actions-runner.%i" + +ExecStop=/usr/bin/env rm -f /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.txt + +[Install] +WantedBy=multi-user.target diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner new file mode 100755 index 000000000000..e8c9bde29074 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +set -e -u + +trap cleanup EXIT + +token_file=registration-token.json + +# Function to clean up and unregister the runner +cleanup() { + echo "Cleaning up temporary files..." + [ -f "$token_file" ] && rm -f "$token_file" + [ -f "runner-id.json" ] && rm -f "runner-id.json" + + echo "Unregistering the runner from GitHub..." + ACCESS_TOKEN="$(cat /run/runner_secret)" + runner_id=$(curl -s \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners" | \ + jq --raw-output '.runners[] | select(.name=="'"${NAME}"'") | .id') + + if [ -n "$runner_id" ]; then + curl -s \ + -X DELETE \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/$runner_id" + echo "Runner unregistered successfully." + else + echo "Warning: Runner ID for ${NAME} not found. It may already be removed." + fi + + unset ACCESS_TOKEN runner_id +} + +# Fetch GitHub access token +if [ ! -f /run/runner_secret ]; then + echo "Error: Access token file not found at /run/runner_secret." + exit 1 +fi + + +ACCESS_TOKEN="$(cat /run/runner_secret)" + +# Generate registration token +curl \ + -X POST \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/registration-token" \ + -o "$token_file" + +unset ACCESS_TOKEN + +# register runner as ephemeral runner +# it does one job, stops and unregisters +registration_token=$(jq --raw-output .token "$token_file") + +./config.sh \ + --unattended \ + --ephemeral \ + --url "https://github.com/${OWNER}/${REPO}" \ + --token "${registration_token}" \ + --name "${NAME}" \ + --no-default-labels \ + --labels self-hosted,linux.ppc64le + +unset registration_token +rm -f "$token_file" + +# Run one job. +./run.sh + +echo "Ephemeral runner workflow completed." diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint new file mode 100755 index 000000000000..14f6c84ca602 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/entrypoint @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# +# Container entrypoint that waits for all spawned processes. +# + +set -e -u + +# Create a FIFO and start reading from its read end. +tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX") +trap 'rm -r "$tempdir"' EXIT +done="$tempdir/pipe" +mkfifo "$done" +cat "$done" & waiter=$! + +# Start the workload. Its descendants will inherit the FIFO's write end. +status=0 +if [ "$#" -eq 0 ]; then + bash 9>"$done" || status=$? +else + "$@" 9>"$done" || status=$? +fi + +# When the workload and all of its descendants exit, the FIFO's write end will +# be closed and `cat "$done"` will exit. Wait until it happens. This is needed +# in order to handle SelfUpdater, which the workload may start in background +# before exiting. +wait "$waiter" + +exit "$status" diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh new file mode 100755 index 000000000000..cecde970b84b --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/app_token.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Request an ACCESS_TOKEN to be used by a GitHub APP +# Environment variable that need to be set up: +# * APP_ID, the GitHub's app ID +# * INSTALL_ID, the Github's app's installation ID +# * APP_PRIVATE_KEY, the content of GitHub app's private key in PEM format. +# +# https://github.com/orgs/community/discussions/24743#discussioncomment-3245300 +# + +set -o pipefail + +set -e # Exit on error + +# Generate JWT +header='{"alg":"RS256","typ":"JWT"}' +payload="{\"iat\":$(date +%s),\"exp\":$(( $(date +%s) + 600 )),\"iss\":${APP_ID}}" + +header_base64=$(echo -n "$header" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') +payload_base64=$(echo -n "$payload" | openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + +signature=$(echo -n "${header_base64}.${payload_base64}" | \ + openssl dgst -sha256 -sign "${APP_PRIVATE_KEY}" | \ + openssl base64 | tr -d '=' | tr '/+' '_-' | tr -d '\n') + +generated_jwt="${header_base64}.${payload_base64}.${signature}" + +API_VERSION=v3 +API_HEADER="Accept: application/vnd.github+json" + +auth_header="Authorization: Bearer ${generated_jwt}" + +app_installations_response=$(curl -sX POST \ + -H "${auth_header}" \ + -H "${API_HEADER}" \ + --url "https://api.github.com/app/installations/${INSTALL_ID}/access_tokens" \ + ) + +echo "$app_installations_response" | jq --raw-output '.token' diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh new file mode 100755 index 000000000000..2274e5a13c74 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_cat_token.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +TOKEN_FILE=$1 +OUTPUT_FILE=$2 + +echo "Starting gh_cat_token.sh with TOKEN_FILE=${TOKEN_FILE}, OUTPUT_FILE=${OUTPUT_FILE}" + +# Validate inputs +if [[ ! -r "${TOKEN_FILE}" ]]; then + echo "Error: Token file '${TOKEN_FILE}' does not exist or is not readable." + exit 1 +fi + +# Write the token to the output file +cat "${TOKEN_FILE}" > "${OUTPUT_FILE}" +echo "Token written to ${OUTPUT_FILE}" diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh new file mode 100755 index 000000000000..1feee26eb2c1 --- /dev/null +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/helpers/gh_token_generator.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$(dirname "$0") +APP_ID=$1 +INSTALL_ID=$2 +APP_PRIVATE_KEY=$3 +DST_FILE="$4" + +ACCESS_TOKEN="$(APP_ID="$(<"${APP_ID}")" INSTALL_ID="$(<"${INSTALL_ID}")" APP_PRIVATE_KEY="${APP_PRIVATE_KEY}" "${SCRIPT_DIR}/app_token.sh")" +echo "${ACCESS_TOKEN}" > "${DST_FILE}" diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index b8045f5e20fb..f4c75a5e5cfc 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -109,7 +109,7 @@ jobs: steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -119,16 +119,17 @@ jobs: # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: no-sudo: true - name: Setup Linux uses: ./.github/actions/setup-linux - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' - name: configure aws credentials uses: aws-actions/configure-aws-credentials@v3 - if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} + if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' }} with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-build @@ -137,13 +138,13 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: docker-image-name: ${{ inputs.docker-image-name }} - name: Use following to pull public copy of the image id: print-ghcr-mirror - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash @@ -153,24 +154,26 @@ jobs: - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Parse ref id: parse-ref + if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' run: .github/scripts/parse_ref.py - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id - if: always() + if: always() && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: github-token: ${{ secrets.GITHUB_TOKEN }} # Apply the filter logic to the build step too if the test-config label is already there - name: Select all requested test configurations (if the test matrix is available) id: filter + if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' uses: ./.github/actions/filter-test-configs with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -181,14 +184,14 @@ jobs: - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} s3_bucket: ${{ inputs.s3-bucket }} - name: Build - if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' + if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && (inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9') id: build env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} @@ -275,6 +278,16 @@ jobs: END_TIME=$(date +%s) echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" + - name: Execute Build and Tests inside ppc64le Docker Container + if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' + run: | + CONTAINER_NAME="temp_builder_${RUN_ID}" + docker run -d --name "$CONTAINER_NAME" pytorch-ppc64le:ubi9.3 /ppc64le-build.sh + docker wait "$CONTAINER_NAME" + docker logs "$CONTAINER_NAME" + docker cp "$CONTAINER_NAME":/workspace/pytorch/dist/. dist/ + docker rm "$CONTAINER_NAME" + - name: Archive artifacts into zip if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' run: | @@ -282,7 +295,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -292,7 +305,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 for split build uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: name: ${{ inputs.build-environment }}-experimental-split-build retention-days: 14 @@ -318,8 +331,27 @@ jobs: if-no-files-found: error path: artifacts.zip + - name: Archive ppc64le artifacts into zip + if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' + run: | + zip -1 -r artifacts.zip dist/ + + + - name: Store PyTorch Build Artifacts for ppc64le + uses: actions/upload-artifact@v4 + if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' + with: + name: ${{ inputs.build-environment }}-ubi9 + retention-days: 14 + if-no-files-found: error + path: artifacts.zip + + - name: Cleanup dangling Docker images for ppc64le + if: always() && inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' + run: docker image prune -f + - name: Upload sccache stats - if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' uses: ./.github/actions/upload-sccache-stats with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -327,7 +359,7 @@ jobs: - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: always() && (inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9') - name: Cleanup docker if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml new file mode 100755 index 000000000000..59c07bd3ceae --- /dev/null +++ b/.github/workflows/ppc64le.yml @@ -0,0 +1,39 @@ +name: ppc64le + +on: + push: + branches: + - main + tags: + - ciflow/ppc64le/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + linux-ppc64le-docker-image-build: + name: Build docker image for ppc64le + runs-on: linux.ppc64le + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: true + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Build Docker image for ppc64le + run: | + docker build -f .ci/docker/manywheel/Dockerfile_ppc64le -t pytorch-ppc64le:ubi9.3 . + + ppc64le-UBI-9-3-Build-and-Test: + name: ppc64le-UBI-9-3-Build-and-Test + uses: ./.github/workflows/_linux-build.yml + needs: linux-ppc64le-docker-image-build + with: + build-environment: linux-ppc64le-binary-manywheel-ubi9 + docker-image-name: pytorch-ppc64le:ubi9.3 + runner: linux.ppc64le + \ No newline at end of file From b6dabf80f915c0a6ff8f312155f0805df7302e36 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Mon, 17 Feb 2025 17:53:50 +0530 Subject: [PATCH 2/6] Optimize build performance and upgrade base image to UBI 9.5 --- .ci/docker/manywheel/Dockerfile_ppc64le | 2 +- .github/scripts/ppc64le-build.sh | 3 +-- .github/workflows/ppc64le.yml | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le index e60f6428a7c6..441ac7d61049 100755 --- a/.ci/docker/manywheel/Dockerfile_ppc64le +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -1,5 +1,5 @@ # Use UBI 9.3 as base image -FROM registry.access.redhat.com/ubi9/ubi:9.3 +FROM registry.access.redhat.com/ubi9/ubi:9.5 # Install necessary dependencies RUN dnf install -y \ diff --git a/.github/scripts/ppc64le-build.sh b/.github/scripts/ppc64le-build.sh index a338fdfa7f18..751174aead6c 100755 --- a/.github/scripts/ppc64le-build.sh +++ b/.github/scripts/ppc64le-build.sh @@ -2,7 +2,6 @@ # Environment variables PACKAGE_NAME=pytorch -PACKAGE_VERSION=${PACKAGE_VERSION:-v2.4.0} cd /workspace/$PACKAGE_NAME @@ -10,7 +9,7 @@ cd /workspace/$PACKAGE_NAME rm -rf build/ dist/ torch.egg-info/ # Build and install PyTorch wheel -if ! (MAX_JOBS=4 python setup.py bdist_wheel && pip install dist/*.whl); then +if ! (MAX_JOBS=$(nproc) python setup.py bdist_wheel && pip install dist/*.whl); then echo "------------------$PACKAGE_NAME:install_fails-------------------------------------" exit 1 fi diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml index 59c07bd3ceae..fb26c1bfc5a3 100755 --- a/.github/workflows/ppc64le.yml +++ b/.github/workflows/ppc64le.yml @@ -4,8 +4,6 @@ on: push: branches: - main - tags: - - ciflow/ppc64le/* workflow_dispatch: concurrency: From 5871a5c045071d317cd05b60479d4cee0b89c1f4 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Thu, 20 Feb 2025 11:52:29 +0530 Subject: [PATCH 3/6] removed permision to file --- .ci/docker/manywheel/Dockerfile_ppc64le | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le index 441ac7d61049..7e2d01fbc860 100755 --- a/.ci/docker/manywheel/Dockerfile_ppc64le +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -35,10 +35,5 @@ RUN git submodule update --init --recursive # Copy the build script and make it executable COPY .github/scripts/ppc64le-build.sh /ppc64le-build.sh -RUN chmod +x /ppc64le-build.sh - -# Verify permissions and ensure Unix line endings -RUN dos2unix /ppc64le-build.sh || sed -i 's/\r$//' /ppc64le-build.sh -RUN chmod +x /ppc64le-build.sh From 2bbaf3b0d0915b5ec48be26fc94e3ae1f8c134e2 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Tue, 25 Feb 2025 16:01:17 +0530 Subject: [PATCH 4/6] Address review comments:Fix issue with Docker image load as per review Update runner registration flow based on feedback Enhance security by unmounting and removing runner token file To prevent any potential token leakage, unmount and remove /run/runner_secret immediately after generating the token. This ensures that the token is inaccessible beyond its intended use, even within the job execution. --- .ci/docker/manywheel/Dockerfile_ppc64le | 20 ++----- .ci/docker/manywheel/build.sh | 9 ++- .github/scripts/ppc64le-ci/README.md | 23 ++++++-- .../actions-runner.Dockerfile | 16 ++--- .../fs/usr/bin/actions-runner | 42 ++++---------- .github/workflows/_linux-build.yml | 58 +++++-------------- .github/workflows/ppc64le.yml | 50 ++++++++++------ 7 files changed, 96 insertions(+), 122 deletions(-) diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le index 7e2d01fbc860..479e9370ef86 100755 --- a/.ci/docker/manywheel/Dockerfile_ppc64le +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -1,5 +1,5 @@ -# Use UBI 9.3 as base image -FROM registry.access.redhat.com/ubi9/ubi:9.5 +# Use UBI 9 as base image +FROM registry.access.redhat.com/ubi9/ubi:9.5 AS base # Install necessary dependencies RUN dnf install -y \ @@ -20,20 +20,12 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python && \ COPY requirements.txt . # Install Python packages via pip -RUN pip install wheel setuptools pyyaml typing_extensions expecttest - -#RUN source /opt/rh/gcc-toolset-13/enable && pip install -r requirements.txt +RUN pip install wheel RUN pip install -r requirements.txt -# Copy the PyTorch source code into the container -COPY . /workspace/pytorch - -WORKDIR /workspace/pytorch - -# Ensure submodules are initialized and updated -RUN git submodule update --init --recursive +RUN mkdir -p /workspace/pytorch -# Copy the build script and make it executable -COPY .github/scripts/ppc64le-build.sh /ppc64le-build.sh +ENTRYPOINT [] +CMD ["/bin/bash"] diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 4c2e490fc27d..815a32a6cdfe 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -65,6 +65,13 @@ case ${GPU_ARCH_TYPE} in DOCKER_GPU_BUILD_ARG="" MANY_LINUX_VERSION="s390x" ;; + cpu-ppc64le) + TARGET=base + DOCKER_TAG=ppc64le + GPU_IMAGE=redhat/ubi9 + DOCKER_GPU_BUILD_ARG="" + MANY_LINUX_VERSION="ppc64le" + ;; cuda) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} @@ -121,7 +128,7 @@ fi ( set -x - if [ "$(uname -m)" != "s390x" ]; then + if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ]; then # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service diff --git a/.github/scripts/ppc64le-ci/README.md b/.github/scripts/ppc64le-ci/README.md index 5be1405613d2..f3f0e83c0acd 100755 --- a/.github/scripts/ppc64le-ci/README.md +++ b/.github/scripts/ppc64le-ci/README.md @@ -3,16 +3,30 @@ ## Install prerequisites. ``` -Install Docker +$ sudo dnf install podman podman-docker jq ``` -## Clone pytorch repository - ## Add services. ``` $ sudo cp self-hosted-builder/*.service /etc/systemd/system/ $ sudo systemctl daemon-reload ``` + +## Rebuild the image + +First build ppc64le builder image `docker.io/pytorch/ubippc64le-builder`, +using following commands: + +``` +$ cd ~ +$ git clone https://github.com/pytorch/pytorch +$ cd pytorch +$ git submodule update --init --recursive +$ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" ubippc64le-builder +$ docker image tag localhost/pytorch/ubippc64le-builder docker.io/pytorch/ubippc64le-builder:cpu-ppc64le +$ docker image save -o ~/ubi-ppc64le.tar docker.io/pytorch/ubippc64le-builder:cpu-ppc64le +``` + Next step is to build `actions-runner` image using: ``` @@ -36,8 +50,7 @@ $ sudo /bin/cp /etc/actions-runner//key_priv $ sudo echo | sudo tee /etc/actions-runner//appid.env $ sudo echo | sudo tee /etc/actions-runner//installid.env $ sudo echo NAME= | sudo tee /etc/actions-runner//env -$ sudo echo OWNER= | sudo tee -a /etc/actions-runner//env -$ sudo echo REPO=pytorch | sudo tee -a /etc/actions-runner//env +$ sudo echo ORG= | sudo tee -a /etc/actions-runner//env $ cd self-hosted-builder $ sudo /bin/cp helpers/*.sh /usr/local/bin/ $ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile index c52a3c718c4a..b9fc516acad0 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile @@ -32,10 +32,8 @@ RUN update-alternatives --set iptables /usr/sbin/iptables-legacy && \ update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy -# Add Docker GPG key and repository -RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg && \ - echo "deb [arch=ppc64el signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" > /etc/apt/sources.list.d/docker.list && \ - apt-get update && apt-get install -y docker-ce docker-ce-cli containerd.io && \ +# Install Podman and podman-docker (Docker compatibility) +RUN apt-get update && apt-get install -y podman podman-docker && \ apt-get clean && rm -rf /var/lib/apt/lists/* # Install dotnet SDK and other dependencies @@ -56,10 +54,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN useradd -c "Action Runner" -m runner && \ usermod -L runner && \ echo "runner ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/runner && \ - groupadd docker || true && \ - usermod -aG docker runner && \ - (test -S /var/run/docker.sock && chmod 660 /var/run/docker.sock && chgrp docker /var/run/docker.sock || true) + groupadd podman || true && \ + usermod -aG podman runner +# Configure Podman cgroup manager +RUN mkdir -p /etc/containers && \ + echo "[engine]\ncgroup_manager = \"cgroupfs\"" | sudo tee /etc/containers/containers.conf # Add and configure GitHub Actions runner ARG RUNNERREPO="https://github.com/actions/runner" @@ -96,6 +96,8 @@ USER runner # Set working directory WORKDIR /opt/runner +COPY --chown=runner:runner pytorch-ubi-ppc64le.tar /opt/runner/pytorch-ubi-ppc64le.tar + # Define entry point and command ENTRYPOINT ["/usr/bin/entrypoint"] CMD ["/usr/bin/actions-runner"] diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner index e8c9bde29074..07d2b7704382 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -2,38 +2,15 @@ set -e -u -trap cleanup EXIT +# first import docker image +if [ -f ./pytorch-ubi-ppc64le.tar ] ; then + docker image load --input pytorch-ubi-ppc64le.tar + docker image tag docker.io/pytorch/ubippc64le-builder:cpu-ppc64le docker.io/pytorch/ubippc64le-builder:cpu-ppc64le-main + rm -f ubi-ppc64le.tar +fi token_file=registration-token.json -# Function to clean up and unregister the runner -cleanup() { - echo "Cleaning up temporary files..." - [ -f "$token_file" ] && rm -f "$token_file" - [ -f "runner-id.json" ] && rm -f "runner-id.json" - - echo "Unregistering the runner from GitHub..." - ACCESS_TOKEN="$(cat /run/runner_secret)" - runner_id=$(curl -s \ - -H "Accept: application/vnd.github.v3+json" \ - -H "Authorization: Bearer ${ACCESS_TOKEN}" \ - "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners" | \ - jq --raw-output '.runners[] | select(.name=="'"${NAME}"'") | .id') - - if [ -n "$runner_id" ]; then - curl -s \ - -X DELETE \ - -H "Accept: application/vnd.github.v3+json" \ - -H "Authorization: Bearer ${ACCESS_TOKEN}" \ - "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/$runner_id" - echo "Runner unregistered successfully." - else - echo "Warning: Runner ID for ${NAME} not found. It may already be removed." - fi - - unset ACCESS_TOKEN runner_id -} - # Fetch GitHub access token if [ ! -f /run/runner_secret ]; then echo "Error: Access token file not found at /run/runner_secret." @@ -48,11 +25,14 @@ curl \ -X POST \ -H "Accept: application/vnd.github.v3+json" \ -H "Authorization: Bearer ${ACCESS_TOKEN}" \ - "https://api.github.com/repos/${OWNER}/${REPO}/actions/runners/registration-token" \ + "https://api.github.com/orgs/${ORG}/actions/runners/registration-token" \ -o "$token_file" unset ACCESS_TOKEN +sudo umount /run/runner_secret +sudo rm -f /run/runner_secret + # register runner as ephemeral runner # it does one job, stops and unregisters registration_token=$(jq --raw-output .token "$token_file") @@ -60,7 +40,7 @@ registration_token=$(jq --raw-output .token "$token_file") ./config.sh \ --unattended \ --ephemeral \ - --url "https://github.com/${OWNER}/${REPO}" \ + --url "https://github.com/${ORG}" \ --token "${registration_token}" \ --name "${NAME}" \ --no-default-labels \ diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index f4c75a5e5cfc..b8045f5e20fb 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -109,7 +109,7 @@ jobs: steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -119,17 +119,16 @@ jobs: # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main - if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' with: no-sudo: true - name: Setup Linux uses: ./.github/actions/setup-linux - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' - name: configure aws credentials uses: aws-actions/configure-aws-credentials@v3 - if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' }} + if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-build @@ -138,13 +137,13 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} - name: Use following to pull public copy of the image id: print-ghcr-mirror - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash @@ -154,26 +153,24 @@ jobs: - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Parse ref id: parse-ref - if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' run: .github/scripts/parse_ref.py - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id - if: always() && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: always() with: github-token: ${{ secrets.GITHUB_TOKEN }} # Apply the filter logic to the build step too if the test-config label is already there - name: Select all requested test configurations (if the test matrix is available) id: filter - if: inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' uses: ./.github/actions/filter-test-configs with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -184,14 +181,14 @@ jobs: - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true - if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} s3_bucket: ${{ inputs.s3-bucket }} - name: Build - if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && (inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9') + if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' id: build env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} @@ -278,16 +275,6 @@ jobs: END_TIME=$(date +%s) echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" - - name: Execute Build and Tests inside ppc64le Docker Container - if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' - run: | - CONTAINER_NAME="temp_builder_${RUN_ID}" - docker run -d --name "$CONTAINER_NAME" pytorch-ppc64le:ubi9.3 /ppc64le-build.sh - docker wait "$CONTAINER_NAME" - docker logs "$CONTAINER_NAME" - docker cp "$CONTAINER_NAME":/workspace/pytorch/dist/. dist/ - docker rm "$CONTAINER_NAME" - - name: Archive artifacts into zip if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' run: | @@ -295,7 +282,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -305,7 +292,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 for split build uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' with: name: ${{ inputs.build-environment }}-experimental-split-build retention-days: 14 @@ -331,27 +318,8 @@ jobs: if-no-files-found: error path: artifacts.zip - - name: Archive ppc64le artifacts into zip - if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' - run: | - zip -1 -r artifacts.zip dist/ - - - - name: Store PyTorch Build Artifacts for ppc64le - uses: actions/upload-artifact@v4 - if: inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' - with: - name: ${{ inputs.build-environment }}-ubi9 - retention-days: 14 - if-no-files-found: error - path: artifacts.zip - - - name: Cleanup dangling Docker images for ppc64le - if: always() && inputs.build-environment == 'linux-ppc64le-binary-manywheel-ubi9' - run: docker image prune -f - - name: Upload sccache stats - if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9' + if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' uses: ./.github/actions/upload-sccache-stats with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -359,7 +327,7 @@ jobs: - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() && (inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel-ubi9') + if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' - name: Cleanup docker if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml index fb26c1bfc5a3..cb143a9163db 100755 --- a/.github/workflows/ppc64le.yml +++ b/.github/workflows/ppc64le.yml @@ -11,27 +11,39 @@ concurrency: cancel-in-progress: true jobs: - linux-ppc64le-docker-image-build: - name: Build docker image for ppc64le - runs-on: linux.ppc64le + ppc64le-UBI-9-Build-and-Test: + name: ppc64le-UBI-9-Build-and-Test + runs-on: linux.ppc64le + env: + IMAGE_NAME: docker.io/pytorch/ubippc64le-builder:cpu-ppc64le-main # Define the image variable + steps: - - name: Checkout repository - uses: actions/checkout@v3 + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: - fetch-depth: 0 - submodules: true - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - name: Build Docker image for ppc64le + no-sudo: true + + - name: Execute Build and Tests inside ppc64le Docker Container + run: | + docker run --rm -v ${GITHUB_WORKSPACE}:/workspace/pytorch $IMAGE_NAME /workspace/pytorch/.github/scripts/ppc64le-build.sh + + - name: Archive ppc64le artifacts into zip run: | - docker build -f .ci/docker/manywheel/Dockerfile_ppc64le -t pytorch-ppc64le:ubi9.3 . + zip -1 -r artifacts.zip dist/ + + - name: Store PyTorch Build Artifacts for ppc64le + uses: actions/upload-artifact@v4 + with: + name: linux-ppc64le-binary-ubi9 + retention-days: 14 + if-no-files-found: error + path: artifacts.zip - ppc64le-UBI-9-3-Build-and-Test: - name: ppc64le-UBI-9-3-Build-and-Test - uses: ./.github/workflows/_linux-build.yml - needs: linux-ppc64le-docker-image-build - with: - build-environment: linux-ppc64le-binary-manywheel-ubi9 - docker-image-name: pytorch-ppc64le:ubi9.3 - runner: linux.ppc64le + - name: Cleanup docker + if: always() + shell: bash + run: | + # Stop and remove all containers + docker ps -q | xargs -r docker stop + docker ps -aq | xargs -r docker rm \ No newline at end of file From f93aae3c25197c47c8a8287d8f93d3d1797fcd45 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Tue, 25 Feb 2025 23:26:46 +0530 Subject: [PATCH 5/6] Address review comments: Permission changes and removed mount docker socket --- .github/scripts/ppc64le-ci/README.md | 2 +- .../ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile | 4 ++-- .../ppc64le-ci/self-hosted-builder/actions-runner@.service | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/scripts/ppc64le-ci/README.md b/.github/scripts/ppc64le-ci/README.md index f3f0e83c0acd..4a3433b7fb50 100755 --- a/.github/scripts/ppc64le-ci/README.md +++ b/.github/scripts/ppc64le-ci/README.md @@ -24,7 +24,7 @@ $ cd pytorch $ git submodule update --init --recursive $ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" ubippc64le-builder $ docker image tag localhost/pytorch/ubippc64le-builder docker.io/pytorch/ubippc64le-builder:cpu-ppc64le -$ docker image save -o ~/ubi-ppc64le.tar docker.io/pytorch/ubippc64le-builder:cpu-ppc64le +$ docker image save -o ~/pytorch-ubi-ppc64le.tar docker.io/pytorch/ubippc64le-builder:cpu-ppc64le ``` Next step is to build `actions-runner` image using: diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile index b9fc516acad0..d45b1be2fe5a 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile @@ -84,11 +84,11 @@ RUN mkdir -p /opt/runner && \ chown -R runner:runner /opt/runner && \ su - runner -c "/opt/runner/config.sh --version" -RUN rm -rf /tmp/runner /tmp/runner.patch +RUN rm -rf /tmp/runner /tmp/runner.patch # Copy custom scripts and set permissions COPY fs/ / -RUN chmod 777 /usr/bin/actions-runner /usr/bin/entrypoint +RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint # Switch to the runner user USER runner diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service index abbc0ffc97e5..bd1a636cef3c 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner@.service @@ -14,7 +14,6 @@ ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env ExecStart=/usr/bin/docker run \ --env-file=/etc/actions-runner/%i/env \ --volume /etc/actions-runner/%i/ghtoken.txt:/run/runner_secret \ - --volume /var/run/docker.sock:/var/run/docker.sock \ --init \ --interactive \ --name=actions-runner.%i \ From 5929e1f7cefcd2c5b2e81b0864b0388b7213c6d2 Mon Sep 17 00:00:00 2001 From: sandeepgupta12 Date: Tue, 4 Mar 2025 16:41:33 +0530 Subject: [PATCH 6/6] Reuse _linux-build.yml for ppc64le build and updated other files accordingly --- .ci/docker/manywheel/Dockerfile_ppc64le | 107 ++++++++++++++---- .ci/docker/manywheel/build_scripts/build.sh | 2 +- .ci/pytorch/build.sh | 8 +- .github/scripts/ppc64le-build.sh | 33 ------ .github/scripts/ppc64le-ci/README.md | 12 +- .../actions-runner.Dockerfile | 2 +- .../fs/usr/bin/actions-runner | 8 +- .github/workflows/_linux-build.yml | 62 ++++++---- .github/workflows/ppc64le.yml | 45 ++------ 9 files changed, 150 insertions(+), 129 deletions(-) delete mode 100755 .github/scripts/ppc64le-build.sh diff --git a/.ci/docker/manywheel/Dockerfile_ppc64le b/.ci/docker/manywheel/Dockerfile_ppc64le index 479e9370ef86..936d5037d74c 100755 --- a/.ci/docker/manywheel/Dockerfile_ppc64le +++ b/.ci/docker/manywheel/Dockerfile_ppc64le @@ -1,31 +1,92 @@ -# Use UBI 9 as base image -FROM registry.access.redhat.com/ubi9/ubi:9.5 AS base +# Use the manylinux_2_28 base image for ppc64le +FROM quay.io/pypa/manylinux_2_28_ppc64le as base -# Install necessary dependencies -RUN dnf install -y \ - https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ - dnf install -y git cmake ninja-build gcc-toolset-13 rust cargo zip \ - python3 python3-devel && \ - dnf clean all +# Language variables +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 -ENV PATH="/opt/rh/gcc-toolset-13/root/usr/bin:$PATH" -ENV MANPATH="/opt/rh/gcc-toolset-13/root/usr/share/man" -ENV INFOPATH="/opt/rh/gcc-toolset-13/root/usr/share/info" -ENV PCP_DIR="/opt/rh/gcc-toolset-13/root" -ENV LD_LIBRARY_PATH="/opt/rh/gcc-toolset-13/root/usr/lib64:/opt/rh/gcc-toolset-13/root/usr/lib" +ARG DEVTOOLSET_VERSION=13 -# Set Python and pip aliases to use Python 3.9 -RUN ln -sf /usr/bin/python3 /usr/bin/python && \ - ln -sf /usr/bin/pip3 /usr/bin/pip +# Create symbolic links for Python 3.12 +RUN ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python3 && \ + ln -sf /opt/python/cp312-cp312/bin/python3.12 /usr/bin/python -COPY requirements.txt . -# Install Python packages via pip -RUN pip install wheel -RUN pip install -r requirements.txt +# Install required system dependencies +RUN yum -y install epel-release && \ + yum -y update && \ + yum install -y \ + sudo \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + less \ + zstd \ + libgomp \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + gcc-toolset-${DEVTOOLSET_VERSION}-binutils \ + gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + cmake \ + ninja-build \ + rust \ + cargo \ + llvm-devel \ + libzstd-devel \ + python3.12-devel \ + python3.12-setuptools \ + python3.12-pip \ + python3-virtualenv \ + python3.12-pyyaml \ + python3.12-numpy \ + python3.12-wheel \ + python3.12-cryptography \ + blas-devel \ + openblas-devel \ + lapack-devel \ + atlas-devel \ + libjpeg-devel \ + libxslt-devel \ + libxml2-devel \ + openssl-devel \ + valgrind + -RUN mkdir -p /workspace/pytorch +# Ensure the correct Python version is used +ENV PATH=/opt/python/cp312-cp312/bin:$PATH +# Add gcc-toolset to the path +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH -ENTRYPOINT [] -CMD ["/bin/bash"] +# Configure git to avoid safe directory issues +RUN git config --global --add safe.directory "*" +# Install required Python packages +RUN pip install --upgrade pip +RUN pip install typing_extensions pyyaml setuptools + +# Install test dependencies +RUN dnf install -y \ + protobuf-devel \ + protobuf-c-devel \ + protobuf-lite-devel \ + wget \ + patch +# Set default entrypoint +ENTRYPOINT [] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh index e2cb1c7f27cd..34ea62cc2099 100644 --- a/.ci/docker/manywheel/build_scripts/build.sh +++ b/.ci/docker/manywheel/build_scripts/build.sh @@ -20,7 +20,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 # the final image after compiling Python PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel" -if [ "$(uname -m)" != "s390x" ] ; then +if [ "$(uname -m)" != "s390x" && "$(uname -m)" != "ppc64le" ] ; then PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel" else PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel" diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index 24575e358002..f3d065499f3e 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -230,7 +230,7 @@ fi # Do not change workspace permissions for ROCm and s390x CI jobs # as it can leave workspace with bad permissions for cancelled jobs -if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then +if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && -d /var/lib/jenkins/workspace ]]; then # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") cleanup_workspace() { @@ -274,8 +274,10 @@ else # XLA test build fails when WERROR=1 # set only when building other architectures # or building non-XLA tests. + # ppc64le builds fail when WERROR=1 if [[ "$BUILD_ENVIRONMENT" != *rocm* && - "$BUILD_ENVIRONMENT" != *xla* ]]; then + "$BUILD_ENVIRONMENT" != *xla* && + "$BUILD_ENVIRONMENT" != *ppc64le* ]]; then if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then # Install numpy-2.0.2 for builds which are backward compatible with 1.X python -mpip install numpy==2.0.2 @@ -396,6 +398,6 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; python tools/stats/export_test_times.py fi # don't do this for bazel or s390x as they don't use sccache -if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then +if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then print_sccache_stats fi diff --git a/.github/scripts/ppc64le-build.sh b/.github/scripts/ppc64le-build.sh deleted file mode 100755 index 751174aead6c..000000000000 --- a/.github/scripts/ppc64le-build.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -# Environment variables -PACKAGE_NAME=pytorch - -cd /workspace/$PACKAGE_NAME - -# Clean up old artifacts -rm -rf build/ dist/ torch.egg-info/ - -# Build and install PyTorch wheel -if ! (MAX_JOBS=$(nproc) python setup.py bdist_wheel && pip install dist/*.whl); then - echo "------------------$PACKAGE_NAME:install_fails-------------------------------------" - exit 1 -fi - -# register PrivateUse1HooksInterface -python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_bfloat16 -python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float16 -python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float32 -python test/test_utils.py TestDeviceUtilsCPU.test_device_mode_ops_sparse_mm_reduce_cpu_float64 - -cd .. -pip install pytest pytest-xdist - -if ! pytest "$PACKAGE_NAME/test/test_utils.py"; then - echo "------------------$PACKAGE_NAME:install_success_but_test_fails---------------------" - exit 2 - -else - echo "------------------$PACKAGE_NAME:install_and_test_both_success-------------------------" - exit 0 -fi \ No newline at end of file diff --git a/.github/scripts/ppc64le-ci/README.md b/.github/scripts/ppc64le-ci/README.md index 4a3433b7fb50..8676799de341 100755 --- a/.github/scripts/ppc64le-ci/README.md +++ b/.github/scripts/ppc64le-ci/README.md @@ -3,7 +3,7 @@ ## Install prerequisites. ``` -$ sudo dnf install podman podman-docker jq +$ sudo apt install podman podman-docker jq ``` ## Add services. @@ -14,7 +14,7 @@ $ sudo systemctl daemon-reload ## Rebuild the image -First build ppc64le builder image `docker.io/pytorch/ubippc64le-builder`, +First build ppc64le builder image `docker.io/pytorch/manylinuxppc64le-builder`, using following commands: ``` @@ -22,9 +22,9 @@ $ cd ~ $ git clone https://github.com/pytorch/pytorch $ cd pytorch $ git submodule update --init --recursive -$ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" ubippc64le-builder -$ docker image tag localhost/pytorch/ubippc64le-builder docker.io/pytorch/ubippc64le-builder:cpu-ppc64le -$ docker image save -o ~/pytorch-ubi-ppc64le.tar docker.io/pytorch/ubippc64le-builder:cpu-ppc64le +$ GPU_ARCH_TYPE=cpu-ppc64le "$(pwd)/.ci/docker/manywheel/build.sh" manylinuxppc64le-builder +$ docker image tag localhost/pytorch/manylinuxppc64le-builder docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le +$ docker image save -o ~/manywheel-ppc64le.tar docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le ``` Next step is to build `actions-runner` image using: @@ -60,4 +60,4 @@ $ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.s ``` $ sudo systemctl enable --now actions-runner@$NAME -``` +``` \ No newline at end of file diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile index d45b1be2fe5a..f1589d7edf9a 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/actions-runner.Dockerfile @@ -96,7 +96,7 @@ USER runner # Set working directory WORKDIR /opt/runner -COPY --chown=runner:runner pytorch-ubi-ppc64le.tar /opt/runner/pytorch-ubi-ppc64le.tar +COPY --chown=runner:runner manywheel-ppc64le.tar /opt/runner/manywheel-ppc64le.tar # Define entry point and command ENTRYPOINT ["/usr/bin/entrypoint"] diff --git a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner index 07d2b7704382..e60c9e7314d7 100755 --- a/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner +++ b/.github/scripts/ppc64le-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -3,10 +3,10 @@ set -e -u # first import docker image -if [ -f ./pytorch-ubi-ppc64le.tar ] ; then - docker image load --input pytorch-ubi-ppc64le.tar - docker image tag docker.io/pytorch/ubippc64le-builder:cpu-ppc64le docker.io/pytorch/ubippc64le-builder:cpu-ppc64le-main - rm -f ubi-ppc64le.tar +if [ -f ./manywheel-ppc64le.tar ] ; then + docker image load --input manywheel-ppc64le.tar + docker image tag docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le docker.io/pytorch/manylinuxppc64le-builder:cpu-ppc64le-main + rm -f manywheel-ppc64le.tar fi token_file=registration-token.json diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index b8045f5e20fb..1aeaa5d8b73c 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -109,7 +109,7 @@ jobs: steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -124,11 +124,11 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' - name: configure aws credentials uses: aws-actions/configure-aws-credentials@v3 - if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} + if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' }} with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-build @@ -137,13 +137,13 @@ jobs: - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} - name: Use following to pull public copy of the image id: print-ghcr-mirror - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash @@ -153,7 +153,7 @@ jobs: - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -181,7 +181,7 @@ jobs: - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true - if: inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} @@ -205,6 +205,7 @@ jobs: TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }} + DOCKER_IMAGE_PPC64LE: ${{ inputs.docker-image-name }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} @@ -224,17 +225,35 @@ jobs: # since some steps are skipped on s390x, if they are necessary, run them here env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + elif [[ ${BUILD_ENVIRONMENT} == *"ppc64le"* ]]; then + JENKINS_USER="" + USED_IMAGE="${DOCKER_IMAGE_PPC64LE}" + # ensure that docker container cleanly exits in 12 hours + # if for some reason cleanup action doesn't stop container + # when job is cancelled + DOCKER_SHELL_CMD="sleep 12h" + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + else JENKINS_USER="--user jenkins" USED_IMAGE="${DOCKER_IMAGE}" DOCKER_SHELL_CMD= fi - # Leaving 1GB for the runner and other things - TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) - # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap - # comes from https://github.com/pytorch/test-infra/pull/6058 - TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + # Compute memory limits, but skip setting them for ppc64le + if [[ ${BUILD_ENVIRONMENT} != *"ppc64le"* ]]; then + + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + MEMORY_FLAGS="--memory=${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g --memory-swap=${TOTAL_MEMORY_WITH_SWAP}g" + else + MEMORY_FLAGS="" + fi # detached container should get cleaned up by teardown_ec2_linux # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty @@ -257,13 +276,12 @@ jobs: -e HUGGING_FACE_HUB_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e USE_SPLIT_BUILD \ - --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ - --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --tty \ --detach \ + ${MEMORY_FLAGS} \ ${JENKINS_USER} \ -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ @@ -282,7 +300,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -292,7 +310,7 @@ jobs: - name: Store PyTorch Build Artifacts on S3 for split build uses: seemethere/upload-artifact-s3@v5 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' with: name: ${{ inputs.build-environment }}-experimental-split-build retention-days: 14 @@ -300,9 +318,9 @@ jobs: path: artifacts.zip s3-bucket: ${{ inputs.s3-bucket }} - - name: Store PyTorch Build Artifacts for s390x + - name: Store PyTorch Build Artifacts for s390x and ppc64le uses: actions/upload-artifact@v4 - if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel' + if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && (inputs.build-environment == 'linux-s390x-binary-manywheel' || inputs.build-environment == 'linux-ppc64le-binary-manywheel') with: name: ${{ inputs.build-environment }} retention-days: 14 @@ -319,7 +337,7 @@ jobs: path: artifacts.zip - name: Upload sccache stats - if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' uses: ./.github/actions/upload-sccache-stats with: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -327,12 +345,12 @@ jobs: - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' + if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' && inputs.build-environment != 'linux-ppc64le-binary-manywheel' - name: Cleanup docker - if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' + if: always() && (inputs.build-environment == 'linux-s390x-binary-manywheel' || inputs.build-environment == 'linux-ppc64le-binary-manywheel' ) shell: bash run: | - # on s390x stop the container for clean worker stop + # on s390x and ppc64le stop the container for clean worker stop docker stop -a || true docker kill -a || true diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml index cb143a9163db..e5875982c187 100755 --- a/.github/workflows/ppc64le.yml +++ b/.github/workflows/ppc64le.yml @@ -11,39 +11,12 @@ concurrency: cancel-in-progress: true jobs: - ppc64le-UBI-9-Build-and-Test: - name: ppc64le-UBI-9-Build-and-Test - runs-on: linux.ppc64le - env: - IMAGE_NAME: docker.io/pytorch/ubippc64le-builder:cpu-ppc64le-main # Define the image variable - - steps: - - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main - with: - no-sudo: true - - - name: Execute Build and Tests inside ppc64le Docker Container - run: | - docker run --rm -v ${GITHUB_WORKSPACE}:/workspace/pytorch $IMAGE_NAME /workspace/pytorch/.github/scripts/ppc64le-build.sh - - - name: Archive ppc64le artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ - - - name: Store PyTorch Build Artifacts for ppc64le - uses: actions/upload-artifact@v4 - with: - name: linux-ppc64le-binary-ubi9 - retention-days: 14 - if-no-files-found: error - path: artifacts.zip - - - name: Cleanup docker - if: always() - shell: bash - run: | - # Stop and remove all containers - docker ps -q | xargs -r docker stop - docker ps -aq | xargs -r docker rm - \ No newline at end of file + linux-manylinux-2_28-py3-cpu-ppc64le-build: + if: github.repository_owner == 'pytorch' + name: linux-manylinux-2_28-py3-cpu-ppc64le-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-ppc64le-binary-manywheel + docker-image-name: pytorch/manylinuxppc64le-builder:cpu-ppc64le-main + runner: linux.ppc64le + secrets: inherit \ No newline at end of file