From bd58332e514873796d83cb677108b9c648eb6fa6 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Fri, 7 Nov 2025 17:58:30 -0500 Subject: [PATCH 1/2] Metal backend: Add Whisper to CI workflow --- .ci/scripts/export_model_cuda_artifact.sh | 63 ++++++++--- .ci/scripts/test_model_cuda_e2e.sh | 47 +++++--- .github/workflows/cuda.yml | 4 +- .github/workflows/metal.yml | 128 ++++++---------------- 4 files changed, 114 insertions(+), 128 deletions(-) diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh index 3ff27fc2bd0..3c173b0ea2a 100755 --- a/.ci/scripts/export_model_cuda_artifact.sh +++ b/.ci/scripts/export_model_cuda_artifact.sh @@ -5,15 +5,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Export model to CUDA format with optional quantization +# Export model to CUDA/Metal format with optional quantization show_help() { cat << EOF -Usage: export_model_cuda_artifact.sh [quant_name] [output_dir] +Usage: export_model_artifact.sh [quant_name] [output_dir] -Export a HuggingFace model to CUDA format with optional quantization. +Export a HuggingFace model to CUDA/Metal format with optional quantization. Arguments: + device cuda or metal (required) + hf_model HuggingFace model ID (required) Supported models: - mistralai/Voxtral-Mini-3B-2507 @@ -29,9 +31,9 @@ Arguments: output_dir Output directory for artifacts (optional, default: current directory) Examples: - export_model_cuda_artifact.sh "openai/whisper-small" - export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" - export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output" + export_model_artifact.sh metal "openai/whisper-small" + export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" + export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output" EOF } @@ -48,9 +50,22 @@ fi set -eux -HF_MODEL="$1" -QUANT_NAME="${2:-non-quantized}" -OUTPUT_DIR="${3:-.}" +DEVICE="$1" +HF_MODEL="$2" +QUANT_NAME="${3:-non-quantized}" +OUTPUT_DIR="${4:-.}" + +case "$DEVICE" in + cuda) + ;; + metal) + ;; + *) + echo "Error: Unsupported device '$DEVICE'" + echo "Supported devices: cuda, metal" + exit 1 + ;; +esac # Determine model configuration based on HF model ID case "$HF_MODEL" in @@ -75,6 +90,10 @@ case "$HF_MODEL" in fi ;; google/gemma-3-4b-it) + if [ "$DEVICE" = "metal" ]; then + echo "Error: Export for device 'metal' is not yet tested for model '$HF_MODEL'" + exit 1 + fi MODEL_NAME="gemma3" TASK="multimodal-text-to-text" MAX_SEQ_LEN="64" @@ -95,9 +114,17 @@ case "$QUANT_NAME" in EXTRA_ARGS="" ;; quantized-int4-tile-packed) + if [ "$DEVICE" = "metal" ]; then + echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'" + exit 1 + fi EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" ;; quantized-int4-weight-only) + if [ "$DEVICE" = "metal" ]; then + echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'" + exit 1 + fi EXTRA_ARGS="--qlinear_encoder 4w" ;; *) @@ -118,12 +145,18 @@ MAX_SEQ_LEN_ARG="" if [ -n "$MAX_SEQ_LEN" ]; then MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN" fi + +DEVICE_ARG="" +if [ "$DEVICE" = "cuda" ]; then + DEVICE_ARG="--device cuda" +fi + optimum-cli export executorch \ --model "$HF_MODEL" \ --task "$TASK" \ - --recipe "cuda" \ + --recipe "$DEVICE" \ --dtype bfloat16 \ - --device cuda \ + ${DEVICE_ARG} \ ${MAX_SEQ_LEN_ARG} \ ${EXTRA_ARGS} \ --output_dir ./ @@ -137,7 +170,7 @@ if [ -n "$PREPROCESSOR_OUTPUT" ]; then fi test -f model.pte -test -f aoti_cuda_blob.ptd +test -f aoti_${DEVICE}_blob.ptd if [ -n "$PREPROCESSOR_OUTPUT" ]; then test -f $PREPROCESSOR_OUTPUT fi @@ -145,10 +178,10 @@ echo "::endgroup::" echo "::group::Store $MODEL_NAME Artifacts" mkdir -p "${OUTPUT_DIR}" -cp model.pte "${OUTPUT_DIR}/" -cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/" +mv model.pte "${OUTPUT_DIR}/" +mv aoti_${DEVICE}_blob.ptd "${OUTPUT_DIR}/" if [ -n "$PREPROCESSOR_OUTPUT" ]; then - cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/" + mv $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/" fi ls -al "${OUTPUT_DIR}" echo "::endgroup::" diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh index dc577dfc753..13ebeff34e5 100755 --- a/.ci/scripts/test_model_cuda_e2e.sh +++ b/.ci/scripts/test_model_cuda_e2e.sh @@ -5,15 +5,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first +# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first show_help() { cat << EOF -Usage: test_model_cuda_e2e.sh [model_dir] +Usage: test_model_e2e.sh [model_dir] -Build and run end-to-end tests for CUDA models. +Build and run end-to-end tests for CUDA/Metal models. Arguments: + device cuda or metal (required) + hf_model HuggingFace model ID (required) Supported models: - mistralai/Voxtral-Mini-3B-2507 @@ -27,12 +29,12 @@ Arguments: - quantized-int4-weight-only model_dir Directory containing model artifacts (optional, default: current directory) - Expected files: model.pte, aoti_cuda_blob.ptd + Expected files: model.pte, aoti_cuda_blob.ptd/aoti_metal_blob.ptd Tokenizers and test files will be downloaded to this directory Examples: - test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized" - test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output" + test_model_e2e.sh metal "openai/whisper-small" "non-quantized" + test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output" EOF } @@ -55,20 +57,21 @@ fi set -eux -HF_MODEL="$1" -QUANT_NAME="$2" +DEVICE="$1" +HF_MODEL="$2" +QUANT_NAME="$3" # Download tokenizers, audio, and image files to this directory -MODEL_DIR="${3:-.}" +MODEL_DIR="${4:-.}" echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)" -# Make sure model.pte and aoti_cuda_blob.ptd exist +# Make sure model.pte and aoti_${DEVICE}_blob.ptd exist if [ ! -f "$MODEL_DIR/model.pte" ]; then echo "Error: model.pte not found in $MODEL_DIR" exit 1 fi -if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then - echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR" +if [ ! -f "$MODEL_DIR/aoti_${DEVICE}_blob.ptd" ]; then + echo "Error: aoti_${DEVICE}_blob.ptd not found in $MODEL_DIR" exit 1 fi # Locate EXECUTORCH_ROOT from the directory of this script @@ -152,14 +155,24 @@ ls -al echo "::endgroup::" echo "::group::Build $MODEL_NAME Runner" + +if [ "$DEVICE" = "cuda" ]; then + BUILD_BACKEND="EXECUTORCH_BUILD_CUDA" +elif [ "$DEVICE" = "metal" ]; then + BUILD_BACKEND="EXECUTORCH_BUILD_METAL" +else + echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'." + exit 1 +fi + cmake --preset llm \ - -DEXECUTORCH_BUILD_CUDA=ON \ + -D${BUILD_BACKEND}=ON \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ -Bcmake-out -S. cmake --build cmake-out -j$(nproc) --target install --config Release -cmake -DEXECUTORCH_BUILD_CUDA=ON \ +cmake -D${BUILD_BACKEND}=ON \ -DCMAKE_BUILD_TYPE=Release \ -Sexamples/models/$RUNNER_PATH \ -Bcmake-out/examples/models/$RUNNER_PATH/ @@ -168,11 +181,13 @@ echo "::endgroup::" echo "::group::Run $MODEL_NAME Runner" set +e -export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH +if [ "$DEVICE" = "cuda" ]; then + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH +fi # Build runner command with common arguments RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET" -RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0" +RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_${DEVICE}_blob.ptd --temperature 0" # Add model-specific arguments case "$MODEL_NAME" in diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 80d5484ff15..7cc937fe6ca 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -142,7 +142,7 @@ jobs: pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" - source .ci/scripts/export_model_cuda_artifact.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" benchmark-model-cuda: name: benchmark-model-cuda @@ -249,4 +249,4 @@ jobs: download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | - source .ci/scripts/test_model_cuda_e2e.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml index 5a47f07ff0b..92351883e8f 100644 --- a/.github/workflows/metal.yml +++ b/.github/workflows/metal.yml @@ -28,12 +28,24 @@ jobs: PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh echo "::endgroup::" - export-voxtral-metal-artifact: - name: export-voxtral-metal-artifact + export-model-metal-artifact: + name: export-model-metal-artifact # Skip this job if the pull request is from a fork (HuggingFace secrets are not available) if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' uses: pytorch/test-infra/.github/workflows/macos_job.yml@main secrets: inherit + strategy: + fail-fast: false + matrix: + model: + - repo: "mistralai" + name: "Voxtral-Mini-3B-2507" + - repo: "openai" + name: "whisper-small" + - repo: "openai" + name: "whisper-large-v3-turbo" + quant: + - "non-quantized" with: runner: macos-m2-stable python-version: '3.11' @@ -41,7 +53,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - upload-artifact: voxtral-metal-export + upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} script: | set -eux @@ -54,7 +66,6 @@ jobs: OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}" ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} - ${CONDA_RUN} pip install mistral-common librosa echo "::endgroup::" echo "::group::Setup ExecuTorch" @@ -65,44 +76,31 @@ jobs: ${CONDA_RUN} pip list echo "::endgroup::" - echo "::group::Export Voxtral" - ${CONDA_RUN} optimum-cli export executorch \ - --model "mistralai/Voxtral-Mini-3B-2507" \ - --task "multimodal-text-to-text" \ - --recipe "metal" \ - --dtype bfloat16 \ - --max_seq_len 1024 \ - --output_dir ./ - ${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \ - --feature_size 128 \ - --stack_output \ - --max_audio_len 300 \ - --output_file voxtral_preprocessor.pte + ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" - test -f model.pte - test -f aoti_metal_blob.ptd - test -f voxtral_preprocessor.pte - echo "::endgroup::" - - echo "::group::Store Voxtral Artifacts" - mkdir -p "${RUNNER_ARTIFACT_DIR}" - cp model.pte "${RUNNER_ARTIFACT_DIR}/" - cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/" - cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" - ls -al "${RUNNER_ARTIFACT_DIR}" - echo "::endgroup::" - - test-voxtral-metal-e2e: - name: test-voxtral-metal-e2e - needs: export-voxtral-metal-artifact + test-model-metal-e2e: + name: test-model-metal-e2e + needs: export-model-metal-artifact uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + strategy: + fail-fast: false + matrix: + model: + - repo: "mistralai" + name: "Voxtral-Mini-3B-2507" + - repo: "openai" + name: "whisper-small" + - repo: "openai" + name: "whisper-large-v3-turbo" + quant: + - "non-quantized" with: runner: macos-m2-stable python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 - download-artifact: voxtral-metal-export + download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} script: | set -eux @@ -130,64 +128,4 @@ jobs: fi echo "::endgroup::" - echo "::group::Setup ExecuTorch Requirements" - CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh - echo "::endgroup::" - - echo "::group::Pip List" - ${CONDA_RUN} pip list - echo "::endgroup::" - - echo "::group::Prepare Voxtral Artifacts" - cp "${RUNNER_ARTIFACT_DIR}/model.pte" . - cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" . - cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . - TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" - curl -L $TOKENIZER_URL -o tekken.json - ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json - echo "::endgroup::" - - echo "::group::Create Test Audio File" - say -o call_samantha_hall.aiff "Call Samantha Hall" - afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav - echo "::endgroup::" - - echo "::group::Build Voxtral Runner" - ${CONDA_RUN} cmake --preset llm \ - -DEXECUTORCH_BUILD_METAL=ON \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -Bcmake-out -S. - ${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release - - ${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -Sexamples/models/voxtral \ - -Bcmake-out/examples/models/voxtral/ - ${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release - echo "::endgroup::" - - echo "::group::Run Voxtral Runner" - set +e - OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ - --model_path model.pte \ - --data_path aoti_metal_blob.ptd \ - --tokenizer_path tekken.json \ - --audio_path call_samantha_hall.wav \ - --processor_path voxtral_preprocessor.pte \ - --temperature 0 2>&1) - EXIT_CODE=$? - set -e - - echo "$OUTPUT" - - if ! echo "$OUTPUT" | grep -iq "Samantha"; then - echo "Expected output 'Samantha' not found in output" - exit 1 - fi - - if [ $EXIT_CODE -ne 0 ]; then - echo "Unexpected exit code: $EXIT_CODE" - exit $EXIT_CODE - fi - echo "::endgroup::" + ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" From 6d84c8014b7131eccaa825450036225188eec213 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Fri, 7 Nov 2025 18:03:44 -0500 Subject: [PATCH 2/2] rename scripts --- .../{export_model_cuda_artifact.sh => export_model_artifact.sh} | 0 .ci/scripts/{test_model_cuda_e2e.sh => test_model_e2e.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename .ci/scripts/{export_model_cuda_artifact.sh => export_model_artifact.sh} (100%) rename .ci/scripts/{test_model_cuda_e2e.sh => test_model_e2e.sh} (100%) diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_artifact.sh similarity index 100% rename from .ci/scripts/export_model_cuda_artifact.sh rename to .ci/scripts/export_model_artifact.sh diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_e2e.sh similarity index 100% rename from .ci/scripts/test_model_cuda_e2e.sh rename to .ci/scripts/test_model_e2e.sh