From bd58332e514873796d83cb677108b9c648eb6fa6 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 7 Nov 2025 17:58:30 -0500
Subject: [PATCH 1/2] Metal backend: Add Whisper to CI workflow

---
 .ci/scripts/export_model_cuda_artifact.sh |  63 ++++++++---
 .ci/scripts/test_model_cuda_e2e.sh        |  47 +++++---
 .github/workflows/cuda.yml                |   4 +-
 .github/workflows/metal.yml               | 128 ++++++----------------
 4 files changed, 114 insertions(+), 128 deletions(-)
diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh
index 3ff27fc2bd0..3c173b0ea2a 100755
--- a/.ci/scripts/export_model_cuda_artifact.sh
+++ b/.ci/scripts/export_model_cuda_artifact.sh
@@ -5,15 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Export model to CUDA format with optional quantization
+# Export model to CUDA/Metal format with optional quantization
 
 show_help() {
   cat << EOF
-Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
+Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
 
-Export a HuggingFace model to CUDA format with optional quantization.
+Export a HuggingFace model to CUDA/Metal format with optional quantization.
 
 Arguments:
+  device       cuda or metal (required)
+
   hf_model     HuggingFace model ID (required)
                Supported models:
                  - mistralai/Voxtral-Mini-3B-2507
@@ -29,9 +31,9 @@ Arguments:
   output_dir   Output directory for artifacts (optional, default: current directory)
 
 Examples:
-  export_model_cuda_artifact.sh "openai/whisper-small"
-  export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
-  export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
+  export_model_artifact.sh metal "openai/whisper-small"
+  export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
+  export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
 EOF
 }
 
@@ -48,9 +50,22 @@ fi
 
 set -eux
 
-HF_MODEL="$1"
-QUANT_NAME="${2:-non-quantized}"
-OUTPUT_DIR="${3:-.}"
+DEVICE="$1"
+HF_MODEL="$2"
+QUANT_NAME="${3:-non-quantized}"
+OUTPUT_DIR="${4:-.}"
+
+case "$DEVICE" in
+  cuda)
+    ;;
+  metal)
+    ;;
+  *)
+    echo "Error: Unsupported device '$DEVICE'"
+    echo "Supported devices: cuda, metal"
+    exit 1
+    ;;
+esac
 
 # Determine model configuration based on HF model ID
 case "$HF_MODEL" in
@@ -75,6 +90,10 @@ case "$HF_MODEL" in
     fi
     ;;
   google/gemma-3-4b-it)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Export for device 'metal' is not yet tested for model '$HF_MODEL'"
+      exit 1
+    fi
     MODEL_NAME="gemma3"
     TASK="multimodal-text-to-text"
     MAX_SEQ_LEN="64"
@@ -95,9 +114,17 @@ case "$QUANT_NAME" in
     EXTRA_ARGS=""
     ;;
   quantized-int4-tile-packed)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      exit 1
+    fi
     EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
     ;;
   quantized-int4-weight-only)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      exit 1
+    fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
   *)
@@ -118,12 +145,18 @@ MAX_SEQ_LEN_ARG=""
 if [ -n "$MAX_SEQ_LEN" ]; then
   MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
 fi
+
+DEVICE_ARG=""
+if [ "$DEVICE" = "cuda" ]; then
+  DEVICE_ARG="--device cuda"
+fi
+
 optimum-cli export executorch \
     --model "$HF_MODEL" \
     --task "$TASK" \
-    --recipe "cuda" \
+    --recipe "$DEVICE" \
     --dtype bfloat16 \
-    --device cuda \
+    ${DEVICE_ARG} \
     ${MAX_SEQ_LEN_ARG} \
     ${EXTRA_ARGS} \
     --output_dir ./
@@ -137,7 +170,7 @@ if [ -n "$PREPROCESSOR_OUTPUT" ]; then
 fi
 
 test -f model.pte
-test -f aoti_cuda_blob.ptd
+test -f aoti_${DEVICE}_blob.ptd
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
   test -f $PREPROCESSOR_OUTPUT
 fi
@@ -145,10 +178,10 @@ echo "::endgroup::"
 
 echo "::group::Store $MODEL_NAME Artifacts"
 mkdir -p "${OUTPUT_DIR}"
-cp model.pte "${OUTPUT_DIR}/"
-cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
+mv model.pte "${OUTPUT_DIR}/"
+mv aoti_${DEVICE}_blob.ptd "${OUTPUT_DIR}/"
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
-  cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
+  mv $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
 fi
 ls -al "${OUTPUT_DIR}"
 echo "::endgroup::"
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh
index dc577dfc753..13ebeff34e5 100755
--- a/.ci/scripts/test_model_cuda_e2e.sh
+++ b/.ci/scripts/test_model_cuda_e2e.sh
@@ -5,15 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
+# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
 
 show_help() {
   cat << EOF
-Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
+Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]
 
-Build and run end-to-end tests for CUDA models.
+Build and run end-to-end tests for CUDA/Metal models.
 
 Arguments:
+  device      cuda or metal (required)
+
   hf_model    HuggingFace model ID (required)
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
@@ -27,12 +29,12 @@ Arguments:
                 - quantized-int4-weight-only
 
   model_dir   Directory containing model artifacts (optional, default: current directory)
-              Expected files: model.pte, aoti_cuda_blob.ptd
+              Expected files: model.pte, aoti_cuda_blob.ptd/aoti_metal_blob.ptd
               Tokenizers and test files will be downloaded to this directory
 
 Examples:
-  test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
-  test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
+  test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
+  test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
 EOF
 }
 
@@ -55,20 +57,21 @@ fi
 
 set -eux
 
-HF_MODEL="$1"
-QUANT_NAME="$2"
+DEVICE="$1"
+HF_MODEL="$2"
+QUANT_NAME="$3"
 # Download tokenizers, audio, and image files to this directory
-MODEL_DIR="${3:-.}"
+MODEL_DIR="${4:-.}"
 
 echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"
 
-# Make sure model.pte and aoti_cuda_blob.ptd exist
+# Make sure model.pte and aoti_${DEVICE}_blob.ptd exist
 if [ ! -f "$MODEL_DIR/model.pte" ]; then
   echo "Error: model.pte not found in $MODEL_DIR"
   exit 1
 fi
-if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
-  echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
+if [ ! -f "$MODEL_DIR/aoti_${DEVICE}_blob.ptd" ]; then
+  echo "Error: aoti_${DEVICE}_blob.ptd not found in $MODEL_DIR"
   exit 1
 fi
 # Locate EXECUTORCH_ROOT from the directory of this script
@@ -152,14 +155,24 @@ ls -al
 echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
+
+if [ "$DEVICE" = "cuda" ]; then
+  BUILD_BACKEND="EXECUTORCH_BUILD_CUDA"
+elif [ "$DEVICE" = "metal" ]; then
+  BUILD_BACKEND="EXECUTORCH_BUILD_METAL"
+else
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
+  exit 1
+fi
+
 cmake --preset llm \
-      -DEXECUTORCH_BUILD_CUDA=ON \
+      -D${BUILD_BACKEND}=ON \
       -DCMAKE_INSTALL_PREFIX=cmake-out \
       -DCMAKE_BUILD_TYPE=Release \
       -Bcmake-out -S.
 cmake --build cmake-out -j$(nproc) --target install --config Release
 
-cmake -DEXECUTORCH_BUILD_CUDA=ON \
+cmake -D${BUILD_BACKEND}=ON \
       -DCMAKE_BUILD_TYPE=Release \
       -Sexamples/models/$RUNNER_PATH \
       -Bcmake-out/examples/models/$RUNNER_PATH/
@@ -168,11 +181,13 @@ echo "::endgroup::"
 
 echo "::group::Run $MODEL_NAME Runner"
 set +e
-export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+if [ "$DEVICE" = "cuda" ]; then
+  export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+fi
 
 # Build runner command with common arguments
 RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
-RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
+RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_${DEVICE}_blob.ptd --temperature 0"
 
 # Add model-specific arguments
 case "$MODEL_NAME" in
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 80d5484ff15..7cc937fe6ca 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -142,7 +142,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        source .ci/scripts/export_model_cuda_artifact.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
   benchmark-model-cuda:
     name: benchmark-model-cuda
@@ -249,4 +249,4 @@ jobs:
       download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        source .ci/scripts/test_model_cuda_e2e.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index 5a47f07ff0b..92351883e8f 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -28,12 +28,24 @@ jobs:
         PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
         echo "::endgroup::"
 
-  export-voxtral-metal-artifact:
-    name: export-voxtral-metal-artifact
+  export-model-metal-artifact:
+    name: export-model-metal-artifact
       # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
     if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          - repo: "mistralai"
+            name: "Voxtral-Mini-3B-2507"
+          - repo: "openai"
+            name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v3-turbo"
+        quant:
+          - "non-quantized"
     with:
       runner: macos-m2-stable
       python-version: '3.11'
@@ -41,7 +53,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      upload-artifact: voxtral-metal-export
+      upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
       script: |
         set -eux
 
@@ -54,7 +66,6 @@ jobs:
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
         ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
-        ${CONDA_RUN} pip install mistral-common librosa
         echo "::endgroup::"
 
         echo "::group::Setup ExecuTorch"
@@ -65,44 +76,31 @@ jobs:
         ${CONDA_RUN} pip list
         echo "::endgroup::"
 
-        echo "::group::Export Voxtral"
-        ${CONDA_RUN} optimum-cli export executorch \
-            --model "mistralai/Voxtral-Mini-3B-2507" \
-            --task "multimodal-text-to-text" \
-            --recipe "metal" \
-            --dtype bfloat16 \
-            --max_seq_len 1024 \
-            --output_dir ./
-        ${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
-            --feature_size 128 \
-            --stack_output \
-            --max_audio_len 300 \
-            --output_file voxtral_preprocessor.pte
+        ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
-        test -f model.pte
-        test -f aoti_metal_blob.ptd
-        test -f voxtral_preprocessor.pte
-        echo "::endgroup::"
-
-        echo "::group::Store Voxtral Artifacts"
-        mkdir -p "${RUNNER_ARTIFACT_DIR}"
-        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
-        cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
-        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
-        ls -al "${RUNNER_ARTIFACT_DIR}"
-        echo "::endgroup::"
-
-  test-voxtral-metal-e2e:
-    name: test-voxtral-metal-e2e
-    needs: export-voxtral-metal-artifact
+  test-model-metal-e2e:
+    name: test-model-metal-e2e
+    needs: export-model-metal-artifact
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          - repo: "mistralai"
+            name: "Voxtral-Mini-3B-2507"
+          - repo: "openai"
+            name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v3-turbo"
+        quant:
+          - "non-quantized"
     with:
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
-      download-artifact: voxtral-metal-export
+      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
       script: |
         set -eux
 
@@ -130,64 +128,4 @@ jobs:
         fi
         echo "::endgroup::"
 
-        echo "::group::Setup ExecuTorch Requirements"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh
-        echo "::endgroup::"
-
-        echo "::group::Pip List"
-        ${CONDA_RUN} pip list
-        echo "::endgroup::"
-
-        echo "::group::Prepare Voxtral Artifacts"
-        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
-        cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" .
-        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
-        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
-        curl -L $TOKENIZER_URL -o tekken.json
-        ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json
-        echo "::endgroup::"
-
-        echo "::group::Create Test Audio File"
-        say -o call_samantha_hall.aiff "Call Samantha Hall"
-        afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav
-        echo "::endgroup::"
-
-        echo "::group::Build Voxtral Runner"
-        ${CONDA_RUN} cmake --preset llm \
-              -DEXECUTORCH_BUILD_METAL=ON \
-              -DCMAKE_INSTALL_PREFIX=cmake-out \
-              -DCMAKE_BUILD_TYPE=Release \
-              -Bcmake-out -S.
-        ${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release
-
-        ${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \
-              -DCMAKE_BUILD_TYPE=Release \
-              -Sexamples/models/voxtral \
-              -Bcmake-out/examples/models/voxtral/
-        ${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
-        echo "::endgroup::"
-
-        echo "::group::Run Voxtral Runner"
-        set +e
-        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
-              --model_path model.pte \
-              --data_path aoti_metal_blob.ptd \
-              --tokenizer_path tekken.json \
-              --audio_path call_samantha_hall.wav \
-              --processor_path voxtral_preprocessor.pte \
-              --temperature 0 2>&1)
-        EXIT_CODE=$?
-        set -e
-
-        echo "$OUTPUT"
-
-        if ! echo "$OUTPUT" | grep -iq "Samantha"; then
-          echo "Expected output 'Samantha' not found in output"
-          exit 1
-        fi
-
-        if [ $EXIT_CODE -ne 0 ]; then
-          echo "Unexpected exit code: $EXIT_CODE"
-          exit $EXIT_CODE
-        fi
-        echo "::endgroup::"
+        ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"

From 6d84c8014b7131eccaa825450036225188eec213 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 7 Nov 2025 18:03:44 -0500
Subject: [PATCH 2/2] rename scripts

---
 .../{export_model_cuda_artifact.sh => export_model_artifact.sh}   | 0
 .ci/scripts/{test_model_cuda_e2e.sh => test_model_e2e.sh}         | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename .ci/scripts/{export_model_cuda_artifact.sh => export_model_artifact.sh} (100%)
 rename .ci/scripts/{test_model_cuda_e2e.sh => test_model_e2e.sh} (100%)

diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_artifact.sh
similarity index 100%
rename from .ci/scripts/export_model_cuda_artifact.sh
rename to .ci/scripts/export_model_artifact.sh
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_e2e.sh
similarity index 100%
rename from .ci/scripts/test_model_cuda_e2e.sh
rename to .ci/scripts/test_model_e2e.sh