From d753f03dc0fef76d086f924909c7b6809a8c9792 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Wed, 15 Oct 2025 17:40:38 -0700 Subject: [PATCH 1/7] Add int4mm test to the CUDA CI flow --- .github/workflows/cuda.yml | 148 +++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 8a9dc101ff3..15885fb155f 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -275,6 +275,154 @@ jobs: exit 1 fi + if [ $EXIT_CODE -ne 0 ]; then + echo "Unexpected exit code: $EXIT_CODE" + exit $EXIT_CODE + fi + echo "::endgroup::" + + export-voxtral-cuda-quantized: + name: export-voxtral-cuda-quantized + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + with: + timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + upload-artifact: voxtral-cuda-quantized-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install mistral-common librosa + pip list + echo "::endgroup::" + + echo "::group::Export Voxtral with Quantization" + optimum-cli export executorch \ + --model "mistralai/Voxtral-Mini-3B-2507" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 1024 \ + --qlinear 4w \ + --qlinear_encoder 4w \ + --qlinear_packing_format tile_packed_to_4d \ + --qlinear_encoder_packing_format tile_packed_to_4d \ + --output_dir ./ + python -m executorch.extension.audio.mel_spectrogram \ + --feature_size 128 \ + --stack_output \ + --max_audio_len 300 \ + --output_file voxtral_preprocessor.pte + + test -f model.pte + test -f aoti_cuda_blob.ptd + test -f voxtral_preprocessor.pte + echo "::endgroup::" + + echo "::group::Store Voxtral Quantized Artifacts" + mkdir -p "${RUNNER_ARTIFACT_DIR}" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}" + echo "::endgroup::" + + test-voxtral-cuda-quantized-e2e: + name: test-voxtral-cuda-quantized-e2e + needs: export-voxtral-cuda-quantized + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: voxtral-cuda-quantized-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Voxtral Quantized Artifacts" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . + TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" + curl -L $TOKENIZER_URL -o tekken.json + ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json + echo "::endgroup::" + + echo "::group::Download Test Audio File" + AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" + curl -L $AUDIO_URL -o poem.wav + echo "::endgroup::" + + echo "::group::Build Voxtral Runner" + cmake --preset llm \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out -S. + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release + + cmake -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -Sexamples/models/voxtral \ + -Bcmake-out/examples/models/voxtral/ + cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release + echo "::endgroup::" + + echo "::group::Run Voxtral Runner with Quantized Model" + set +e + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ + --model_path model.pte \ + --data_path aoti_cuda_blob.ptd \ + --tokenizer_path tekken.json \ + --audio_path poem.wav \ + --processor_path voxtral_preprocessor.pte \ + --temperature 0 2>&1) + EXIT_CODE=$? + set -e + + echo "$OUTPUT" + + if ! echo "$OUTPUT" | grep -iq "poem"; then + echo "Expected output 'poem' not found in output" + exit 1 + fi + if [ $EXIT_CODE -ne 0 ]; then echo "Unexpected exit code: $EXIT_CODE" exit $EXIT_CODE From 8456310f457d790eb0ec167f7bfca52a662bc9bc Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Wed, 15 Oct 2025 18:40:43 -0700 Subject: [PATCH 2/7] Update optimum-executorch pin --- .ci/docker/ci_commit_pins/optimum-executorch.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt index 49b079047a3..3c085a7ef3a 100644 --- a/.ci/docker/ci_commit_pins/optimum-executorch.txt +++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt @@ -1 +1 @@ -44d8d54e38c0258357d4e92e1fefe21e845947a3 +09fdbd0a0639b128f712a4f5202ed42ca4c60957 From fcbefe312eb20a481e53730b83829778def3da43 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Thu, 16 Oct 2025 13:45:27 -0700 Subject: [PATCH 3/7] 1. Test more quantization format 2. Update models/voxtral/READEME.md with CUDA quantization --- .github/workflows/cuda.yml | 114 +++++++++++++++++++++++------- examples/models/voxtral/README.md | 18 ++++- 2 files changed, 105 insertions(+), 27 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 15885fb155f..6d39c808458 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -281,8 +281,8 @@ jobs: fi echo "::endgroup::" - export-voxtral-cuda-quantized: - name: export-voxtral-cuda-quantized + test-voxtral-cuda-quantized-4w-tile-packed: + name: test-voxtral-cuda-quantized-4w-tile-packed uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -298,7 +298,6 @@ jobs: gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive - upload-artifact: voxtral-cuda-quantized-export ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux @@ -316,7 +315,7 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export Voxtral with Quantization" + echo "::group::Export Voxtral with Quantization (4w-tile-packed)" optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ --task "multimodal-text-to-text" \ @@ -328,6 +327,7 @@ jobs: --qlinear_encoder 4w \ --qlinear_packing_format tile_packed_to_4d \ --qlinear_encoder_packing_format tile_packed_to_4d \ + --qembedding 4w \ --output_dir ./ python -m executorch.extension.audio.mel_spectrogram \ --feature_size 128 \ @@ -340,50 +340,114 @@ jobs: test -f voxtral_preprocessor.pte echo "::endgroup::" - echo "::group::Store Voxtral Quantized Artifacts" - mkdir -p "${RUNNER_ARTIFACT_DIR}" - cp model.pte "${RUNNER_ARTIFACT_DIR}/" - cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" - cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" - ls -al "${RUNNER_ARTIFACT_DIR}" + echo "::group::Download Tokenizer and Test Audio" + TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" + curl -L $TOKENIZER_URL -o tekken.json + AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" + curl -L $AUDIO_URL -o poem.wav + echo "::endgroup::" + + echo "::group::Build Voxtral Runner" + cmake --preset llm \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out -S. + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release + + cmake -DEXECUTORCH_BUILD_CUDA=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -Sexamples/models/voxtral \ + -Bcmake-out/examples/models/voxtral/ + cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release echo "::endgroup::" - test-voxtral-cuda-quantized-e2e: - name: test-voxtral-cuda-quantized-e2e - needs: export-voxtral-cuda-quantized + echo "::group::Run Voxtral Runner with Quantized Model (4w-tile-packed)" + set +e + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ + --model_path model.pte \ + --data_path aoti_cuda_blob.ptd \ + --tokenizer_path tekken.json \ + --audio_path poem.wav \ + --processor_path voxtral_preprocessor.pte \ + --temperature 0 2>&1) + EXIT_CODE=$? + set -e + + echo "$OUTPUT" + + if ! echo "$OUTPUT" | grep -iq "poem"; then + echo "Expected output 'poem' not found in output" + exit 1 + fi + + if [ $EXIT_CODE -ne 0 ]; then + echo "Unexpected exit code: $EXIT_CODE" + exit $EXIT_CODE + fi + echo "::endgroup::" + + test-voxtral-cuda-quantized-4w-weight-only: + name: test-voxtral-cuda-quantized-4w-weight-only uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write contents: read + secrets: inherit strategy: fail-fast: false with: timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive - download-artifact: voxtral-cuda-quantized-export ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux - echo "::group::Setup ExecuTorch Requirements" - CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install mistral-common librosa pip list echo "::endgroup::" - echo "::group::Prepare Voxtral Quantized Artifacts" - cp "${RUNNER_ARTIFACT_DIR}/model.pte" . - cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . - cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . - TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" - curl -L $TOKENIZER_URL -o tekken.json - ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json + echo "::group::Export Voxtral with Quantization (4w-weight-only)" + optimum-cli export executorch \ + --model "mistralai/Voxtral-Mini-3B-2507" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 1024 \ + --qlinear 4w \ + --qlinear_encoder 4w \ + --qembedding 4w \ + --output_dir ./ + python -m executorch.extension.audio.mel_spectrogram \ + --feature_size 128 \ + --stack_output \ + --max_audio_len 300 \ + --output_file voxtral_preprocessor.pte + + test -f model.pte + test -f aoti_cuda_blob.ptd + test -f voxtral_preprocessor.pte echo "::endgroup::" - echo "::group::Download Test Audio File" + echo "::group::Download Tokenizer and Test Audio" + TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" + curl -L $TOKENIZER_URL -o tekken.json AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" curl -L $AUDIO_URL -o poem.wav echo "::endgroup::" @@ -403,7 +467,7 @@ jobs: cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release echo "::endgroup::" - echo "::group::Run Voxtral Runner with Quantized Model" + echo "::group::Run Voxtral Runner with Quantized Model (4w-weight-only)" set +e export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md index 861043fe2a7..1c211f2dcb3 100644 --- a/examples/models/voxtral/README.md +++ b/examples/models/voxtral/README.md @@ -39,8 +39,6 @@ This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit ac ## CUDA Support If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below: -**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution. - ### Exporting with CUDA ``` optimum-cli export executorch \ @@ -57,6 +55,22 @@ This will generate: - `model.pte` - The exported model - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime +Furthermore, we support several quantization formats on CUDA. +To export Voxtral with int4 weights-only quantization, use +``` +optimum-cli export executorch \ + --model "mistralai/Voxtral-Mini-3B-2507" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 1024 \ + --qlinear 4w \ + --qlinear_encoder 4w \ + --qembedding 4w \ + --output_dir="voxtral" +``` + See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions. # Running the model From 95e2e5193b802d5f2acd915b1938a1e11aa8ec2e Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Thu, 16 Oct 2025 14:11:26 -0700 Subject: [PATCH 4/7] Consolidate e2e workflows --- .github/workflows/cuda.yml | 203 ++++++++++-------------------- examples/models/voxtral/README.md | 2 +- 2 files changed, 69 insertions(+), 136 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 6d39c808458..0ca0a42c2e5 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -200,89 +200,8 @@ jobs: echo "::endgroup::" - test-voxtral-cuda-e2e: - name: test-voxtral-cuda-e2e - needs: export-voxtral-cuda-artifact - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - permissions: - id-token: write - contents: read - strategy: - fail-fast: false - with: - timeout: 90 - runner: linux.g5.4xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: 12.6 - use-custom-docker-registry: false - submodules: recursive - download-artifact: voxtral-cuda-export - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - script: | - set -eux - - echo "::group::Setup ExecuTorch Requirements" - CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh - pip list - echo "::endgroup::" - - echo "::group::Prepare Voxtral Artifacts" - cp "${RUNNER_ARTIFACT_DIR}/model.pte" . - cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . - cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . - TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" - curl -L $TOKENIZER_URL -o tekken.json - ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json - echo "::endgroup::" - - echo "::group::Download Test Audio File" - AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" - curl -L $AUDIO_URL -o poem.wav - echo "::endgroup::" - - echo "::group::Build Voxtral Runner" - cmake --preset llm \ - -DEXECUTORCH_BUILD_CUDA=ON \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -Bcmake-out -S. - cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release - - cmake -DEXECUTORCH_BUILD_CUDA=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -Sexamples/models/voxtral \ - -Bcmake-out/examples/models/voxtral/ - cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release - echo "::endgroup::" - - echo "::group::Run Voxtral Runner" - set +e - export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH - OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ - --model_path model.pte \ - --data_path aoti_cuda_blob.ptd \ - --tokenizer_path tekken.json \ - --audio_path poem.wav \ - --processor_path voxtral_preprocessor.pte \ - --temperature 0 2>&1) - EXIT_CODE=$? - set -e - - echo "$OUTPUT" - - if ! echo "$OUTPUT" | grep -iq "poem"; then - echo "Expected output 'poem' not found in output" - exit 1 - fi - - if [ $EXIT_CODE -ne 0 ]; then - echo "Unexpected exit code: $EXIT_CODE" - exit $EXIT_CODE - fi - echo "::endgroup::" - - test-voxtral-cuda-quantized-4w-tile-packed: - name: test-voxtral-cuda-quantized-4w-tile-packed + export-voxtral-cuda-quantized-int4-tile-packed: + name: export-voxtral-cuda-quantized-int4-tile-packed uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -298,6 +217,7 @@ jobs: gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive + upload-artifact: voxtral-cuda-quantized-int4-tile-packed ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux @@ -315,7 +235,7 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export Voxtral with Quantization (4w-tile-packed)" + echo "::group::Export Voxtral with Quantization (int4-tile-packed)" optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ --task "multimodal-text-to-text" \ @@ -340,56 +260,16 @@ jobs: test -f voxtral_preprocessor.pte echo "::endgroup::" - echo "::group::Download Tokenizer and Test Audio" - TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" - curl -L $TOKENIZER_URL -o tekken.json - AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" - curl -L $AUDIO_URL -o poem.wav - echo "::endgroup::" - - echo "::group::Build Voxtral Runner" - cmake --preset llm \ - -DEXECUTORCH_BUILD_CUDA=ON \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -Bcmake-out -S. - cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release - - cmake -DEXECUTORCH_BUILD_CUDA=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -Sexamples/models/voxtral \ - -Bcmake-out/examples/models/voxtral/ - cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release - echo "::endgroup::" - - echo "::group::Run Voxtral Runner with Quantized Model (4w-tile-packed)" - set +e - export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH - OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ - --model_path model.pte \ - --data_path aoti_cuda_blob.ptd \ - --tokenizer_path tekken.json \ - --audio_path poem.wav \ - --processor_path voxtral_preprocessor.pte \ - --temperature 0 2>&1) - EXIT_CODE=$? - set -e - - echo "$OUTPUT" - - if ! echo "$OUTPUT" | grep -iq "poem"; then - echo "Expected output 'poem' not found in output" - exit 1 - fi - - if [ $EXIT_CODE -ne 0 ]; then - echo "Unexpected exit code: $EXIT_CODE" - exit $EXIT_CODE - fi + echo "::group::Store Voxtral Quantized Artifacts (int4-tile-packed)" + mkdir -p "${RUNNER_ARTIFACT_DIR}" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}" echo "::endgroup::" - test-voxtral-cuda-quantized-4w-weight-only: - name: test-voxtral-cuda-quantized-4w-weight-only + export-voxtral-cuda-quantized-int4-weight-only: + name: export-voxtral-cuda-quantized-int4-weight-only uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -405,6 +285,7 @@ jobs: gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive + upload-artifact: voxtral-cuda-quantized-int4-weight-only ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux @@ -422,7 +303,7 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export Voxtral with Quantization (4w-weight-only)" + echo "::group::Export Voxtral with Quantization (int4-weight-only)" optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ --task "multimodal-text-to-text" \ @@ -445,9 +326,61 @@ jobs: test -f voxtral_preprocessor.pte echo "::endgroup::" - echo "::group::Download Tokenizer and Test Audio" + echo "::group::Store Voxtral Quantized Artifacts (int4-weight-only)" + mkdir -p "${RUNNER_ARTIFACT_DIR}" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}" + echo "::endgroup::" + + test-voxtral-cuda-e2e: + name: test-voxtral-cuda-e2e-${{ matrix.format.name }} + needs: + - export-voxtral-cuda-artifact + - export-voxtral-cuda-quantized-int4-tile-packed + - export-voxtral-cuda-quantized-int4-weight-only + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: + format: + - name: "non-quantized" + artifact: "voxtral-cuda-export" + - name: "quantized-int4-tile-packed" + artifact: "voxtral-cuda-quantized-int4-tile-packed" + - name: "quantized-int4-weight-only" + artifact: "voxtral-cuda-quantized-int4-weight-only" + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: ${{ matrix.format.artifact }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" . TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json" curl -L $TOKENIZER_URL -o tekken.json + ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json + echo "::endgroup::" + + echo "::group::Download Test Audio File" AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" curl -L $AUDIO_URL -o poem.wav echo "::endgroup::" @@ -467,7 +400,7 @@ jobs: cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release echo "::endgroup::" - echo "::group::Run Voxtral Runner with Quantized Model (4w-weight-only)" + echo "::group::Run Voxtral Runner (${{ matrix.format.name }})" set +e export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \ diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md index 1c211f2dcb3..51a68e4f8f6 100644 --- a/examples/models/voxtral/README.md +++ b/examples/models/voxtral/README.md @@ -56,7 +56,7 @@ This will generate: - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime Furthermore, we support several quantization formats on CUDA. -To export Voxtral with int4 weights-only quantization, use +To export Voxtral with int4 weight-only quantization, use ``` optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ From d27e10aa4edb8b7ab1b77f2689490b95acad2652 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Thu, 16 Oct 2025 18:58:46 -0700 Subject: [PATCH 5/7] Remove --qembedding --- .github/workflows/cuda.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 0ca0a42c2e5..15e2e0ef960 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -247,7 +247,6 @@ jobs: --qlinear_encoder 4w \ --qlinear_packing_format tile_packed_to_4d \ --qlinear_encoder_packing_format tile_packed_to_4d \ - --qembedding 4w \ --output_dir ./ python -m executorch.extension.audio.mel_spectrogram \ --feature_size 128 \ @@ -313,7 +312,6 @@ jobs: --max_seq_len 1024 \ --qlinear 4w \ --qlinear_encoder 4w \ - --qembedding 4w \ --output_dir ./ python -m executorch.extension.audio.mel_spectrogram \ --feature_size 128 \ From 8cd4c6a6578fbdac9f3a228dd0e84e62284d50bb Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Fri, 17 Oct 2025 15:49:49 -0700 Subject: [PATCH 6/7] Remove --qlinear as the produced result is invalid --- .github/workflows/cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 15e2e0ef960..1e4849a8843 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -303,6 +303,7 @@ jobs: echo "::endgroup::" echo "::group::Export Voxtral with Quantization (int4-weight-only)" + # TODO: --qlinear 4w will produce wrong result. Need more investigation. optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ --task "multimodal-text-to-text" \ @@ -310,7 +311,6 @@ jobs: --dtype bfloat16 \ --device cuda \ --max_seq_len 1024 \ - --qlinear 4w \ --qlinear_encoder 4w \ --output_dir ./ python -m executorch.extension.audio.mel_spectrogram \ From 45cbb8457519de506d3610a220466b6c7fd61078 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Fri, 17 Oct 2025 16:22:42 -0700 Subject: [PATCH 7/7] Consolidate export workflow --- .github/workflows/cuda.yml | 159 ++++-------------------------- examples/models/voxtral/README.md | 5 +- 2 files changed, 22 insertions(+), 142 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 1e4849a8843..9ee72a34ef0 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -88,7 +88,7 @@ jobs: PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda export-voxtral-cuda-artifact: - name: export-voxtral-cuda-artifact + name: export-voxtral-cuda-${{ matrix.quant.name }} uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write @@ -96,6 +96,18 @@ jobs: secrets: inherit strategy: fail-fast: false + matrix: + quant: + - name: "non-quantized" + artifact: "voxtral-cuda-export" + extra_args: "" + - name: "quantized-int4-tile-packed" + artifact: "voxtral-cuda-quantized-int4-tile-packed" + extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d" + - name: "quantized-int4-weight-only" + artifact: "voxtral-cuda-quantized-int4-weight-only" + # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation. + extra_args: "--qlinear_encoder 4w" with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN @@ -104,7 +116,7 @@ jobs: gpu-arch-version: 12.6 use-custom-docker-registry: false submodules: recursive - upload-artifact: voxtral-cuda-export + upload-artifact: ${{ matrix.quant.artifact }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | set -eux @@ -122,7 +134,8 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export Voxtral" + echo "::group::Export Voxtral (${{ matrix.quant.name }})" + EXTRA_ARGS="${{ matrix.quant.extra_args }}" optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ --task "multimodal-text-to-text" \ @@ -130,6 +143,7 @@ jobs: --dtype bfloat16 \ --device cuda \ --max_seq_len 1024 \ + ${EXTRA_ARGS} \ --output_dir ./ python -m executorch.extension.audio.mel_spectrogram \ --feature_size 128 \ @@ -142,7 +156,7 @@ jobs: test -f voxtral_preprocessor.pte echo "::endgroup::" - echo "::group::Store Voxtral Artifacts" + echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})" mkdir -p "${RUNNER_ARTIFACT_DIR}" cp model.pte "${RUNNER_ARTIFACT_DIR}/" cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" @@ -200,144 +214,9 @@ jobs: echo "::endgroup::" - export-voxtral-cuda-quantized-int4-tile-packed: - name: export-voxtral-cuda-quantized-int4-tile-packed - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - permissions: - id-token: write - contents: read - secrets: inherit - strategy: - fail-fast: false - with: - timeout: 90 - secrets-env: EXECUTORCH_HF_TOKEN - runner: linux.g5.4xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: 12.6 - use-custom-docker-registry: false - submodules: recursive - upload-artifact: voxtral-cuda-quantized-int4-tile-packed - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - script: | - set -eux - - echo "::group::Setup ExecuTorch" - ./install_executorch.sh - echo "::endgroup::" - - echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]" accelerate - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} - pip install mistral-common librosa - pip list - echo "::endgroup::" - - echo "::group::Export Voxtral with Quantization (int4-tile-packed)" - optimum-cli export executorch \ - --model "mistralai/Voxtral-Mini-3B-2507" \ - --task "multimodal-text-to-text" \ - --recipe "cuda" \ - --dtype bfloat16 \ - --device cuda \ - --max_seq_len 1024 \ - --qlinear 4w \ - --qlinear_encoder 4w \ - --qlinear_packing_format tile_packed_to_4d \ - --qlinear_encoder_packing_format tile_packed_to_4d \ - --output_dir ./ - python -m executorch.extension.audio.mel_spectrogram \ - --feature_size 128 \ - --stack_output \ - --max_audio_len 300 \ - --output_file voxtral_preprocessor.pte - - test -f model.pte - test -f aoti_cuda_blob.ptd - test -f voxtral_preprocessor.pte - echo "::endgroup::" - - echo "::group::Store Voxtral Quantized Artifacts (int4-tile-packed)" - mkdir -p "${RUNNER_ARTIFACT_DIR}" - cp model.pte "${RUNNER_ARTIFACT_DIR}/" - cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" - cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" - ls -al "${RUNNER_ARTIFACT_DIR}" - echo "::endgroup::" - - export-voxtral-cuda-quantized-int4-weight-only: - name: export-voxtral-cuda-quantized-int4-weight-only - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - permissions: - id-token: write - contents: read - secrets: inherit - strategy: - fail-fast: false - with: - timeout: 90 - secrets-env: EXECUTORCH_HF_TOKEN - runner: linux.g5.4xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: 12.6 - use-custom-docker-registry: false - submodules: recursive - upload-artifact: voxtral-cuda-quantized-int4-weight-only - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - script: | - set -eux - - echo "::group::Setup ExecuTorch" - ./install_executorch.sh - echo "::endgroup::" - - echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]" accelerate - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN - OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} - pip install mistral-common librosa - pip list - echo "::endgroup::" - - echo "::group::Export Voxtral with Quantization (int4-weight-only)" - # TODO: --qlinear 4w will produce wrong result. Need more investigation. - optimum-cli export executorch \ - --model "mistralai/Voxtral-Mini-3B-2507" \ - --task "multimodal-text-to-text" \ - --recipe "cuda" \ - --dtype bfloat16 \ - --device cuda \ - --max_seq_len 1024 \ - --qlinear_encoder 4w \ - --output_dir ./ - python -m executorch.extension.audio.mel_spectrogram \ - --feature_size 128 \ - --stack_output \ - --max_audio_len 300 \ - --output_file voxtral_preprocessor.pte - - test -f model.pte - test -f aoti_cuda_blob.ptd - test -f voxtral_preprocessor.pte - echo "::endgroup::" - - echo "::group::Store Voxtral Quantized Artifacts (int4-weight-only)" - mkdir -p "${RUNNER_ARTIFACT_DIR}" - cp model.pte "${RUNNER_ARTIFACT_DIR}/" - cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" - cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/" - ls -al "${RUNNER_ARTIFACT_DIR}" - echo "::endgroup::" - test-voxtral-cuda-e2e: name: test-voxtral-cuda-e2e-${{ matrix.format.name }} - needs: - - export-voxtral-cuda-artifact - - export-voxtral-cuda-quantized-int4-tile-packed - - export-voxtral-cuda-quantized-int4-weight-only + needs: export-voxtral-cuda-artifact uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md index 51a68e4f8f6..65085e30c1d 100644 --- a/examples/models/voxtral/README.md +++ b/examples/models/voxtral/README.md @@ -56,7 +56,7 @@ This will generate: - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime Furthermore, we support several quantization formats on CUDA. -To export Voxtral with int4 weight-only quantization, use +For example, to export Voxtral with int4 weight and int4mm for linear layers, you can use the following command, ``` optimum-cli export executorch \ --model "mistralai/Voxtral-Mini-3B-2507" \ @@ -67,7 +67,8 @@ optimum-cli export executorch \ --max_seq_len 1024 \ --qlinear 4w \ --qlinear_encoder 4w \ - --qembedding 4w \ + --qlinear_packing_format tile_packed_to_4d \ + --qlinear_encoder_packing_format tile_packed_to_4d \ --output_dir="voxtral" ```