From 0da3f0d6adcfd2c854c00b77a6a94224c5259c40 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Fri, 5 Sep 2025 17:20:07 -0400 Subject: [PATCH] Fix voxtral instructions --- examples/models/voxtral/CMakeLists.txt | 4 ++ examples/models/voxtral/README.md | 54 +++++++++++++++----------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 682bb10e786..85c6a13e0ff 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -20,6 +20,10 @@ else() set(CMAKE_TOOLCHAIN_IOS OFF) endif() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + # Let files say "include " set(_common_include_directories ${EXECUTORCH_ROOT}/..) diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md index 5a06df9803d..a111abdd630 100644 --- a/examples/models/voxtral/README.md +++ b/examples/models/voxtral/README.md @@ -26,20 +26,30 @@ pip install git+https://github.com/huggingface/transformers@6121e9e46c4fc4e5c91d ## Using the export CLI We export Voxtral using the Optimum CLI, which will export `model.pte` to the `voxtral` output directory: ``` -optimum-cli export executorch - --model "mistralai/Voxtral-Mini-3B-2507" - --task "multimodal-text-to-text" - --recipe "xnnpack" - --use_custom_sdpa - --use_custom_kv_cache - --qlinear 8da4w - --qembedding 4w - --output_dir="voxtral +optimum-cli export executorch \ + --model "mistralai/Voxtral-Mini-3B-2507" \ + --task "multimodal-text-to-text" \ + --recipe "xnnpack" \ + --use_custom_sdpa \ + --use_custom_kv_cache \ + --qlinear 8da4w \ + --qembedding 4w \ + --output_dir="voxtral" ``` This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit activation linear quantization. -# [Optional] Exporting the audio preprocessor +# Running the model +To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API. +The Voxtral runner will do the following things: + +- Audio Input: + - Option A: Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor. + - Option B: If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.). +- Feed the formatted inputs to the multimodal modal runner. + + +# [Option A] Exporting the audio preprocessor The exported model takes in a mel spectrogram input tensor as its audio inputs. We provide a simple way to transform raw audio data into a mel spectrogram by exporting a version of Voxtral's audio preprocessor used directly by Transformers. @@ -47,13 +57,6 @@ We provide a simple way to transform raw audio data into a mel spectrogram by ex python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --output_file voxtral_preprocessor.pte ``` -# Running the model -To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API. -The Voxtral runner will do the following things: -1. [Optional] Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor. -2. [If starting directly with an already processed audio input tensor] Format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.). -3. Feed the formatted inputs to the multimodal modal runner. - ## Building the multimodal runner ``` # Build and install ExecuTorch @@ -66,11 +69,12 @@ cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Re ## Running the model You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507). ``` -./cmake-out/examples/models/voxtral/voxtral_runner - --model_path voxtral/model.pte - --tokenizer_path path/to/tekken.json - --prompt "What can you tell me about this audio?" - --audio_path ~/models/voxtral/audio_input.bin +./cmake-out/examples/models/voxtral/voxtral_runner \ + --model_path path/to/model.pte \ + --tokenizer_path path/to/tekken.json \ + --prompt "What can you tell me about this audio?" \ + --audio_path path/to/audio_input.bin \ + --processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path ``` Example output: @@ -93,3 +97,9 @@ You can easily produce an `.bin` for the audio input in Python like this: with open("tensor.bin", "wb") as f: f.write(t.numpy().tobytes()) ``` + +You can also produce raw audio file as follows (for Option A): + +``` +ffmpeg -i audio.mp3 -f f32le -acodec pcm_f32le audio_input.bin +``` \ No newline at end of file