From af94fa5456afa1206176cb32c960eb182aa1c16a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 19 Sep 2025 14:51:52 -0700
Subject: [PATCH] [multimodal] Let Audio take float data blob (#14427)

If the processed audio went through Mel transform, the spectrogram are
float values. We should allow `Audio` class to be able to take this,
since multimodal runner pybind API will have to be able to take
processed input. Once we have the pybind API we can do something like:

```python
model_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(model_id)
audio_url = "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav"
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "url": audio_url},
            {
                "type": "text",
                "text": "What can you tell me about this audio?",
            },
        ],
    },
]
inputs = processor.apply_chat_template(conversation,
    tokenize=True,
    return_dict=True,
    return_tensors="pt")

inputs_combined = [
    make_text_input("<s>[INST][BEGIN_AUDIO]"),
    make_audio_input(inputs["input_features"]),
    make_text_input("\nWhat can you tell me about this audio?[/INST]"),
]
runner = MultimodalRunner("voxtral.pte", "tekken.json", None)
config = GenerationConfig()
config.max_new_tokens = 100
runner.generate(inputs_combined, config)
```

### Summary
[PLEASE REMOVE] See [CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests)
for ExecuTorch PR guidelines.

[PLEASE REMOVE] If this PR closes an issue, please add a `Fixes
#<issue-id>` line.

[PLEASE REMOVE] If this PR introduces a fix or feature that should be
the upcoming release notes, please add a "Release notes: <area>" label.
For a list of available release notes labels, check out
[CONTRIBUTING.md's Pull
Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests).

### Test plan
[PLEASE REMOVE] How did you test this PR? Please write down any manual
commands you used and note down tests that you have written if
applicable.

(cherry picked from commit 8b114180ef143abb06b0441c0788edec5461e5ad)
---
 examples/models/voxtral/multimodal.cpp        |  47 +++----
 extension/llm/runner/audio.h                  | 129 +++++++++++++++++-
 extension/llm/runner/multimodal_prefiller.cpp |  29 ++--
 3 files changed, 158 insertions(+), 47 deletions(-)
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 17013df96e1..081df27cd67 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 
   ET_LOG(Info, "audio_data len = %zu", n_floats);
 
-  // Create Audio multimodal input
-  auto audio = std::make_unique<::executorch::extension::llm::Audio>();
-  audio->batch_size = batch_size;
-  audio->n_bins = n_bins;
-  audio->n_frames = n_frames;
-  audio->data.resize(n_floats * sizeof(float));
-  f.read(reinterpret_cast<char*>(audio->data.data()), n_floats * sizeof(float));
+  std::vector<float> audio_data(n_floats);
+  f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
   f.close();
-  return ::executorch::extension::llm::make_audio_input(std::move(*audio));
+
+  auto audio = ::executorch::extension::llm::Audio(
+      std::move(audio_data), batch_size, n_bins, n_frames);
+  return ::executorch::extension::llm::make_audio_input(std::move(audio));
 }
 
 /**
@@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile(
       static_cast<int>(sizes[2]));
 
   // Create Audio multimodal input from processed features
-  auto processed_audio =
-      std::make_unique<::executorch::extension::llm::Audio>();
-  processed_audio->batch_size =
-      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
-                                      // yet, so this will just be = 1.
-  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
-  processed_audio->n_frames =
-      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
-
-  size_t total_elements = processed_audio->batch_size *
-      processed_audio->n_bins * processed_audio->n_frames;
-  processed_audio->data.resize(total_elements * sizeof(float));
-  std::memcpy(
-      processed_audio->data.data(),
-      processed_data,
-      total_elements * sizeof(float));
-
+  int32_t batch_size = static_cast<int32_t>(sizes[0]);
+  int32_t n_bins = static_cast<int32_t>(sizes[1]);
+  int32_t n_frames = static_cast<int32_t>(sizes[2]);
+  size_t total_elements = batch_size * n_bins * n_frames;
+  std::vector<float> audio_vec(processed_data, processed_data + total_elements);
+  auto processed_audio = ::executorch::extension::llm::Audio(
+      std::move(audio_vec), batch_size, n_bins, n_frames);
   ET_LOG(
       Info,
       "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
-      processed_audio->batch_size,
-      processed_audio->n_bins,
-      processed_audio->n_frames);
-
+      batch_size,
+      n_bins,
+      n_frames);
   return ::executorch::extension::llm::make_audio_input(
-      std::move(*processed_audio));
+      std::move(processed_audio));
 }
 
 /**
diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h
index 868765950af..ce71513ed17 100644
--- a/extension/llm/runner/audio.h
+++ b/extension/llm/runner/audio.h
@@ -11,8 +11,11 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
@@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio {
 };
 
 /**
- * Pre-processed audio inputs, ready to feed directly into an audio
- * encoder.
+ * Pre-processed audio inputs, ready to feed directly into an audio encoder.
+ *
+ * The data can be either uint8_t or float. If the audio has gone through a Mel
+ * transform, we expect the data type to be float (i.e., std::vector<float>), as
+ * Mel spectrograms are typically represented as floating point values. For raw
+ * or quantized audio, uint8_t may be used instead.
  */
-struct ET_EXPERIMENTAL Audio {
-  std::vector<uint8_t> data;
-  int32_t batch_size;
-  int32_t n_bins;
-  int32_t n_frames;
+class ET_EXPERIMENTAL Audio final {
+ public:
+  // Default constructor
+  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
+
+  // Constructor for uint8_t data
+  Audio(
+      std::vector<uint8_t>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 0 &&
+            std::get<std::vector<uint8_t>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<uint8_t>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Constructor for float data
+  Audio(
+      std::vector<float>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 1 &&
+            std::get<std::vector<float>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<float>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Type checkers
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  // Data access
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  int32_t get_batch_size() const {
+    return batch_size_;
+  }
+  int32_t get_n_bins() const {
+    return n_bins_;
+  }
+  int32_t get_n_frames() const {
+    return n_frames_;
+  }
+  /**
+   * Convert the audio data to a TensorPtr, with optional batch dimension.
+   * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
+   * batch_size, n_bins, n_frames) if with_batch is true.
+   */
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    std::vector<executorch::aten::SizesType> sizes = {
+        get_batch_size(), get_n_bins(), get_n_frames()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error,
+        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
+  // Members
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t batch_size_;
+  int32_t n_bins_;
+  int32_t n_frames_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index f9645667f24..824fdf943a9 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -47,8 +47,9 @@ Result<uint64_t> MultimodalPrefiller::prefill(
         "Failed to get method_meta for %s",
         kVisionEncoderMethod);
 
-    ET_CHECK_MSG(
+    ET_CHECK_OR_RETURN_ERROR(
         method_meta.num_inputs() > 0,
+        InvalidArgument,
         "Image encoder should have at least 1 input");
     auto input_meta = ET_UNWRAP(
         method_meta.input_tensor_meta(0),
@@ -56,12 +57,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto expected_dtype = input_meta.scalar_type();
 
     if (expected_dtype == ::executorch::aten::ScalarType::Float) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_float(),
+          InvalidArgument,
           "Model expects float image data, but image has uint8_t data.");
     } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_uint8(),
+          InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
     } else {
       ET_LOG(
@@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto image_tensor = ET_UNWRAP(
         image.toTensor(/*with_batch*/ expected_dims.size() == 4),
         "Failed to convert image to tensor");
-
+    ET_LOG(
+        Info,
+        "Image tensor dim: %zu, dtype: %s",
+        image_tensor->dim(),
+        ::executorch::runtime::toString(image_tensor->scalar_type()));
     // Run image encoder
     auto image_encoder_outputs =
         ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
@@ -86,12 +93,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   } else if (input.is_audio()) {
     Audio audio = input.get_audio();
 
-    // Use the original tensor shape as intended
-    auto audio_tensor = executorch::extension::from_blob(
-        audio.data.data(),
-        {audio.batch_size, audio.n_bins, audio.n_frames},
-        ::executorch::aten::ScalarType::Float);
-
+    // Use Audio::toTensor() for tensor creation
+    auto audio_tensor =
+        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+    ET_LOG(
+        Info,
+        "Audio tensor dim: %zu, dtype: %s",
+        audio_tensor->dim(),
+        ::executorch::runtime::toString(audio_tensor->scalar_type()));
     // Run audio encoder
     auto audio_encoder_result =
         module_->execute(kAudioEncoderMethod, audio_tensor);