pytorch · larryliu0820 · Oct 10, 2025 · Sep 19, 2025
@@ -103,15 +103,13 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 
   ET_LOG(Info, "audio_data len = %zu", n_floats);
 
-  // Create Audio multimodal input
-  auto audio = std::make_unique<::executorch::extension::llm::Audio>();
-  audio->batch_size = batch_size;
-  audio->n_bins = n_bins;
-  audio->n_frames = n_frames;
-  audio->data.resize(n_floats * sizeof(float));
-  f.read(reinterpret_cast<char*>(audio->data.data()), n_floats * sizeof(float));
+  std::vector<float> audio_data(n_floats);
+  f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
   f.close();
-  return ::executorch::extension::llm::make_audio_input(std::move(*audio));
+
+  auto audio = ::executorch::extension::llm::Audio(
+      std::move(audio_data), batch_size, n_bins, n_frames);
+  return ::executorch::extension::llm::make_audio_input(std::move(audio));
 }
 
 /**
@@ -206,32 +204,21 @@ MultimodalInput processRawAudioFile(
       static_cast<int>(sizes[2]));
 
   // Create Audio multimodal input from processed features
-  auto processed_audio =
-      std::make_unique<::executorch::extension::llm::Audio>();
-  processed_audio->batch_size =
-      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
-                                      // yet, so this will just be = 1.
-  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
-  processed_audio->n_frames =
-      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
-
-  size_t total_elements = processed_audio->batch_size *
-      processed_audio->n_bins * processed_audio->n_frames;
-  processed_audio->data.resize(total_elements * sizeof(float));
-  std::memcpy(
-      processed_audio->data.data(),
-      processed_data,
-      total_elements * sizeof(float));
-
+  int32_t batch_size = static_cast<int32_t>(sizes[0]);
+  int32_t n_bins = static_cast<int32_t>(sizes[1]);
+  int32_t n_frames = static_cast<int32_t>(sizes[2]);
+  size_t total_elements = batch_size * n_bins * n_frames;
+  std::vector<float> audio_vec(processed_data, processed_data + total_elements);
+  auto processed_audio = ::executorch::extension::llm::Audio(
+      std::move(audio_vec), batch_size, n_bins, n_frames);
   ET_LOG(
       Info,
       "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
-      processed_audio->batch_size,
-      processed_audio->n_bins,
-      processed_audio->n_frames);
-
+      batch_size,
+      n_bins,
+      n_frames);
   return ::executorch::extension::llm::make_audio_input(
-      std::move(*processed_audio));
+      std::move(processed_audio));
 }
 
 /**

@@ -11,8 +11,11 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
@@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio {
 };
 
 /**
- * Pre-processed audio inputs, ready to feed directly into an audio
- * encoder.
+ * Pre-processed audio inputs, ready to feed directly into an audio encoder.
+ *
+ * The data can be either uint8_t or float. If the audio has gone through a Mel
+ * transform, we expect the data type to be float (i.e., std::vector<float>), as
+ * Mel spectrograms are typically represented as floating point values. For raw
+ * or quantized audio, uint8_t may be used instead.
  */
-struct ET_EXPERIMENTAL Audio {
-  std::vector<uint8_t> data;
-  int32_t batch_size;
-  int32_t n_bins;
-  int32_t n_frames;
+class ET_EXPERIMENTAL Audio final {
+ public:
+  // Default constructor
+  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
+
+  // Constructor for uint8_t data
+  Audio(
+      std::vector<uint8_t>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 0 &&
+            std::get<std::vector<uint8_t>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<uint8_t>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Constructor for float data
+  Audio(
+      std::vector<float>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 1 &&
+            std::get<std::vector<float>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<float>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Type checkers
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  // Data access
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  int32_t get_batch_size() const {
+    return batch_size_;
+  }
+  int32_t get_n_bins() const {
+    return n_bins_;
+  }
+  int32_t get_n_frames() const {
+    return n_frames_;
+  }
+  /**
+   * Convert the audio data to a TensorPtr, with optional batch dimension.
+   * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
+   * batch_size, n_bins, n_frames) if with_batch is true.
+   */
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    std::vector<executorch::aten::SizesType> sizes = {
+        get_batch_size(), get_n_bins(), get_n_frames()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error,
+        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
+  // Members
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t batch_size_;
+  int32_t n_bins_;
+  int32_t n_frames_;
 };
 
 } // namespace llm

@@ -47,21 +47,24 @@ Result<uint64_t> MultimodalPrefiller::prefill(
         "Failed to get method_meta for %s",
         kVisionEncoderMethod);
 
-    ET_CHECK_MSG(
+    ET_CHECK_OR_RETURN_ERROR(
         method_meta.num_inputs() > 0,
+        InvalidArgument,
         "Image encoder should have at least 1 input");
     auto input_meta = ET_UNWRAP(
         method_meta.input_tensor_meta(0),
         "Cannot get input tensor meta at index 0");
     auto expected_dtype = input_meta.scalar_type();
 
     if (expected_dtype == ::executorch::aten::ScalarType::Float) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_float(),
+          InvalidArgument,
           "Model expects float image data, but image has uint8_t data.");
     } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
-      ET_CHECK_MSG(
+      ET_CHECK_OR_RETURN_ERROR(
           image.is_uint8(),
+          InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
     } else {
       ET_LOG(
@@ -77,7 +80,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto image_tensor = ET_UNWRAP(
         image.toTensor(/*with_batch*/ expected_dims.size() == 4),
         "Failed to convert image to tensor");
-
+    ET_LOG(
+        Info,
+        "Image tensor dim: %zu, dtype: %s",
+        image_tensor->dim(),
+        ::executorch::runtime::toString(image_tensor->scalar_type()));
     // Run image encoder
     auto image_encoder_outputs =
         ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
@@ -86,12 +93,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   } else if (input.is_audio()) {
     Audio audio = input.get_audio();
 
-    // Use the original tensor shape as intended
-    auto audio_tensor = executorch::extension::from_blob(
-        audio.data.data(),
-        {audio.batch_size, audio.n_bins, audio.n_frames},
-        ::executorch::aten::ScalarType::Float);
-
+    // Use Audio::toTensor() for tensor creation
+    auto audio_tensor =
+        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+    ET_LOG(
+        Info,
+        "Audio tensor dim: %zu, dtype: %s",
+        audio_tensor->dim(),
+        ::executorch::runtime::toString(audio_tensor->scalar_type()));
     // Run audio encoder
     auto audio_encoder_result =
         module_->execute(kAudioEncoderMethod, audio_tensor);