From e9283159e84172d1fe1d3bcf9ff20b1a62890aee Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:21:35 -0700
Subject: [PATCH 1/3] Include audio preprocessing for raw audio tensor

[ghstack-poisoned]
---
 examples/models/voxtral/multimodal.cpp | 266 ++++++++++++++++++++++---
 1 file changed, 239 insertions(+), 27 deletions(-)
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index d7183f3c662..252fb3865fd 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -12,6 +12,10 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+
 #include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
 
 DEFINE_string(audio_path, "", "Path to input audio file.");
 
+DEFINE_string(
+    processor_path,
+    "",
+    "Path to processor .pte file for raw audio processing.");
+
 DEFINE_double(
     temperature,
     0.8f,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
 
 namespace {
 
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
 using ::executorch::extension::llm::Image;
 using ::executorch::extension::llm::make_image_input;
 using ::executorch::extension::llm::make_text_input;
 using ::executorch::extension::llm::MultimodalInput;
+using ::executorch::runtime::EValue;
 
 bool ends_with(const std::string& str, const std::string& suffix) {
   return str.size() >= suffix.size() &&
       str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 }
 
+/**
+ * @brief Loads float data from a binary file
+ *
+ * @param audio_path Path to the binary audio file (.bin)
+ * @return Vector of float data loaded from the file
+ * @throws std::runtime_error if file loading fails
+ */
+std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
+  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  if (!f.is_open()) {
+    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+    throw std::runtime_error("Failed to open audio file");
+  }
+
+  std::size_t n_floats =
+      f.tellg() / sizeof(float); // Number of floats in the audio file
+  f.seekg(0, std::ios::beg);
+
+  std::vector<float> audio_data(n_floats);
+  f.read(
+      reinterpret_cast<char*>(audio_data.data()),
+      audio_data.size() * sizeof(float));
+  f.close();
+
+  ET_LOG(
+      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  return audio_data;
+}
+
 /**
  * @brief Loads preprocessed audio data from a binary file
  *
@@ -70,22 +111,83 @@ bool ends_with(const std::string& str, const std::string& suffix) {
  *       f.write(t.numpy().tobytes())
  *
  * @param audio_path Path to the binary audio file (.bin)
+ * @param processor_path Path to the processor .pte file to get metadata
  * @return MultimodalInput containing the loaded audio data
  */
-MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
-  int32_t n_bins = 128;
-  int32_t n_frames = 3000;
-  std::size_t n_floats =
-      f.tellg() / sizeof(float); // Number of floats in the audio file.
-  f.seekg(0, std::ios::beg);
+MultimodalInput loadPreprocessedAudio(
+    const std::string& audio_path,
+    const std::string& processor_path = "") {
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
+  int32_t n_bins, n_frames;
+
+  if (!processor_path.empty()) {
+    // Load processor module to get metadata
+    std::unique_ptr<Module> processor_module;
+    try {
+      processor_module =
+          std::make_unique<Module>(processor_path, Module::LoadMode::File);
+      auto load_error = processor_module->load();
+      if (load_error != ::executorch::runtime::Error::Ok) {
+        ET_LOG(
+            Error,
+            "Failed to load processor module from: %s",
+            processor_path.c_str());
+        throw std::runtime_error("Failed to load processor module");
+      }
+    } catch (const std::exception& e) {
+      ET_LOG(Error, "Exception while loading processor module: %s", e.what());
+      throw std::runtime_error("Exception while loading processor module");
+    }
+
+    // Get n_bins by running "feature_size" method
+    auto feature_size_result = processor_module->execute("feature_size");
+    if (!feature_size_result.ok()) {
+      ET_LOG(
+          Error, "Failed to execute 'feature_size' method on processor module");
+      throw std::runtime_error(
+          "Failed to execute 'feature_size' method on processor module");
+    }
+    auto feature_size_outputs = feature_size_result.get();
+    if (feature_size_outputs.empty()) {
+      ET_LOG(Error, "'feature_size' method returned no outputs");
+      throw std::runtime_error("'feature_size' method returned no outputs");
+    }
+    n_bins = static_cast<int32_t>(feature_size_outputs[0].toInt());
+
+    // Get n_frames by running "nb_max_frames" method
+    auto nb_max_frames_result = processor_module->execute("nb_max_frames");
+    if (!nb_max_frames_result.ok()) {
+      ET_LOG(
+          Error,
+          "Failed to execute 'nb_max_frames' method on processor module");
+      throw std::runtime_error(
+          "Failed to execute 'nb_max_frames' method on processor module");
+    }
+    auto nb_max_frames_outputs = nb_max_frames_result.get();
+    if (nb_max_frames_outputs.empty()) {
+      ET_LOG(Error, "'nb_max_frames' method returned no outputs");
+      throw std::runtime_error("'nb_max_frames' method returned no outputs");
+    }
+    n_frames = static_cast<int32_t>(nb_max_frames_outputs[0].toInt());
+
+    ET_LOG(
+        Info,
+        "Got values from processor methods: n_bins=%d, n_frames=%d",
+        n_bins,
+        n_frames);
+  } else {
+    ET_LOG(
+        Error,
+        "Processor path is required to get feature_size and nb_max_frames");
+    throw std::runtime_error(
+        "Processor path is required to get feature_size and nb_max_frames");
+  }
+
+  std::size_t n_floats = audio_data.size();
   int32_t batch_size = ceil(
       n_floats /
       (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
-  std::vector<float> audio_data(batch_size * n_bins * n_frames);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
 
   ET_LOG(Info, "audio_data len = %d", audio_data.size());
 
@@ -100,29 +202,138 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Processes audio files for multimodal input
+ * @brief Loads a .bin file into a tensor and processes it using a .pte
+ * processor
  *
- * Dispatches audio file processing based on file extension:
- * - .bin files: Loads preprocessed mel spectrogram features directly
- * - .wav/.mp3 files: Currently unsupported, throws runtime_error
+ * This function loads raw audio data from a .bin file (similar to
+ * loadPreprocessedAudio), creates a tensor from it, and then passes it through
+ * a processor module loaded from a .pte file to generate processed audio
+ * features.
+ *
+ * @param audio_path Path to the .bin audio file
+ * @param processor_path Path to the .pte processor file
+ * @return MultimodalInput containing the processed audio data
+ * @throws std::runtime_error if file loading or processing fails
+ */
+MultimodalInput processRawAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path) {
+  if (processor_path.empty()) {
+    ET_LOG(Error, "Processor path is required for raw audio processing");
+    throw std::runtime_error(
+        "Processor path is required for raw audio processing");
+  }
+
+  // Load the audio processor .pte.
+  std::unique_ptr<Module> processor_module;
+  try {
+    processor_module =
+        std::make_unique<Module>(processor_path, Module::LoadMode::File);
+    auto load_error = processor_module->load();
+    if (load_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load processor module from: %s",
+          processor_path.c_str());
+      throw std::runtime_error("Failed to load processor module");
+    }
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
+    throw std::runtime_error("Exception while loading processor module");
+  }
+
+  // Load the audio data from file.
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
+  // Execute the processor
+  std::vector<executorch::aten::SizesType> tensor_shape = {
+      static_cast<executorch::aten::SizesType>(audio_data.size())};
+  auto input_tensor = from_blob(
+      audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
+
+  ET_LOG(Info, "Processing audio through processor module...");
+  auto result = processor_module->execute("forward", input_tensor);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to execute processor's forward method");
+    throw std::runtime_error("Failed to execute processor forward method");
+  }
+
+  auto outputs = result.get();
+  if (outputs.empty()) {
+    ET_LOG(Error, "Processor returned no outputs");
+    throw std::runtime_error("Processor returned no outputs");
+  }
+
+  // Extract processed audio features
+  const auto& processed_tensor = outputs[0].toTensor();
+  const float* processed_data = processed_tensor.const_data_ptr<float>();
+  const auto& sizes = processed_tensor.sizes();
+
+  ET_LOG(
+      Info,
+      "Processed audio tensor shape: [%d, %d, %d]",
+      static_cast<int>(sizes[0]),
+      static_cast<int>(sizes[1]),
+      static_cast<int>(sizes[2]));
+
+  // Create Audio multimodal input from processed features
+  auto processed_audio =
+      std::make_unique<::executorch::extension::llm::Audio>();
+  processed_audio->batch_size = static_cast<int32_t>(
+      sizes[0]); // Note: batching for s > 30 doesn't work yet.
+  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
+  processed_audio->n_frames = static_cast<int32_t>(sizes[2]);
+
+  size_t total_elements = processed_audio->batch_size *
+      processed_audio->n_bins * processed_audio->n_frames;
+  processed_audio->data.resize(total_elements * sizeof(float));
+  std::memcpy(
+      processed_audio->data.data(),
+      processed_data,
+      total_elements * sizeof(float));
+
+  ET_LOG(
+      Info,
+      "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
+      processed_audio->batch_size,
+      processed_audio->n_bins,
+      processed_audio->n_frames);
+
+  return ::executorch::extension::llm::make_audio_input(
+      std::move(*processed_audio));
+}
+
+/**
+ * @brief Processes audio files for multimodal input
  *
- * This function provides a interface for different audio input formats
- * and can be extended to support raw audio processing in the future.
+ * Dispatches audio file processing based on file extension and processor
+ * availability:
+ * - .bin files with processor: Loads raw audio from .bin and processes through
+ * processor
+ * - .bin files without processor: Loads preprocessed mel spectrogram features
+ * directly
  *
- * @param audio_path Path to the audio file
+ * @param audio_path Path to the audio file (.bin)
+ * @param processor_path Path to the processor .pte file (optional)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
-MultimodalInput processAudioFile(const std::string& audio_path) {
+MultimodalInput processAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path = "") {
   if (ends_with(audio_path, ".bin")) {
-    // Current behavior - load preprocessed audio stored as a binary file.
-    return loadPreprocessedAudio(audio_path);
-  } else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
-    // New: Process raw audio files - unsupported for now
-    ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
-    throw std::runtime_error("Raw audio file processing not supported");
+    if (!processor_path.empty()) {
+      // Process raw audio from .bin file through the processor
+      return processRawAudioFile(audio_path, processor_path);
+    } else {
+      // Load preprocessed audio stored as a binary file (existing behavior)
+      return loadPreprocessedAudio(audio_path, processor_path);
+    }
   } else {
-    ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin files are supported)",
+        audio_path.c_str());
     throw std::runtime_error("Unsupported audio file format");
   }
 }
@@ -137,6 +348,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
+  const char* processor_path = FLAGS_processor_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -184,7 +396,7 @@ int32_t main(int32_t argc, char** argv) {
   inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
 
   // 2. Add audio input
-  inputs.emplace_back(processAudioFile(audio_path));
+  inputs.emplace_back(processAudioFile(audio_path, processor_path));
 
   // 3. Add text input (the actual user-submitted prompt)
   inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));

From 9861ba5e50a61b4a605a4f753c21e1dbf7793d2f Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:29:41 -0700
Subject: [PATCH 2/3] Update on "Include audio preprocessing for raw audio
 tensor"

## Summary

Runs audio preprocessing (mel spectrogram conversion) on raw audio tensor, using an exported `.pte` from https://github.com/pytorch/executorch/blob/main/extension/audio/mel_spectrogram.py

Current limitations - no batching support in the mel spectrogram, so can only support audio of <30 seconds.

```
The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that
 the model was trained based on the speaker's data or instructions. They also mention that the volume is quite small, which could imply that the speaker is trying to control the volume of the model's output, likely because they are concerned about how loud the model's responses might
PyTorchObserver {"prompt_tokens":388,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756351346381,"inference_end_ms":1756351362602,"prompt_eval_end_ms":1756351351435,"first_token_ms":1756351351435,"aggregate_sampling_time_ms":99,"SCALING_FACTOR_UNITS_PER_SECOND":1000}
I 00:00:24.036773 executorch:stats.h:104]       Prompt Tokens: 388    Generated Tokens: 99
I 00:00:24.036800 executorch:stats.h:110]       Model Load Time:                0.000000 (seconds)
I 00:00:24.036805 executorch:stats.h:117]       Total inference time:           16.221000 (seconds)              Rate:  6.103200 (tokens/second)
I 00:00:24.036815 executorch:stats.h:127]               Prompt evaluation:      5.054000 (seconds)               Rate:  76.770875 (tokens/second)
I 00:00:24.036819 executorch:stats.h:136]               Generated 99 tokens:    11.167000 (seconds)              Rate:  8.865407 (tokens/second)
I 00:00:24.036822 executorch:stats.h:147]       Time to first generated token:  5.054000 (seconds)
I 00:00:24.036828 executorch:stats.h:153]       Sampling time over 487 tokens:  0.099000 (seconds)
```




[ghstack-poisoned]
---
 examples/models/voxtral/multimodal.cpp | 114 +++++++++++++------------
 1 file changed, 58 insertions(+), 56 deletions(-)

diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 252fb3865fd..c2215848422 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -121,68 +121,67 @@ MultimodalInput loadPreprocessedAudio(
 
   int32_t n_bins, n_frames;
 
-  if (!processor_path.empty()) {
-    // Load processor module to get metadata
-    std::unique_ptr<Module> processor_module;
-    try {
-      processor_module =
-          std::make_unique<Module>(processor_path, Module::LoadMode::File);
-      auto load_error = processor_module->load();
-      if (load_error != ::executorch::runtime::Error::Ok) {
-        ET_LOG(
-            Error,
-            "Failed to load processor module from: %s",
-            processor_path.c_str());
-        throw std::runtime_error("Failed to load processor module");
-      }
-    } catch (const std::exception& e) {
-      ET_LOG(Error, "Exception while loading processor module: %s", e.what());
-      throw std::runtime_error("Exception while loading processor module");
-    }
-
-    // Get n_bins by running "feature_size" method
-    auto feature_size_result = processor_module->execute("feature_size");
-    if (!feature_size_result.ok()) {
-      ET_LOG(
-          Error, "Failed to execute 'feature_size' method on processor module");
-      throw std::runtime_error(
-          "Failed to execute 'feature_size' method on processor module");
-    }
-    auto feature_size_outputs = feature_size_result.get();
-    if (feature_size_outputs.empty()) {
-      ET_LOG(Error, "'feature_size' method returned no outputs");
-      throw std::runtime_error("'feature_size' method returned no outputs");
-    }
-    n_bins = static_cast<int32_t>(feature_size_outputs[0].toInt());
+  if (processor_path.empty()) {
+    ET_LOG(
+        Error,
+        "Processor path is required to get feature_size and nb_max_frames");
+    throw std::runtime_error(
+        "Processor path is required to get feature_size and nb_max_frames");
+  }
 
-    // Get n_frames by running "nb_max_frames" method
-    auto nb_max_frames_result = processor_module->execute("nb_max_frames");
-    if (!nb_max_frames_result.ok()) {
+  // Load processor module to get metadata
+  std::unique_ptr<Module> processor_module;
+  try {
+    processor_module =
+        std::make_unique<Module>(processor_path, Module::LoadMode::File);
+    auto load_error = processor_module->load();
+    if (load_error != ::executorch::runtime::Error::Ok) {
       ET_LOG(
           Error,
-          "Failed to execute 'nb_max_frames' method on processor module");
-      throw std::runtime_error(
-          "Failed to execute 'nb_max_frames' method on processor module");
-    }
-    auto nb_max_frames_outputs = nb_max_frames_result.get();
-    if (nb_max_frames_outputs.empty()) {
-      ET_LOG(Error, "'nb_max_frames' method returned no outputs");
-      throw std::runtime_error("'nb_max_frames' method returned no outputs");
+          "Failed to load processor module from: %s",
+          processor_path.c_str());
+      throw std::runtime_error("Failed to load processor module");
     }
-    n_frames = static_cast<int32_t>(nb_max_frames_outputs[0].toInt());
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
+    throw std::runtime_error("Exception while loading processor module");
+  }
 
+  // Get n_bins by running "feature_size" method
+  auto feature_size_result = processor_module->execute("feature_size");
+  if (!feature_size_result.ok()) {
     ET_LOG(
-        Info,
-        "Got values from processor methods: n_bins=%d, n_frames=%d",
-        n_bins,
-        n_frames);
-  } else {
+        Error, "Failed to execute 'feature_size' method on processor module");
+    throw std::runtime_error(
+        "Failed to execute 'feature_size' method on processor module");
+  }
+  auto feature_size_outputs = feature_size_result.get();
+  if (feature_size_outputs.empty()) {
+    ET_LOG(Error, "'feature_size' method returned no outputs");
+    throw std::runtime_error("'feature_size' method returned no outputs");
+  }
+  n_bins = static_cast<int32_t>(feature_size_outputs[0].toInt());
+
+  // Get n_frames by running "nb_max_frames" method
+  auto nb_max_frames_result = processor_module->execute("nb_max_frames");
+  if (!nb_max_frames_result.ok()) {
     ET_LOG(
-        Error,
-        "Processor path is required to get feature_size and nb_max_frames");
+        Error, "Failed to execute 'nb_max_frames' method on processor module");
     throw std::runtime_error(
-        "Processor path is required to get feature_size and nb_max_frames");
+        "Failed to execute 'nb_max_frames' method on processor module");
   }
+  auto nb_max_frames_outputs = nb_max_frames_result.get();
+  if (nb_max_frames_outputs.empty()) {
+    ET_LOG(Error, "'nb_max_frames' method returned no outputs");
+    throw std::runtime_error("'nb_max_frames' method returned no outputs");
+  }
+  n_frames = static_cast<int32_t>(nb_max_frames_outputs[0].toInt());
+
+  ET_LOG(
+      Info,
+      "Got values from processor methods: n_bins=%d, n_frames=%d",
+      n_bins,
+      n_frames);
 
   std::size_t n_floats = audio_data.size();
   int32_t batch_size = ceil(
@@ -191,6 +190,7 @@ MultimodalInput loadPreprocessedAudio(
 
   ET_LOG(Info, "audio_data len = %d", audio_data.size());
 
+  // Create Audio multimodal input
   auto audio = std::make_unique<::executorch::extension::llm::Audio>();
   audio->batch_size = batch_size;
   audio->n_bins = n_bins;
@@ -279,10 +279,12 @@ MultimodalInput processRawAudioFile(
   // Create Audio multimodal input from processed features
   auto processed_audio =
       std::make_unique<::executorch::extension::llm::Audio>();
-  processed_audio->batch_size = static_cast<int32_t>(
-      sizes[0]); // Note: batching for s > 30 doesn't work yet.
+  processed_audio->batch_size =
+      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
+                                      // yet, so this will just be = 1.
   processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
-  processed_audio->n_frames = static_cast<int32_t>(sizes[2]);
+  processed_audio->n_frames =
+      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
 
   size_t total_elements = processed_audio->batch_size *
       processed_audio->n_bins * processed_audio->n_frames;

From efc12d85923a60633b7d2cc4a8b0281dca7cd3f9 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:52:20 -0700
Subject: [PATCH 3/3] Update on "Include audio preprocessing for raw audio
 tensor"

## Summary

Runs audio preprocessing (mel spectrogram conversion) on raw audio tensor, using an exported `.pte` from https://github.com/pytorch/executorch/blob/main/extension/audio/mel_spectrogram.py

Current limitations - no batching support in the mel spectrogram, so can only support audio of <30 seconds.

```
The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that
 the model was trained based on the speaker's data or instructions. They also mention that the volume is quite small, which could imply that the speaker is trying to control the volume of the model's output, likely because they are concerned about how loud the model's responses might
PyTorchObserver {"prompt_tokens":388,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756351346381,"inference_end_ms":1756351362602,"prompt_eval_end_ms":1756351351435,"first_token_ms":1756351351435,"aggregate_sampling_time_ms":99,"SCALING_FACTOR_UNITS_PER_SECOND":1000}
I 00:00:24.036773 executorch:stats.h:104]       Prompt Tokens: 388    Generated Tokens: 99
I 00:00:24.036800 executorch:stats.h:110]       Model Load Time:                0.000000 (seconds)
I 00:00:24.036805 executorch:stats.h:117]       Total inference time:           16.221000 (seconds)              Rate:  6.103200 (tokens/second)
I 00:00:24.036815 executorch:stats.h:127]               Prompt evaluation:      5.054000 (seconds)               Rate:  76.770875 (tokens/second)
I 00:00:24.036819 executorch:stats.h:136]               Generated 99 tokens:    11.167000 (seconds)              Rate:  8.865407 (tokens/second)
I 00:00:24.036822 executorch:stats.h:147]       Time to first generated token:  5.054000 (seconds)
I 00:00:24.036828 executorch:stats.h:153]       Sampling time over 487 tokens:  0.099000 (seconds)
```




[ghstack-poisoned]
---
 examples/models/voxtral/multimodal.cpp | 72 ++------------------------
 1 file changed, 4 insertions(+), 68 deletions(-)

diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index c2215848422..b086a04363c 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -111,77 +111,13 @@ std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
  *       f.write(t.numpy().tobytes())
  *
  * @param audio_path Path to the binary audio file (.bin)
- * @param processor_path Path to the processor .pte file to get metadata
  * @return MultimodalInput containing the loaded audio data
  */
-MultimodalInput loadPreprocessedAudio(
-    const std::string& audio_path,
-    const std::string& processor_path = "") {
+MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
   std::vector<float> audio_data = loadBinaryFloatData(audio_path);
 
-  int32_t n_bins, n_frames;
-
-  if (processor_path.empty()) {
-    ET_LOG(
-        Error,
-        "Processor path is required to get feature_size and nb_max_frames");
-    throw std::runtime_error(
-        "Processor path is required to get feature_size and nb_max_frames");
-  }
-
-  // Load processor module to get metadata
-  std::unique_ptr<Module> processor_module;
-  try {
-    processor_module =
-        std::make_unique<Module>(processor_path, Module::LoadMode::File);
-    auto load_error = processor_module->load();
-    if (load_error != ::executorch::runtime::Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to load processor module from: %s",
-          processor_path.c_str());
-      throw std::runtime_error("Failed to load processor module");
-    }
-  } catch (const std::exception& e) {
-    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
-    throw std::runtime_error("Exception while loading processor module");
-  }
-
-  // Get n_bins by running "feature_size" method
-  auto feature_size_result = processor_module->execute("feature_size");
-  if (!feature_size_result.ok()) {
-    ET_LOG(
-        Error, "Failed to execute 'feature_size' method on processor module");
-    throw std::runtime_error(
-        "Failed to execute 'feature_size' method on processor module");
-  }
-  auto feature_size_outputs = feature_size_result.get();
-  if (feature_size_outputs.empty()) {
-    ET_LOG(Error, "'feature_size' method returned no outputs");
-    throw std::runtime_error("'feature_size' method returned no outputs");
-  }
-  n_bins = static_cast<int32_t>(feature_size_outputs[0].toInt());
-
-  // Get n_frames by running "nb_max_frames" method
-  auto nb_max_frames_result = processor_module->execute("nb_max_frames");
-  if (!nb_max_frames_result.ok()) {
-    ET_LOG(
-        Error, "Failed to execute 'nb_max_frames' method on processor module");
-    throw std::runtime_error(
-        "Failed to execute 'nb_max_frames' method on processor module");
-  }
-  auto nb_max_frames_outputs = nb_max_frames_result.get();
-  if (nb_max_frames_outputs.empty()) {
-    ET_LOG(Error, "'nb_max_frames' method returned no outputs");
-    throw std::runtime_error("'nb_max_frames' method returned no outputs");
-  }
-  n_frames = static_cast<int32_t>(nb_max_frames_outputs[0].toInt());
-
-  ET_LOG(
-      Info,
-      "Got values from processor methods: n_bins=%d, n_frames=%d",
-      n_bins,
-      n_frames);
+  int32_t n_bins = 128;
+  int32_t n_frames = 3000;
 
   std::size_t n_floats = audio_data.size();
   int32_t batch_size = ceil(
@@ -329,7 +265,7 @@ MultimodalInput processAudioFile(
       return processRawAudioFile(audio_path, processor_path);
     } else {
       // Load preprocessed audio stored as a binary file (existing behavior)
-      return loadPreprocessedAudio(audio_path, processor_path);
+      return loadPreprocessedAudio(audio_path);
     }
   } else {
     ET_LOG(