pytorch · shoumikhin · Oct 14, 2025
@@ -11,7 +11,6 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -41,117 +40,81 @@ struct ET_EXPERIMENTAL RawAudio {
  */
 class ET_EXPERIMENTAL Audio final {
  public:
-  // Default constructor
-  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
-
   // Constructor for uint8_t data
   Audio(
       std::vector<uint8_t>&& data,
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 0 &&
-            std::get<std::vector<uint8_t>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<uint8_t>>(data_).size(),
-        batch_size * n_bins * n_frames);
-  }
+      : Audio(make_tensor_ptr(
+            {batch_size, n_bins, n_frames},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Audio(
       std::vector<float>&& data,
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 1 &&
-            std::get<std::vector<float>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<float>>(data_).size(),
-        batch_size * n_bins * n_frames);
+      : Audio(make_tensor_ptr({batch_size, n_bins, n_frames}, std::move(data))) {}
+
+  explicit Audio(
+      executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
   }
 
   // Type checkers
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
   // Data access
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
   int32_t get_batch_size() const {
-    return batch_size_;
+    return tensor_->size(0);
   }
   int32_t get_n_bins() const {
-    return n_bins_;
+    return tensor_->size(1);
   }
   int32_t get_n_frames() const {
-    return n_frames_;
+    return tensor_->size(2);
   }
   /**
    * Convert the audio data to a TensorPtr, with optional batch dimension.
    * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
    * batch_size, n_bins, n_frames) if with_batch is true.
    */
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    std::vector<executorch::aten::SizesType> sizes = {
-        get_batch_size(), get_n_bins(), get_n_frames()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           static_cast<executorch::aten::SizesType>(tensor_->size(0)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(1)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(2))});
     }
-    ET_LOG(
-        Error,
-        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
   // Members
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t batch_size_;
-  int32_t n_bins_;
-  int32_t n_frames_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm

@@ -10,9 +10,7 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
-#include <cstddef>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -22,100 +20,80 @@ namespace executorch {
 namespace extension {
 namespace llm {
 
+// Assuming NCHW format
 class ET_EXPERIMENTAL Image {
  public:
-  // Default constructor
-  Image() : width_(0), height_(0), channels_(0) {}
-
   // Constructor for uint8_t data
   Image(
       std::vector<uint8_t>&& data,
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr(
+            {channels, height, width},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Image(
       std::vector<float>&& data,
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr({channels, height, width}, std::move(data))) {}
+
+  explicit Image(executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
+  }
 
   // Getters
-  int32_t width() const {
-    return width_;
+  int32_t channels() const {
+    return tensor_->size(0);
   }
+
   int32_t height() const {
-    return height_;
+    return tensor_->size(1);
   }
-  int32_t channels() const {
-    return channels_;
+
+  int32_t width() const {
+    return tensor_->size(2);
   }
 
   // Data access
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
-  }
-
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
-  }
-
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
-    // tensor (NCHW). The caller should handle reshaping if needed.
-    std::vector<executorch::aten::SizesType> sizes = {
-        channels(), height(), width()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           executorch::aten::SizesType(tensor_->size(0)),
+           executorch::aten::SizesType(tensor_->size(1)),
+           executorch::aten::SizesType(tensor_->size(2))});
     }
-    ET_LOG(
-        Error, "Image data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
-  // Assuming NCHW format
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t width_;
-  int32_t height_;
-  int32_t channels_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm

@@ -77,9 +77,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
     // tensor (CHW). Add a batch dimension of 1 if needed.
     auto expected_dims = input_meta.sizes();
-    auto image_tensor = ET_UNWRAP(
-        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
-        "Failed to convert image to tensor");
+    auto image_tensor = image.tensor(/*with_batch*/ expected_dims.size() == 4);
     ET_LOG(
         Info,
         "Image tensor dim: %zu, dtype: %s",
@@ -108,8 +106,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto expected_dtype = input_meta.scalar_type();
 
     // Create tensor with original dtype
-    auto audio_tensor =
-        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+    auto audio_tensor = audio.tensor();
 
     // Convert to expected dtype if needed
     if (audio_tensor->scalar_type() != expected_dtype) {