diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h
index ce71513ed17..cc7e6b1714a 100644
--- a/extension/llm/runner/audio.h
+++ b/extension/llm/runner/audio.h
@@ -11,7 +11,6 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -41,27 +40,16 @@ struct ET_EXPERIMENTAL RawAudio {
  */
 class ET_EXPERIMENTAL Audio final {
  public:
-  // Default constructor
-  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
-
   // Constructor for uint8_t data
   Audio(
       std::vector<uint8_t>&& data,
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 0 &&
-            std::get<std::vector<uint8_t>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<uint8_t>>(data_).size(),
-        batch_size * n_bins * n_frames);
-  }
+      : Audio(make_tensor_ptr(
+            {batch_size, n_bins, n_frames},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Audio(
@@ -69,89 +57,64 @@ class ET_EXPERIMENTAL Audio final {
       int32_t batch_size,
       int32_t n_bins,
       int32_t n_frames)
-      : data_(std::move(data)),
-        batch_size_(batch_size),
-        n_bins_(n_bins),
-        n_frames_(n_frames) {
-    ET_CHECK_MSG(
-        data_.index() == 1 &&
-            std::get<std::vector<float>>(data_).size() ==
-                static_cast<size_t>(batch_size * n_bins * n_frames),
-        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
-        std::get<std::vector<float>>(data_).size(),
-        batch_size * n_bins * n_frames);
+      : Audio(make_tensor_ptr({batch_size, n_bins, n_frames}, std::move(data))) {}
+
+  explicit Audio(
+      executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
   }
 
   // Type checkers
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
   // Data access
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
   int32_t get_batch_size() const {
-    return batch_size_;
+    return tensor_->size(0);
   }
   int32_t get_n_bins() const {
-    return n_bins_;
+    return tensor_->size(1);
   }
   int32_t get_n_frames() const {
-    return n_frames_;
+    return tensor_->size(2);
   }
   /**
    * Convert the audio data to a TensorPtr, with optional batch dimension.
    * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
    * batch_size, n_bins, n_frames) if with_batch is true.
    */
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    std::vector<executorch::aten::SizesType> sizes = {
-        get_batch_size(), get_n_bins(), get_n_frames()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           static_cast<executorch::aten::SizesType>(tensor_->size(0)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(1)),
+           static_cast<executorch::aten::SizesType>(tensor_->size(2))});
     }
-    ET_LOG(
-        Error,
-        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
   // Members
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t batch_size_;
-  int32_t n_bins_;
-  int32_t n_frames_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index dbdba273536..9c7746fff2a 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -10,9 +10,7 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
-#include <cstddef>
 #include <cstdint>
-#include <variant>
 #include <vector>
 
 #include <executorch/extension/tensor/tensor.h>
@@ -22,21 +20,19 @@ namespace executorch {
 namespace extension {
 namespace llm {
 
+// Assuming NCHW format
 class ET_EXPERIMENTAL Image {
  public:
-  // Default constructor
-  Image() : width_(0), height_(0), channels_(0) {}
-
   // Constructor for uint8_t data
   Image(
       std::vector<uint8_t>&& data,
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr(
+            {channels, height, width},
+            std::move(data),
+            executorch::aten::ScalarType::Byte)) {}
 
   // Constructor for float data
   Image(
@@ -44,78 +40,60 @@ class ET_EXPERIMENTAL Image {
       int32_t width,
       int32_t height,
       int32_t channels)
-      : data_(std::move(data)),
-        width_(width),
-        height_(height),
-        channels_(channels) {}
+      : Image(make_tensor_ptr({channels, height, width}, std::move(data))) {}
+
+  explicit Image(executorch::extension::TensorPtr tensor) : tensor_(std::move(tensor)) {
+    ET_CHECK_MSG(tensor_, "Null tensor");
+    ET_CHECK_MSG(tensor_->dim() == 3, "Invalid tensor rank");
+  }
 
   // Getters
-  int32_t width() const {
-    return width_;
+  int32_t channels() const {
+    return tensor_->size(0);
   }
+
   int32_t height() const {
-    return height_;
+    return tensor_->size(1);
   }
-  int32_t channels() const {
-    return channels_;
+
+  int32_t width() const {
+    return tensor_->size(2);
   }
 
   // Data access
   bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Byte;
   }
 
   bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
-  }
-
-  const std::vector<uint8_t>& get_uint8_data() const& {
-    return std::get<std::vector<uint8_t>>(data_);
+    return tensor_->scalar_type() == ::executorch::aten::ScalarType::Float;
   }
 
-  std::vector<uint8_t>& get_uint8_data() & {
-    return std::get<std::vector<uint8_t>>(data_);
+  const uint8_t* uint8_data() const {
+    ET_DCHECK_MSG(is_uint8(), "Dtype is not uint8");
+    return tensor_->const_data_ptr<uint8_t>();
   }
 
-  const std::vector<float>& get_float_data() const& {
-    return std::get<std::vector<float>>(data_);
+  const float* float_data() const {
+    ET_DCHECK_MSG(is_float(), "Dtype is not float");
+    return tensor_->const_data_ptr<float>();
   }
 
-  std::vector<float>& get_float_data() & {
-    return std::get<std::vector<float>>(data_);
-  }
-
-  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+  executorch::extension::TensorPtr tensor(
       bool with_batch = false) const {
-    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
-    // tensor (NCHW). The caller should handle reshaping if needed.
-    std::vector<executorch::aten::SizesType> sizes = {
-        channels(), height(), width()};
     if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return executorch::extension::from_blob(
-          const_cast<float*>(get_float_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return executorch::extension::from_blob(
-          const_cast<uint8_t*>(get_uint8_data().data()),
-          sizes,
-          ::executorch::aten::ScalarType::Byte);
+      return make_tensor_ptr(
+          *tensor_,
+          {1,
+           executorch::aten::SizesType(tensor_->size(0)),
+           executorch::aten::SizesType(tensor_->size(1)),
+           executorch::aten::SizesType(tensor_->size(2))});
     }
-    ET_LOG(
-        Error, "Image data is not initialized with uint8_t or float vector.");
-    return ::executorch::runtime::Error::NotSupported;
+    return tensor_;
   }
 
  private:
-  // Assuming NCHW format
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t width_;
-  int32_t height_;
-  int32_t channels_;
+  executorch::extension::TensorPtr tensor_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 7f5a8356979..97d52268fd8 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -77,9 +77,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
     // tensor (CHW). Add a batch dimension of 1 if needed.
     auto expected_dims = input_meta.sizes();
-    auto image_tensor = ET_UNWRAP(
-        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
-        "Failed to convert image to tensor");
+    auto image_tensor = image.tensor(/*with_batch*/ expected_dims.size() == 4);
     ET_LOG(
         Info,
         "Image tensor dim: %zu, dtype: %s",
@@ -108,8 +106,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto expected_dtype = input_meta.scalar_type();
 
     // Create tensor with original dtype
-    auto audio_tensor =
-        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+    auto audio_tensor = audio.tensor();
 
     // Convert to expected dtype if needed
     if (audio_tensor->scalar_type() != expected_dtype) {
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
index bcc6aba0f8e..993358d46fb 100644
--- a/extension/llm/runner/pybindings.cpp
+++ b/extension/llm/runner/pybindings.cpp
@@ -42,6 +42,23 @@ using namespace executorch::runtime;
     }                                                             \
   })
 
+static TensorPtr tensor_to_tensor_ptr(const torch::Tensor& tensor) {
+  auto contiguous_tensor = tensor.contiguous();
+  void* data_ptr = contiguous_tensor.data_ptr();
+  const auto dtype = contiguous_tensor.options().dtype();
+  std::vector<SizesType> sizes;
+  sizes.reserve(contiguous_tensor.sizes().size());
+
+  for (const auto size : contiguous_tensor.sizes()) {
+    sizes.push_back(size);
+  }
+  return executorch::extension::from_blob(
+      data_ptr,
+      sizes,
+      torch_to_executorch_scalar_type(dtype),
+      [tensor = std::move(contiguous_tensor)](void*) {});
+}
+
 // Python wrapper class for MultimodalRunner
 class PyMultimodalRunner {
  public:
@@ -132,7 +149,7 @@ class PyMultimodalRunner {
     }
   }
 
-  void prefill(std::vector<MultimodalInput> inputs) {
+  void prefill(const std::vector<MultimodalInput>& inputs) {
     if (!runner_) {
       throw std::runtime_error("Runner not initialized");
     }
@@ -274,14 +291,29 @@ PYBIND11_MODULE(_llm_runner, m) {
       .def_property_readonly("width", &Image::width)
       .def_property_readonly("height", &Image::height)
       .def_property_readonly("channels", &Image::channels)
-      .def_property_readonly(
-          "uint8_data",
-          static_cast<const std::vector<uint8_t>& (Image::*)() const&>(
-              &Image::get_uint8_data))
-      .def_property_readonly(
-          "float_data",
-          static_cast<const std::vector<float>& (Image::*)() const&>(
-              &Image::get_float_data))
+      .def(
+          "tensor",
+          [](const Image& image, bool with_batch) {
+            return tensor_to_torch_tensor(*image.tensor(with_batch));
+          },
+          py::arg("with_batch") = false)
+      .def_buffer([](Image& image) -> py::buffer_info {
+        auto tensor = image.tensor();
+        const auto scalar_type = tensor->scalar_type();
+        const auto element_size = elementSize(scalar_type);
+        const auto* format = scalar_type == aten::ScalarType::Byte
+          ? py::format_descriptor<uint8_t>::format()
+          : py::format_descriptor<float>::format();
+        py::buffer_info buffer_info(
+          tensor->mutable_data_ptr(),
+          element_size,
+          format,
+          tensor->dim(),
+          std::vector<aten::SizesType>{tensor->sizes().begin(), tensor->sizes().end()}
+        );
+        buffer_info.readonly = true;
+        return buffer_info;
+      })
       .def("__repr__", [](const Image& img) {
         std::string dtype = "unknown";
         if (img.is_uint8()) {
@@ -297,7 +329,6 @@ PYBIND11_MODULE(_llm_runner, m) {
 
   // Bind Audio class
   py::class_<Audio>(m, "Audio")
-      .def(py::init<>())
       .def(
           py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
           py::arg("data"),
@@ -314,18 +345,32 @@ PYBIND11_MODULE(_llm_runner, m) {
           "Create preprocessed audio data (float32)")
       .def("is_uint8", &Audio::is_uint8)
       .def("is_float", &Audio::is_float)
-      .def_property_readonly(
-          "uint8_data",
-          static_cast<const std::vector<uint8_t>& (Audio::*)() const&>(
-              &Audio::get_uint8_data))
-      .def_property_readonly(
-          "float_data",
-          static_cast<const std::vector<float>& (Audio::*)() const&>(
-              &Audio::get_float_data))
       .def_property_readonly("batch_size", &Audio::get_batch_size)
       .def_property_readonly("n_bins", &Audio::get_n_bins)
       .def_property_readonly("n_frames", &Audio::get_n_frames)
-      .def("toTensor", &Audio::toTensor)
+      .def(
+          "tensor",
+          [](const Audio& audio, bool with_batch) {
+            return tensor_to_torch_tensor(*audio.tensor(with_batch));
+          },
+          py::arg("with_batch") = false)
+      .def_buffer([](Audio& audio) -> py::buffer_info {
+        auto tensor = audio.tensor();
+        const auto scalar_type = tensor->scalar_type();
+        const auto element_size = elementSize(scalar_type);
+        const auto* format = scalar_type == aten::ScalarType::Byte
+          ? py::format_descriptor<uint8_t>::format()
+          : py::format_descriptor<float>::format();
+        py::buffer_info buffer_info(
+          tensor->mutable_data_ptr(),
+          element_size,
+          format,
+          tensor->dim(),
+          std::vector<aten::SizesType>{tensor->sizes().begin(), tensor->sizes().end()}
+        );
+        buffer_info.readonly = true;
+        return buffer_info;
+      })
       .def("__repr__", [](const Audio& audio) {
         std::string dtype = "unknown";
         if (audio.is_uint8()) {
@@ -369,10 +414,6 @@ PYBIND11_MODULE(_llm_runner, m) {
           py::init<const std::vector<uint64_t>&>(),
           py::arg("tokens"),
           "Create a MultimodalInput with pre-tokenized tokens (List[int])")
-      .def(
-          py::init<const std::vector<uint64_t>&>(),
-          py::arg("tokens"),
-          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
       .def(
           py::init<const Image&>(),
           py::arg("image"),
@@ -473,6 +514,14 @@ PYBIND11_MODULE(_llm_runner, m) {
   m.def(
       "make_image_input",
       [](torch::Tensor image_tensor) -> MultimodalInput {
+        if (!image_tensor.device().is_cpu()) {
+          throw std::runtime_error("Image tensor must be on CPU");
+        }
+        if (image_tensor.scalar_type() != torch::kUInt8 &&
+            image_tensor.scalar_type() != torch::kFloat) {
+          throw std::runtime_error(
+              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
+        }
         if (image_tensor.dim() == 4) {
           if (image_tensor.size(0) != 1) {
             throw std::runtime_error(
@@ -480,56 +529,18 @@ PYBIND11_MODULE(_llm_runner, m) {
           }
           image_tensor = image_tensor.squeeze(0);
         }
-
         if (image_tensor.dim() != 3) {
           throw std::runtime_error(
-              "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
+              "Image tensor must be 3D (H,W,C) or (C,H,W)");
         }
-
-        int64_t height, width, channels;
-        // Check for memory format and permute to CHW if necessary
-        if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-          // Input is HWC, permute to CHW
-          height = image_tensor.size(0);
-          width = image_tensor.size(1);
-          channels = image_tensor.size(2);
+        if (image_tensor.size(2) == 3 || image_tensor.size(2) == 4) {
           image_tensor = image_tensor.permute({2, 0, 1});
-        } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) {
-          // Input is CHW
-          channels = image_tensor.size(0);
-          height = image_tensor.size(1);
-          width = image_tensor.size(2);
-        } else {
-          throw std::runtime_error(
-              "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format.");
         }
-
-        if (channels != 3 && channels != 4) {
+        if (!(image_tensor.size(0) == 3 || image_tensor.size(0) == 4)) {
           throw std::runtime_error(
               "Image must have 3 (RGB) or 4 (RGBA) channels");
         }
-
-        image_tensor = image_tensor.contiguous();
-        if (image_tensor.scalar_type() == torch::kUInt8) {
-          uint8_t* data = image_tensor.data_ptr<uint8_t>();
-          std::vector<uint8_t> image_data(data, data + image_tensor.numel());
-          return MultimodalInput(Image(
-              std::move(image_data),
-              static_cast<int32_t>(width),
-              static_cast<int32_t>(height),
-              static_cast<int32_t>(channels)));
-        } else if (image_tensor.scalar_type() == torch::kFloat) {
-          float* data = image_tensor.data_ptr<float>();
-          std::vector<float> image_data(data, data + image_tensor.numel());
-          return MultimodalInput(Image(
-              std::move(image_data),
-              static_cast<int32_t>(width),
-              static_cast<int32_t>(height),
-              static_cast<int32_t>(channels)));
-        } else {
-          throw std::runtime_error(
-              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
-        }
+        return MultimodalInput(Image(tensor_to_tensor_ptr(image_tensor)));
       },
       "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
       py::arg("image_tensor"));
@@ -537,36 +548,15 @@ PYBIND11_MODULE(_llm_runner, m) {
   m.def(
       "make_audio_input",
       [](torch::Tensor audio_tensor) -> MultimodalInput {
-        if (audio_tensor.dim() != 3) {
+        if (audio_tensor.scalar_type() != torch::kUInt8 && audio_tensor.scalar_type() != torch::kFloat) {
           throw std::runtime_error(
-              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
+              "Unsupported audio tensor dtype. Only uint8 and float32 are supported.");
         }
-
-        int64_t batch_size = audio_tensor.size(0);
-        int64_t n_bins = audio_tensor.size(1);
-        int64_t n_frames = audio_tensor.size(2);
-
-        audio_tensor = audio_tensor.contiguous();
-        if (audio_tensor.scalar_type() == torch::kUInt8) {
-          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
-          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
-          return MultimodalInput(Audio(
-              std::move(audio_data),
-              static_cast<int32_t>(batch_size),
-              static_cast<int32_t>(n_bins),
-              static_cast<int32_t>(n_frames)));
-        } else if (audio_tensor.scalar_type() == torch::kFloat) {
-          float* data = audio_tensor.data_ptr<float>();
-          std::vector<float> audio_data(data, data + audio_tensor.numel());
-          return MultimodalInput(Audio(
-              std::move(audio_data),
-              static_cast<int32_t>(batch_size),
-              static_cast<int32_t>(n_bins),
-              static_cast<int32_t>(n_frames)));
-        } else {
+        if (audio_tensor.dim() != 3) {
           throw std::runtime_error(
-              "Unsupported audio tensor dtype. Only uint8 and float32 are supported for preprocessed audio.");
+              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
         }
+        return MultimodalInput(Audio(tensor_to_tensor_ptr(audio_tensor)));
       },
       "Create a preprocessed audio input from a torch tensor (batch_size, n_bins, n_frames)",
       py::arg("audio_tensor"));
@@ -644,4 +634,4 @@ PYBIND11_MODULE(_llm_runner, m) {
       .def("__repr__", [](const PyMultimodalRunner& runner) {
         return "<MultimodalRunner>";
       });
-}
\ No newline at end of file
+}
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index e001e8fc154..18f8ccb4fd5 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -42,8 +42,12 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_decoder_runner" + aten_suffix,
-            exported_headers = ["text_decoder_runner.h"],
-            srcs = ["text_decoder_runner.cpp"],
+            exported_headers = [
+                "text_decoder_runner.h",
+            ],
+            srcs = [
+                "text_decoder_runner.cpp",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -59,8 +63,12 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_prefiller" + aten_suffix,
-            exported_headers = ["text_prefiller.h"],
-            srcs = ["text_prefiller.cpp"],
+            exported_headers = [
+                "text_prefiller.h",
+            ],
+            srcs = [
+                "text_prefiller.cpp",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -74,7 +82,9 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "text_token_generator" + aten_suffix,
-            exported_headers = ["text_token_generator.h"],
+            exported_headers = [
+                "text_token_generator.h",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -88,7 +98,10 @@ def define_common_targets():
 
         runtime.cxx_library(
             name = "image_prefiller" + aten_suffix,
-            exported_headers = ["image_prefiller.h", "image.h"],
+            exported_headers = [
+                "image.h",
+                "image_prefiller.h",
+            ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -104,20 +117,18 @@ def define_common_targets():
             name = "multimodal_runner_lib" + aten_suffix,
             exported_headers = [
                 "audio.h",
-                "image.h",
-                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",
                 "multimodal_decoder_runner.h",
+                "wav_loader.h",
             ],
             srcs = [
                 "multimodal_prefiller.cpp",
             ],
             exported_deps = [
-                ":text_decoder_runner" + aten_suffix,
-                ":text_prefiller" + aten_suffix,
                 ":image_prefiller" + aten_suffix,
+                ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,
             ],
         )
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp
index 85d45d69173..04714349716 100644
--- a/extension/llm/runner/test/test_multimodal_input.cpp
+++ b/extension/llm/runner/test/test_multimodal_input.cpp
@@ -71,7 +71,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromImage) {
   EXPECT_EQ(input.get_image().width(), 224);
   EXPECT_EQ(input.get_image().height(), 224);
   EXPECT_EQ(input.get_image().channels(), 3);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), 224 * 224 * 3);
+  EXPECT_EQ(input.get_image().tensor()->numel(), 224 * 224 * 3);
 }
 
 TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
@@ -79,7 +79,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   int width = img.width();
   int height = img.height();
   int channels = img.channels();
-  size_t data_size = img.get_uint8_data().size();
+  size_t data_size = img.tensor()->numel();
 
   MultimodalInput input(std::move(img));
 
@@ -89,7 +89,7 @@ TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   EXPECT_EQ(input.get_image().width(), width);
   EXPECT_EQ(input.get_image().height(), height);
   EXPECT_EQ(input.get_image().channels(), channels);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), data_size);
+  EXPECT_EQ(input.get_image().tensor()->numel(), data_size);
 }
 
 // Test copy constructor and assignment
@@ -356,7 +356,7 @@ TEST_F(MultimodalInputTest, DifferentImageSizes) {
   EXPECT_EQ(input.get_image().width(), 32);
   EXPECT_EQ(input.get_image().height(), 32);
   EXPECT_EQ(input.get_image().channels(), 1);
-  EXPECT_EQ(input.get_image().get_uint8_data().size(), 32 * 32);
+  EXPECT_EQ(input.get_image().tensor()->numel(), 32 * 32);
 }
 
 // Test with empty text
diff --git a/extension/llm/runner/test/test_runner_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py
index f30226bf3e2..c242d5374cc 100644
--- a/extension/llm/runner/test/test_runner_pybindings.py
+++ b/extension/llm/runner/test/test_runner_pybindings.py
@@ -122,7 +122,7 @@ def test_creation(self):
         image = Image([1, 2, 3, 4], 2, 2, 1)
 
         # Properties are read-only
-        self.assertEqual(image.uint8_data, [1, 2, 3, 4])
+        self.assertEqual(memoryview(image).tobytes(), bytes([1, 2, 3, 4]))
         self.assertEqual(image.width, 2)
         self.assertEqual(image.height, 2)
         self.assertEqual(image.channels, 1)