From 05ff26eed50820f7404a6710ac9d048aded84207 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 20 Oct 2025 14:27:10 -0700
Subject: [PATCH] gemma3 e2e runner on cuda

This diff introduces e2e runner for gemma3 model on cuda delegating using AOTI library, which is guarded by CI. Also other necessary infrastructure updates for building and running the `gemma3 e2e runner` on CUDA devices.

Differential Revision: [D85087532](https://our.internmc.facebook.com/intern/diff/D85087532/)

[ghstack-poisoned]
---
 .github/workflows/cuda.yml                    | 104 +++++++-
 examples/models/gemma3/CMakeLists.txt         | 122 +++++++++
 examples/models/gemma3/e2e_runner.cpp         | 242 ++++++++++++++++++
 extension/llm/runner/multimodal_prefiller.cpp |  13 +
 extension/tensor/tensor_ptr.cpp               |  12 +-
 5 files changed, 489 insertions(+), 4 deletions(-)
 create mode 100644 examples/models/gemma3/CMakeLists.txt
 create mode 100644 examples/models/gemma3/e2e_runner.cpp

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 67f2a681d81..a97a8edd17a 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -151,7 +151,7 @@ jobs:
         echo "::endgroup::"
 
   export-gemma3-cuda-artifact:
-    name: export-gemma3-cuda-artifact
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -159,6 +159,19 @@ jobs:
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          # TODO: enable gemma3 quantization
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -184,7 +197,8 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Gemma3"
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "google/gemma-3-4b-it" \
             --task "multimodal-text-to-text" \
@@ -198,7 +212,7 @@ jobs:
         test -f aoti_cuda_blob.ptd
         echo "::endgroup::"
 
-        echo "::group::Store Gemma3 Artifacts"
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}/"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -385,3 +399,87 @@ jobs:
           exit $EXIT_CODE
         fi
         echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          # TODO: enable quantized gemma3.
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
diff --git a/examples/models/gemma3/CMakeLists.txt b/examples/models/gemma3/CMakeLists.txt
new file mode 100644
index 00000000000..920db18abda
--- /dev/null
+++ b/examples/models/gemma3/CMakeLists.txt
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for gemma3 e2e runner.
+#
+cmake_minimum_required(VERSION 3.24)
+project(gemma3)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find `executorch` libraries, same as for gflags
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+set(_srcs e2e_runner.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# stb_image: a lightweight library to load images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+# Add deprecated/ to use stb_image_resize.h for internal compatibility
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
+# Add the required ExecuTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(gemma3_e2e_runner ${_srcs})
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(gemma3_e2e_runner)
+  if(NOT APPLE)
+    target_link_options(gemma3_e2e_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(gemma3_e2e_runner PUBLIC ${_common_include_directories})
+target_link_libraries(gemma3_e2e_runner PUBLIC ${link_libraries})
+target_compile_options(gemma3_e2e_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/gemma3/e2e_runner.cpp b/examples/models/gemma3/e2e_runner.cpp
new file mode 100644
index 00000000000..68f19e8296d
--- /dev/null
+++ b/examples/models/gemma3/e2e_runner.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include <stb_image_resize.h>
+
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
+DEFINE_string(
+    model_path,
+    "multimodal.pte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_string(data_path, "", "Path to data file.");
+DEFINE_string(tokenizer_path, "tokenizer.json", "Tokenizer stuff.");
+
+DEFINE_string(prompt, "What is in this image?", "Text prompt.");
+
+DEFINE_string(image_path, "", "Path to input image file.");
+
+DEFINE_double(
+    temperature,
+    0.0f,
+    "Temperature; Default is 0. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+
+DEFINE_bool(warmup, false, "Whether to run a warmup run.");
+
+namespace {
+
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+using ::executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_image_input;
+using ::executorch::extension::llm::make_text_input;
+using ::executorch::extension::llm::MultimodalInput;
+using ::executorch::runtime::EValue;
+
+bool ends_with(const std::string& str, const std::string& suffix) {
+  return str.size() >= suffix.size() &&
+      str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+/**
+ * @brief Loads an image from a file and resizes it to 896x896
+ *
+ * This function loads an image using stb_image and resizes it to the expected
+ * input size for Gemma3 (896x896). The image is converted to CHW (Channel,
+ * Height, Width) format which is expected by the model.
+ *
+ * @param image_path Path to the image file (.jpg, .png, etc.)
+ * @return MultimodalInput containing the loaded and processed image data
+ * @throws std::runtime_error if image loading fails
+ */
+MultimodalInput loadImage(const std::string& image_path) {
+  if (!ends_with(image_path, ".jpg") && !ends_with(image_path, ".jpeg") &&
+      !ends_with(image_path, ".png") && !ends_with(image_path, ".bmp")) {
+    ET_LOG(
+        Error,
+        "Unsupported image file format: %s (only .jpg, .jpeg, .png, .bmp are supported)",
+        image_path.c_str());
+    throw std::runtime_error("Unsupported image file format");
+  }
+
+  int width, height, channels;
+  unsigned char* data =
+      stbi_load(image_path.c_str(), &width, &height, &channels, 0);
+  if (!data) {
+    ET_LOG(Error, "Failed to load image: %s", image_path.c_str());
+    throw std::runtime_error("Failed to load image");
+  }
+
+  ET_LOG(
+      Info,
+      "Loaded image: %s, original size: %dx%d, channels: %d",
+      image_path.c_str(),
+      width,
+      height,
+      channels);
+
+  // Resize to 896x896 (Gemma3 vision encoder input size)
+  const int target_size = 896;
+  std::vector<uint8_t> resized_data(target_size * target_size * channels);
+
+  int resize_result = stbir_resize_uint8(
+      data,
+      width,
+      height,
+      0,
+      resized_data.data(),
+      target_size,
+      target_size,
+      0,
+      channels);
+
+  if (!resize_result) {
+    stbi_image_free(data);
+    ET_LOG(Error, "Failed to resize image");
+    throw std::runtime_error("Failed to resize image");
+  }
+
+  // Convert from HWC (Height, Width, Channel) to CHW (Channel, Height, Width)
+  // and normalize uint8 [0, 255] to float32 [0.0, 1.0]
+  std::vector<float> chw_data(channels * target_size * target_size);
+  for (int h = 0; h < target_size; ++h) {
+    for (int w = 0; w < target_size; ++w) {
+      for (int c = 0; c < channels; ++c) {
+        uint8_t pixel_value =
+            resized_data[h * target_size * channels + w * channels + c];
+        chw_data[c * target_size * target_size + h * target_size + w] =
+            static_cast<float>(pixel_value) / 255.0f;
+      }
+    }
+  }
+
+  ET_LOG(
+      Info,
+      "Resized and converted image to CHW format (float32): %dx%d, channels: %d",
+      target_size,
+      target_size,
+      channels);
+
+  Image image(std::move(chw_data), target_size, target_size, channels);
+  stbi_image_free(data);
+
+  return make_image_input(std::move(image));
+}
+
+} // namespace
+
+int32_t main(int32_t argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  const char* model_path = FLAGS_model_path.c_str();
+  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
+  const char* prompt = FLAGS_prompt.c_str();
+  const char* image_path = FLAGS_image_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
+  float temperature = FLAGS_temperature;
+  int32_t cpu_threads = FLAGS_cpu_threads;
+  bool warmup = FLAGS_warmup;
+
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif
+
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
+  if (tokenizer == nullptr) {
+    ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
+    return 1;
+  }
+
+  // Create multimodal runner
+  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
+      ::executorch::extension::llm::create_multimodal_runner(
+          model_path, std::move(tokenizer), data_path);
+
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create multimodal runner");
+    return 1;
+  }
+
+  // Load runner
+  auto load_error = runner->load();
+  if (load_error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load multimodal runner");
+    return 1;
+  }
+
+  // Prepare inputs
+  std::vector<MultimodalInput> inputs = {
+      make_text_input("<start_of_turn>user\n<start_of_image>"),
+      loadImage(image_path),
+      make_text_input(
+          std::string(prompt) + "<end_of_turn>\n<start_of_turn>model\n"),
+  };
+
+  ::executorch::extension::llm::GenerationConfig config;
+  config.max_new_tokens = 100;
+  config.temperature = temperature;
+
+  // Run warmup if requested
+  if (warmup) {
+    ET_LOG(Info, "Running warmup...");
+    auto warmup_error = runner->generate(inputs, config);
+    if (warmup_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Error, "Failed to run warmup");
+      return 1;
+    }
+    runner->reset();
+  }
+
+  auto error = runner->generate(inputs, config);
+
+  if (error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with multimodal runner\n");
+    return 1;
+  }
+  ET_LOG(Info, "Generated successfully");
+
+  return 0;
+}
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 8578187128f..817fb3b29c4 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -69,6 +69,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
           image.is_uint8(),
           InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+      ET_CHECK_OR_RETURN_ERROR(
+          image.is_float(),
+          InvalidArgument,
+          "Model expects BFloat16 data, we need to take image in float32 type and convert afterwards. But now image has uint8_t data.");
     } else {
       ET_CHECK_OR_RETURN_ERROR(
           false,
@@ -85,6 +90,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     ET_CHECK_OK_OR_RETURN_ERROR(
         image_tensor_result.error(), "Failed to convert image to tensor");
     auto image_tensor = image_tensor_result.get();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+      // Convert to bfloat16 for model input
+      auto image_tensor_return = convert_to_bfloat16(image_tensor);
+      ET_CHECK_OK_OR_RETURN_ERROR(image_tensor_return.error(), "Failed to convert image tensor to bfloat16");
+      image_tensor = image_tensor_return.get();
+    }
+
     ET_LOG(
         Info,
         "Image tensor dim: %zu, dtype: %s",
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index ae1823fe0db..9705009307a 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -83,7 +83,17 @@ TensorPtr make_tensor_ptr(
 // Skip stride calculation and incontiguous tensor check for CUDA backend since
 // AOTI-CUDA handles both contiguous and incontiguous tensors. This will be
 // removed after SlimTensor migration.
-#ifndef USE_CUDA_BACKEND
+#ifdef USE_CUDA_BACKEND
+  if (strides.empty()) {
+    std::vector<executorch::aten::StridesType> computed_strides(dim);
+
+    auto error = runtime::dim_order_to_stride(
+        sizes.data(), dim_order.data(), dim, computed_strides.data());
+    ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+
+    strides = std::move(computed_strides);
+  }
+#else
   std::vector<executorch::aten::StridesType> computed_strides(dim);
 
   auto error = runtime::dim_order_to_stride(