From 2d3546ff8c9b13a15b1194ddab4d91bba18784c0 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 21 Oct 2025 10:23:37 -0700 Subject: [PATCH 1/2] make aoti_torch_empty_strided support creating incontiguous tensor Pull Request resolved: https://github.com/pytorch/executorch/pull/15228 This diff modifies the `aoti_torch_empty_strided` function to support the creation of incontiguous tensors. To achieve it, this diff: 1. update the way to calculate the memory size by using both tensor size and the stride 2. skip stride check in ETensor by adding and checking cmake macro `USE_CUDA_BACKEND` when building with CUDA backend support. we will soon bring the ETensor check back for every backend after migrating to use slimtensor. ghstack-source-id: 317688814 @exported-using-ghexport Differential Revision: [D84938258](https://our.internmc.facebook.com/intern/diff/D84938258/) --- backends/cuda/CMakeLists.txt | 38 ++++- backends/cuda/runtime/TARGETS | 21 ++- backends/cuda/runtime/shims/memory.cpp | 50 +++++-- .../cuda/runtime/shims/tensor_attribute.h | 2 +- .../test_aoti_torch__reinterpret_tensor.cpp | 1 + .../tests/test_aoti_torch_empty_strided.cpp | 135 ++++++++++++++---- backends/cuda/runtime/tensor/tensor_maker.cpp | 126 ++++++++++++++++ backends/cuda/runtime/tensor/tensor_maker.h | 56 ++++++++ 8 files changed, 385 insertions(+), 44 deletions(-) create mode 100644 backends/cuda/runtime/tensor/tensor_maker.cpp create mode 100644 backends/cuda/runtime/tensor/tensor_maker.h diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 8b94351d469..1db8792e0c0 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED) include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) find_package_torch() +# CUDA tensor maker for backends that support incontiguous tensors +set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp) +add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources}) +target_include_directories( + cuda_tensor_maker + PUBLIC $ $ + $ +) +target_compile_options( + cuda_tensor_maker + PUBLIC $<$:/EHsc /GR> + $<$>:-fexceptions -frtti -fPIC> +) +# Ensure symbols are exported properly +if(APPLE) + target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic) +else() + target_link_options( + cuda_tensor_maker PUBLIC + $<$>:-Wl,--export-dynamic> + ) +endif() + +# Link against ExecuTorch core libraries +target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS}) +executorch_target_link_options_shared_lib(cuda_tensor_maker) + +install( + TARGETS cuda_tensor_maker + EXPORT ExecuTorchTargets + DESTINATION lib +) + # CUDA-specific AOTI functionality set(_aoti_cuda_sources runtime/cuda_backend.cpp @@ -62,9 +95,10 @@ target_link_options( aoti_cuda PUBLIC $<$>:-Wl,--export-dynamic> ) -# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries +# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch +# CUDA libraries target_link_libraries( - aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS} + aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS} ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index d18c0118542..a85f3a7e6a3 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -27,6 +27,25 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "tensor_maker", + srcs = [ + "tensor/tensor_maker.cpp", + ], + headers = [ + "tensor/tensor_maker.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ], +) + runtime.cxx_library( name = "runtime_shims", srcs = [ @@ -52,8 +71,8 @@ runtime.cxx_library( compiler_flags = ["-Wno-global-constructors"], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ + ":tensor_maker", "//executorch/backends/aoti:common_shims", - "//executorch/extension/tensor:tensor", "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/platform:platform", diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index 5d30d3124d9..46b8d448a3a 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -163,9 +164,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // Create ExecutorTorch tensor that wraps the existing memory // Note: We're NOT copying the data, just wrapping it - auto tensor = executorch::extension::from_blob( - data, // existing memory (don't copy!) + // Using CUDA-specific tensor maker that supports incontiguous tensors + auto tensor = make_tensor( sizes, // tensor dimensions + data, // existing memory (don't copy!) + {}, // dim_order (empty, will be auto-generated) strides, // tensor strides (allows different strides) dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType ); @@ -210,10 +213,6 @@ AOTITorchError aoti_torch_empty_strided( // This requires us to reserve CUDA memory and put it into a ETensor void* ptr; - int64_t numel = 1; - for (int64_t i = 0; i < ndim; i++) { - numel *= sizes_ptr[i]; - } ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype)); @@ -223,7 +222,28 @@ AOTITorchError aoti_torch_empty_strided( InvalidArgument, "Invalid element size for dtype: %d", dtype); - int64_t nbytes = numel * element_size; + + // Calculate storage size based on strides, matching PyTorch's behavior + // This is critical when sizes and strides don't match the expected contiguous + // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp + int64_t storage_size = 1; // storage offset (0) + 1 + for (int64_t i = 0; i < ndim; i++) { + if (sizes_ptr[i] == 0) { + storage_size = 0; + break; + } + // For each dimension, add stride[i] * (size[i] - 1) + // This gives us the maximum offset in that dimension + int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1; + if (strides_ptr == nullptr) { + // Calculate contiguous stride if not provided + for (int64_t j = i + 1; j < ndim; j++) { + stride_i *= sizes_ptr[j]; + } + } + storage_size += stride_i * (sizes_ptr[i] - 1); + } + int64_t nbytes = storage_size * element_size; if (device_type == static_cast(SupportedDevices::CUDA)) { ET_CUDA_CHECK_OR_RETURN_ERROR( @@ -250,8 +270,13 @@ AOTITorchError aoti_torch_empty_strided( auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr); // ETensor creation with dynamic shape support for edge cases - auto tensor = executorch::extension::from_blob( - ptr, sizes, strides, dtype_to_scalar_type(dtype)); + // Using CUDA-specific tensor maker that supports incontiguous tensors + auto tensor = make_tensor( + sizes, + ptr, + {}, // dim_order (empty, will be auto-generated) + strides, + dtype_to_scalar_type(dtype)); // Store the tensor so it doesn't get destroyed tensors.insert(tensor); @@ -259,7 +284,6 @@ AOTITorchError aoti_torch_empty_strided( // This tensor owns the memory it allocated, set reference count to 1 memory_to_n_tensor[ptr] = 1; - return Error::Ok; } @@ -630,9 +654,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( // Create new tensor view that reinterprets the same memory with different // shape/strides This creates a view, not a copy - the data pointer is shared - std::shared_ptr tensor = executorch::extension::from_blob( - data_ptr, // Reuse the same memory from source tensor + // Using CUDA-specific tensor maker that supports incontiguous tensors + std::shared_ptr tensor = make_tensor( sizes, // New sizes with explicit SizesType + data_ptr, // Reuse the same memory from source tensor + {}, // dim_order (empty, will be auto-generated) strides, // New strides with explicit StridesType dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting ); diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h index 15a4e397d24..6b61b5bd3b8 100644 --- a/backends/cuda/runtime/shims/tensor_attribute.h +++ b/backends/cuda/runtime/shims/tensor_attribute.h @@ -8,8 +8,8 @@ #pragma once -#include #include +#include #include namespace executorch::backends::cuda { diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp index 1cefca99c2a..d3044810b15 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp index da65129f18a..799a8d1221b 100644 --- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp +++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp @@ -278,30 +278,6 @@ TEST_F(AOTITorchEmptyStridedTest, LargeTensor) { EXPECT_EQ(tensor->size(2), 50); } -// Test error handling with memory allocation failures -TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) { - // Try to create a very large tensor that might cause allocation failure - // (This test may pass or fail depending on available memory) - std::vector huge_sizes = {10000, 10000, 100}; // ~38GB for float32 - Tensor* tensor; - - AOTITorchError error = aoti_torch_empty_strided( - huge_sizes.size(), - huge_sizes.data(), - nullptr, - 6, // float32 - 1, // CUDA device - 0, // device index - &tensor); - - // Either succeed or fail with memory allocation error - if (error == Error::Ok) { - EXPECT_NE(tensor, nullptr); - } else { - EXPECT_EQ(error, Error::MemoryAllocationFailed); - } -} - // Test aoti_torch_empty_strided with bfloat16 dtype TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) { // Test creating bfloat16 tensor on CUDA @@ -509,11 +485,11 @@ TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) { EXPECT_EQ(sizes_ptr[2], 3); } -// Test different data types (only float32 is currently supported) +// Test different data types (currently we support bf16, fp32 and int32) TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { std::vector sizes = {2, 3}; - // Test float32 (dtype 6) - currently the only supported type + // Test float32 (dtype 6) - one of the supported types Tensor* tensor_float32; AOTITorchError error = aoti_torch_empty_strided( sizes.size(), @@ -527,7 +503,7 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { EXPECT_EQ(error, Error::Ok); EXPECT_NE(tensor_float32, nullptr); - // Test unsupported data types should return error + // Test int32 (dtype 3) - one of the supported types Tensor* tensor_int32; error = aoti_torch_empty_strided( sizes.size(), @@ -538,7 +514,8 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) { 0, // device index &tensor_int32); - EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor_int32, nullptr); // Test another unsupported data type Tensor* tensor_float64; @@ -586,3 +563,105 @@ TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) { EXPECT_EQ(tensor_5d->size(3), 4); EXPECT_EQ(tensor_5d->size(4), 5); } + +// Test incontiguous tensor creation - transpose-like layout +TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) { + // Create a tensor with transpose-like strides (column-major) + // For a 3x4 tensor in column-major order, strides should be [1, 3] + // This means each row step is 1, and each column step is 3 + std::vector sizes = {3, 4}; + std::vector strides = {1, 3}; // Column-major (incontiguous) + + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 3); + EXPECT_EQ(tensor->size(1), 4); + + // Verify the strides are what we specified + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0 + EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1 + + // Verify that memory was allocated correctly for incontiguous layout + // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - + // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12 + // elements Total bytes = 12 * 4 = 48 bytes (for float32) + EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape + + // The tensor should be accessible and writable + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Verify we can use CUDA to write to the memory + std::vector test_data(12, 1.0f); + cudaError_t cuda_err = cudaMemcpy( + data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); +} + +// Test incontiguous tensor creation - expanded/broadcasted stride pattern +TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) { + // Create a tensor with expanded strides (simulating broadcasting) + // A 2x3x4 tensor where the first dimension has stride 0 (expanded) + // This creates a tensor where the first dimension is "broadcasted" + std::vector sizes = {2, 3, 4}; + std::vector strides = {0, 4, 1}; // First dimension has stride 0 + + Tensor* tensor; + AOTITorchError error = aoti_torch_empty_strided( + sizes.size(), + sizes.data(), + strides.data(), + static_cast(SupportedDTypes::FLOAT32), + static_cast(SupportedDevices::CUDA), + 0, // device index + &tensor); + + EXPECT_EQ(error, Error::Ok); + EXPECT_NE(tensor, nullptr); + + // Verify tensor properties + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 3); + EXPECT_EQ(tensor->size(2), 4); + + // Verify the strides are what we specified + int64_t* strides_ptr; + EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok); + EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride + EXPECT_EQ(strides_ptr[1], 4); + EXPECT_EQ(strides_ptr[2], 1); + + // Verify that memory was allocated correctly for this incontiguous layout + // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] - + // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 - + // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number + // of elements (2*3*4=24), not storage size + EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24 + + // The tensor should be accessible and writable + void* data_ptr = tensor->mutable_data_ptr(); + EXPECT_NE(data_ptr, nullptr); + + // Verify we can use CUDA to write to the allocated memory + // We only need to allocate 12 elements (storage size), not 24 + std::vector test_data(12, 2.0f); + cudaError_t cuda_err = cudaMemcpy( + data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice); + EXPECT_EQ(cuda_err, cudaSuccess); +} diff --git a/backends/cuda/runtime/tensor/tensor_maker.cpp b/backends/cuda/runtime/tensor/tensor_maker.cpp new file mode 100644 index 00000000000..01252082bfc --- /dev/null +++ b/backends/cuda/runtime/tensor/tensor_maker.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace executorch::backends::cuda { + +namespace { +#ifndef USE_ATEN_LIB +/** + * A structure that consolidates the metadata (sizes, dim_order, strides) and + * the data buffer associated with a Tensor. Since Tensor does not own + * the memory for these metadata arrays or the data itself, this structure + * ensures that they are managed together and have the same lifetime as the + * Tensor. When the Tensor is destroyed, the Storage structure ensures + * proper cleanup of the associated metadata and data if needed. + */ +struct Storage final { + executorch::aten::TensorImpl tensor_impl; + executorch::aten::Tensor tensor; + std::vector sizes; + std::vector dim_order; + std::vector strides; + std::function deleter; + + Storage( + executorch::aten::TensorImpl&& tensor_impl, + std::vector&& sizes, + std::vector&& dim_order, + std::vector&& strides, + std::function&& deleter) + : tensor_impl(std::move(tensor_impl)), + tensor(&this->tensor_impl), + sizes(std::move(sizes)), + dim_order(std::move(dim_order)), + strides(std::move(strides)), + deleter(std::move(deleter)) {} + + ~Storage() { + if (deleter) { + deleter(tensor_impl.mutable_data()); + } + } +}; +#endif // USE_ATEN_LIB +} // namespace + +TensorPtr make_tensor( + std::vector sizes, + void* data, + std::vector dim_order, + std::vector strides, + executorch::aten::ScalarType type, + executorch::aten::TensorShapeDynamism dynamism, + std::function deleter) { + const auto dim = sizes.size(); + ET_CHECK_MSG( + dim_order.empty() || dim_order.size() == dim, + "dim_order size must match sizes or be empty."); + ET_CHECK_MSG( + strides.empty() || strides.size() == dim, + "strides size must match sizes or be empty."); + + if (dim_order.empty()) { + dim_order.resize(dim); + std::iota(dim_order.begin(), dim_order.end(), 0); + if (!strides.empty()) { + std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) { + return strides[a] > strides[b]; + }); + } + } + + // AOTI backends (like AOTI-CUDA) handle both contiguous and incontiguous + // tensors, so we skip stride calculation and incontiguous tensor checks. + // Strides are passed through as-is without validation. + +#ifndef USE_ATEN_LIB + executorch::aten::TensorImpl tensor_impl( + type, + dim, + sizes.data(), + data, + dim_order.data(), + strides.data(), + dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC); + auto storage = std::make_shared( + std::move(tensor_impl), + std::move(sizes), + std::move(dim_order), + std::move(strides), + std::move(deleter)); + const auto tensor_ptr = &storage->tensor; + return std::shared_ptr( + std::move(storage), tensor_ptr); +#else + auto options = c10::TensorOptions() + .dtype(c10::scalarTypeToTypeMeta(type)) + .device(c10::kCPU); + auto storage = c10::Storage( + c10::Storage::use_byte_size_t(), + at::detail::computeStorageNbytes( + sizes, strides, options.dtype().itemsize()), + c10::InefficientStdFunctionContext::makeDataPtr( + data, std::move(deleter), options.device()), + nullptr, + false); + auto tensor_impl = c10::make_intrusive( + std::move(storage), + c10::DispatchKeySet(c10::DispatchKey::CPU), + options.dtype()); + tensor_impl->set_sizes_and_strides(sizes, strides); + return std::make_shared(std::move(tensor_impl)); +#endif // USE_ATEN_LIB +} + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/tensor/tensor_maker.h b/backends/cuda/runtime/tensor/tensor_maker.h new file mode 100644 index 00000000000..92cdec60bb4 --- /dev/null +++ b/backends/cuda/runtime/tensor/tensor_maker.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace executorch::backends::cuda { + +/** + * A smart pointer type for managing the lifecycle of a Tensor. + * This is compatible with executorch::extension::TensorPtr. + */ +using TensorPtr = std::shared_ptr; + +/** + * Creates a TensorPtr for AOTI backends that skips stride calculation and + * incontiguous tensor checks. This is specifically designed for AOTI-CUDA + * which handles both contiguous and incontiguous tensors. + * + * This function is similar to executorch::extension::make_tensor_ptr but + * bypasses the stride validation that assumes contiguous tensors, making it + * suitable for AOTI backends that support arbitrary strides. + * + * @param sizes A vector specifying the size of each dimension. + * @param data A pointer to the data buffer. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of the tensor. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies the mutability of the tensor's shape. + * @param deleter A custom deleter function for managing the lifetime of the + * data buffer. If provided, this deleter will be called when the managed Tensor + * object is destroyed. + * @return A TensorPtr that manages the newly created Tensor. + */ +TensorPtr make_tensor( + std::vector sizes, + void* data, + std::vector dim_order, + std::vector strides, + executorch::aten::ScalarType type = executorch::aten::ScalarType::Float, + executorch::aten::TensorShapeDynamism dynamism = + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + std::function deleter = nullptr); + +} // namespace executorch::backends::cuda From 5a4c94e53dcbf7d26ae41a83ac3d8dc5b8d1ee2d Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 21 Oct 2025 10:23:39 -0700 Subject: [PATCH 2/2] add module level benchmark for gemma3 model Pull Request resolved: https://github.com/pytorch/executorch/pull/15241 This diff adds a module-level benchmark for the GEMMA3 model. Also introduce mutlmodal_benchmark.cpp to replace original voxtral_runner.cpp for benchmarking both gemma3 and voxtral model in module level. ghstack-source-id: 317688813 Differential Revision: [D84958564](https://our.internmc.facebook.com/intern/diff/D84958564/) --- .github/workflows/cuda.yml | 109 ++++- backends/cuda/CMakeLists.txt | 7 +- backends/cuda/cuda_backend.py | 3 + backends/cuda/tests/multimodal_benchmark.cpp | 465 +++++++++++++++++++ backends/cuda/tests/voxtral_runner.cpp | 264 ----------- 5 files changed, 578 insertions(+), 270 deletions(-) create mode 100644 backends/cuda/tests/multimodal_benchmark.cpp delete mode 100644 backends/cuda/tests/voxtral_runner.cpp diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9ee72a34ef0..f59a29420d7 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -164,6 +164,61 @@ jobs: ls -al "${RUNNER_ARTIFACT_DIR}" echo "::endgroup::" + export-gemma3-cuda-artifact: + name: export-gemma3-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + strategy: + fail-fast: false + with: + timeout: 90 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + upload-artifact: gemma3-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch" + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]" accelerate + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip list + echo "::endgroup::" + + echo "::group::Export Gemma3" + optimum-cli export executorch \ + --model "google/gemma-3-4b-it" \ + --task "multimodal-text-to-text" \ + --recipe "cuda" \ + --dtype bfloat16 \ + --device cuda \ + --max_seq_len 64 \ + --output_dir ./ + + test -f model.pte + test -f aoti_cuda_blob.ptd + echo "::endgroup::" + + echo "::group::Store Gemma3 Artifacts" + mkdir -p "${RUNNER_ARTIFACT_DIR}/" + cp model.pte "${RUNNER_ARTIFACT_DIR}/" + cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/" + ls -al "${RUNNER_ARTIFACT_DIR}/" + echo "::endgroup::" + benchmark-voxtral-cuda: name: benchmark-voxtral-cuda needs: export-voxtral-cuda-artifact @@ -204,13 +259,63 @@ jobs: -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ -DEXECUTORCH_BUILD_TESTS=ON \ -Bcmake-out . - cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark echo "::endgroup::" echo "::group::Run Voxtral Benchmark" export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH - cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd + cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd + + echo "::endgroup::" + + benchmark-gemma3-cuda: + name: benchmark-gemma3-cuda + needs: export-gemma3-cuda-artifact + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: 12.6 + use-custom-docker-registry: false + submodules: recursive + download-artifact: gemma3-cuda-export + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch Requirements" + CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh + pip list + echo "::endgroup::" + + echo "::group::Prepare Gemma3 Artifacts" + cp "${RUNNER_ARTIFACT_DIR}/model.pte" . + cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" . + ls -al model.pte aoti_cuda_blob.ptd + echo "::endgroup::" + + echo "::group::Build Gemma3 Benchmark" + cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_CUDA=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_TESTS=ON \ + -Bcmake-out . + cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark + echo "::endgroup::" + + echo "::group::Run Gemma3 Benchmark" + + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd echo "::endgroup::" diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 1db8792e0c0..af36c89585e 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -105,11 +105,10 @@ target_link_libraries( executorch_target_link_options_shared_lib(aoti_cuda) if(BUILD_TESTING) - # Add runtime - add_executable(voxtral_runner tests/voxtral_runner.cpp) + add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp) target_link_libraries( - voxtral_runner PUBLIC aoti_cuda extension_module_static - extension_flat_tensor portable_ops_lib + multimodal_benchmark PUBLIC aoti_cuda extension_module_static + extension_flat_tensor portable_ops_lib ) endif() diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index ba6da92b991..f8482835ea5 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -140,6 +140,9 @@ def preprocess( user_input_placeholders.append(node.meta["val"]) options: dict[str, typing.Any] = { + # Disable this to support sdpa decomposition + # TODO(gasoonjia): remove it after pin bump to latest pytorch + "loop_ordering_after_fusion": False, # Better model precision "emulate_precision_casts": True, # Embed CUDA kernel binaries directly into the compiled shared object diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp new file mode 100644 index 00000000000..679db889b71 --- /dev/null +++ b/backends/cuda/tests/multimodal_benchmark.cpp @@ -0,0 +1,465 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::extension::make_tensor_ptr; +using executorch::extension::TensorPtr; +using executorch::extension::module::Module; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Result; +using Clock = std::chrono::steady_clock; +using executorch::aten::TensorShapeDynamism; +using DurationMs = std::chrono::duration; + +enum class ModelType { GEMMA3, VOXTRAL, UNKNOWN }; + +struct ModelConfig { + std::string name; + size_t token_seq_len; + size_t text_embed_dim; + std::vector expected_methods; +}; + +const std::map model_configs = { + {ModelType::GEMMA3, + {"gemma3", + 128, + 2304, + {"vision_encoder", "token_embedding", "text_decoder"}}}, + {ModelType::VOXTRAL, + {"voxtral", + 1138, + 3072, + {"audio_encoder", "token_embedding", "text_decoder"}}}}; + +ModelType parse_model_type(const std::string& model_name) { + std::string lower_name = model_name; + std::transform( + lower_name.begin(), + lower_name.end(), + lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lower_name.find("gemma3") != std::string::npos) { + return ModelType::GEMMA3; + } else if (lower_name.find("voxtral") != std::string::npos) { + return ModelType::VOXTRAL; + } + return ModelType::UNKNOWN; +} + +std::vector to_sizes( + std::initializer_list dims) { + return std::vector(dims.begin(), dims.end()); +} + +std::string format_shape(const Tensor& tensor) { + std::ostringstream oss; + oss << "["; + const auto& sizes = tensor.sizes(); + for (size_t i = 0; i < sizes.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << sizes[i]; + } + oss << "]"; + return oss.str(); +} + +void print_tensor_summary(const std::string& label, const Tensor& tensor) { + std::cout << " " << label + << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) + << ", shape=" << format_shape(tensor) + << ", numel=" << tensor.numel() << std::endl; +} + +void dump_tensor_to_file(const std::string& filename, const Tensor& tensor) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open file for writing: " << filename << std::endl; + return; + } + + int32_t dtype = static_cast(tensor.scalar_type()); + file.write(reinterpret_cast(&dtype), sizeof(int32_t)); + + int32_t ndim = static_cast(tensor.sizes().size()); + file.write(reinterpret_cast(&ndim), sizeof(int32_t)); + + for (size_t i = 0; i < tensor.sizes().size(); ++i) { + int64_t dim_size = tensor.sizes()[i]; + file.write(reinterpret_cast(&dim_size), sizeof(int64_t)); + } + + const void* data_ptr = tensor.const_data_ptr(); + size_t element_size = 0; + + switch (tensor.scalar_type()) { + case ScalarType::Float: + element_size = sizeof(float); + break; + case ScalarType::BFloat16: + element_size = 2; + break; + case ScalarType::Half: + element_size = 2; + break; + case ScalarType::Long: + element_size = sizeof(int64_t); + break; + case ScalarType::Int: + element_size = sizeof(int32_t); + break; + default: + std::cerr << "Unsupported dtype for dumping: " + << executorch::runtime::toString(tensor.scalar_type()) + << std::endl; + return; + } + + size_t data_size = tensor.numel() * element_size; + file.write(reinterpret_cast(data_ptr), data_size); + file.close(); + + std::cout << "Dumped tensor to: " << filename << std::endl; +} + +TensorPtr create_vision_input() { + const auto sizes = to_sizes({1, 3, 896, 896}); + const size_t numel = 1ull * 3ull * 896ull * 896ull; + std::vector data(numel); + for (size_t i = 0; i < numel; ++i) { + data[i] = static_cast((i % 255) / 255.0); + } + return make_tensor_ptr( + sizes, + std::move(data), + {}, + {}, + ScalarType::BFloat16, + TensorShapeDynamism::DYNAMIC_UNBOUND); +} + +TensorPtr create_audio_input() { + const auto sizes = to_sizes({3, 128, 3000}); + const size_t numel = 3ull * 128ull * 3000ull; + std::vector data(numel, 0.5f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +TensorPtr create_token_ids_input(const ModelConfig& config) { + const auto sizes = to_sizes({1, static_cast(config.token_seq_len)}); + std::vector data(config.token_seq_len); + for (size_t i = 0; i < config.token_seq_len; ++i) { + data[i] = static_cast(i + 1); + } + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_positions_input(const ModelConfig& config) { + const auto sizes = to_sizes({static_cast(config.token_seq_len)}); + std::vector data(config.token_seq_len); + for (size_t i = 0; i < config.token_seq_len; ++i) { + data[i] = static_cast(i); + } + return make_tensor_ptr(sizes, std::move(data)); +} + +TensorPtr create_fallback_text_embedding(const ModelConfig& config) { + const auto sizes = to_sizes( + {1, + static_cast(config.token_seq_len), + static_cast(config.text_embed_dim)}); + const size_t numel = 1ull * config.token_seq_len * config.text_embed_dim; + std::vector data(numel, 0.0f); + return make_tensor_ptr( + sizes, std::move(data), {}, {}, ScalarType::BFloat16); +} + +struct MethodTiming { + double load_ms{0.0}; + double run_ms{0.0}; +}; + +enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN }; + +MethodCategory categorize_method(const std::string& method_name) { + std::string lower_name = method_name; + std::transform( + lower_name.begin(), + lower_name.end(), + lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lower_name.find("vision") != std::string::npos || + lower_name.find("audio") != std::string::npos || + lower_name.find("encoder") != std::string::npos) { + return MethodCategory::ENCODER; + } else if ( + lower_name.find("token") != std::string::npos && + lower_name.find("embedding") != std::string::npos) { + return MethodCategory::TOKEN_EMBEDDING; + } else if ( + lower_name.find("text") != std::string::npos && + lower_name.find("decoder") != std::string::npos) { + return MethodCategory::TEXT_DECODER; + } + return MethodCategory::UNKNOWN; +} + +std::vector create_inputs_for_method( + const std::string& method_name, + MethodCategory category, + ModelType model_type, + const ModelConfig& config, + const EValue* token_output, + std::vector& owned_inputs) { + std::vector inputs; + + switch (category) { + case MethodCategory::ENCODER: { + if (method_name.find("vision") != std::string::npos) { + auto input = create_vision_input(); + owned_inputs.emplace_back(input); + inputs.emplace_back(*input); + } else if (method_name.find("audio") != std::string::npos) { + auto input = create_audio_input(); + owned_inputs.emplace_back(input); + inputs.emplace_back(*input); + } + break; + } + + case MethodCategory::TOKEN_EMBEDDING: { + auto token_ids = create_token_ids_input(config); + owned_inputs.emplace_back(token_ids); + inputs.emplace_back(*token_ids); + break; + } + + case MethodCategory::TEXT_DECODER: { + if (token_output && token_output->isTensor()) { + inputs.emplace_back(*token_output); + } else { + auto fallback_embedding = create_fallback_text_embedding(config); + owned_inputs.emplace_back(fallback_embedding); + inputs.emplace_back(*fallback_embedding); + } + + auto positions = create_positions_input(config); + owned_inputs.emplace_back(positions); + inputs.emplace_back(*positions); + break; + } + + default: + break; + } + + return inputs; +} + +Error execute_method( + Module& module, + const std::string& method_name, + MethodCategory category, + ModelType model_type, + const ModelConfig& config, + const EValue* token_output, + MethodTiming& timing, + EValue* output_storage = nullptr) { + ET_LOG(Info, "Loading %s...", method_name.c_str()); + + const auto load_start = Clock::now(); + const Error load_err = module.load_method(method_name); + const auto load_end = Clock::now(); + if (load_err != Error::Ok) { + std::cerr << "Failed to load method " << method_name << ": error code " + << static_cast(load_err) << std::endl; + return load_err; + } + timing.load_ms = DurationMs(load_end - load_start).count(); + + std::vector owned_inputs; + std::vector inputs = create_inputs_for_method( + method_name, category, model_type, config, token_output, owned_inputs); + + const auto run_start = Clock::now(); + ET_LOG(Info, "%s running", method_name.c_str()); + Result> output_result = + module.execute(method_name, inputs); + ET_LOG(Info, "%s done", method_name.c_str()); + const auto run_end = Clock::now(); + timing.run_ms = DurationMs(run_end - run_start).count(); + + if (output_result.error() != Error::Ok) { + std::cerr << method_name << " execution failed: error code " + << static_cast(output_result.error()) << std::endl; + return output_result.error(); + } + + const auto& outputs = output_result.get(); + if (!outputs.empty() && outputs[0].isTensor()) { + print_tensor_summary(method_name + " output", outputs[0].toTensor()); + + if (category == MethodCategory::ENCODER || + category == MethodCategory::TOKEN_EMBEDDING) { + dump_tensor_to_file(method_name + "_output.bin", outputs[0].toTensor()); + } + + if (output_storage) { + *output_storage = outputs[0]; + } + } + + return Error::Ok; +} + +} // namespace + +int main(int argc, char** argv) { + if (argc != 4) { + std::cerr + << "Usage: " << argv[0] + << " " + << std::endl; + std::cerr << " model_name: gemma3 or voxtral" << std::endl; + return 1; + } + + const std::string model_name = argv[1]; + const std::string program_path = argv[2]; + const std::string data_map_path = argv[3]; + + const ModelType model_type = parse_model_type(model_name); + if (model_type == ModelType::UNKNOWN) { + std::cerr << "Unknown model type: " << model_name << std::endl; + std::cerr << "Supported models: gemma3, voxtral" << std::endl; + return 1; + } + + const ModelConfig& config = model_configs.at(model_type); + std::cout << "Running benchmark for model: " << config.name << std::endl; + + try { + Module module(program_path, data_map_path); + + const auto program_load_start = Clock::now(); + const Error program_load_error = module.load(); + const auto program_load_end = Clock::now(); + if (program_load_error != Error::Ok) { + std::cerr << "Failed to load ExecuTorch program: error code " + << static_cast(program_load_error) << std::endl; + return 1; + } + const DurationMs program_load_latency = + program_load_end - program_load_start; + + auto method_names_result = module.method_names(); + if (method_names_result.error() != Error::Ok) { + std::cerr << "Failed to get method names: error code " + << static_cast(method_names_result.error()) << std::endl; + return 1; + } + + const auto& available_methods = method_names_result.get(); + + std::cout << "Checking for expected methods..." << std::endl; + std::vector missing_methods; + for (const auto& expected : config.expected_methods) { + if (available_methods.find(expected) == available_methods.end()) { + missing_methods.push_back(expected); + } else { + std::cout << " ✓ " << expected << std::endl; + } + } + + if (!missing_methods.empty()) { + std::cerr << "\nError: Missing expected methods:" << std::endl; + for (const auto& missing : missing_methods) { + std::cerr << " ✗ " << missing << std::endl; + } + return 1; + } + + std::map timings; + EValue token_output; + bool token_executed = false; + + for (const auto& method_name : config.expected_methods) { + MethodCategory category = categorize_method(method_name); + MethodTiming timing; + + const EValue* input_token_ptr = + (category == MethodCategory::TEXT_DECODER && token_executed) + ? &token_output + : nullptr; + + EValue* output_storage = (category == MethodCategory::TOKEN_EMBEDDING) + ? &token_output + : nullptr; + + Error err = execute_method( + module, + method_name, + category, + model_type, + config, + input_token_ptr, + timing, + output_storage); + + if (err != Error::Ok) { + return 1; + } + + if (category == MethodCategory::TOKEN_EMBEDDING) { + token_executed = true; + } + + timings[method_name] = timing; + } + + std::cout << std::fixed << std::setprecision(3); + std::cout << "\n=== Benchmark Results ===" << std::endl; + std::cout << "Program load latency (ms): " << program_load_latency.count() + << std::endl; + + std::cout << "\nMethod load latency (ms):" << std::endl; + for (const auto& [name, timing] : timings) { + std::cout << " " << name << ": " << timing.load_ms << std::endl; + } + + std::cout << "\nRun latency (ms):" << std::endl; + for (const auto& [name, timing] : timings) { + std::cout << " " << name << ": " << timing.run_ms << std::endl; + } + + return 0; + } catch (const std::exception& ex) { + std::cerr << "Unhandled exception: " << ex.what() << std::endl; + return 1; + } +} diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp deleted file mode 100644 index feed458e1f5..00000000000 --- a/backends/cuda/tests/voxtral_runner.cpp +++ /dev/null @@ -1,264 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace { - -using executorch::aten::ScalarType; -using executorch::aten::Tensor; -using executorch::extension::make_tensor_ptr; -using executorch::extension::TensorPtr; -using executorch::extension::module::Module; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::Result; -using Clock = std::chrono::steady_clock; -using DurationMs = std::chrono::duration; - -std::vector to_sizes( - std::initializer_list dims) { - return std::vector(dims.begin(), dims.end()); -} - -std::string format_shape(const Tensor& tensor) { - std::ostringstream oss; - oss << "["; - const auto& sizes = tensor.sizes(); - for (size_t i = 0; i < sizes.size(); ++i) { - if (i > 0) { - oss << ", "; - } - oss << sizes[i]; - } - oss << "]"; - return oss.str(); -} - -void print_tensor_summary(const std::string& label, const Tensor& tensor) { - std::cout << " " << label - << ": dtype=" << executorch::runtime::toString(tensor.scalar_type()) - << ", shape=" << format_shape(tensor) - << ", numel=" << tensor.numel() << std::endl; -} - -TensorPtr create_audio_input() { - const auto sizes = to_sizes({3, 128, 3000}); - const size_t numel = 3ull * 128ull * 3000ull; - std::vector data(numel, 0.5f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -TensorPtr create_token_ids_input() { - const auto sizes = to_sizes({1, 1138}); - std::vector data(static_cast(1) * 1138, 0); - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_positions_input() { - const auto sizes = to_sizes({1138}); - std::vector data(static_cast(1138), 0); - return make_tensor_ptr(sizes, std::move(data)); -} - -TensorPtr create_fallback_text_embedding() { - const auto sizes = to_sizes({1, 1138, 3072}); - const size_t numel = 1ull * 1138ull * 3072ull; - std::vector data(numel, 0.0f); - return make_tensor_ptr( - sizes, std::move(data), {}, {}, ScalarType::BFloat16); -} - -struct MethodTiming { - double load_ms{0.0}; - double run_ms{0.0}; -}; - -} // namespace - -int main(int argc, char** argv) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] - << " " - << std::endl; - return 1; - } - - const std::string program_path = argv[1]; - const std::string data_map_path = argv[2]; - - try { - Module module(program_path, data_map_path); - - const auto program_load_start = Clock::now(); - const Error program_load_error = module.load(); - const auto program_load_end = Clock::now(); - if (program_load_error != Error::Ok) { - std::cerr << "Failed to load ExecuTorch program: error code " - << static_cast(program_load_error) << std::endl; - return 1; - } - const DurationMs program_load_latency = - program_load_end - program_load_start; - - MethodTiming audio_timing; - MethodTiming token_timing; - MethodTiming text_timing; - - auto measure_method_load = - [&](const std::string& name) -> std::pair { - const auto start = Clock::now(); - const Error err = module.load_method(name); - const auto end = Clock::now(); - return {err, DurationMs(end - start).count()}; - }; - - // audio_encoder - { - const auto [err, load_ms] = measure_method_load("audio_encoder"); - if (err != Error::Ok) { - std::cerr << "Failed to load method audio_encoder: error code " - << static_cast(err) << std::endl; - return 1; - } - audio_timing.load_ms = load_ms; - - const TensorPtr audio_input = create_audio_input(); - std::vector inputs; - std::vector owned_inputs; - owned_inputs.emplace_back(audio_input); - inputs.emplace_back(*audio_input); - - const auto run_start = Clock::now(); - Result> output_result = - module.execute("audio_encoder", inputs); - const auto run_end = Clock::now(); - audio_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (output_result.error() != Error::Ok) { - std::cerr << "audio_encoder execution failed: error code " - << static_cast(output_result.error()) << std::endl; - return 1; - } - - const auto& outputs = output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("audio_encoder output", outputs[0].toTensor()); - } - } - - EValue token_output; - bool token_executed = false; - - // token_embedding - { - const auto [err, load_ms] = measure_method_load("token_embedding"); - if (err != Error::Ok) { - std::cerr << "Failed to load method token_embedding: error code " - << static_cast(err) << std::endl; - return 1; - } - token_timing.load_ms = load_ms; - - const TensorPtr token_ids = create_token_ids_input(); - std::vector inputs; - std::vector owned_inputs; - owned_inputs.emplace_back(token_ids); - inputs.emplace_back(*token_ids); - - const auto run_start = Clock::now(); - auto token_output_result = module.execute("token_embedding", inputs); - const auto run_end = Clock::now(); - token_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (token_output_result.error() != Error::Ok) { - std::cerr << "token_embedding execution failed: error code " - << static_cast(token_output_result.error()) << std::endl; - return 1; - } - - token_executed = true; - const auto& outputs = token_output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("token_embedding output", outputs[0].toTensor()); - token_output = outputs[0]; - } - } - - // text_decoder - { - const auto [err, load_ms] = measure_method_load("text_decoder"); - if (err != Error::Ok) { - std::cerr << "Failed to load method text_decoder: error code " - << static_cast(err) << std::endl; - return 1; - } - text_timing.load_ms = load_ms; - - std::vector inputs; - std::vector owned_inputs; - if (token_executed) { - if (token_output.isTensor()) { - inputs.emplace_back(token_output); - } - } - - if (inputs.empty()) { - auto fallback_embedding = create_fallback_text_embedding(); - owned_inputs.emplace_back(fallback_embedding); - inputs.emplace_back(*fallback_embedding); - } - - auto positions = create_positions_input(); - owned_inputs.emplace_back(positions); - inputs.emplace_back(*positions); - - const auto run_start = Clock::now(); - Result> output_result = - module.execute("text_decoder", inputs); - const auto run_end = Clock::now(); - text_timing.run_ms = DurationMs(run_end - run_start).count(); - - if (output_result.error() != Error::Ok) { - std::cerr << "text_decoder execution failed: error code " - << static_cast(output_result.error()) << std::endl; - return 1; - } - - const auto& outputs = output_result.get(); - if (!outputs.empty() && outputs[0].isTensor()) { - print_tensor_summary("text_decoder output", outputs[0].toTensor()); - } - } - - std::cout << std::fixed << std::setprecision(3); - std::cout << "Program load latency (ms): " << program_load_latency.count() - << std::endl; - - std::cout << "Method load latency (ms):" << std::endl; - std::cout << " audio_encoder: " << audio_timing.load_ms << std::endl; - std::cout << " token_embedding: " << token_timing.load_ms << std::endl; - std::cout << " text_decoder: " << text_timing.load_ms << std::endl; - - std::cout << "Run latency (ms):" << std::endl; - std::cout << " audio_encoder: " << audio_timing.run_ms << std::endl; - std::cout << " token_embedding: " << token_timing.run_ms << std::endl; - std::cout << " text_decoder: " << text_timing.run_ms << std::endl; - - return 0; - } catch (const std::exception& ex) { - std::cerr << "Unhandled exception: " << ex.what() << std::endl; - return 1; - } -}