From 288e758d7fed7c2c58010f5e4b6418f536530819 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Wed, 21 Aug 2024 17:09:09 -0700 Subject: [PATCH 001/531] [executorch] Migrate most of extension/... to new namespace Differential Revision: D60938936 Pull Request resolved: https://github.com/pytorch/executorch/pull/4617 --- extension/aten_util/aten_bridge.cpp | 22 ++-- extension/aten_util/aten_bridge.h | 42 +++++++- .../make_aten_functor_from_et_functor.h | 61 +++++++---- extension/aten_util/test/aten_bridge_test.cpp | 2 +- ...make_aten_functor_from_et_functor_test.cpp | 10 +- extension/data_loader/buffer_data_loader.h | 28 +++-- extension/data_loader/file_data_loader.cpp | 15 +-- extension/data_loader/file_data_loader.h | 27 +++-- extension/data_loader/mmap_data_loader.cpp | 14 +-- extension/data_loader/mmap_data_loader.h | 29 +++-- .../data_loader/shared_ptr_data_loader.h | 23 ++-- .../test/buffer_data_loader_test.cpp | 12 +-- .../test/file_data_loader_test.cpp | 14 +-- .../test/mmap_data_loader_test.cpp | 14 +-- .../test/shared_ptr_data_loader_test.cpp | 12 +-- .../make_boxed_from_unboxed_functor.h | 74 ++++++++----- extension/kernel_util/meta_programming.h | 18 ++-- .../make_boxed_from_unboxed_functor_test.cpp | 33 +++--- extension/kernel_util/type_list.h | 13 ++- .../malloc_memory_allocator.h | 17 ++- .../test/malloc_memory_allocator_test.cpp | 7 +- extension/module/module.cpp | 52 ++++++--- extension/module/module.h | 102 +++++++++++------- .../parallel/test/thread_parallel_test.cpp | 5 +- extension/parallel/thread_parallel.cpp | 6 +- extension/parallel/thread_parallel.h | 16 ++- extension/pybindings/pybindings.cpp | 91 ++++++++++------ extension/pytree/aten_util/ivalue_util.cpp | 12 +-- extension/pytree/aten_util/ivalue_util.h | 24 +++-- .../aten_util/test/ivalue_util_test.cpp | 12 +-- extension/pytree/function_ref.h | 20 +++- extension/pytree/pybindings.cpp | 8 +- extension/pytree/pytree.h | 16 ++- extension/pytree/test/TARGETS | 4 +- extension/pytree/test/function_ref_test.cpp | 12 +-- extension/pytree/test/test_pytree.cpp | 20 ++-- extension/runner_util/inputs.cpp | 17 +-- extension/runner_util/inputs.h | 31 ++++-- extension/runner_util/inputs_aten.cpp | 14 +-- extension/runner_util/inputs_portable.cpp | 16 +-- extension/runner_util/managed_tensor.h | 45 ++++---- extension/runner_util/test/inputs_test.cpp | 27 +++-- .../runner_util/test/managed_tensor_test.cpp | 6 +- extension/testing_util/temp_file.h | 16 ++- .../testing_util/test/temp_file_test.cpp | 2 +- extension/training/optimizer/sgd.cpp | 17 ++- extension/training/optimizer/sgd.h | 46 ++++---- .../training/optimizer/test/sgd_test.cpp | 10 +- runtime/executor/test/executor_test.cpp | 10 +- 49 files changed, 715 insertions(+), 429 deletions(-) diff --git a/extension/aten_util/aten_bridge.cpp b/extension/aten_util/aten_bridge.cpp index 3916f7ed42c..362dc57c37d 100644 --- a/extension/aten_util/aten_bridge.cpp +++ b/extension/aten_util/aten_bridge.cpp @@ -11,8 +11,8 @@ #include #include -namespace torch { -namespace util { +namespace executorch { +namespace extension { namespace { void check_tensor_meta(const at::Tensor& a, const exec_aten::Tensor& b) { @@ -55,14 +55,15 @@ ET_CHECK_MSG( } // check dtype ET_CHECK_MSG( - b.scalar_type() == torchToExecuTorchScalarType(a.options().dtype()), + b.scalar_type() == torch_to_executorch_scalar_type(a.options().dtype()), "dtypes dont match a %hhd vs. b %hhd", - torchToExecuTorchScalarType(a.options().dtype()), + torch_to_executorch_scalar_type(a.options().dtype()), b.scalar_type()); } } // namespace -torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) { +torch::executor::ScalarType torch_to_executorch_scalar_type( + caffe2::TypeMeta type) { switch (c10::typeMetaToScalarType(type)) { case c10::ScalarType::Byte: return torch::executor::ScalarType::Byte; @@ -91,7 +92,8 @@ torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type) { } } -c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type) { +c10::ScalarType executorch_to_torch_scalar_type( + torch::executor::ScalarType type) { switch (type) { case torch::executor::ScalarType::Byte: return c10::ScalarType::Byte; @@ -147,7 +149,8 @@ void alias_etensor_to_attensor( } at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& etensor) { - c10::ScalarType dtype = execuTorchtoTorchScalarType(etensor.scalar_type()); + c10::ScalarType dtype = + executorch_to_torch_scalar_type(etensor.scalar_type()); std::vector at_tensor_sizes( etensor.sizes().begin(), etensor.sizes().end()); std::vector at_tensor_strides( @@ -162,5 +165,6 @@ at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& etensor) { check_tensor_meta(t, etensor); return t; } -} // namespace util -} // namespace torch + +} // namespace extension +} // namespace executorch diff --git a/extension/aten_util/aten_bridge.h b/extension/aten_util/aten_bridge.h index a01d9bc26cd..0d6b697463c 100644 --- a/extension/aten_util/aten_bridge.h +++ b/extension/aten_util/aten_bridge.h @@ -18,12 +18,14 @@ #include #include -namespace torch { -namespace util { +namespace executorch { +namespace extension { -torch::executor::ScalarType torchToExecuTorchScalarType(caffe2::TypeMeta type); +torch::executor::ScalarType torch_to_executorch_scalar_type( + caffe2::TypeMeta type); -c10::ScalarType execuTorchtoTorchScalarType(torch::executor::ScalarType type); +c10::ScalarType executorch_to_torch_scalar_type( + torch::executor::ScalarType type); /* * @param[in] aten_tensor Input at::Tensor @@ -45,5 +47,37 @@ void alias_etensor_to_attensor(at::Tensor& at, torch::executor::Tensor& et); * cloned. */ at::Tensor alias_attensor_to_etensor(const torch::executor::Tensor& et); + +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::alias_attensor_to_etensor; +using ::executorch::extension::alias_etensor_to_attensor; +inline torch::executor::ScalarType torchToExecuTorchScalarType( + caffe2::TypeMeta type) { + return ::executorch::extension::torch_to_executorch_scalar_type(type); +} +inline c10::ScalarType execuTorchtoTorchScalarType( + torch::executor::ScalarType type) { + return ::executorch::extension::executorch_to_torch_scalar_type(type); +} +} // namespace util +} // namespace executor +} // namespace torch + +// Some users refer to these as `torch::util::`. +namespace torch { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::torch::executor::util::alias_attensor_to_etensor; +using ::torch::executor::util::alias_etensor_to_attensor; +using ::torch::executor::util::execuTorchtoTorchScalarType; +using ::torch::executor::util::torchToExecuTorchScalarType; } // namespace util } // namespace torch diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h index 1e6ab069efb..3b54254e8ed 100644 --- a/extension/aten_util/make_aten_functor_from_et_functor.h +++ b/extension/aten_util/make_aten_functor_from_et_functor.h @@ -24,8 +24,9 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace internal { // Map types from ETen to ATen. // This is used to convert ETen arguments into ATen. @@ -105,17 +106,22 @@ struct type_convert< torch::executor::Tensor>>> final { explicit type_convert(ATensor value) : value_(value) { - auto sizes = std::make_shared>( - value_.sizes().begin(), value_.sizes().end()); + auto sizes = + std::make_shared>( + value_.sizes().begin(), value_.sizes().end()); const ssize_t dim = sizes->size(); - auto dim_order = std::make_shared>(dim); - auto strides = std::make_shared>(dim); + auto dim_order = + std::make_shared>( + dim); + auto strides = + std::make_shared>( + dim); std::iota(dim_order->begin(), dim_order->end(), 0); - dim_order_to_stride_nocheck( + ::executorch::runtime::dim_order_to_stride_nocheck( sizes->data(), dim_order->data(), dim, strides->data()); - auto tensor_impl = std::make_shared( + auto tensor_impl = std::make_shared( static_cast(value_.scalar_type()), sizes->size(), sizes->data(), @@ -123,11 +129,12 @@ struct type_convert< dim_order->data(), strides->data()); - converted_ = std::unique_ptr>( - new Tensor(tensor_impl.get()), - [sizes, dim_order, strides, tensor_impl](Tensor* pointer) { - delete pointer; - }); + converted_ = std::unique_ptr< + torch::executor::Tensor, + std::function>( + new torch::executor::Tensor(tensor_impl.get()), + [sizes, dim_order, strides, tensor_impl]( + torch::executor::Tensor* pointer) { delete pointer; }); } ETensor call() { @@ -136,7 +143,10 @@ struct type_convert< private: ATensor value_; - std::unique_ptr> converted_; + std::unique_ptr< + torch::executor::Tensor, + std::function> + converted_; }; // Tensors: ETen to ATen. @@ -258,7 +268,12 @@ struct wrapper_impl { using TupleArgsType = std::tuple::type...>; static constexpr size_t num_args = sizeof...(Args); static_assert( - (N < num_args && std::is_same_v>, R>) || + (N < num_args && + std::is_same_v< + executorch::extension::kernel_util_internal::element_t< + N, + executorch::extension::kernel_util_internal::typelist>, + R>) || N == -1, "The index of the out tensor can't be greater or equal to num_args and " "the Nth argument type has to be the same as the return type."); @@ -298,16 +313,18 @@ struct wrapper_impl { } }; -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace extension +} // namespace executorch // Wrapper macro for out variant function. N is the index of the out tensor. // We need N to know how to preserve the semantics of modifying out tensor and // return the reference without allocating a new memory buffer for out tensor. -#define _WRAP_2(func, N) \ - ::torch::executor::wrapper_impl::wrap +#define _WRAP_2(func, N) \ + ::executorch::extension::internal:: \ + wrapper_impl::wrap #define _WRAP_1(func) \ - ::torch::executor::wrapper_impl::wrap + ::executorch::extension::internal::wrapper_impl::wrap -#define GET_MACRO(_1, _2, NAME, ...) NAME -#define WRAP_TO_ATEN(...) GET_MACRO(__VA_ARGS__, _WRAP_2, _WRAP_1)(__VA_ARGS__) +#define _GET_MACRO(_1, _2, NAME, ...) NAME +#define WRAP_TO_ATEN(...) _GET_MACRO(__VA_ARGS__, _WRAP_2, _WRAP_1)(__VA_ARGS__) diff --git a/extension/aten_util/test/aten_bridge_test.cpp b/extension/aten_util/test/aten_bridge_test.cpp index 5f52a063095..cf6d2b85978 100644 --- a/extension/aten_util/test/aten_bridge_test.cpp +++ b/extension/aten_util/test/aten_bridge_test.cpp @@ -16,8 +16,8 @@ #include using namespace ::testing; -using namespace torch::util; using namespace torch::executor; +using namespace torch::executor::util; namespace { at::Tensor generate_at_tensor() { diff --git a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp index db99d9b49d0..26fe845a9e1 100644 --- a/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp +++ b/extension/aten_util/test/make_aten_functor_from_et_functor_test.cpp @@ -14,10 +14,11 @@ #include #include -namespace torch { -namespace executor { - using namespace ::testing; +using ::executorch::extension::internal::type_convert; +using ::executorch::extension::internal::type_map; +using ::torch::executor::ScalarType; +using ::torch::executor::Tensor; Tensor& my_op_out(const Tensor& a, Tensor& out) { (void)a; @@ -420,6 +421,3 @@ TEST_F(MakeATenFunctorFromETFunctorTest, TestWrap_ArrayRefOptional) { EXPECT_EQ(stack.size(), 1); EXPECT_EQ(stack[0].toTensor().const_data_ptr()[0], 4); } - -} // namespace executor -} // namespace torch diff --git a/extension/data_loader/buffer_data_loader.h b/extension/data_loader/buffer_data_loader.h index 17ca36386d3..ee25d86526a 100644 --- a/extension/data_loader/buffer_data_loader.h +++ b/extension/data_loader/buffer_data_loader.h @@ -14,9 +14,8 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * A DataLoader that wraps a pre-allocated buffer. The FreeableBuffers @@ -25,12 +24,13 @@ namespace util { * This can be used to wrap data that is directly embedded into the firmware * image, or to wrap data that was allocated elsewhere. */ -class BufferDataLoader final : public DataLoader { +class BufferDataLoader final : public executorch::runtime::DataLoader { public: BufferDataLoader(const void* data, size_t size) : data_(reinterpret_cast(data)), size_(size) {} - ET_NODISCARD Result load( + ET_NODISCARD + executorch::runtime::Result load( size_t offset, size_t size, ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override { @@ -41,14 +41,15 @@ class BufferDataLoader final : public DataLoader { offset, size, size_); - return FreeableBuffer(data_ + offset, size, /*free_fn=*/nullptr); + return executorch::runtime::FreeableBuffer( + data_ + offset, size, /*free_fn=*/nullptr); } - ET_NODISCARD Result size() const override { + ET_NODISCARD executorch::runtime::Result size() const override { return size_; } - ET_NODISCARD Error load_into( + ET_NODISCARD executorch::runtime::Error load_into( size_t offset, size_t size, ET_UNUSED const SegmentInfo& segment_info, @@ -63,7 +64,7 @@ class BufferDataLoader final : public DataLoader { return result.error(); } std::memcpy(buffer, result->data(), size); - return Error::Ok; + return executorch::runtime::Error::Ok; } private: @@ -71,6 +72,15 @@ class BufferDataLoader final : public DataLoader { const size_t size_; }; +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::BufferDataLoader; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp index b58edaa2dee..1d097cfd989 100644 --- a/extension/data_loader/file_data_loader.cpp +++ b/extension/data_loader/file_data_loader.cpp @@ -34,9 +34,13 @@ #define ET_HAVE_PREAD 1 #endif // !ET_HAVE_PREAD -namespace torch { -namespace executor { -namespace util { +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; + +namespace executorch { +namespace extension { + namespace { /** @@ -287,6 +291,5 @@ ET_NODISCARD Error FileDataLoader::load_into( return Error::Ok; } -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/data_loader/file_data_loader.h b/extension/data_loader/file_data_loader.h index 12e0fcae49b..7cf2a92c4ad 100644 --- a/extension/data_loader/file_data_loader.h +++ b/extension/data_loader/file_data_loader.h @@ -14,9 +14,8 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * A DataLoader that loads segments from a file, allocating the memory @@ -25,7 +24,7 @@ namespace util { * Note that this will keep the file open for the duration of its lifetime, to * avoid the overhead of opening it again for every load() call. */ -class FileDataLoader final : public DataLoader { +class FileDataLoader final : public executorch::runtime::DataLoader { public: /** * Creates a new FileDataLoader that wraps the named file. @@ -40,12 +39,12 @@ class FileDataLoader final : public DataLoader { * could not be found. * @retval Error::MemoryAllocationFailed Internal memory allocation failure. */ - static Result from( + static executorch::runtime::Result from( const char* file_name, size_t alignment = alignof(std::max_align_t)); /// DEPRECATED: Use the lowercase `from()` instead. - ET_DEPRECATED static Result From( + ET_DEPRECATED static executorch::runtime::Result From( const char* file_name, size_t alignment = alignof(std::max_align_t)) { return from(file_name, alignment); @@ -65,14 +64,15 @@ class FileDataLoader final : public DataLoader { ~FileDataLoader() override; - ET_NODISCARD Result load( + ET_NODISCARD + executorch::runtime::Result load( size_t offset, size_t size, const DataLoader::SegmentInfo& segment_info) const override; - ET_NODISCARD Result size() const override; + ET_NODISCARD executorch::runtime::Result size() const override; - ET_NODISCARD Error load_into( + ET_NODISCARD executorch::runtime::Error load_into( size_t offset, size_t size, ET_UNUSED const SegmentInfo& segment_info, @@ -100,6 +100,15 @@ class FileDataLoader final : public DataLoader { const int fd_; // Owned by the instance. }; +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::FileDataLoader; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp index ce523f6e9be..ebe74f95266 100644 --- a/extension/data_loader/mmap_data_loader.cpp +++ b/extension/data_loader/mmap_data_loader.cpp @@ -22,9 +22,12 @@ #include #include -namespace torch { -namespace executor { -namespace util { +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; + +namespace executorch { +namespace extension { namespace { @@ -254,6 +257,5 @@ Result MmapDataLoader::size() const { return file_size_; } -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/data_loader/mmap_data_loader.h b/extension/data_loader/mmap_data_loader.h index 04a2514c77d..c55f81a490b 100644 --- a/extension/data_loader/mmap_data_loader.h +++ b/extension/data_loader/mmap_data_loader.h @@ -12,9 +12,8 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * A DataLoader that loads segments from a file, allocating the memory @@ -23,7 +22,7 @@ namespace util { * Note that this will keep the file open for the duration of its lifetime, to * avoid the overhead of opening it again for every load() call. */ -class MmapDataLoader final : public DataLoader { +class MmapDataLoader final : public executorch::runtime::DataLoader { public: /** * Describes how and whether to lock loaded pages with `mlock()`. @@ -51,12 +50,12 @@ class MmapDataLoader final : public DataLoader { * @param[in] mlock_config How and whether to lock loaded pages with * `mlock()`. */ - static Result from( + static executorch::runtime::Result from( const char* file_name, MlockConfig mlock_config = MlockConfig::UseMlock); /// DEPRECATED: Use the lowercase `from()` instead. - ET_DEPRECATED static Result From( + ET_DEPRECATED static executorch::runtime::Result From( const char* file_name, MlockConfig mlock_config = MlockConfig::UseMlock) { return from(file_name, mlock_config); @@ -64,7 +63,9 @@ class MmapDataLoader final : public DataLoader { /// DEPRECATED: Use the version of `from()` that takes an MlockConfig. ET_DEPRECATED - static Result From(const char* file_name, bool use_mlock) { + static executorch::runtime::Result From( + const char* file_name, + bool use_mlock) { MlockConfig mlock_config = use_mlock ? MlockConfig::UseMlock : MlockConfig::NoMlock; return from(file_name, mlock_config); @@ -86,12 +87,13 @@ class MmapDataLoader final : public DataLoader { ~MmapDataLoader() override; - ET_NODISCARD Result load( + ET_NODISCARD + executorch::runtime::Result load( size_t offset, size_t size, const DataLoader::SegmentInfo& segment_info) const override; - ET_NODISCARD Result size() const override; + ET_NODISCARD executorch::runtime::Result size() const override; private: MmapDataLoader( @@ -118,6 +120,15 @@ class MmapDataLoader final : public DataLoader { const MlockConfig mlock_config_; }; +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::MmapDataLoader; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/data_loader/shared_ptr_data_loader.h b/extension/data_loader/shared_ptr_data_loader.h index 79a329084f3..551ab4d498c 100644 --- a/extension/data_loader/shared_ptr_data_loader.h +++ b/extension/data_loader/shared_ptr_data_loader.h @@ -14,9 +14,8 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * A DataLoader that wraps a pre-allocated buffer and shares ownership to it. @@ -24,12 +23,13 @@ namespace util { * * This can be used to wrap data that was allocated elsewhere. */ -class SharedPtrDataLoader final : public DataLoader { +class SharedPtrDataLoader final : public executorch::runtime::DataLoader { public: SharedPtrDataLoader(std::shared_ptr data, size_t size) : data_(data), size_(size) {} - ET_NODISCARD Result load( + ET_NODISCARD + executorch::runtime::Result load( size_t offset, size_t size, ET_UNUSED const DataLoader::SegmentInfo& segment_info) const override { @@ -40,11 +40,11 @@ class SharedPtrDataLoader final : public DataLoader { offset, size, size_); - return FreeableBuffer( + return executorch::runtime::FreeableBuffer( static_cast(data_.get()) + offset, size, /*free_fn=*/nullptr); } - ET_NODISCARD Result size() const override { + ET_NODISCARD executorch::runtime::Result size() const override { return size_; } @@ -53,6 +53,15 @@ class SharedPtrDataLoader final : public DataLoader { const size_t size_; }; +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::SharedPtrDataLoader; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/data_loader/test/buffer_data_loader_test.cpp b/extension/data_loader/test/buffer_data_loader_test.cpp index e5facfc3ba0..83d053ee466 100644 --- a/extension/data_loader/test/buffer_data_loader_test.cpp +++ b/extension/data_loader/test/buffer_data_loader_test.cpp @@ -16,18 +16,18 @@ #include using namespace ::testing; -using torch::executor::DataLoader; -using torch::executor::Error; -using torch::executor::FreeableBuffer; -using torch::executor::Result; -using torch::executor::util::BufferDataLoader; +using executorch::extension::BufferDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; class BufferDataLoaderTest : public ::testing::Test { protected: void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); } }; diff --git a/extension/data_loader/test/file_data_loader_test.cpp b/extension/data_loader/test/file_data_loader_test.cpp index f7081565fd3..1d4f4c16196 100644 --- a/extension/data_loader/test/file_data_loader_test.cpp +++ b/extension/data_loader/test/file_data_loader_test.cpp @@ -18,19 +18,19 @@ #include using namespace ::testing; -using torch::executor::DataLoader; -using torch::executor::Error; -using torch::executor::FreeableBuffer; -using torch::executor::Result; -using torch::executor::testing::TempFile; -using torch::executor::util::FileDataLoader; +using executorch::extension::FileDataLoader; +using executorch::extension::testing::TempFile; +using executorch::runtime::DataLoader; +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; class FileDataLoaderTest : public ::testing::TestWithParam { protected: void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); } // The alignment in bytes that tests should use. The values are set by the diff --git a/extension/data_loader/test/mmap_data_loader_test.cpp b/extension/data_loader/test/mmap_data_loader_test.cpp index b6781bc4482..a76121109a8 100644 --- a/extension/data_loader/test/mmap_data_loader_test.cpp +++ b/extension/data_loader/test/mmap_data_loader_test.cpp @@ -19,19 +19,19 @@ #include using namespace ::testing; -using torch::executor::DataLoader; -using torch::executor::Error; -using torch::executor::FreeableBuffer; -using torch::executor::Result; -using torch::executor::testing::TempFile; -using torch::executor::util::MmapDataLoader; +using executorch::extension::MmapDataLoader; +using executorch::extension::testing::TempFile; +using executorch::runtime::DataLoader; +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; class MmapDataLoaderTest : public ::testing::Test { protected: void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); // Get the page size and ensure it's a power of 2. long page_size = sysconf(_SC_PAGESIZE); diff --git a/extension/data_loader/test/shared_ptr_data_loader_test.cpp b/extension/data_loader/test/shared_ptr_data_loader_test.cpp index b4fc153cab3..62d71ae0560 100644 --- a/extension/data_loader/test/shared_ptr_data_loader_test.cpp +++ b/extension/data_loader/test/shared_ptr_data_loader_test.cpp @@ -17,18 +17,18 @@ #include using namespace ::testing; -using torch::executor::DataLoader; -using torch::executor::Error; -using torch::executor::FreeableBuffer; -using torch::executor::Result; -using torch::executor::util::SharedPtrDataLoader; +using executorch::extension::SharedPtrDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::Error; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::Result; class SharedPtrDataLoaderTest : public ::testing::Test { protected: void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); } }; diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h index 0202b4f51cb..2b21914f49b 100644 --- a/extension/kernel_util/make_boxed_from_unboxed_functor.h +++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h @@ -54,10 +54,13 @@ class KernelRuntimeContext; // Forward declaration } // namespace runtime } // namespace executorch -namespace torch { -namespace executor { +namespace executorch { +namespace extension { + +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { -// evalue_to_arg template struct decay_if_not_tensor final { using type = std::decay_t; @@ -73,45 +76,44 @@ struct decay_if_not_tensor final { template struct evalue_to_arg final { - static T call(EValue& v) { + static T call(executorch::runtime::EValue& v) { return std::move(v).to(); } }; template <> struct evalue_to_arg final { - static exec_aten::Tensor& call(EValue& v) { + static exec_aten::Tensor& call(executorch::runtime::EValue& v) { return v.toTensor(); } }; template <> struct evalue_to_arg final { - static const exec_aten::Tensor& call(EValue& v) { + static const exec_aten::Tensor& call(executorch::runtime::EValue& v) { return v.toTensor(); } }; template struct evalue_to_arg> final { - static exec_aten::optional call(EValue& v) { + static exec_aten::optional call(executorch::runtime::EValue& v) { return v.toOptional(); } }; template struct evalue_to_arg>> final { - static exec_aten::ArrayRef> call(EValue& v) { + static exec_aten::ArrayRef> call( + executorch::runtime::EValue& v) { return v.toListOptionalTensor(); } }; -// Call functor with args from stack - template -void call_functor_with_args_from_stack_( +void call_functor_with_args_from_stack( ::executorch::runtime::KernelRuntimeContext& ctx, - EValue** stack, + executorch::runtime::EValue** stack, std::index_sequence, typelist*) { (*Functor::func_ptr())( @@ -120,6 +122,8 @@ void call_functor_with_args_from_stack_( *stack[evalue_arg_indices])...); } +} // namespace kernel_util_internal + /** * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that * takes EValues as input and returns void. The wrapped functor will unbox all @@ -128,25 +132,29 @@ void call_functor_with_args_from_stack_( template struct WrapUnboxedIntoFunctor { static_assert( - is_compile_time_function_pointer::value, + kernel_util_internal::is_compile_time_function_pointer::value, "Can't handle function other than EXECUTORCH_FN"); using TrueType = typename FuncType::FuncType; - using ReturnType = typename infer_function_traits_t::return_type; - using ArgsType = typename infer_function_traits_t::parameter_types; + using ReturnType = typename kernel_util_internal::infer_function_traits_t< + TrueType>::return_type; + using ArgsType = typename kernel_util_internal::infer_function_traits_t< + TrueType>::parameter_types; // check if the first argument is KernelRuntimeContext, if so, remove it static constexpr bool first_arg_is_context = std::is_same< ::executorch::runtime::KernelRuntimeContext, - std::remove_reference_t>>::value; + std::remove_reference_t< + kernel_util_internal::head_with_default_t>>::value; using ContextRemovedArgsType = std::conditional_t< first_arg_is_context, - drop_if_nonempty_t, + kernel_util_internal::drop_if_nonempty_t, ArgsType>; static void call( ::executorch::runtime::KernelRuntimeContext& ctx, - EValue** stack) { - constexpr size_t num_inputs = size::value; - return call_functor_with_args_from_stack_( + executorch::runtime::EValue** stack) { + constexpr size_t num_inputs = + kernel_util_internal::size::value; + return kernel_util_internal::call_functor_with_args_from_stack( ctx, stack, std::make_index_sequence(), @@ -155,14 +163,26 @@ struct WrapUnboxedIntoFunctor { }; template -static Kernel make_boxed_kernel(const char* name, FuncType) { - return Kernel(name, WrapUnboxedIntoFunctor::call); +static executorch::runtime::Kernel make_boxed_kernel( + const char* name, + FuncType) { + return executorch::runtime::Kernel( + name, WrapUnboxedIntoFunctor::call); } -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch -#define EXECUTORCH_LIBRARY(ns, op_name, func) \ - static auto res_##ns = ::torch::executor::register_kernels( \ - ::torch::executor::make_boxed_kernel( \ +#define EXECUTORCH_LIBRARY(ns, op_name, func) \ + static auto res_##ns = ::executorch::runtime::register_kernels( \ + ::executorch::extension::make_boxed_kernel( \ #ns "::" op_name, EXECUTORCH_FN(func))) + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::make_boxed_kernel; +using ::executorch::extension::WrapUnboxedIntoFunctor; +} // namespace executor +} // namespace torch diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h index c412e907ea0..027568fe687 100644 --- a/extension/kernel_util/meta_programming.h +++ b/extension/kernel_util/meta_programming.h @@ -17,8 +17,11 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { // Check if a given type is a function template @@ -48,9 +51,9 @@ template struct is_compile_time_function_pointer< CompileTimeFunctionPointer> : std::true_type {}; -#define EXECUTORCH_FN_TYPE(func) \ - ::torch::executor::CompileTimeFunctionPointer< \ - std::remove_pointer_t>, \ +#define EXECUTORCH_FN_TYPE(func) \ + ::executorch::extension::kernel_util_internal::CompileTimeFunctionPointer< \ + std::remove_pointer_t>, \ func> #define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)() @@ -111,5 +114,6 @@ struct infer_function_traits { template using infer_function_traits_t = typename infer_function_traits::type; -} // namespace executor -} // namespace torch +} // namespace kernel_util_internal +} // namespace extension +} // namespace executorch diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp index b75a8f160a6..da9596def70 100644 --- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp +++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp @@ -15,22 +15,31 @@ #include using namespace ::testing; -using RuntimeContext = torch::executor::KernelRuntimeContext; -using namespace torch::executor; - -Tensor& my_op_out(RuntimeContext& ctx, const Tensor& a, Tensor& out) { +using exec_aten::ArrayRef; +using exec_aten::optional; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using exec_aten::TensorImpl; +using executorch::runtime::BoxedEvalueList; +using executorch::runtime::EValue; +using executorch::runtime::getOpsFn; +using executorch::runtime::hasOpsFn; +using executorch::runtime::KernelRuntimeContext; + +Tensor& my_op_out(KernelRuntimeContext& ctx, const Tensor& a, Tensor& out) { (void)ctx; (void)a; return out; } -Tensor& set_1_out(RuntimeContext& ctx, Tensor& out) { +Tensor& set_1_out(KernelRuntimeContext& ctx, Tensor& out) { (void)ctx; out.mutable_data_ptr()[0] = 1; return out; } -Tensor& add_tensor_out(RuntimeContext& ctx, ArrayRef a, Tensor& out) { +Tensor& +add_tensor_out(KernelRuntimeContext& ctx, ArrayRef a, Tensor& out) { (void)ctx; for (int i = 0; i < out.numel(); i++) { int sum = 0; @@ -43,7 +52,7 @@ Tensor& add_tensor_out(RuntimeContext& ctx, ArrayRef a, Tensor& out) { } Tensor& add_optional_scalar_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, optional s1, optional s2, Tensor& out) { @@ -58,7 +67,7 @@ Tensor& add_optional_scalar_out( } Tensor& add_optional_tensor_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, ArrayRef> a, Tensor& out) { (void)ctx; @@ -100,7 +109,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { auto fn = getOpsFn("my_ns::set_1.out"); // run it - RuntimeContext context; + KernelRuntimeContext context; EValue values[1]; values[0] = a; EValue* stack[1]; @@ -129,7 +138,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { auto fn = getOpsFn("my_ns::add_tensor.out"); // run it. - RuntimeContext context; + KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; fn(context, stack); @@ -154,7 +163,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { auto fn = getOpsFn("my_ns::add_optional_scalar.out"); // run it. - RuntimeContext context; + KernelRuntimeContext context; EValue values[3] = {scalar, scalar_none, out}; EValue* stack[3] = {&values[0], &values[1], &values[2]}; fn(context, stack); @@ -180,7 +189,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) { auto fn = getOpsFn("my_ns::add_optional_tensor.out"); // run it. - RuntimeContext context; + KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; fn(context, stack); diff --git a/extension/kernel_util/type_list.h b/extension/kernel_util/type_list.h index f832ab9f267..300cbfcb7cb 100644 --- a/extension/kernel_util/type_list.h +++ b/extension/kernel_util/type_list.h @@ -20,8 +20,12 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +// This extension has a lot of generic internal names like "size"; use a unique +// internal namespace to avoid conflicts with other extensions. +namespace kernel_util_internal { + /** * Type holding a list of types for compile time type computations * constexpr size_t num = size>::value; @@ -139,5 +143,6 @@ struct drop_if_nonempty final { template using drop_if_nonempty_t = typename drop_if_nonempty::type; -} // namespace executor -} // namespace torch +} // namespace kernel_util_internal +} // namespace extension +} // namespace executorch diff --git a/extension/memory_allocator/malloc_memory_allocator.h b/extension/memory_allocator/malloc_memory_allocator.h index 6625f587aad..7e1cf8b2abc 100644 --- a/extension/memory_allocator/malloc_memory_allocator.h +++ b/extension/memory_allocator/malloc_memory_allocator.h @@ -14,9 +14,8 @@ #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * Dynamically allocates memory using malloc() and frees all pointers at @@ -25,7 +24,7 @@ namespace util { * For systems with malloc(), this can be easier than using a fixed-sized * MemoryAllocator. */ -class MallocMemoryAllocator : public MemoryAllocator { +class MallocMemoryAllocator : public executorch::runtime::MemoryAllocator { public: /** * Construct a new Malloc memory allocator via an optional alignment size @@ -76,6 +75,16 @@ class MallocMemoryAllocator : public MemoryAllocator { private: std::vector mem_ptrs_; }; + +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::MallocMemoryAllocator; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/memory_allocator/test/malloc_memory_allocator_test.cpp b/extension/memory_allocator/test/malloc_memory_allocator_test.cpp index 05dfafa2061..fc2db04bc84 100644 --- a/extension/memory_allocator/test/malloc_memory_allocator_test.cpp +++ b/extension/memory_allocator/test/malloc_memory_allocator_test.cpp @@ -12,17 +12,16 @@ #include using namespace ::testing; -using torch::executor::util::MallocMemoryAllocator; +using executorch::extension::MallocMemoryAllocator; -constexpr auto kDefaultAlignment = - torch::executor::util::MallocMemoryAllocator::kDefaultAlignment; +constexpr auto kDefaultAlignment = MallocMemoryAllocator::kDefaultAlignment; class MallocMemoryAllocatorTest : public ::testing::Test { protected: void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); } }; diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 6cee0185c74..e59d4b45dbc 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -33,7 +33,24 @@ std::move(*et_result__)); \ }) -namespace torch::executor { +using ::exec_aten::Tensor; +using ::executorch::extension::FileDataLoader; +using ::executorch::extension::MallocMemoryAllocator; +using ::executorch::extension::MmapDataLoader; +using ::executorch::runtime::DataLoader; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracer; +using ::executorch::runtime::HierarchicalAllocator; +using ::executorch::runtime::MemoryAllocator; +using ::executorch::runtime::MemoryManager; +using ::executorch::runtime::MethodMeta; +using ::executorch::runtime::Program; +using ::executorch::runtime::Result; +using ::executorch::runtime::Span; + +namespace executorch { +namespace extension { Module::Module( const std::string& file_path, @@ -41,10 +58,10 @@ Module::Module( std::unique_ptr event_tracer) : file_path_(file_path), load_mode_(load_mode), - memory_allocator_(std::make_unique()), - temp_allocator_(std::make_unique()), + memory_allocator_(std::make_unique()), + temp_allocator_(std::make_unique()), event_tracer_(std::move(event_tracer)) { - runtime_init(); + ::executorch::runtime::runtime_init(); } Module::Module( @@ -55,12 +72,12 @@ Module::Module( : data_loader_(std::move(data_loader)), memory_allocator_( memory_allocator ? std::move(memory_allocator) - : std::make_unique()), + : std::make_unique()), temp_allocator_( temp_allocator ? std::move(temp_allocator) - : std::make_unique()), + : std::make_unique()), event_tracer_(std::move(event_tracer)) { - runtime_init(); + ::executorch::runtime::runtime_init(); } Module::Module( @@ -71,12 +88,12 @@ Module::Module( : program_(std::move(program)), memory_allocator_( memory_allocator ? std::move(memory_allocator) - : std::make_unique()), + : std::make_unique()), temp_allocator_( temp_allocator ? std::move(temp_allocator) - : std::make_unique()), + : std::make_unique()), event_tracer_(std::move(event_tracer)) { - runtime_init(); + ::executorch::runtime::runtime_init(); } Error Module::load(const Program::Verification verification) { @@ -85,20 +102,20 @@ Error Module::load(const Program::Verification verification) { switch (load_mode_) { case LoadMode::File: data_loader_ = - ET_UNWRAP_UNIQUE(util::FileDataLoader::from(file_path_.c_str())); + ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path_.c_str())); break; case LoadMode::Mmap: - data_loader_ = ET_UNWRAP_UNIQUE(util::MmapDataLoader::from( - file_path_.c_str(), util::MmapDataLoader::MlockConfig::NoMlock)); + data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from( + file_path_.c_str(), MmapDataLoader::MlockConfig::NoMlock)); break; case LoadMode::MmapUseMlock: data_loader_ = - ET_UNWRAP_UNIQUE(util::MmapDataLoader::from(file_path_.c_str())); + ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path_.c_str())); break; case LoadMode::MmapUseMlockIgnoreErrors: - data_loader_ = ET_UNWRAP_UNIQUE(util::MmapDataLoader::from( + data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from( file_path_.c_str(), - util::MmapDataLoader::MlockConfig::UseMlockIgnoreErrors)); + MmapDataLoader::MlockConfig::UseMlockIgnoreErrors)); break; } }; @@ -199,4 +216,5 @@ Error Module::set_output_data_ptr(Tensor& output_tensor, size_t output_index) { output_tensor.mutable_data_ptr(), output_tensor.nbytes(), output_index); } -} // namespace torch::executor +} // namespace extension +} // namespace executorch diff --git a/extension/module/module.h b/extension/module/module.h index da09141659c..a0b575d5bf6 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -16,7 +16,8 @@ #include -namespace torch::executor { +namespace executorch { +namespace extension { /** * A facade class for loading programs and executing methods within them. @@ -47,7 +48,8 @@ class Module final { explicit Module( const std::string& file_path, const LoadMode load_mode = LoadMode::MmapUseMlock, - std::unique_ptr event_tracer = nullptr); + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = + nullptr); /** * Constructs an instance with the provided data loader and memory allocator. @@ -59,10 +61,13 @@ class Module final { * @param[in] event_tracer A EventTracer used for tracking and logging events. */ explicit Module( - std::unique_ptr data_loader, - std::unique_ptr memory_allocator = nullptr, - std::unique_ptr temp_allocator = nullptr, - std::unique_ptr event_tracer = nullptr); + std::unique_ptr<::executorch::runtime::DataLoader> data_loader, + std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator = + nullptr, + std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator = + nullptr, + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = + nullptr); /** * Constructs an instance using an existing shared program. @@ -75,10 +80,13 @@ class Module final { * @param[in] event_tracer A EventTracer used for tracking and logging events. */ explicit Module( - std::shared_ptr program, - std::unique_ptr memory_allocator = nullptr, - std::unique_ptr temp_allocator = nullptr, - std::unique_ptr event_tracer = nullptr); + std::shared_ptr<::executorch::runtime::Program> program, + std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator = + nullptr, + std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator = + nullptr, + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = + nullptr); Module(const Module&) = delete; Module& operator=(const Module&) = delete; @@ -94,9 +102,9 @@ class Module final { * @returns An Error to indicate success or failure of the loading process. */ ET_NODISCARD - Error load( - const Program::Verification verification = - Program::Verification::Minimal); + ::executorch::runtime::Error load( + const ::executorch::runtime::Program::Verification verification = + ::executorch::runtime::Program::Verification::Minimal); /** * Checks if the program is loaded. @@ -111,7 +119,7 @@ class Module final { * * @returns Shared pointer to the program or nullptr if it's not yet loaded. */ - std::shared_ptr program() const { + std::shared_ptr<::executorch::runtime::Program> program() const { return program_; } @@ -122,7 +130,7 @@ class Module final { * @returns A set of strings containing the names of the methods, or an error * if the program or method failed to load. */ - Result> method_names(); + ::executorch::runtime::Result> method_names(); /** * Load a specific method from the program and set up memory management if @@ -133,7 +141,7 @@ class Module final { * @returns An Error to indicate success or failure. */ ET_NODISCARD - Error load_method(const std::string& method_name); + ::executorch::runtime::Error load_method(const std::string& method_name); /** * Checks if a specific method is loaded. @@ -154,7 +162,8 @@ class Module final { * @returns A method metadata, or an error if the program or method failed to * load. */ - Result method_meta(const std::string& method_name); + ::executorch::runtime::Result<::executorch::runtime::MethodMeta> method_meta( + const std::string& method_name); /** * Execute a specific method with the given input and retrieve output. @@ -167,9 +176,10 @@ class Module final { * from the method or an error to indicate failure. */ ET_NODISCARD - Result> execute( + ::executorch::runtime::Result> + execute( const std::string& method_name, - const std::vector& input); + const std::vector<::executorch::runtime::EValue>& input); /** * Execute a specific method without any input values. @@ -181,7 +191,8 @@ class Module final { * from the method or an error to indicate failure. */ ET_NODISCARD - Result> execute(const std::string& method_name) { + ::executorch::runtime::Result> + execute(const std::string& method_name) { return execute(method_name, {}); } @@ -196,12 +207,12 @@ class Module final { * method or an error to indicate failure. */ ET_NODISCARD - Result get( + ::executorch::runtime::Result<::executorch::runtime::EValue> get( const std::string& method_name, - const std::vector& input) { + const std::vector<::executorch::runtime::EValue>& input) { auto result = ET_UNWRAP(execute(method_name, input)); if (result.empty()) { - return Error::InvalidArgument; + return ::executorch::runtime::Error::InvalidArgument; } return result[0]; } @@ -216,7 +227,8 @@ class Module final { * method or an error to indicate failure. */ ET_NODISCARD - Result get(const std::string& method_name) { + ::executorch::runtime::Result<::executorch::runtime::EValue> get( + const std::string& method_name) { return get(method_name, {}); } @@ -230,7 +242,8 @@ class Module final { * from the 'forward' method or an error to indicate failure. */ ET_NODISCARD - Result> forward(const std::vector& input) { + ::executorch::runtime::Result> + forward(const std::vector<::executorch::runtime::EValue>& input) { return execute("forward", input); } @@ -242,7 +255,8 @@ class Module final { * from the 'forward' method or an error to indicate failure. */ ET_NODISCARD - Result> forward() { + ::executorch::runtime::Result> + forward() { return forward({}); } @@ -254,7 +268,7 @@ class Module final { * @returns A pointer to the EventTracer instance. Returns nullptr if no * EventTracer is set. */ - EventTracer* event_tracer() const { + ::executorch::runtime::EventTracer* event_tracer() const { return event_tracer_.get(); } @@ -266,26 +280,38 @@ class Module final { * * @returns An Error to indicate success or failure of the loading process. */ - Error set_output_data_ptr(Tensor& output_tensor, size_t output_index); + ::executorch::runtime::Error set_output_data_ptr( + exec_aten::Tensor& output_tensor, + size_t output_index); private: struct MethodHolder { std::vector> planned_buffers; - std::vector> planned_spans; - std::unique_ptr planned_memory; - std::unique_ptr memory_manager; - std::unique_ptr method; + std::vector<::executorch::runtime::Span> planned_spans; + std::unique_ptr<::executorch::runtime::HierarchicalAllocator> + planned_memory; + std::unique_ptr<::executorch::runtime::MemoryManager> memory_manager; + std::unique_ptr<::executorch::runtime::Method> method; }; private: std::string file_path_; LoadMode load_mode_{LoadMode::MmapUseMlock}; - std::shared_ptr program_; - std::unique_ptr data_loader_; - std::unique_ptr memory_allocator_; - std::unique_ptr temp_allocator_; - std::unique_ptr event_tracer_; + std::shared_ptr<::executorch::runtime::Program> program_; + std::unique_ptr<::executorch::runtime::DataLoader> data_loader_; + std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator_; + std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator_; + std::unique_ptr<::executorch::runtime::EventTracer> event_tracer_; std::unordered_map methods_; }; -} // namespace torch::executor +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::Module; +} // namespace executor +} // namespace torch diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp index 1eea87beb01..d386429100d 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/parallel/test/thread_parallel_test.cpp @@ -15,8 +15,7 @@ #include using namespace ::testing; - -namespace torch::executor { +using ::executorch::extension::parallel_for; class ParallelTest : public ::testing::Test { protected: @@ -192,5 +191,3 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) { EXPECT_EQ(data_[i], i); } } - -} // namespace torch::executor diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp index cdd1d21a83c..aac47cca2e8 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/parallel/thread_parallel.cpp @@ -13,7 +13,8 @@ #include #include -namespace torch::executor { +namespace executorch { +namespace extension { namespace { thread_local int64_t thread_num_ = 0; @@ -74,4 +75,5 @@ bool parallel_for( return true; } -} // namespace torch::executor +} // namespace extension +} // namespace executorch diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 7b58236e8ab..bbce211597d 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -12,7 +12,8 @@ // @nolint PATTERNLINT Ok to use stdlib for this optional library #include -namespace torch::executor { +namespace executorch { +namespace extension { /** * A helper to run function in parallel. @@ -39,4 +40,15 @@ int64_t get_thread_num(); void set_thread_num(int64_t thread_num); -} // namespace torch::executor +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::get_thread_num; +using ::executorch::extension::parallel_for; +using ::executorch::extension::set_thread_num; +} // namespace executor +} // namespace torch diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 9a49e3878ea..83cec280b89 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -72,8 +72,39 @@ void et_pal_emit_log_message( } namespace py = pybind11; -namespace torch { -namespace executor { +using ::executorch::extension::BufferDataLoader; +using ::executorch::extension::MallocMemoryAllocator; +using ::executorch::extension::MmapDataLoader; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::DataLoader; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerDebugLogLevel; +using ::executorch::runtime::get_kernels; +using ::executorch::runtime::HierarchicalAllocator; +using ::executorch::runtime::Kernel; +using ::executorch::runtime::MemoryAllocator; +using ::executorch::runtime::MemoryManager; +using ::executorch::runtime::Method; +using ::executorch::runtime::prof_result_t; +using ::executorch::runtime::Program; +using ::executorch::runtime::Result; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; +using torch::executor::etdump_result; +using torch::executor::ETDumpGen; +using torch::executor::bundled_program::LoadBundledInput; +using torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput; + +#ifndef USE_ATEN_LIB +using ::executorch::extension::alias_attensor_to_etensor; +using ::executorch::extension::alias_etensor_to_attensor; +using ::executorch::extension::torch_to_executorch_scalar_type; +#endif // !USE_ATEN_LIB + +namespace executorch { +namespace extension { +namespace pybindings { namespace { @@ -96,7 +127,7 @@ void write_data_to_file(const std::string& path, void* buf, size_t size) { } void setup_output_storage( - executor::Method& method, + Method& method, const std::vector>& output_storages) { if (output_storages.size() != method.outputs_size()) { THROW_IF_ERROR( @@ -123,10 +154,6 @@ void setup_output_storage( } } -using util::BufferDataLoader; -using util::MallocMemoryAllocator; -using util::MmapDataLoader; - class Module final { public: explicit Module( @@ -136,7 +163,7 @@ class Module final { : loader_(std::move(loader)), event_tracer_(std::move(tracer)), debug_buffer_size_(debug_buffer_size) { - runtime_init(); + ::executorch::runtime::runtime_init(); Result program = Program::load( loader_.get(), Program::Verification::InternalConsistency); THROW_IF_ERROR( @@ -346,12 +373,12 @@ class Module final { size_t debug_buffer_size_; }; -inline std::unique_ptr load_from_buffer( +inline std::unique_ptr load_module_from_buffer( const void* ptr, size_t ptr_len, bool enable_etdump, size_t debug_buffer_size) { - EXECUTORCH_SCOPE_PROF("load_from_buffer"); + EXECUTORCH_SCOPE_PROF("load_module_from_buffer"); auto loader = std::make_unique(ptr, ptr_len); return std::make_unique( std::move(loader), @@ -359,11 +386,11 @@ inline std::unique_ptr load_from_buffer( debug_buffer_size); } -inline std::unique_ptr load_from_file( +inline std::unique_ptr load_module_from_file( const std::string& path, bool enable_etdump, size_t debug_buffer_size) { - EXECUTORCH_SCOPE_PROF("load_from_file"); + EXECUTORCH_SCOPE_PROF("load_module_from_file"); Result res = MmapDataLoader::from( path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors); @@ -428,7 +455,7 @@ struct PyModule final { const py::bytes& buffer, bool enable_etdump, size_t debug_buffer_size = 0) - : module_(torch::executor::load_from_buffer( + : module_(load_module_from_buffer( buffer.cast().data(), py::len(buffer), enable_etdump, @@ -439,7 +466,7 @@ struct PyModule final { size_t ptr_len, bool enable_etdump, size_t debug_buffer_size = 0) - : module_(torch::executor::load_from_buffer( + : module_(load_module_from_buffer( ptr, ptr_len, enable_etdump, @@ -449,10 +476,8 @@ struct PyModule final { const std::string& path, bool enable_etdump, size_t debug_buffer_size = 0) - : module_(torch::executor::load_from_file( - path, - enable_etdump, - debug_buffer_size)) {} + : module_(load_module_from_file(path, enable_etdump, debug_buffer_size)) { + } PyModule(const PyModule&) = delete; PyModule& operator=(const PyModule&) = delete; @@ -525,8 +550,8 @@ struct PyModule final { EValue evalue(at_tensor); #else // convert at::Tensor to torch::executor::Tensor - auto type = torch::util::torchToExecuTorchScalarType( - at_tensor.options().dtype()); + auto type = + torch_to_executorch_scalar_type(at_tensor.options().dtype()); size_t dim = at_tensor.dim(); // cant directly alias at::Tensor sizes and strides due to int64 vs // int32 typing conflict @@ -551,7 +576,7 @@ struct PyModule final { torch::executor::Tensor temp = torch::executor::Tensor(&input_tensors.back()); - torch::util::alias_etensor_to_attensor(at_tensor, temp); + alias_etensor_to_attensor(at_tensor, temp); EValue evalue(temp); #endif @@ -628,10 +653,10 @@ struct PyModule final { void load_bundled_input( PyBundledModule& m, - const string method_name, + const std::string method_name, size_t testset_idx) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); - Error status = bundled_program::LoadBundledInput( + Error status = LoadBundledInput( module_->get_method(method_name), bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, @@ -641,20 +666,19 @@ struct PyModule final { py::list verify_result_with_bundled_expected_output( PyBundledModule& m, - const string method_name, + const std::string method_name, size_t testset_idx, double rtol = 1e-5, double atol = 1e-8) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); auto& method = module_->get_method(method_name); - Error status = bundled_program::LoadBundledInput( - method, bundled_program_ptr, testset_idx); + Error status = LoadBundledInput(method, bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, "LoadBundledInput failed with status %" PRIu32, static_cast(status)); py::list outputs = plan_execute(method_name); - status = bundled_program::VerifyResultWithBundledExpectedOutput( + status = VerifyResultWithBundledExpectedOutput( method, bundled_program_ptr, testset_idx, rtol, atol); THROW_IF_ERROR( status, @@ -663,7 +687,7 @@ struct PyModule final { return outputs; } - py::list plan_execute(const string method_name) { + py::list plan_execute(const std::string method_name) { auto& method = module_->get_method(method_name); // Need to pre-allocate space for outputs just like in run_method. const auto num_outputs = method.outputs_size(); @@ -704,8 +728,7 @@ struct PyModule final { // module object list[i] = py::cast(v.toTensor().clone()); #else - list[i] = py::cast( - torch::util::alias_attensor_to_etensor(v.toTensor()).clone()); + list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone()); #endif } else { ET_ASSERT_UNREACHABLE_MSG("Invalid model output type"); @@ -720,8 +743,7 @@ struct PyModule final { // bundled programs. std::vector> output_storages_; - std::vector> make_output_storages( - const executor::Method& method) { + std::vector> make_output_storages(const Method& method) { const auto num_outputs = method.outputs_size(); // These output storages will not be used if the ExecuTorch program already // pre-allocated output space. That is represented by an error from @@ -845,5 +867,6 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) { py::class_(m, "BundledModule"); } -} // namespace executor -} // namespace torch +} // namespace pybindings +} // namespace extension +} // namespace executorch diff --git a/extension/pytree/aten_util/ivalue_util.cpp b/extension/pytree/aten_util/ivalue_util.cpp index 6935d45e928..c4d11c13ee0 100644 --- a/extension/pytree/aten_util/ivalue_util.cpp +++ b/extension/pytree/aten_util/ivalue_util.cpp @@ -10,13 +10,12 @@ #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { using namespace c10; using namespace at; -using namespace torch::executor::pytree; +using namespace executorch::extension::pytree; ContainerHandle getContainerHandle(const IValue& data) { if (data.isList()) { @@ -214,6 +213,5 @@ bool is_same(const IValue& lhs, const IValue& rhs) { return at::all(l == r).item(); } -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/pytree/aten_util/ivalue_util.h b/extension/pytree/aten_util/ivalue_util.h index 7798a3a4b35..b797f13e8fd 100644 --- a/extension/pytree/aten_util/ivalue_util.h +++ b/extension/pytree/aten_util/ivalue_util.h @@ -19,20 +19,19 @@ #include -namespace torch { -namespace executor { -namespace util { - -using Empty = torch::executor::pytree::Empty; +namespace executorch { +namespace extension { std::pair< std::vector, - std::unique_ptr>> + std::unique_ptr<::executorch::extension::pytree::TreeSpec< + ::executorch::extension::pytree::Empty>>> flatten(const c10::IValue& data); c10::IValue unflatten( const std::vector& tensors, - const std::unique_ptr>& tree_spec); + const std::unique_ptr<::executorch::extension::pytree::TreeSpec< + ::executorch::extension::pytree::Empty>>& tree_spec); bool is_same( const std::vector& a, @@ -40,6 +39,17 @@ bool is_same( bool is_same(const c10::IValue& lhs, const c10::IValue& rhs); +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::flatten; +using ::executorch::extension::is_same; +using ::executorch::extension::unflatten; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/pytree/aten_util/test/ivalue_util_test.cpp b/extension/pytree/aten_util/test/ivalue_util_test.cpp index 92954f2af85..8b35f173e31 100644 --- a/extension/pytree/aten_util/test/ivalue_util_test.cpp +++ b/extension/pytree/aten_util/test/ivalue_util_test.cpp @@ -9,9 +9,9 @@ #include #include -using namespace c10; -using namespace torch::executor::pytree; -using namespace torch::executor::util; +using executorch::extension::flatten; +using executorch::extension::is_same; +using executorch::extension::unflatten; std::vector makeExampleTensors(size_t N) { std::vector tensors; @@ -22,7 +22,7 @@ std::vector makeExampleTensors(size_t N) { } struct TestCase { - IValue ivalue; + c10::IValue ivalue; std::vector tensors; }; @@ -54,7 +54,7 @@ TestCase makeExampleDictOfTensors() { TestCase makeExampleComposite() { auto tensors = makeExampleTensors(8); - IValue list = c10::List{ + c10::IValue list = c10::List{ tensors[1], tensors[2], }; @@ -100,7 +100,7 @@ void testUnflatten(const TestCase& testcase) { auto ret = flatten(testcase.ivalue); // then we unflatten it - IValue unflattened = unflatten(ret.first, ret.second); + c10::IValue unflattened = unflatten(ret.first, ret.second); // and see if we got the same IValue back ASSERT_TRUE(is_same(unflattened, testcase.ivalue)); diff --git a/extension/pytree/function_ref.h b/extension/pytree/function_ref.h index 01d2988597a..0458610c4db 100644 --- a/extension/pytree/function_ref.h +++ b/extension/pytree/function_ref.h @@ -38,14 +38,16 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { namespace pytree { //===----------------------------------------------------------------------===// // Features from C++20 //===----------------------------------------------------------------------===// +namespace internal { + template struct remove_cvref { using type = @@ -55,6 +57,8 @@ struct remove_cvref { template using remove_cvref_t = typename remove_cvref::type; +} // namespace internal + template class FunctionRef; @@ -79,7 +83,7 @@ class FunctionRef { typename Callable, // This is not the copy-constructor. typename std::enable_if< - !std::is_same, FunctionRef>::value, + !std::is_same, FunctionRef>::value, int32_t>::type = 0, // Avoid lvalue reference to non-capturing lambda. typename std::enable_if< @@ -153,6 +157,16 @@ class FunctionRef { } }; +} // namespace pytree +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace pytree { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::pytree::FunctionRef; } // namespace pytree } // namespace executor } // namespace torch diff --git a/extension/pytree/pybindings.cpp b/extension/pytree/pybindings.cpp index 9bcf6043b77..931943e489e 100644 --- a/extension/pytree/pybindings.cpp +++ b/extension/pytree/pybindings.cpp @@ -15,8 +15,8 @@ namespace py = pybind11; -namespace torch { -namespace executor { +namespace executorch { +namespace extension { namespace pytree { namespace { @@ -395,5 +395,5 @@ PYBIND11_MODULE(pybindings, m) { } } // namespace pytree -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/pytree/pytree.h b/extension/pytree/pytree.h index 127310254d7..78e2305fe3e 100644 --- a/extension/pytree/pytree.h +++ b/extension/pytree/pytree.h @@ -19,8 +19,8 @@ // NB: This is a local, pytree FunctionRef and not from the ExecuTorch runtime. #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { namespace pytree { inline void pytree_assert(bool must_be_true) { @@ -738,6 +738,18 @@ std::pair, std::unique_ptr>> flatten( std::make_unique>(clone(tree, spec_leaves.get()))}; } +} // namespace pytree +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace pytree { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::pytree::Empty; +using ::executorch::extension::pytree::from_str; +using ::executorch::extension::pytree::TreeSpec; } // namespace pytree } // namespace executor } // namespace torch diff --git a/extension/pytree/test/TARGETS b/extension/pytree/test/TARGETS index c281994cea4..190bdb0bc67 100644 --- a/extension/pytree/test/TARGETS +++ b/extension/pytree/test/TARGETS @@ -5,7 +5,7 @@ load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") oncall("executorch") cpp_unittest( - name = "cpptest", + name = "pytree_test", srcs = ["test_pytree.cpp"], deps = ["//executorch/extension/pytree:pytree"], ) @@ -17,7 +17,7 @@ cpp_unittest( ) python_unittest( - name = "test", + name = "pybindings_test", srcs = [ "test.py", ], diff --git a/extension/pytree/test/function_ref_test.cpp b/extension/pytree/test/function_ref_test.cpp index f847c8ebd78..a3cdbd824bf 100644 --- a/extension/pytree/test/function_ref_test.cpp +++ b/extension/pytree/test/function_ref_test.cpp @@ -6,15 +6,13 @@ * LICENSE file in the root directory of this source tree. */ -#include - #include +#include + using namespace ::testing; -namespace torch { -namespace executor { -namespace pytree { +using ::executorch::extension::pytree::FunctionRef; namespace { class Item { @@ -84,7 +82,3 @@ TEST(FunctionRefTest, FunctionPointer) { Item item1(0, &one); EXPECT_EQ(item1.get(), 1); } - -} // namespace pytree -} // namespace executor -} // namespace torch diff --git a/extension/pytree/test/test_pytree.cpp b/extension/pytree/test/test_pytree.cpp index 5f8ab72acf1..0101bca3f55 100644 --- a/extension/pytree/test/test_pytree.cpp +++ b/extension/pytree/test/test_pytree.cpp @@ -6,19 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include - #include -int main(int argc, char* argv[]) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +#include +#include -namespace torch { -namespace executor { -namespace pytree { +using ::executorch::extension::pytree::ContainerHandle; +using ::executorch::extension::pytree::Key; +using ::executorch::extension::pytree::Kind; +using ::executorch::extension::pytree::unflatten; using Leaf = int32_t; @@ -187,7 +183,3 @@ TEST(pytree, FlattenNestedDict) { ASSERT_EQ(*leaves[i], items[i]); } } - -} // namespace pytree -} // namespace executor -} // namespace torch diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp index 103d3461f0a..f4c77cae194 100644 --- a/extension/runner_util/inputs.cpp +++ b/extension/runner_util/inputs.cpp @@ -12,9 +12,15 @@ #include #include -namespace torch { -namespace executor { -namespace util { +using executorch::runtime::Error; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::Tag; +using executorch::runtime::TensorInfo; + +namespace executorch { +namespace extension { Result prepare_input_tensors(Method& method) { MethodMeta method_meta = method.method_meta(); @@ -53,6 +59,5 @@ Result prepare_input_tensors(Method& method) { return BufferCleanup({inputs, num_allocated}); } -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/runner_util/inputs.h b/extension/runner_util/inputs.h index 316ae53cb52..b933bca8073 100644 --- a/extension/runner_util/inputs.h +++ b/extension/runner_util/inputs.h @@ -12,9 +12,8 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { /** * RAII helper that frees a set of buffers when destroyed. Movable. @@ -25,14 +24,15 @@ class BufferCleanup final { * Takes ownership of `buffers.data()` and the elements of `buffers`, which * each will be passed to `free()` when the object is destroyed. */ - explicit BufferCleanup(Span buffers) : buffers_(buffers) {} + explicit BufferCleanup(executorch::runtime::Span buffers) + : buffers_(buffers) {} /** * Move ctor. Takes ownership of the data previously owned by `rhs`, leaving * `rhs` with an empty list of buffers. */ BufferCleanup(BufferCleanup&& rhs) noexcept : buffers_(rhs.buffers_) { - rhs.buffers_ = Span(); + rhs.buffers_ = executorch::runtime::Span(); } ~BufferCleanup() { @@ -48,7 +48,7 @@ class BufferCleanup final { BufferCleanup& operator=(const BufferCleanup&) = delete; BufferCleanup& operator=(BufferCleanup&&) noexcept = delete; - Span buffers_; + executorch::runtime::Span buffers_; }; /** @@ -61,20 +61,31 @@ class BufferCleanup final { * remain alive when calling `method->execute()`. * @returns An error on failure. */ -Result prepare_input_tensors(Method& method); +executorch::runtime::Result prepare_input_tensors( + executorch::runtime::Method& method); namespace internal { /** * INTERNAL-ONLY: Creates a Tensor using the provided shape and buffer, * fills it with ones, and sets the input at `input_index`. */ -Error fill_and_set_input( - Method& method, - TensorInfo& tensor_meta, +executorch::runtime::Error fill_and_set_input( + executorch::runtime::Method& method, + executorch::runtime::TensorInfo& tensor_meta, size_t input_index, void* data_ptr); } // namespace internal +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::BufferCleanup; +using ::executorch::extension::prepare_input_tensors; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/runner_util/inputs_aten.cpp b/extension/runner_util/inputs_aten.cpp index d9ccb63fd52..83d12dac42d 100644 --- a/extension/runner_util/inputs_aten.cpp +++ b/extension/runner_util/inputs_aten.cpp @@ -14,10 +14,12 @@ #include #include -namespace torch { -namespace executor { -namespace util { +using executorch::runtime::Error; +using executorch::runtime::Method; +using executorch::runtime::TensorInfo; +namespace executorch { +namespace extension { namespace internal { Error fill_and_set_input( @@ -38,7 +40,5 @@ Error fill_and_set_input( } } // namespace internal - -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/runner_util/inputs_portable.cpp b/extension/runner_util/inputs_portable.cpp index f799a968cf0..f9db03bcd1d 100644 --- a/extension/runner_util/inputs_portable.cpp +++ b/extension/runner_util/inputs_portable.cpp @@ -16,9 +16,14 @@ #include #include -namespace torch { -namespace executor { -namespace util { +using exec_aten::Tensor; +using exec_aten::TensorImpl; +using executorch::runtime::Error; +using executorch::runtime::Method; +using executorch::runtime::TensorInfo; + +namespace executorch { +namespace extension { namespace internal { namespace { @@ -68,6 +73,5 @@ Error fill_and_set_input( } } // namespace internal -} // namespace util -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h index 16a84a13df4..5c74f7550ae 100644 --- a/extension/runner_util/managed_tensor.h +++ b/extension/runner_util/managed_tensor.h @@ -6,28 +6,27 @@ * LICENSE file in the root directory of this source tree. */ +#pragma once + +#include +// @nolint PATTERNLINT Ok to use stdlib for this optional library +#include + #include #include #include #include -#include -// NOTE: required by torchchat install_et.sh script. -// @nolint PATTERNLINT Ok to use stdlib for this optional library -#include #ifdef USE_ATEN_LIB #include -#else -#include #endif -#pragma once -namespace torch { -namespace executor { +namespace executorch { +namespace extension { /** * A tensor wrapper takes ownership of all the memory of the necessary metadata - * for torch::executor::Tensor. Note that it doesn't own the data memory. + * for exec_aten::Tensor. Note that it doesn't own the data memory. */ class ManagedTensor { public: @@ -43,7 +42,7 @@ class ManagedTensor { explicit ManagedTensor( void* data, const std::vector& sizes, - ScalarType dtype) + exec_aten::ScalarType dtype) : sizes_(sizes) { #ifdef USE_ATEN_LIB tensor_ = torch::from_blob(data, sizes, dtype); @@ -58,43 +57,51 @@ class ManagedTensor { } // Allocate TensorImpl. - tensor_impl_ = std::make_unique( + tensor_impl_ = std::make_unique( dtype, sizes_.size(), sizes_.data(), data, /*dim_order=*/nullptr, strides_.data(), - TensorShapeDynamism::DYNAMIC_BOUND); + executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND); #endif } void resize(const std::vector& new_sizes) { - auto err = resize_tensor( + auto err = executorch::runtime::resize_tensor( this->get_aliasing_tensor(), exec_aten::ArrayRef(new_sizes.data(), new_sizes.size())); - ET_CHECK(err == Error::Ok); + ET_CHECK(err == executorch::runtime::Error::Ok); } /** * Get the underlying Tensor object. This is assuming the copying is cheap. */ - Tensor get_aliasing_tensor() { + exec_aten::Tensor get_aliasing_tensor() { #ifdef USE_ATEN_LIB return tensor_; #else - return Tensor(tensor_impl_.get()); + return exec_aten::Tensor(tensor_impl_.get()); #endif } private: - std::unique_ptr tensor_impl_; + std::unique_ptr tensor_impl_; std::vector sizes_; std::vector strides_; #ifdef USE_ATEN_LIB - Tensor tensor_; + exec_aten::Tensor tensor_; #endif }; +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::ManagedTensor; } // namespace executor } // namespace torch diff --git a/extension/runner_util/test/inputs_test.cpp b/extension/runner_util/test/inputs_test.cpp index 94e99bcc098..c916da488e5 100644 --- a/extension/runner_util/test/inputs_test.cpp +++ b/extension/runner_util/test/inputs_test.cpp @@ -20,20 +20,19 @@ using namespace ::testing; using exec_aten::ScalarType; using exec_aten::Tensor; -using torch::executor::Error; -using torch::executor::EValue; -using torch::executor::MemoryAllocator; -using torch::executor::MemoryManager; -using torch::executor::Method; -using torch::executor::Program; -using torch::executor::Result; -using torch::executor::Span; -using torch::executor::Tag; -using torch::executor::Tensor; -using torch::executor::testing::ManagedMemoryManager; -using torch::executor::util::BufferCleanup; -using torch::executor::util::FileDataLoader; -using torch::executor::util::prepare_input_tensors; +using executorch::extension::BufferCleanup; +using executorch::extension::FileDataLoader; +using executorch::extension::prepare_input_tensors; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::Tag; +using executorch::runtime::testing::ManagedMemoryManager; class InputsTest : public ::testing::Test { protected: diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp index b511cdbcf17..8ac1285f2bd 100644 --- a/extension/runner_util/test/managed_tensor_test.cpp +++ b/extension/runner_util/test/managed_tensor_test.cpp @@ -17,13 +17,13 @@ using exec_aten::DimOrderType; using exec_aten::ScalarType; using exec_aten::SizesType; using exec_aten::StridesType; -using torch::executor::ArrayRef; -using torch::executor::ManagedTensor; +using executorch::extension::ManagedTensor; +using executorch::runtime::ArrayRef; class ManagedTensorTest : public ::testing::Test { protected: void SetUp() override { - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); data_ = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; sizes_ = {2, 3, 4}; diff --git a/extension/testing_util/temp_file.h b/extension/testing_util/temp_file.h index a710e130e7f..aa8f5bcc82e 100644 --- a/extension/testing_util/temp_file.h +++ b/extension/testing_util/temp_file.h @@ -18,9 +18,9 @@ #include -namespace torch { -namespace executor { -namespace testing { +namespace executorch { +namespace extension { +namespace testing { // Test-only helpers belong in a "testing" sub-namespace. /** * Creates and manages a named temporary file in the file system. Deletes the @@ -98,6 +98,16 @@ class TempFile { std::string path_; }; +} // namespace testing +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace testing { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::testing::TempFile; } // namespace testing } // namespace executor } // namespace torch diff --git a/extension/testing_util/test/temp_file_test.cpp b/extension/testing_util/test/temp_file_test.cpp index 072630897b2..2a666d24d51 100644 --- a/extension/testing_util/test/temp_file_test.cpp +++ b/extension/testing_util/test/temp_file_test.cpp @@ -18,7 +18,7 @@ #include using namespace ::testing; -using torch::executor::testing::TempFile; +using executorch::extension::testing::TempFile; TEST(TempFileTest, Smoke) { std::string path; diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp index f2f63523b48..ad6130183e5 100644 --- a/extension/training/optimizer/sgd.cpp +++ b/extension/training/optimizer/sgd.cpp @@ -12,8 +12,14 @@ #include #include -namespace torch { -namespace executor { +using exec_aten::Tensor; +using exec_aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::Span; + +namespace executorch { +namespace extension { namespace training { namespace optimizer { @@ -67,7 +73,7 @@ Error SGD::step(Span gradient_names, Span gradient_data) { InvalidState, "Gradient names and gradients must have the same length."); - RuntimeContext context; + KernelRuntimeContext context; for (auto& group : param_groups_) { auto& options = static_cast(group.options()); auto weight_decay = options.weight_decay(); @@ -170,7 +176,8 @@ SGD::~SGD() { #endif } } + } // namespace optimizer } // namespace training -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h index 308e3471d99..fb797e4d5d6 100644 --- a/extension/training/optimizer/sgd.h +++ b/extension/training/optimizer/sgd.h @@ -23,15 +23,11 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { namespace training { namespace optimizer { -using Tensor = exec_aten::Tensor; -using TensorImpl = exec_aten::TensorImpl; -using ScalarType = exec_aten::ScalarType; - /** * SGD optimizer state. This keeps track of the state of a given parameter to * be used in later epochs. @@ -44,15 +40,15 @@ class SGDParamState { * @param[in] momentum_buffer A tensor that stores the momentum at the last * epoch. */ - explicit SGDParamState(Tensor& momentum_buffer) + explicit SGDParamState(exec_aten::Tensor& momentum_buffer) : momentum_buffer_(momentum_buffer) {} - Tensor& momentum_buffer() { + exec_aten::Tensor& momentum_buffer() { return momentum_buffer_; } private: - Tensor momentum_buffer_; + exec_aten::Tensor momentum_buffer_; }; /** @@ -159,13 +155,13 @@ class SGDParamGroup { * @param[in] param_data The tensors representing the param data. */ /* implicit */ SGDParamGroup( - Span param_names, - Span param_data) + ::executorch::runtime::Span param_names, + ::executorch::runtime::Span param_data) : param_data_(std::move(param_data)), param_names_(std::move(param_names)) {} SGDParamGroup( - Span param_names, - Span param_data, + ::executorch::runtime::Span param_names, + ::executorch::runtime::Span param_data, std::unique_ptr options) : param_data_(std::move(param_data)), param_names_(std::move(param_names)), @@ -175,14 +171,14 @@ class SGDParamGroup { SGDOptions& options(); const SGDOptions& options() const; void set_options(std::unique_ptr options); - Span param_names(); - const Span param_names() const; - Span param_data(); - const Span param_data() const; + ::executorch::runtime::Span param_names(); + const ::executorch::runtime::Span param_names() const; + ::executorch::runtime::Span param_data(); + const ::executorch::runtime::Span param_data() const; private: - Span param_data_; - Span param_names_; + ::executorch::runtime::Span param_data_; + ::executorch::runtime::Span param_names_; std::unique_ptr options_; }; @@ -202,8 +198,8 @@ class SGD { } explicit SGD( - Span param_names, - Span param_data, + ::executorch::runtime::Span param_names, + ::executorch::runtime::Span param_data, SGDOptions defaults) : SGD({SGDParamGroup(std::move(param_names), std::move(param_data))}, defaults) {} @@ -225,7 +221,9 @@ class SGD { * @param[in] gradient_data The gradient tensors to be used for optimization * step. */ - Error step(Span gradient_names, Span gradient_data); + ::executorch::runtime::Error step( + ::executorch::runtime::Span gradient_names, + ::executorch::runtime::Span gradient_data); private: std::vector param_groups_; @@ -235,5 +233,5 @@ class SGD { } // namespace optimizer } // namespace training -} // namespace executor -} // namespace torch +} // namespace extension +} // namespace executorch diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp index 1dd1a3e55df..33a70b4fe95 100644 --- a/extension/training/optimizer/test/sgd_test.cpp +++ b/extension/training/optimizer/test/sgd_test.cpp @@ -17,12 +17,14 @@ // @lint-ignore-every CLANGTIDY facebook-hte-CArray using namespace ::testing; -using namespace torch::executor::training::optimizer; using exec_aten::ScalarType; using exec_aten::Tensor; -using torch::executor::Error; -using torch::executor::Span; -using torch::executor::testing::TensorFactory; +using ::executorch::extension::training::optimizer::SGD; +using ::executorch::extension::training::optimizer::SGDOptions; +using ::executorch::extension::training::optimizer::SGDParamState; +using ::executorch::runtime::Error; +using ::executorch::runtime::Span; +using ::executorch::runtime::testing::TensorFactory; class SGDOptimizerTest : public ::testing::Test { protected: diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp index 74da48d7320..da0d53374f1 100644 --- a/runtime/executor/test/executor_test.cpp +++ b/runtime/executor/test/executor_test.cpp @@ -31,6 +31,8 @@ using executorch::runtime::KernelRuntimeContext; using executorch::runtime::register_kernels; using executorch::runtime::testing::TensorFactory; +namespace pytree = ::executorch::extension::pytree; + class ExecutorTest : public ::testing::Test { protected: void SetUp() override { @@ -210,7 +212,7 @@ TEST(PyTreeEValue, List) { Scalar d((double)3.0); EValue items[2] = {i, d}; - auto c = torch::executor::pytree::unflatten(spec, items); + auto c = pytree::unflatten(spec, items); ASSERT_TRUE(c.isList()); ASSERT_EQ(c.size(), 2); @@ -232,7 +234,7 @@ TEST(PyTreeEValue, List) { auto unflatten(EValue* items) { std::string spec = "D4#1#1#1#1('key0':$,1:$,23:$,123:$)"; - return torch::executor::pytree::unflatten(spec, items); + return pytree::unflatten(spec, items); } TEST(PyTreeEValue, DestructedSpec) { @@ -249,8 +251,8 @@ TEST(PyTreeEValue, DestructedSpec) { auto& key0 = c.key(0); auto& key1 = c.key(1); - ASSERT_TRUE(key0 == torch::executor::pytree::Key("key0")); - ASSERT_TRUE(key1 == torch::executor::pytree::Key(1)); + ASSERT_TRUE(key0 == pytree::Key("key0")); + ASSERT_TRUE(key1 == pytree::Key(1)); const auto& child0 = c[0]; const auto& child1 = c[1]; From c7aff77df7f845432dd3fcc4242e14a8c25c333c Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:20:48 -0700 Subject: [PATCH 002/531] Fix android-perf job Differential Revision: D61632951 Pull Request resolved: https://github.com/pytorch/executorch/pull/4825 --- .github/workflows/android-perf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 78cd342c874..49d07516b15 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -156,7 +156,7 @@ jobs: BUILD_MODE="cmake" DTYPE="fp32" - if [[ ${{ matrix.model }} == "stories*"" ]]; then + if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh # Test llama2 From ea4a1870ce1027d1aefaf8e28d23b2a5d0195ffc Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 21 Aug 2024 17:44:54 -0700 Subject: [PATCH 003/531] Fix non-existing docstring parameters (#4827) --- backends/apple/mps/operators/node_visitor.py | 9 ++++----- backends/apple/mps/utils/mps_utils.py | 2 +- backends/arm/test/runner_utils.py | 14 ++++---------- backends/cadence/aot/quantizer/utils.py | 2 +- examples/models/llama2/evaluate/eager_eval.py | 2 +- examples/models/llama2/tokenizer/tiktoken.py | 6 +++--- examples/qualcomm/oss_scripts/ssd300_vgg16.py | 1 - examples/sdk/scripts/export_bundled_program.py | 2 +- extension/gguf_util/converters/llama_converter.py | 4 ++-- extension/llm/export/builder.py | 2 +- sdk/bundled_program/config.py | 10 +++++----- sdk/bundled_program/core.py | 2 +- sdk/etrecord/_etrecord.py | 4 ++-- 13 files changed, 26 insertions(+), 34 deletions(-) diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py index d2f7219748a..2b443134bf8 100644 --- a/backends/apple/mps/operators/node_visitor.py +++ b/backends/apple/mps/operators/node_visitor.py @@ -77,7 +77,7 @@ def define_tensor( """Defines a tensor value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + node (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ @@ -155,7 +155,7 @@ def define_constant( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + constant_tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ constant_tensor = constant_tensor.contiguous() @@ -191,7 +191,6 @@ def define_scalar( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ assert isinstance(val, int) or isinstance(val, float) @@ -229,7 +228,7 @@ def get_serialized_buffer( index of its placement in the constant buffer Args: - tensor (torch.fx.Node): _description_ + node (torch.fx.Node): _description_ mps_graph (MPSGraph): _description_ Returns: @@ -299,7 +298,7 @@ def get_serialized_id( the existent id. Args: - tensor (Union[torch.fx.Node, float]): _description_ + node (Union[torch.fx.Node, float]): _description_ mps_graph (MPSGraph): _description_ Returns: diff --git a/backends/apple/mps/utils/mps_utils.py b/backends/apple/mps/utils/mps_utils.py index b6ba215534d..c31ebba0e46 100644 --- a/backends/apple/mps/utils/mps_utils.py +++ b/backends/apple/mps/utils/mps_utils.py @@ -73,7 +73,7 @@ def is_parameter(exp_prog: torch.export.ExportedProgram, node: torch.fx.Node) -> are supplied as inputs to the graph. Args: - edge_program (torch.export.ExportedProgram): _description_ + exp_prog (torch.export.ExportedProgram): _description_ node (torch.fx.Node): _description_ Returns: diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 4e3b447103c..930fc0adf10 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -275,10 +275,10 @@ def run_tosa_ref_model( inputs: Tuple[torch.Tensor], ) -> list[torch.Tensor]: """ - Run TOSA reference model using the tosa_refence_model program. + Run TOSA reference model using the tosa_reference_model program. In order to do that we need: - 1. desc.json, which points to files needed by tosa_refence_model. + 1. desc.json, which points to files needed by tosa_reference_model. 2. output.tosa, which is the TOSA buffer that describes the model we're trying to run. @@ -287,12 +287,6 @@ def run_tosa_ref_model( All these files are saved on disk in self.intermediate_path. Args: - params_input (Tuple[List[str], List[QuantizationParams]]): A tuple - containing a list of input node names and a list of their - quantization parameters (if model is quantized). - param_output (Tuple[str, QuantizationParams]): A tuple containing - the output node name and its quantization parameters (if - model is quantized). inputs (Tuple[torch.Tensor]): The input data to run the TOSA Returns: @@ -423,7 +417,7 @@ def save_npy( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: @@ -448,7 +442,7 @@ def save_bytes( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index 2afe5aba32e..0f9c9399780 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -145,7 +145,7 @@ def get_aten_node_target_partitions( """ Args: graph: The graph we want to partition - wanted_sources: List of orginal_aten ops (OpOverload) + wanted_original_aten_op: List of original_aten ops (OpOverload) Returns: Dictionary mapping aten ops that were given to a list of SourcePartitions diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py index 28dbe9381ad..e8a540f95e2 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama2/evaluate/eager_eval.py @@ -99,7 +99,7 @@ def evaluate_model( Args: eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation - task (str): The name of the evaluation task to perform. + tasks: Optional[list]: The names of the evaluation tasks to perform. limit (Optional[int]): The maximum number of samples to evaluate (None for all available). Returns: diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama2/tokenizer/tiktoken.py index a1f0fde11af..d12b4eb33d2 100644 --- a/examples/models/llama2/tokenizer/tiktoken.py +++ b/examples/models/llama2/tokenizer/tiktoken.py @@ -116,8 +116,8 @@ def encode( s (str): The input string to be encoded. bos (bool): Whether to prepend the beginning-of-sequence token. eos (bool): Whether to append the end-of-sequence token. - allowed_tokens ("all"|set[str]): allowed special tokens in string - disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string + allowed_special ("all"|set[str]): allowed special tokens in string + disallowed_special ("all"|set[str]): special tokens that raise an error when in string Returns: list[int]: A list of token IDs. @@ -125,7 +125,7 @@ def encode( By default, setting disallowed_special=() encodes a string by ignoring special tokens. Specifically: - Setting `disallowed_special` to () will cause all text corresponding - to special tokens to be encoded as natural text (insteading of raising + to special tokens to be encoded as natural text (instead of raising an error). - Setting `allowed_special` to "all" will treat all text corresponding to special tokens to be encoded as special tokens. diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py index bd5089441ed..45e3073baeb 100644 --- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py +++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py @@ -28,7 +28,6 @@ def create_data_lists(voc07_path, data_size): Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file. :param voc07_path: path to the 'VOC2007' folder - :param output_folder: folder where the JSONs must be saved """ from utils import parse_annotation diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/sdk/scripts/export_bundled_program.py index 7e118a78c1d..a34a0ab4d34 100644 --- a/examples/sdk/scripts/export_bundled_program.py +++ b/examples/sdk/scripts/export_bundled_program.py @@ -37,7 +37,7 @@ def save_bundled_program( Generates a bundled program from the given ET program and saves it to the specified path. Args: - program: The ExecuTorch program to bundle. + executorch_program: The ExecuTorch program to bundle. method_test_suites: The MethodTestSuites which contains test cases to include in the bundled program. output_path: Path to save the bundled program. """ diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py index dc16cd7dff3..463e5a0fcfe 100644 --- a/extension/gguf_util/converters/llama_converter.py +++ b/extension/gguf_util/converters/llama_converter.py @@ -99,8 +99,8 @@ def convert_to_pte(gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights) -> """Convert a GGUF model into an ExecuTorch program. Args: - model_args: The arguments for the GGUF model. - weights: The weights of the GGUF model. + gguf_model_args: The arguments for the GGUF model. + gguf_weights: The weights of the GGUF model. """ assert ( diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 264e1e95ad3..28afef20d04 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -227,7 +227,7 @@ def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManag Partition the model and lower to different backends. The signature is aligned with the signature of `to_backend` method of EdgeManager. Args: - partitioner (Optional[Partitioner]): One or more + partitioners (Optional[List[Partitioner]]): One or more partitioner to be sent to EdgeManager.to_backend(). """ if partitioners is None: diff --git a/sdk/bundled_program/config.py b/sdk/bundled_program/config.py index 3bfbe7bc69c..97563177603 100644 --- a/sdk/bundled_program/config.py +++ b/sdk/bundled_program/config.py @@ -39,7 +39,7 @@ """ All supported types for input/expected output of MethodTestCase. -Namedtuple is also supported and listed implicity since it is a subclass of tuple. +Namedtuple is also supported and listed implicitly since it is a subclass of tuple. """ # pyre-ignore @@ -59,23 +59,23 @@ def __init__( """Single test case for verifying specific method Args: - input: All inputs required by eager_model with specific inference method for one-time execution. + inputs: All inputs required by eager_model with specific inference method for one-time execution. It is worth mentioning that, although both bundled program and ET runtime apis support setting input other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in the method, and the rest of the inputs will just do a sanity check if they match the default value in method. - expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. + expected_outputs: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. Returns: self """ # TODO(gasoonjia): Update type check logic. - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.inputs: List[ConfigValue] = self._flatten_and_sanity_check(inputs) self.expected_outputs: List[ConfigValue] = [] if expected_outputs is not None: - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.expected_outputs = self._flatten_and_sanity_check(expected_outputs) def _flatten_and_sanity_check( diff --git a/sdk/bundled_program/core.py b/sdk/bundled_program/core.py index 4fede5e5952..56fc817bbee 100644 --- a/sdk/bundled_program/core.py +++ b/sdk/bundled_program/core.py @@ -230,7 +230,7 @@ def _assert_valid_bundle( Other checks not related to correspondence are done in config.py Args: - program: The program to be bundled. + executorch_program: The program to be bundled. method_test_suites: The testcases for specific methods to be bundled. """ diff --git a/sdk/etrecord/_etrecord.py b/sdk/etrecord/_etrecord.py index 55e231f2166..1ae46f27aaa 100644 --- a/sdk/etrecord/_etrecord.py +++ b/sdk/etrecord/_etrecord.py @@ -185,10 +185,10 @@ def generate_etrecord( for SDK tooling usage. Args: - etrecord_path: Path to where the `ETRecord` file will be saved to. + et_record: Path to where the `ETRecord` file will be saved to. edge_dialect_program: `EdgeProgramManager` for this model returned by the call to to_edge() executorch_program: The ExecuTorch program for this model returned by the call to `to_executorch()` or the `BundledProgram` of this model - export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the + export_modules [Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the value being the corresponding exported module. The exported graph modules can be either the output of `torch.export()` or `exir.to_edge()`. From ce4917c6b3360a3226a21c3bd6c71bdecc5f8354 Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 21 Aug 2024 18:05:23 -0700 Subject: [PATCH 004/531] remove pad custom op (#4801) --- .../models/flamingo/export_preprocess_lib.py | 7 --- examples/models/flamingo/passes/__init__.py | 0 .../replace_custom_ops_with_aten_ops_pass.py | 31 ------------ .../models/flamingo/passes/test_passes.py | 50 ------------------- .../llm/custom_ops/preprocess_custom_ops.py | 49 ------------------ 5 files changed, 137 deletions(-) delete mode 100644 examples/models/flamingo/passes/__init__.py delete mode 100644 examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py delete mode 100644 examples/models/flamingo/passes/test_passes.py diff --git a/examples/models/flamingo/export_preprocess_lib.py b/examples/models/flamingo/export_preprocess_lib.py index 736116de8b7..082c306ea38 100644 --- a/examples/models/flamingo/export_preprocess_lib.py +++ b/examples/models/flamingo/export_preprocess_lib.py @@ -15,10 +15,6 @@ from torch.export import Dim, ExportedProgram from torchtune.models.clip.inference._transforms import _CLIPImageTransform -from .passes.replace_custom_ops_with_aten_ops_pass import ( - ReplaceCustomOpsWithAtenOpsPass, -) - def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: image = torch.ones(3, 800, 600) @@ -59,7 +55,6 @@ def export_preprocess( ) # Replace non-exportable ops with custom ops. - image_transform_model.pad = torch.ops.preprocess.pad.default image_transform_model.tile_crop = torch.ops.preprocess.tile_crop.default # Export. @@ -80,8 +75,6 @@ def lower_to_executorch_preprocess( edge_program = to_edge( exported_program, compile_config=EdgeCompileConfig(_check_ir_validity=False) ) - # Replace custom ops with aten ops. - edge_program = edge_program.transform([ReplaceCustomOpsWithAtenOpsPass()]) et_program = edge_program.to_executorch(ExecutorchBackendConfig()) return et_program diff --git a/examples/models/flamingo/passes/__init__.py b/examples/models/flamingo/passes/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py b/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py deleted file mode 100644 index 8c31cf512ce..00000000000 --- a/examples/models/flamingo/passes/replace_custom_ops_with_aten_ops_pass.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -import torch -from executorch.exir.pass_base import ExportPass -from executorch.extension.llm.custom_ops import preprocess_custom_ops # noqa - - -class ReplaceCustomOpsWithAtenOpsPass(ExportPass): - """ - Goes through all ops and replaces custom ops with aten ops. In some cases - aten ops cannot be exported due to dynamism, eg. pad in flamingo preprocess. - Use a custom op to pass export, and replace it with the aten op post-export, - which avoids re-writing the op in C++. - """ - - def __init__(self) -> None: - super().__init__() - - def call_operator(self, op, args, kwargs, meta): - if op._name == "preprocess::pad": - return super().call_operator( - torch.ops.aten.constant_pad_nd.default, args, kwargs, meta - ) - - return super().call_operator(op, args, kwargs, meta) diff --git a/examples/models/flamingo/passes/test_passes.py b/examples/models/flamingo/passes/test_passes.py deleted file mode 100644 index d0a90f2e347..00000000000 --- a/examples/models/flamingo/passes/test_passes.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -import unittest - -from typing import List - -import torch -from executorch.exir import EdgeCompileConfig, to_edge - -from .replace_custom_ops_with_aten_ops_pass import ReplaceCustomOpsWithAtenOpsPass - - -class TestPasses(unittest.TestCase): - def test_replace_custom_ops_with_aten_ops_pass(self) -> None: - from executorch.extension.llm.custom_ops import preprocess_custom_ops # noqa - - class Pad(torch.nn.Module): - def forward(self, x: torch.Tensor, padding: List[int]) -> torch.Tensor: - return torch.ops.preprocess.pad.default(x, padding) - - pad = Pad() - - image_tensor = torch.ones([3, 4, 5]) - padding = [0, 2, 0, 1] - - edge_prog = to_edge( - torch.export.export(pad, (image_tensor, padding), strict=False), - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ) - - # Check that the custom op exists in the graph, and aten op does not. - edge_nodes = [node.name for node in edge_prog.exported_program().graph.nodes] - assert "constant_pad_nd" not in edge_nodes - assert "preprocess_pad_default" in edge_nodes - - edge_prog = edge_prog.transform([ReplaceCustomOpsWithAtenOpsPass()]) - - # After running replace_custom_ops_with_aten_ops pass, the custom op - # should be replaced with aten op. - post_transform_nodes = [ - node.name for node in edge_prog.exported_program().graph.nodes - ] - assert "constant_pad_nd" in post_transform_nodes - assert "preprocess_pad_default" not in post_transform_nodes diff --git a/extension/llm/custom_ops/preprocess_custom_ops.py b/extension/llm/custom_ops/preprocess_custom_ops.py index aea8c09b0ef..e49721ffd35 100644 --- a/extension/llm/custom_ops/preprocess_custom_ops.py +++ b/extension/llm/custom_ops/preprocess_custom_ops.py @@ -7,61 +7,12 @@ # pyre-unsafe -from typing import List - import torch from torch.library import impl, Library preprocess_op_lib = Library("preprocess", "DEF") -# Register and define pad and out variant. -# Note: pad doesn't require an explicit meta kernel because -# CompositeExplicitAutograd automatically registers the implementation to meta, -# and meta kernels do not go through functionalization. The implementation -# does not export due to issues during functionalization. -# See: https://github.com/pytorch/pytorch/issues/120288 -preprocess_op_lib.define("pad(Tensor image, SymInt[] padding) -> Tensor") - - -@impl(preprocess_op_lib, "pad", dispatch_key="CompositeExplicitAutograd") -def pad_impl( - image: torch.Tensor, - padding: List[int], -) -> torch.Tensor: - output = torch.empty( - [image.shape[0], image.shape[1] + padding[3], image.shape[2] + padding[1]], - dtype=image.dtype, - device=image.device, - requires_grad=False, - ) - output = torch.fill(output, 0) - output.narrow(1, 0, image.shape[1]).narrow(2, 0, image.shape[2]).copy_(image) - return output - - -preprocess_op_lib.define( - "pad.out(Tensor image, SymInt[] padding, *, Tensor(a!) out) -> Tensor(a!)" -) - - -@impl(preprocess_op_lib, "pad.out", dispatch_key="CompositeExplicitAutograd") -def pad_out_impl( - image: torch.Tensor, - padding: List[int], - out: torch.Tensor, -) -> torch.Tensor: - out = torch.empty( - [image.shape[0], image.shape[1] + padding[3], image.shape[2] + padding[1]], - dtype=image.dtype, - device=image.device, - requires_grad=False, - ) - out = torch.fill(out, 0) - out.narrow(1, 0, image.shape[1]).narrow(2, 0, image.shape[2]).copy_(image) - return out - - # Register and define tile_crop and out variant. preprocess_op_lib.define("tile_crop(Tensor input, int tile_size) -> Tensor") From 226f9751c1b66ec5a300120089cf8e4fdeb542ef Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Thu, 22 Aug 2024 10:43:57 -0700 Subject: [PATCH 005/531] Migrate extension/llm to new namespace Differential Revision: D61509127 Pull Request resolved: https://github.com/pytorch/executorch/pull/4831 --- examples/models/llava/runner/llava_runner.cpp | 2 ++ examples/models/llava/runner/llava_runner.h | 3 +- extension/llm/custom_ops/op_sdpa.cpp | 4 +-- extension/llm/custom_ops/op_sdpa_test.cpp | 15 +++++---- .../custom_ops/op_sdpa_with_kv_cache_test.cpp | 9 +++--- .../llm/custom_ops/op_tile_crop_test.cpp | 2 +- extension/llm/runner/image.h | 16 ++++++++-- extension/llm/runner/image_prefiller.h | 24 +++++++++++--- extension/llm/runner/metadata_util.h | 15 ++++++--- extension/llm/runner/multimodal_runner.h | 21 +++++++++--- extension/llm/runner/stats.h | 26 +++++++++++---- extension/llm/runner/text_decoder_runner.cpp | 23 ++++++++----- extension/llm/runner/text_decoder_runner.h | 24 ++++++++++---- extension/llm/runner/text_prefiller.cpp | 25 ++++++++++----- extension/llm/runner/text_prefiller.h | 18 +++++++++-- extension/llm/runner/text_token_generator.h | 26 +++++++++++---- extension/llm/runner/util.h | 17 ++++++++-- extension/llm/sampler/sampler.cpp | 10 +++--- extension/llm/sampler/sampler.h | 15 +++++++-- extension/llm/sampler/test/test_sampler.cpp | 17 +++------- extension/llm/tokenizer/base64.h | 17 ++++++++-- extension/llm/tokenizer/bpe_tokenizer.cpp | 13 +++++--- extension/llm/tokenizer/bpe_tokenizer.h | 25 +++++++++++---- .../llm/tokenizer/test/test_bpe_tokenizer.cpp | 11 +++---- .../llm/tokenizer/test/test_tiktoken.cpp | 11 +++---- extension/llm/tokenizer/tiktoken.cpp | 13 +++++--- extension/llm/tokenizer/tiktoken.h | 27 ++++++++++++---- extension/llm/tokenizer/tokenizer.h | 32 +++++++++++++------ 28 files changed, 327 insertions(+), 134 deletions(-) diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index c5ce03b88d7..a58fdfd5e59 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -20,6 +20,8 @@ #include #include +using ::executorch::extension::llm::Stats; + namespace torch::executor { bool LlavaRunner::is_loaded() { diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index d9805a0c917..13d842e30fe 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -35,7 +35,8 @@ class LlavaRunner : public MultimodalRunner { const std::string& prompt, int32_t seq_len = 1024, std::function token_callback = {}, - std::function stats_callback = {}); + std::function + stats_callback = {}); private: inline static const std::string kPresetPrompt = diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index 727c04774b9..d31cbaf3697 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { template < typename scalar_t, typename std::enable_if_t< - torch::executor::is_reduced_floating_point::value, + ::executorch::runtime::is_reduced_floating_point::value, int> = 0> static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { (void)ptr; @@ -247,7 +247,7 @@ void cpu_flash_attention( "KV_split_size must be greater than q_split_size"); constexpr bool is_reduced_type = - torch::executor::is_reduced_floating_point::value; + ::executorch::runtime::is_reduced_floating_point::value; ET_CHECK_MSG( !is_reduced_type, "FlashAttention does not support reduced types."); diff --git a/extension/llm/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp index 116be2508d3..43f20229174 100644 --- a/extension/llm/custom_ops/op_sdpa_test.cpp +++ b/extension/llm/custom_ops/op_sdpa_test.cpp @@ -17,6 +17,7 @@ #include using namespace ::testing; +using executorch::runtime::testing::TensorFactory; exec_aten::Tensor op_scaled_dot_product_attention( const exec_aten::Tensor& query, @@ -37,7 +38,7 @@ Most tests are generated by FACTO */ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 4, 4}, @@ -123,7 +124,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) { } TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 1, 8}, @@ -152,7 +153,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) { } TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 8, 1, 1}, {-47.0, 21.25, 74.75, 46.375, 21.0, -29.0, 2.625, 83.125}); @@ -181,7 +182,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) { } TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {3, 2, 2, 6}, @@ -257,7 +258,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) { } TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {3, 2, 2, 6}, @@ -333,7 +334,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) { // Disabling this test because right now we are enforcing that // attention mask must be 2D TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {3, 2, 2, 6}, @@ -479,7 +480,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) { */ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_51) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 8, 3}, diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp index 819dd702171..2a8124bc1e5 100644 --- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp +++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp @@ -16,6 +16,7 @@ #include using namespace ::testing; +using executorch::runtime::testing::TensorFactory; exec_aten::Tensor op_sdpa_with_kv_cache( const exec_aten::Tensor& query, @@ -79,7 +80,7 @@ Missing tests: 5. Different dtypes, fp16, bf16, double (or expect throw) */ TEST(OpScaledDotProductAttentionTest, BasicTest) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 4, 4}, @@ -360,7 +361,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) { } TEST(OpScaledDotProductAttentionTest, LargerTest) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 7, 4}, {0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566, @@ -524,7 +525,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) { } TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 4, 4}, @@ -807,7 +808,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) { } TEST(OpScaledDotProductAttentionTest, SequenceTest) { - torch::executor::testing::TensorFactory tfFloat; + TensorFactory tfFloat; exec_aten::Tensor query = tfFloat.make( {1, 1, 8, 4}, diff --git a/extension/llm/custom_ops/op_tile_crop_test.cpp b/extension/llm/custom_ops/op_tile_crop_test.cpp index 565f510913a..36841b80f1c 100644 --- a/extension/llm/custom_ops/op_tile_crop_test.cpp +++ b/extension/llm/custom_ops/op_tile_crop_test.cpp @@ -15,7 +15,7 @@ using namespace ::testing; using exec_aten::ScalarType; using exec_aten::Tensor; -using torch::executor::testing::TensorFactory; +using executorch::runtime::testing::TensorFactory; class OpTileCropOutTest : public OperatorTest { protected: diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h index e18353dda9a..32a9f878187 100644 --- a/extension/llm/runner/image.h +++ b/extension/llm/runner/image.h @@ -13,7 +13,9 @@ // patternlint-disable-next-line executorch-cpp-nostdinc #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { struct Image { // Assuming NCHW format @@ -23,4 +25,14 @@ struct Image { int32_t channels; }; -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::Image; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h index 64b623be36f..879b0a6e21a 100644 --- a/extension/llm/runner/image_prefiller.h +++ b/extension/llm/runner/image_prefiller.h @@ -13,23 +13,27 @@ #include #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { // Assuming kv cache and parallel prefill are enabled. class ImagePrefiller { public: - explicit ImagePrefiller(Module* module) : module_(module) {} + explicit ImagePrefiller(::executorch::extension::Module* module) + : module_(module) {} + /** * Prefill an LLM Module with the given image input. * @param image The image input to the multimodal LLM. * @param start_pos The starting position in KV cache of the input in the LLM * @return The next token of the LLM Module after prefill. */ - virtual Result prefill( + virtual ::executorch::runtime::Result prefill( Image& image, int64_t start_pos = 0) = 0; - virtual Error load() = 0; + virtual ::executorch::runtime::Error load() = 0; virtual bool is_method_loaded() = 0; virtual ~ImagePrefiller() = default; @@ -38,4 +42,14 @@ class ImagePrefiller { Module* module_; }; -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::ImagePrefiller; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/metadata_util.h b/extension/llm/runner/metadata_util.h index 4ea2d9eebd5..5f55dad538d 100644 --- a/extension/llm/runner/metadata_util.h +++ b/extension/llm/runner/metadata_util.h @@ -14,7 +14,10 @@ #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { + template T get_module_metadata( Module* module, @@ -26,9 +29,10 @@ T get_module_metadata( T res = default_val; if (model_methods.count(method_name)) { - Result> outputs = module->execute(method_name); + ::executorch::runtime::Result> + outputs = module->execute(method_name); if (outputs.ok()) { - std::vector outs = outputs.get(); + std::vector<::executorch::runtime::EValue> outs = outputs.get(); if (outs.size() > 0) { res = outs[0].to(); } @@ -43,4 +47,7 @@ T get_module_metadata( ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res); return res; } -} // namespace torch::executor + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index ac38085be44..745f086f803 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -33,8 +33,9 @@ #include #include -namespace torch::executor { -using Stats = ::executorch::llm::Stats; +namespace executorch { +namespace extension { +namespace llm { class MultimodalRunner { public: @@ -53,8 +54,8 @@ class MultimodalRunner { } virtual bool is_loaded() = 0; - virtual Error load() = 0; - virtual Error generate( + virtual ::executorch::runtime::Error load() = 0; + virtual ::executorch::runtime::Error generate( std::vector& images, const std::string& prompt, int32_t seq_len = 1024, @@ -91,4 +92,14 @@ class MultimodalRunner { Stats stats_; }; -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::MultimodalRunner; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index f62be0940c8..902ba892966 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -14,7 +14,10 @@ #include // patternlint-disable-next-line executorch-cpp-nostdinc #include -namespace executorch::llm { + +namespace executorch { +namespace extension { +namespace llm { struct Stats { // Scaling factor for timestamps - in this case, we use ms. @@ -41,12 +44,11 @@ struct Stats { // Token count from generated (total - prompt) int64_t num_generated_tokens; inline void on_sampling_begin() { - aggregate_sampling_timer_start_timestamp = - ::torch::executor::util::time_in_ms(); + aggregate_sampling_timer_start_timestamp = time_in_ms(); } inline void on_sampling_end() { - aggregate_sampling_time_ms += ::torch::executor::util::time_in_ms() - - aggregate_sampling_timer_start_timestamp; + aggregate_sampling_time_ms += + time_in_ms() - aggregate_sampling_timer_start_timestamp; aggregate_sampling_timer_start_timestamp = 0; } @@ -132,4 +134,16 @@ inline void print_report(const Stats& stats) { stats.SCALING_FACTOR_UNITS_PER_SECOND); } -} // namespace executorch::llm +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace executorch { +namespace llm { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::kTopp; +using ::executorch::extension::llm::print_report; +using ::executorch::extension::llm::Stats; +} // namespace llm +} // namespace executorch diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 3de75ceccb4..a0963769eab 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -8,11 +8,15 @@ // Given inputs, run a text decoder and return logits. -#include #include + #include -namespace torch::executor { +#include + +namespace executorch { +namespace extension { +namespace llm { // NOTE: we observed ~2x loading performance increase on iPhone 15 // and a ~5% improvement on Galaxy S22 by switching to @@ -26,22 +30,22 @@ TextDecoderRunner::TextDecoderRunner( sampler_(std::make_unique( vocab_size, temperature, - ::executorch::llm::kTopp, + kTopp, static_cast(std::time(nullptr)))), use_kv_cache_(use_kv_cache) {} // This function is functional, meaning it shouldn't modify any state of the // input. It should be safe to call multiple times with the same inputs. The // outer loop (call site) is responsible for managing state. -Result TextDecoderRunner::step( +::executorch::runtime::Result TextDecoderRunner::step( ManagedTensor& managed_tokens, ManagedTensor& managed_start_pos) { auto tokens = managed_tokens.get_aliasing_tensor(); // ET_LOG(Info, "Input token %" PRIu64, input_token); if (use_kv_cache_) { auto start_pos = managed_start_pos.get_aliasing_tensor(); - Result> outputs_res = - module_->forward({tokens, start_pos}); + ::executorch::runtime::Result> + outputs_res = module_->forward({tokens, start_pos}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, @@ -55,7 +59,8 @@ Result TextDecoderRunner::step( } else { // no kv cache (void)managed_start_pos; // unused - Result> outputs_res = module_->forward({tokens}); + ::executorch::runtime::Result> + outputs_res = module_->forward({tokens}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, @@ -69,4 +74,6 @@ Result TextDecoderRunner::step( } } -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 49ddea66299..6a8e3396fef 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -16,7 +16,9 @@ // patternlint-disable-next-line executorch-cpp-nostdinc #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { class TextDecoderRunner { public: @@ -35,7 +37,7 @@ class TextDecoderRunner { * Module. * @return The output of the LLM Module. This will be a tensor of logits. */ - virtual Result step( + virtual ::executorch::runtime::Result step( ManagedTensor& input, ManagedTensor& start_pos); @@ -43,7 +45,7 @@ class TextDecoderRunner { * Load the Module for text decode purpose. * @return The error code. */ - virtual Error load() { + virtual ::executorch::runtime::Error load() { return module_->load_method("forward"); } @@ -70,13 +72,13 @@ class TextDecoderRunner { auto vocab_size = logits_tensor.size(2); switch (logits_tensor.scalar_type()) { - case ScalarType::Float: { + case exec_aten::ScalarType::Float: { float* logits = logits_tensor.mutable_data_ptr(); float* logits_last = logits; logits_last += (num_tokens - 1) * vocab_size; return sampler_->sample(logits_last); } - case ScalarType::Half: { + case exec_aten::ScalarType::Half: { exec_aten::Half* logits = logits_tensor.mutable_data_ptr(); exec_aten::Half* logits_last = logits; @@ -99,4 +101,14 @@ class TextDecoderRunner { bool should_stop_{false}; }; -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::TextDecoderRunner; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index beafb21434d..19fc2d59363 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -11,7 +11,9 @@ #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { TextPrefiller::TextPrefiller( Tokenizer* tokenizer, @@ -23,7 +25,7 @@ TextPrefiller::TextPrefiller( use_kv_cache_(use_kv_cache), enable_parallel_prefill_(enable_parallel_prefill) {} -Result TextPrefiller::prefill( +::executorch::runtime::Result TextPrefiller::prefill( std::vector& prompt_tokens, int64_t start_pos, std::function token_callback) { @@ -40,11 +42,14 @@ Result TextPrefiller::prefill( if (enable_parallel_prefill_ || !use_kv_cache_) { // initialize tensor wrappers ManagedTensor managed_tokens( - prompt_tokens.data(), {1, num_prompt_tokens}, ScalarType::Long); + prompt_tokens.data(), + {1, num_prompt_tokens}, + exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long); + ManagedTensor managed_start_pos( + &start_pos, {1}, exec_aten::ScalarType::Long); - Result outputs_res = + ::executorch::runtime::Result outputs_res = text_decoder_runner_->step(managed_tokens, managed_start_pos); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); @@ -76,9 +81,11 @@ Result TextPrefiller::prefill( cur_token = prompt_tokens[0]; // initialize tensor wrappers - ManagedTensor managed_tokens(&cur_token, {1, 1}, ScalarType::Long); + ManagedTensor managed_tokens( + &cur_token, {1, 1}, exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos(&pos_data, {1}, ScalarType::Long); + ManagedTensor managed_start_pos( + &pos_data, {1}, exec_aten::ScalarType::Long); // run the first token and get back logits tensor. Assuming the first token // is bos so don't callback. @@ -114,4 +121,6 @@ Result TextPrefiller::prefill( return cur_token; } -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h index 7293fdca2a4..bcec2b895fe 100644 --- a/extension/llm/runner/text_prefiller.h +++ b/extension/llm/runner/text_prefiller.h @@ -16,7 +16,9 @@ // patternlint-disable-next-line executorch-cpp-nostdinc #include -namespace torch::executor { +namespace executorch { +namespace extension { +namespace llm { class TextPrefiller { public: @@ -35,7 +37,7 @@ class TextPrefiller { * token in the prompt. * @return The next token of the LLM Module after prefill. */ - Result prefill( + ::executorch::runtime::Result prefill( std::vector& prompt_tokens, int64_t start_pos = 0, std::function token_callback = {}); @@ -47,4 +49,14 @@ class TextPrefiller { bool enable_parallel_prefill_; }; -} // namespace torch::executor +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::TextPrefiller; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 9b3a31f3f71..46d682a4e44 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -13,8 +13,9 @@ #include #include -namespace torch::executor { -using Stats = ::executorch::llm::Stats; +namespace executorch { +namespace extension { +namespace llm { class TextTokenGenerator { public: @@ -41,7 +42,7 @@ class TextTokenGenerator { * @param token_callback what to do after a token is generated. * @return how many tokens are generated. */ - inline Result generate( + inline ::executorch::runtime::Result generate( std::vector tokens, int64_t start_pos, int32_t seq_len, @@ -69,14 +70,14 @@ class TextTokenGenerator { // initialize tensor wrappers ManagedTensor tokens_managed( - token_data.data(), token_shape, ScalarType::Long); + token_data.data(), token_shape, exec_aten::ScalarType::Long); - ManagedTensor start_pos_managed(&pos, {1}, ScalarType::Long); + ManagedTensor start_pos_managed(&pos, {1}, exec_aten::ScalarType::Long); // Generate our tokens while (pos < seq_len - 1) { // Run the model - Result logits_res = + ::executorch::runtime::Result logits_res = text_decoder_runner_->step(tokens_managed, start_pos_managed); ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); @@ -136,4 +137,15 @@ class TextTokenGenerator { // stats Stats* stats_; }; -} // namespace torch::executor + +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::TextTokenGenerator; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index 5d4792b6414..baf6af328b4 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -11,9 +11,9 @@ #include #include -namespace torch { -namespace executor { -namespace util { +namespace executorch { +namespace extension { +namespace llm { void inline safe_printf(const char* piece) { // piece might be a raw byte token, and we only want to print printable chars @@ -44,6 +44,17 @@ long inline time_in_ms() { return time.tv_sec * 1000 + time.tv_nsec / 1000000; } +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace util { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::safe_printf; +using ::executorch::extension::llm::time_in_ms; } // namespace util } // namespace executor } // namespace torch diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp index 6b0f155f120..64e1307d262 100644 --- a/extension/llm/sampler/sampler.cpp +++ b/extension/llm/sampler/sampler.cpp @@ -35,8 +35,9 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { // sampler stuff template @@ -192,5 +193,6 @@ int32_t Sampler::sample(T* logits) { template int32_t Sampler::sample(float* logits); template int32_t Sampler::sample(exec_aten::Half* logits); -} // namespace executor -} // namespace torch +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/sampler/sampler.h b/extension/llm/sampler/sampler.h index 584a010bba2..9d6d742e590 100644 --- a/extension/llm/sampler/sampler.h +++ b/extension/llm/sampler/sampler.h @@ -20,8 +20,9 @@ #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { // A simple llama2 sampler. template @@ -57,5 +58,15 @@ class Sampler { unsigned long long rng_state_; }; +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::ProbIndex; +using ::executorch::extension::llm::Sampler; } // namespace executor } // namespace torch diff --git a/extension/llm/sampler/test/test_sampler.cpp b/extension/llm/sampler/test/test_sampler.cpp index 2dac03d976a..044a39458ea 100644 --- a/extension/llm/sampler/test/test_sampler.cpp +++ b/extension/llm/sampler/test/test_sampler.cpp @@ -12,14 +12,10 @@ #include using namespace ::testing; +using ::executorch::extension::llm::Sampler; -namespace torch { -namespace executor { - -class SamplerTest : public Test {}; - -TEST_F(SamplerTest, TestArgMax) { - torch::executor::Sampler sampler{ +TEST(SamplerTest, TestArgMax) { + Sampler sampler{ /*vocab_size*/ 32000, /*temperature*/ 0.0f, /*topp*/ 0.9f, @@ -31,8 +27,8 @@ TEST_F(SamplerTest, TestArgMax) { EXPECT_EQ(sampler.sample(input.data_ptr()), 396); } -TEST_F(SamplerTest, TestArgMaxWithFP16) { - torch::executor::Sampler sampler{ +TEST(SamplerTest, TestArgMaxWithFP16) { + Sampler sampler{ /*vocab_size*/ 32000, /*temperature*/ 0.0f, /*topp*/ 0.9f, @@ -43,6 +39,3 @@ TEST_F(SamplerTest, TestArgMaxWithFP16) { input[0][0][396] = 1.0f; EXPECT_EQ(sampler.sample(input.data_ptr()), 396); } - -} // namespace executor -} // namespace torch diff --git a/extension/llm/tokenizer/base64.h b/extension/llm/tokenizer/base64.h index 9fb1b5129b3..7337ecead4e 100644 --- a/extension/llm/tokenizer/base64.h +++ b/extension/llm/tokenizer/base64.h @@ -29,8 +29,10 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { + namespace base64 { std::string decode(const std::string_view& input); @@ -176,5 +178,16 @@ inline std::string decode(const std::string_view& input) { } // namespace base64 +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +namespace base64 { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::base64::decode; +} // namespace base64 } // namespace executor } // namespace torch diff --git a/extension/llm/tokenizer/bpe_tokenizer.cpp b/extension/llm/tokenizer/bpe_tokenizer.cpp index 07d138548d8..1548f000a5a 100644 --- a/extension/llm/tokenizer/bpe_tokenizer.cpp +++ b/extension/llm/tokenizer/bpe_tokenizer.cpp @@ -10,8 +10,12 @@ #include -namespace torch { -namespace executor { +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +namespace executorch { +namespace extension { +namespace llm { static int compare_tokens(const void* a, const void* b) { if (((TokenIndex*)a)->str == nullptr) { @@ -311,5 +315,6 @@ BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) const { return Result(tokens); } -} // namespace executor -} // namespace torch +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/tokenizer/bpe_tokenizer.h b/extension/llm/tokenizer/bpe_tokenizer.h index 7ea84025832..7fc7306c100 100644 --- a/extension/llm/tokenizer/bpe_tokenizer.h +++ b/extension/llm/tokenizer/bpe_tokenizer.h @@ -11,8 +11,9 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { struct TokenIndex { const char* str; @@ -26,13 +27,14 @@ class BPETokenizer : public Tokenizer { explicit BPETokenizer(); ~BPETokenizer() override; - Error load(const std::string& tokenizer_path) override; + ::executorch::runtime::Error load(const std::string& tokenizer_path) override; - Result> + ::executorch::runtime::Result> encode(const std::string& input, int8_t bos, int8_t eos) const override; - Result decode(uint64_t prev_token, uint64_t token) - const override; + ::executorch::runtime::Result decode( + uint64_t prev_token, + uint64_t token) const override; private: std::unique_ptr vocab_ = nullptr; @@ -41,5 +43,16 @@ class BPETokenizer : public Tokenizer { unsigned int max_token_length_ = 0; unsigned char byte_pieces_[512]; // stores all single-byte strings }; + +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::BPETokenizer; +using ::executorch::extension::llm::TokenIndex; } // namespace executor } // namespace torch diff --git a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp index 17bb83e2f4c..c553fe59f98 100644 --- a/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp +++ b/extension/llm/tokenizer/test/test_bpe_tokenizer.cpp @@ -13,13 +13,15 @@ using namespace ::testing; -namespace torch { -namespace executor { +using ::executorch::extension::llm::BPETokenizer; +using ::executorch::extension::llm::Tokenizer; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; class TokenizerExtensionTest : public Test { public: void SetUp() override { - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); tokenizer_ = std::make_unique(); modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin"); @@ -65,6 +67,3 @@ TEST_F(TokenizerExtensionTest, SafeToDestruct) { tokenizer_ = std::make_unique(); tokenizer_.reset(); } - -} // namespace executor -} // namespace torch diff --git a/extension/llm/tokenizer/test/test_tiktoken.cpp b/extension/llm/tokenizer/test/test_tiktoken.cpp index f423183b8a8..a81b20bcf88 100644 --- a/extension/llm/tokenizer/test/test_tiktoken.cpp +++ b/extension/llm/tokenizer/test/test_tiktoken.cpp @@ -12,9 +12,10 @@ #include using namespace ::testing; - -namespace torch { -namespace executor { +using ::executorch::extension::llm::Tiktoken; +using ::executorch::extension::llm::Tokenizer; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; namespace { // Test case based on Llama 2 @@ -49,7 +50,7 @@ static inline std::unique_ptr> _get_special_tokens() { class TiktokenExtensionTest : public Test { public: void SetUp() override { - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); tokenizer_ = std::make_unique( _get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex); modelPath_ = std::getenv("RESOURCES_PATH") + @@ -139,5 +140,3 @@ TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) { ""); #endif } -} // namespace executor -} // namespace torch diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp index 67d1f916f2a..7b15d25f0da 100644 --- a/extension/llm/tokenizer/tiktoken.cpp +++ b/extension/llm/tokenizer/tiktoken.cpp @@ -30,8 +30,12 @@ #include #include -namespace torch { -namespace executor { +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +namespace executorch { +namespace extension { +namespace llm { // ------------------------------Util start------------------------------------ @@ -415,5 +419,6 @@ Result Tiktoken::decode(uint64_t prev, uint64_t cur) const { } // -------------------------public method end------------------------------- -} // namespace executor -} // namespace torch +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/tokenizer/tiktoken.h b/extension/llm/tokenizer/tiktoken.h index 0b1b1fa61e0..7d78f8b60da 100644 --- a/extension/llm/tokenizer/tiktoken.h +++ b/extension/llm/tokenizer/tiktoken.h @@ -14,8 +14,9 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { using Encoder = std::unordered_map; using Decoder = std::unordered_map; @@ -33,13 +34,14 @@ class Tiktoken : public Tokenizer { size_t bos_token_index, size_t eos_token_index); - Error load(const std::string& tokenizer_path) override; + ::executorch::runtime::Error load(const std::string& tokenizer_path) override; - Result> + ::executorch::runtime::Result> encode(const std::string& input, int8_t bos, int8_t eos) const override; - Result decode(uint64_t prev_token, uint64_t token) - const override; + ::executorch::runtime::Result decode( + uint64_t prev_token, + uint64_t token) const override; private: template @@ -74,5 +76,18 @@ class Tiktoken : public Tokenizer { Re2UPtr _regex; Re2UPtr _special_token_regex; }; + +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::Decoder; +using ::executorch::extension::llm::Encoder; +using ::executorch::extension::llm::Re2UPtr; +using ::executorch::extension::llm::Tiktoken; } // namespace executor } // namespace torch diff --git a/extension/llm/tokenizer/tokenizer.h b/extension/llm/tokenizer/tokenizer.h index b49dc245eb3..3115cbdff70 100644 --- a/extension/llm/tokenizer/tokenizer.h +++ b/extension/llm/tokenizer/tokenizer.h @@ -17,8 +17,9 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { +namespace extension { +namespace llm { // A tokenizer interface. class Tokenizer { @@ -26,15 +27,16 @@ class Tokenizer { explicit Tokenizer() {} virtual ~Tokenizer() {} - virtual Error load(const std::string& tokenizer_path) = 0; + virtual ::executorch::runtime::Error load( + const std::string& tokenizer_path) = 0; - virtual Result> + virtual ::executorch::runtime::Result> encode(const std::string& input, int8_t bos, int8_t eos) const = 0; - Error decode_verify(uint64_t token) const { + ::executorch::runtime::Error decode_verify(uint64_t token) const { if (!initialized_) { ET_LOG(Error, "Tokenizer not initialized"); - return Error::NotSupported; + return ::executorch::runtime::Error::NotSupported; } if (token >= vocab_size_) { ET_LOG( @@ -42,13 +44,14 @@ class Tokenizer { "token %" PRIu64 " is out side of vacab range %d", token, vocab_size_); - return Error::NotSupported; + return ::executorch::runtime::Error::NotSupported; } - return Error::Ok; + return ::executorch::runtime::Error::Ok; } - virtual Result decode(uint64_t prev_token, uint64_t token) - const = 0; + virtual ::executorch::runtime::Result decode( + uint64_t prev_token, + uint64_t token) const = 0; // getters int32_t vocab_size() const { @@ -70,5 +73,14 @@ class Tokenizer { uint64_t eos_tok_ = 0; }; +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::Tokenizer; } // namespace executor } // namespace torch From 8bb29d0b39c3880db393f964c6e8b8afefab434d Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Thu, 22 Aug 2024 11:51:14 -0700 Subject: [PATCH 006/531] [ET-VK] Implement slice as a view ## Context TSIA. Implement slice as a view operator. This is only valid under the following conditions: * All dims preceding the sliced dim in the dim order have a size of 1 * start is 0 * step is 1 The reasoning for these restrictions is so that the offset of the slice view with respect to the source buffer is 0. More details are in the comments. To test the operator effectively, this diff also extends the test codegen to handle multiple test suites for one operator, each with a different configuration. Differential Revision: [D61666462](https://our.internmc.facebook.com/intern/diff/D61666462/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ops/impl/Slice.cpp | 147 +++++++++++++++++- backends/vulkan/test/op_tests/cases.py | 38 ++++- .../vulkan/test/op_tests/generate_op_tests.py | 8 +- .../vulkan/test/op_tests/utils/codegen.py | 3 + .../test/op_tests/utils/codegen_base.py | 5 +- 5 files changed, 195 insertions(+), 6 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 8b323bafedd..01dafa427c8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -149,6 +149,124 @@ void add_slice_tensor_out_node( } } +std::vector get_slice_sizes( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref) { + const int64_t dim = graph.extract_scalar(dim_ref); + std::optional opt_start = + graph.extract_optional_scalar(opt_start_ref); + std::optional opt_end = + graph.extract_optional_scalar(opt_end_ref); + + int64_t dim_size = graph.size_at(dim, in_ref); + int64_t start = opt_start.value_or(0); + int64_t end = opt_end.value_or(dim_size); + + start = normalize_idx(start, dim_size, 0); + end = normalize_idx(end, dim_size, dim_size); + + std::vector new_out_sizes = graph.sizes_of(in_ref); + new_out_sizes.at(dim) = end - start; + + return new_out_sizes; +} + +void resize_slice_view_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)args; + vTensorPtr out = graph->get_tensor(extra_args[0]); + + std::vector new_out_sizes = get_slice_sizes( + *graph, + extra_args[1], // input + extra_args[2], // dim + extra_args[3], // optional start + extra_args[4]); // optional end + + out->virtual_resize(new_out_sizes); +} + +void check_slice_view_args( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef opt_step_ref, + ValueRef out_ref) { + VK_CHECK_COND( + graph.val_is_view_of(out_ref, in_ref), + "output must be a view of the input"); + + const int64_t dim = graph.extract_scalar(dim_ref); + const int64_t dim_size = graph.size_at(dim, in_ref); + + int64_t start = + graph.extract_optional_scalar(opt_start_ref).value_or(0); + int64_t end = graph.extract_optional_scalar(opt_end_ref).value_or(0); + int64_t step = + graph.extract_optional_scalar(opt_step_ref).value_or(1); + + start = normalize_idx(start, dim_size, 0); + end = normalize_idx(end, dim_size, dim_size); + + // The start idx must be 0; this is to ensure that the start of the slice view + // does not have any offset with respect to the base buffer storage. If the + // offset is nonzero, then it will potentially change upon a resize; however + // the buffer offset of the view tensor will have been "locked in" when the + // descriptor for its buffer storage is bound to a compute shader. Therefore + // there is no way to update the offset of the view once it has been bound. + VK_CHECK_COND(start == 0, "start must be 0 for slice view"); + VK_CHECK_COND(step == 1, "step must be 1 for slice view"); + + VK_CHECK_COND( + end < dim_size, "end must be less than dim size for slice view"); + + // We must also check that all earlier dims in the dim order have a size of 1. + // This ensures that the slice view encompasses a contiguous memory region of + // the source tensor's memory buffer. + std::vector in_sizes = graph.sizes_of(in_ref); + std::vector in_dim_order = graph.dim_order_of(in_ref); + for (int i = 0; i < in_dim_order.size(); ++i) { + if (in_dim_order[i] == dim) { + break; + } + VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1); + } +} + +void add_slice_view_node( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef opt_step_ref, + ValueRef out_ref) { + check_slice_view_args( + graph, + in_ref, + dim_ref, + opt_start_ref, + opt_end_ref, + opt_step_ref, + out_ref); + + std::vector new_out_sizes = + get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref); + + graph.get_tensor(out_ref)->virtual_resize(new_out_sizes); + + graph.execute_nodes().emplace_back(new ExecuteNode( + resize_slice_view_node, + {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref})); +} + void slice_tensor_out(ComputeGraph& graph, const std::vector& args) { return add_slice_tensor_out_node( graph, @@ -160,9 +278,36 @@ void slice_tensor_out(ComputeGraph& graph, const std::vector& args) { args[5]); } +void slice_tensor(ComputeGraph& graph, const std::vector& args) { + ValueRef in = args[0]; + ValueRef out = args[5]; + + // Special case if out is a view of in + if (graph.is_buffer_storage(out) && graph.val_is_view_of(out, in)) { + add_slice_view_node( + graph, + in, + args[1], // dim + args[2], // optional start + args[3], // optional end + args[4], // step + out); + return; + } + + add_slice_tensor_out_node( + graph, + in, + args[1], // dim + args[2], // optional start + args[3], // optional end + args[4], // step + out); +} + REGISTER_OPERATORS { VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out); - VK_REGISTER_OP(aten.slice.Tensor, slice_tensor_out); + VK_REGISTER_OP(aten.slice.Tensor, slice_tensor); } } // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index c5088ffdb32..8fab1cce501 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -465,8 +465,8 @@ def get_view_inputs(): return test_suite -@register_test_suite(["aten.slice.Tensor", "aten.slice_copy.Tensor"]) -def get_slice_inputs(): +@register_test_suite("aten.slice_copy.Tensor") +def get_slice_out_inputs(): Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) Test.__new__.__defaults__ = (None, 0, None, None, 1) @@ -548,6 +548,40 @@ def get_slice_inputs(): return test_suite +def get_slice_view_inputs(): + Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) + Test.__new__.__defaults__ = (None, 0, None, None, 1) + + # Slice by channel + test_cases = [ + Test(self=[1, 5, 1, 10], dim=1, start=0, end=3), + Test(self=[1, 5, 1, 10], dim=1, start=0, end=4), + Test(self=[1, 5, 3, 7], dim=1, start=0, end=2), + Test(self=[1, 5, 8, 7], dim=1, start=0, end=3), + ] + + test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) + + test_suite.dtypes = ["at::kFloat"] + test_suite.storage_types = ["utils::kBuffer"] + test_suite.layouts = ["utils::kWidthPacked"] + test_suite.data_gen = "make_seq_tensor" + test_suite.is_view_op = True + + return test_suite + + +@register_test_suite(["aten.slice.Tensor"]) +def get_slice_inputs(): + texture_test_suite = get_slice_out_inputs() + texture_test_suite.test_name_suffix = "no_view" + + view_test_suite = get_slice_view_inputs() + view_test_suite.test_name_suffix = "view" + + return [view_test_suite, texture_test_suite] + + @register_test_suite(["aten.transpose.int"]) def get_transpose_inputs(): Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"]) diff --git a/backends/vulkan/test/op_tests/generate_op_tests.py b/backends/vulkan/test/op_tests/generate_op_tests.py index 71047ac6f49..08b3da8407f 100644 --- a/backends/vulkan/test/op_tests/generate_op_tests.py +++ b/backends/vulkan/test/op_tests/generate_op_tests.py @@ -41,9 +41,13 @@ def process_test_suites( f_map: Dict[str, NativeFunction], test_suites: Dict[str, TestSuite], ) -> None: - for registry_name, op_test_suite in test_suites.items(): + for registry_name, op_test_suites in test_suites.items(): f = f_map[registry_name] - cpp_generator.add_suite(registry_name, f, op_test_suite) + if isinstance(op_test_suites, list): + for suite in op_test_suites: + cpp_generator.add_suite(registry_name, f, suite) + else: + cpp_generator.add_suite(registry_name, f, op_test_suites) @local.parametrize( diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py index b39801e7660..9a2a2745fe9 100644 --- a/backends/vulkan/test/op_tests/utils/codegen.py +++ b/backends/vulkan/test/op_tests/utils/codegen.py @@ -606,6 +606,9 @@ def gen_conditional_skips(self) -> str: def gen_op_check_fn(self) -> str: op_name = self.f.func.name.unambiguous_name() + if self.suite_def.test_name_suffix is not None: + op_name += "_" + self.suite_def.test_name_suffix + op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n" if self.should_prepack: op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n" diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py index 1ebebe699a0..d8d01fc5f88 100644 --- a/backends/vulkan/test/op_tests/utils/codegen_base.py +++ b/backends/vulkan/test/op_tests/utils/codegen_base.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import re -from typing import Any, List +from typing import Any, List, Optional from torchgen.api import cpp from torchgen.api.types import CppSignatureGroup @@ -58,6 +58,7 @@ def __init__(self, input_cases: List[Any]): self.rtol: str = "1e-5" self.is_view_op: bool = False + self.test_name_suffix: Optional[str] = None def supports_prepack(self): return len(self.prepacked_args) > 0 @@ -112,6 +113,8 @@ def __init__(self, f: NativeFunction, test_suite: TestSuite): self.f = f self.suite_def = test_suite self.op_name = f.func.name.unambiguous_name() + if test_suite.test_name_suffix is not None: + self.op_name += f"_{test_suite.test_name_suffix}" self.f_sig = CppSignatureGroup.from_native_function( self.f, method=False, fallback_binding=self.f.manual_cpp_binding From 4442a91fe1b93bb9896d6a090ffe51885b852411 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Thu, 22 Aug 2024 12:16:28 -0700 Subject: [PATCH 007/531] Cross attention mask C++ Differential Revision: D61511514 Pull Request resolved: https://github.com/pytorch/executorch/pull/4815 --- .../cross_attention/cross_attention_mask.cpp | 169 ++++++++++++++++++ .../cross_attention/cross_attention_mask.h | 71 ++++++++ .../cross_attention_mask_test.cpp | 71 ++++++++ .../flamingo/cross_attention/targets.bzl | 25 +++ 4 files changed, 336 insertions(+) create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask.cpp create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask.h create mode 100644 examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp create mode 100644 examples/models/flamingo/cross_attention/targets.bzl diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp new file mode 100644 index 00000000000..b2a2a6a8063 --- /dev/null +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include + +namespace torch::executor { + +// Fowrward declaration needed for ARM compilers. +int32_t safe_size_t_to_sizes_type(size_t value); +std::vector> _get_image_attention_intervals( + const std::vector& tokens, + int image_token_id); + +int32_t safe_size_t_to_sizes_type(size_t value) { + if (value > + static_cast(std::numeric_limits::max())) { + throw std::overflow_error( + "size_t value too large for TensorImpl::SizesType"); + } + return static_cast(value); +} + +/** + * Returns a list of lists of the form [start, end) where start is the index + * of the current image token and end is the index of the next image token, + * exclusive. + * + * Example: + * >>> text = "These are two dogs. This is a cat." + * >>> size_t image_token_id = 1; + * >>> std::vector tokens = {1, 1, 9673, 527, 1403, 12875, 13, 1, 1115, + * 374, 264, 8415]}; + * >>> transform = VisionCrossAttentionMask(tile_size=400, patch_size=40, + * image_token_id=1) + * >>> intervals = _get_image_attention_intervals(tokens, image_token_id) + * [[0, 7], [1, 7], [7, 12]] + * + * @param tokens List of token IDs in the text sequence. + * @param image_token_id The value of the image token. + * + * @returns Vector of vectors of the form [start, end) indicating the range of + * positions in the text sequence that should attend to the image. + */ +std::vector> _get_image_attention_intervals( + const std::vector& tokens, + int image_token_id) { + std::vector> vision_masks; + int end = tokens.size(); + std::vector vision_token_locations; + + // Find all vision token locations. + for (int i = 0; i < tokens.size(); ++i) { + if (tokens[i] == image_token_id) { + vision_token_locations.push_back(i); + } + } + + // Return empty vector if there are no images. + if (vision_token_locations.empty()) { + return vision_masks; + } + + // If there is only one image, it will attend to subsequent text until end. + if (vision_token_locations.size() == 1) { + vision_masks.push_back({vision_token_locations[0], end}); + return vision_masks; + } + + // Construct intervals from previous image token to next image token. + for (int i = 0; i < vision_token_locations.size() - 1; ++i) { + vision_masks.push_back( + {vision_token_locations[i], vision_token_locations[i + 1]}); + } + + // Last image will attend to subsequent text until end. + vision_masks.push_back({vision_token_locations.back(), end}); + + // If there are consecutive vision tokens, they should all attend to the + // same subsequent text. + int last_mask_end = vision_masks.back()[1]; + for (auto it = vision_masks.rbegin(); it != vision_masks.rend(); ++it) { + if ((*it)[0] == (*it)[1] - 1) { + (*it)[1] = last_mask_end; + } + last_mask_end = (*it)[1]; + } + + return vision_masks; +} + +std::vector cross_attention_mask( + const std::vector& tokens, + const std::vector& images, + size_t tile_size, + size_t patch_size, + int image_token_id, + std::vector>& out) { + size_t patch_grid_size = tile_size / patch_size; + size_t patches_per_tile = patch_grid_size * patch_grid_size; + + std::vector> image_intervals = + _get_image_attention_intervals(tokens, image_token_id); + + if (image_intervals.size() != images.size()) { + throw std::runtime_error( + "The number of image tokens (" + + std::to_string(image_intervals.size()) + + ") does not match the number of images (" + + std::to_string(images.size()) + ")"); + } + + // Create mask for each individual image based on its number of tokens, + // which can vary based on number of tiles since they are not yet tile padded. + // The masks are padded and concatenated together in the batch collator. + std::vector cross_attention_masks; + size_t text_seq_len = tokens.size(); + for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) { + size_t n_tiles = images[image_idx].size(0); + size_t image_seq_len = + n_tiles * (patches_per_tile + 1); // +1 for the CLS token. + + // Mask will be block of 1s at the corresponding interval in the text. + // It is not a causal block because all the image tokens correspond + // to a single image, so text tokens attend to all the image's tokens. + std::vector sizes = { + safe_size_t_to_sizes_type(text_seq_len), + safe_size_t_to_sizes_type(image_seq_len)}; + + // Allocate the underlying data to be handled by the managed tensor. + size_t num_elements = text_seq_len * image_seq_len; + size_t stride = image_seq_len; + std::vector mask_data(num_elements); + + ManagedTensor mask(mask_data.data(), sizes, ScalarType::Int); + cross_attention_masks.emplace_back(std::move(mask)); + + // Add the allocated data to the output vector. + out.emplace_back(std::move(mask_data)); + + // All rows of tensor in the text_seq_len dimension within the interval are + // set to 1 (true). + size_t start = image_intervals[image_idx][0]; + size_t end = image_intervals[image_idx][1]; // End is exclusive. + for (size_t i = start; i < end; ++i) { + for (size_t j = 0; j < image_seq_len; ++j) { + size_t unrolled_index = i * image_seq_len + j; + if (unrolled_index >= out[image_idx].size()) { + throw std::out_of_range( + "Index " + std::to_string(unrolled_index) + + " out of range of output tensor."); + } + out[image_idx][i * stride + j] = 1; + } + } + } + + return cross_attention_masks; +} + +} // namespace torch::executor diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h new file mode 100644 index 00000000000..6998d91ad4a --- /dev/null +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +namespace torch { +namespace executor { + +/** + * Computes the cross-attention mask for text + image inputs. Text tokens that + * participate in cross-attention with an image token will show True in the mask + * and follow the interleaved structure laid out in Fig. 7 of the Flamingo paper + * (https://arxiv.org/pdf/2204.14198): + * + * (1) Text tokens immediately following the image token up until the next + * image token (2) Consecutive image tokens attend to subsequent text tokens + * + * :: + * + * ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + * img1 │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ │ │ │ │ │ │ │ │ │ + * └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + * ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + * img2 │ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ │ │ │ │ │ │ │ │ │ + * └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + * ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + * img3 │ │ │ │ │ │ │ │ │ │ │ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ + * └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + * These are two dogs. This is a cat. + * + * + * + * Resultant mask is constructed per image and is of shape (text_seq_len, + * image_seq_len), where True indicates that the token outputted from the image + * encoder attends to the token in the text sequence in cross-attention. A list + * of these masks are returned with length equal to number of images in the + * sample. + * + * @param tokens Vector of tokens participating in the cross attention. + * @param images Vector of images participating in the cross attention. + * @param tile_size The size of the image tiles from the image transform. + * @param patch_size The size of each patch. Used to divide the tiles into + * patches. E.g. for patch_size = 40, a tile of shape (400, 400) will have 10x10 + * grid of patches with shape (40, 40) each. image_token_id (int): Token ID of + * the image special token. + * @param image_token_id The value of the image token. + * @param out Out vector holding the raw data wrapped by the returned cross + * attention masks. + * + * @returns A vector of cross attention masks, as Tensors, one for each image. + */ +std::vector cross_attention_mask( + const std::vector& tokens, + const std::vector& images, + size_t tile_size, + size_t patch_size, + int image_token_id, + std::vector>& out); + +} // namespace executor +} // namespace torch diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp new file mode 100644 index 00000000000..5b9e58c216f --- /dev/null +++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using namespace ::testing; +using torch::executor::ManagedTensor; +using torch::executor::ScalarType; +using torch::executor::Tensor; +using torch::executor::TensorImpl; + +TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) { + std::vector tokens = { + 1, 1, 9673, 527, 1403, 12875, 13, 1, 1115, 374, 264, 8415}; + + // Initialize image tensors. + TensorImpl::SizesType sizes[2] = {2, 2}; + TensorImpl::DimOrderType dim_order[2] = {0, 1}; + TensorImpl::StridesType strides[2] = {2, 1}; + + int32_t a_data[4] = {1, 2, 3, 4}; + auto a_impl = + TensorImpl(ScalarType::Int, 2, sizes, a_data, dim_order, strides); + Tensor a(&a_impl); + + int32_t b_data[4] = {5, 6, 7, 8}; + auto b_impl = + TensorImpl(ScalarType::Int, 2, sizes, b_data, dim_order, strides); + Tensor b(&b_impl); + + int32_t c_data[4] = {9, 10, 11, 12}; + auto c_impl = + TensorImpl(ScalarType::Int, 2, sizes, c_data, dim_order, strides); + Tensor c(&c_impl); + + std::vector images = {a, b, c}; + std::vector> mask_data; + std::vector output_masks = + torch::executor::cross_attention_mask( + tokens, + images, + /*tile_size=*/1, + /*patch_size=*/1, + /*image_token_id=*/1, + /*out=*/mask_data); + + // Check contents of the mask. + std::vector> expected_intervals = { + {0, 7}, {1, 7}, {7, 12}}; + for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) { + ManagedTensor& output_mask = output_masks[mask_idx]; + Tensor output_tensor = output_mask.get_aliasing_tensor(); + for (size_t i = 0; i < output_tensor.size(0); ++i) { + for (size_t j = 0; j < output_tensor.strides()[0]; ++j) { + size_t unrolled_index = i * output_tensor.strides()[0] + j; + if (i >= expected_intervals[mask_idx][0] && + i < expected_intervals[mask_idx][1]) { + EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 1); + } else { + EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 0); + } + } + } + } +} diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl new file mode 100644 index 00000000000..7bc13270aa9 --- /dev/null +++ b/examples/models/flamingo/cross_attention/targets.bzl @@ -0,0 +1,25 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.cxx_library( + name = "cross_attention_mask", + srcs = ["cross_attention_mask.cpp"], + exported_headers = ["cross_attention_mask.h"], + exported_deps = [ + "//executorch/runtime/core/exec_aten:lib", + "//executorch/extension/runner_util:managed_tensor", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ], + ) + + runtime.cxx_test( + name = "cross_attention_mask_test", + srcs = ["cross_attention_mask_test.cpp"], + deps = [":cross_attention_mask"], + ) From c2044a425679738d99fe68d5785c332e001030b4 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Thu, 22 Aug 2024 12:42:13 -0700 Subject: [PATCH 008/531] [ET-VK] Register conv_with_clamp custom op Differential Revision: D60205360 Pull Request resolved: https://github.com/pytorch/executorch/pull/4829 --- backends/vulkan/partitioner/supported_ops.py | 6 ++- backends/vulkan/passes/custom_ops_defs.py | 37 +++++++++++++++++++ .../runtime/graph/ops/impl/Convolution.cpp | 1 + backends/vulkan/test/test_vulkan_delegate.py | 36 ++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py index 08d7f96a6b9..ca7ce72caed 100644 --- a/backends/vulkan/partitioner/supported_ops.py +++ b/backends/vulkan/partitioner/supported_ops.py @@ -8,7 +8,10 @@ import operator -from executorch.backends.vulkan.passes.custom_ops_defs import grid_priors_op # noqa +from executorch.backends.vulkan.passes.custom_ops_defs import ( # noqa + conv_with_clamp_op, + grid_priors_op, +) from executorch.exir.dialects._ops import ops as exir_ops @@ -84,6 +87,7 @@ def __contains__(self, op): CONVOLUTION_OPS = [ exir_ops.edge.aten.convolution.default, + exir_ops.edge.et_vk.conv_with_clamp.default, ] REDUCTION_OPS = [ diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py index 62f21bfee63..fd586b665a0 100644 --- a/backends/vulkan/passes/custom_ops_defs.py +++ b/backends/vulkan/passes/custom_ops_defs.py @@ -48,6 +48,43 @@ def conv_with_clamp_impl( conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name) +def conv_with_clamp_out_impl( + input, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + transposed=False, + output_padding=0, + groups=1, + output_min=-float("inf"), + output_max=float("inf"), + out=None, +): + out = conv_with_clamp_impl( + input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_min, + output_max, + ) + return out + + +name = "conv_with_clamp.out" +lib.define( + f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd") + + # The dimension of x should be larger than 1 def grid_priors_impl( x, diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 52af0542b6a..74113197d46 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -562,6 +562,7 @@ void conv(ComputeGraph& graph, const std::vector& args) { REGISTER_OPERATORS { VK_REGISTER_OP(aten.convolution.default, conv); VK_REGISTER_OP(conv_with_clamp.default, conv); + VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv); } } // namespace vkcompute diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 9f57ec49a89..d80809ec79f 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -1633,6 +1633,42 @@ def forward(self, x): memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], ) + def test_vulkan_backend_conv_with_clamp(self): + class ConvWithClampModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.weight = torch.randn(6, 8, 3, 3) + self.bias = torch.randn(8) + self.stride = (1, 2) + self.padding = (2, 3) + self.dilation = (1, 1) + self.transposed = True + self.output_padding = (0, 1) + self.groups = 1 + self.output_min = 0 + self.output_max = 10 + + def forward(self, x): + return torch.ops.et_vk.conv_with_clamp( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.transposed, + self.output_padding, + self.groups, + self.output_min, + self.output_max, + ) + + self.lower_module_and_test_output( + ConvWithClampModule(), + (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),), + memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], + ) + def test_vulkan_backend_grid_priors(self): class GridPriorsModule(torch.nn.Module): def __init__(self): From 3af50f9e28ceb6b4cb89813e8cba26bf45ed252e Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 22 Aug 2024 13:13:59 -0700 Subject: [PATCH 009/531] emit metadata Differential Revision: D61625159 Pull Request resolved: https://github.com/pytorch/executorch/pull/4837 --- exir/emit/_emit_program.py | 27 +++++++++++++++++++ exir/tests/test_joint_graph.py | 20 ++++++++++++++ .../training/test/training_loop_test.cpp | 2 +- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index 0aebab649e5..bf40a78bb6e 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -78,6 +78,29 @@ def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.Grap return gm +# For each entry point in the model, determine if its a joint graph, +# and if it is return a map of the indices in the model output that the +# gradient outputs start at and that the parameter outputs start at. +def _get_training_metadata(methods: Dict[str, ExportedProgram]) -> Dict[str, int]: + gradients_method_prefix = "__et_training_gradients_index_" + parameters_method_prefix = "__et_training_parameters_index_" + training_metadata = {} + for name, method in methods.items(): + found_grad = False + found_param = False + i = 0 + for output_spec in method.graph_signature.output_specs: + if output_spec.kind == OutputKind.GRADIENT_TO_PARAMETER and not found_grad: + training_metadata[gradients_method_prefix + name] = i + found_grad = True + elif output_spec.kind == OutputKind.TOKEN and not found_param: + assert found_grad # Params must come after gradients + training_metadata[parameters_method_prefix + name] = i + found_param = True + i += 1 + return training_metadata + + def emit_program( methods: Union[ExportedProgram, Dict[str, ExportedProgram]], emit_stacktrace: bool = False, @@ -143,6 +166,10 @@ def emit_program( emitter.instr_id_to_delegate_debug_id_map ) + training_metadata = _get_training_metadata(methods) + if len(training_metadata) > 0: + plans.extend(emitter._emit_prim_getters(training_metadata)) + # emit any primitive getters if prim_getters is not None: plans.extend(emitter._emit_prim_getters(prim_getters)) diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py index 7c80439610b..0e5a322397d 100644 --- a/exir/tests/test_joint_graph.py +++ b/exir/tests/test_joint_graph.py @@ -108,3 +108,23 @@ def forward(self, x, y): self.assertTrue(torch.allclose(m.linear.bias.grad, et_outputs[2])) self.assertTrue(torch.allclose(m.linear.weight, et_outputs[3])) self.assertTrue(torch.allclose(m.linear.bias, et_outputs[4])) + + self.assertEqual( + len(et.executorch_program.execution_plan), 3 + ) # forward + 2 training metadata functions + + # gradient outputs start at index 1 + self.assertEqual( + et.executorch_program.execution_plan[1] # pyre-ignore + .values[0] + .val.int_val, + 1, + ) + + # parameter outputs start at index 3 + self.assertEqual( + et.executorch_program.execution_plan[2] # pyre-ignore + .values[0] + .val.int_val, + 3, + ) diff --git a/extension/training/test/training_loop_test.cpp b/extension/training/test/training_loop_test.cpp index 28931fbfc0f..8e62663c9f7 100644 --- a/extension/training/test/training_loop_test.cpp +++ b/extension/training/test/training_loop_test.cpp @@ -23,7 +23,7 @@ // @lint-ignore-every CLANGTIDY facebook-hte-CArray using namespace ::testing; -using namespace torch::executor::training::optimizer; +using namespace executorch::extension::training::optimizer; using namespace torch::executor::testing; using exec_aten::ScalarType; using exec_aten::Tensor; From 0a211022a625cd07d874fee8ad6c1d5cc964b90e Mon Sep 17 00:00:00 2001 From: Mengtao Yuan Date: Thu, 22 Aug 2024 13:22:17 -0700 Subject: [PATCH 010/531] Add option to generate full logits Differential Revision: D61575026 Pull Request resolved: https://github.com/pytorch/executorch/pull/4828 --- examples/models/llama2/eval_llama.py | 2 ++ examples/models/llama2/export_llama_lib.py | 10 ++++++++++ examples/models/llama2/llama_transformer.py | 9 +++++++++ examples/models/llama2/model.py | 2 ++ 4 files changed, 23 insertions(+) diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama2/eval_llama.py index 0495c76bbf1..4daeaf7afa5 100644 --- a/examples/models/llama2/eval_llama.py +++ b/examples/models/llama2/eval_llama.py @@ -22,6 +22,8 @@ def main() -> None: modelname = "llama2" parser = build_args_parser() args = parser.parse_args() + # Overrides this arg, because evaluation requires full logits. + args.generate_full_logits = True eval_llama(modelname, args) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index c22c0a3c3ce..8ff5d3aa265 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -296,6 +296,13 @@ def build_args_parser() -> argparse.ArgumentParser: help="Generate the ETRecord debug artifact.", ) + parser.add_argument( + "--generate_full_logits", + action="store_true", + required=False, + default=True, + help="Generate logits for all inputs.", + ) return parser @@ -405,6 +412,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: params_path=params_path, use_kv_cache=args.use_kv_cache, use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache, + generate_full_logits=args.generate_full_logits, weight_type=weight_type, enable_dynamic_shape=args.enable_dynamic_shape, verbose=args.verbose, @@ -590,6 +598,7 @@ def _load_llama_model( params_path: str, use_kv_cache: bool = False, use_sdpa_with_kv_cache: bool = False, + generate_full_logits: bool = True, weight_type: WeightType = WeightType.LLAMA, enable_dynamic_shape: bool = False, verbose: bool = False, @@ -616,6 +625,7 @@ def _load_llama_model( params=params_path, use_kv_cache=use_kv_cache, use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, + generate_full_logits=generate_full_logits, fairseq2=weight_type == WeightType.FAIRSEQ2, max_seq_len=max_seq_len, enable_dynamic_shape=enable_dynamic_shape, diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py index 4ae12b0f647..81b47a3a5d8 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama2/llama_transformer.py @@ -96,6 +96,10 @@ class ModelArgs: use_sdpa_with_kv_cache_op: bool = ( False # Use custom sdpa op that updates kv cache in-place ) + # Generate logits for all inputs. When it's True, it would take big memory usage + # at runtime. Enable it only necessary (e.g., use perplexity tools that requires + # logits for all input tokens.) + generate_full_logits: bool = True enable_dynamic_shape: bool = False # export model with dynamic shape support use_hf_rope: bool = False # Use HuggingFace's RoPE implementation rope_theta: Optional[float] = ( @@ -442,6 +446,7 @@ def __init__(self, params: ModelArgs): self.norm = RMSNorm(params.dim, eps=params.norm_eps) self.output = nn.Linear(params.dim, params.vocab_size, bias=False) self.use_kv_cache = params.use_kv_cache + self.generate_full_logits = params.generate_full_logits self.max_seq_len = params.max_seq_len if params.use_hf_rope: self.precompute_freqs_cis = hf_precompute_freqs_cis @@ -512,6 +517,10 @@ def forward( input_pos, ) + if not self.generate_full_logits: + # Only the last logit is used for the new generated token + h = h[:, -1, :] + h = self.norm(h) logits = self.output(h) diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index fdf0dc707e4..b375399f336 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -61,6 +61,7 @@ def __init__(self, **kwargs): self.use_kv_cache = kwargs.get("use_kv_cache", False) self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False) + self.generate_full_logits = kwargs.get("generate_full_logits", True) self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False) self.max_seq_len = kwargs.get("max_seq_len", 128) @@ -145,6 +146,7 @@ def __init__(self, **kwargs): max_batch_size=max_batch_size, use_kv_cache=self.use_kv_cache, use_sdpa_with_kv_cache_op=self.use_sdpa_with_kv_cache_op, + generate_full_logits=self.generate_full_logits, enable_dynamic_shape=self.enable_dynamic_shape, **params, ) From 65473de3b0d1227a9fd493c6978676f487d9f728 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Thu, 22 Aug 2024 16:25:35 -0400 Subject: [PATCH 011/531] [ET-VK][ez] Introduce `check_close` function in `compute_api_test` to account for small numerical differences Differential Revision: D61666459 Pull Request resolved: https://github.com/pytorch/executorch/pull/4841 --- backends/vulkan/test/utils/test_utils.cpp | 6 ++++++ backends/vulkan/test/utils/test_utils.h | 6 ++++++ backends/vulkan/test/vulkan_compute_api_test.cpp | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index ad496873695..6c056cc9d90 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -482,3 +482,9 @@ void execute_graph_and_check_output( } } } + +bool check_close(float a, float b, float atol, float rtol) { + float max = std::max(std::abs(a), std::abs(b)); + float diff = std::abs(a - b); + return diff <= (atol + rtol * max); +} diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index f9969eddbf4..bf549446170 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -242,3 +242,9 @@ void print_vector( } std::cout << std::endl; } + +// +// Misc. Utilities +// + +bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 307593d8fdb..ee2d119b6be 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -601,7 +601,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { EXPECT_TRUE(data_out.size() == ref_out.size()); for (size_t i = 0; i < data_out.size(); ++i) { - EXPECT_TRUE(data_out[i] == ref_out[i]); + EXPECT_TRUE(check_close(data_out[i], ref_out[i])); } } From 87b38cfdbde3c121f7446c71b7749698e7cf0324 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Thu, 22 Aug 2024 17:27:19 -0400 Subject: [PATCH 012/531] [ET-VK][ez] Empty initialize ShaderInfo and add `bool()` operator Differential Revision: D61666460 Pull Request resolved: https://github.com/pytorch/executorch/pull/4842 --- backends/vulkan/runtime/vk_api/Shader.h | 8 ++++++-- backends/vulkan/test/vulkan_compute_api_test.cpp | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h index 34c2d95c932..1e3b2a799f2 100644 --- a/backends/vulkan/runtime/vk_api/Shader.h +++ b/backends/vulkan/runtime/vk_api/Shader.h @@ -53,8 +53,8 @@ class ShaderLayout final { struct ShaderInfo final { struct { - const uint32_t* bin; - uint32_t size; + const uint32_t* bin = nullptr; + uint32_t size = 0u; } src_code; std::string kernel_name{""}; @@ -71,6 +71,10 @@ struct ShaderInfo final { const uint32_t, std::vector, const utils::uvec3 tile_size); + + operator bool() const { + return src_code.bin != nullptr; + }; }; bool operator==(const ShaderInfo& _1, const ShaderInfo& _2); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index ee2d119b6be..cbd409112ff 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -168,6 +168,13 @@ std::vector get_reference_strides( return {}; } +TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { + vkapi::ShaderInfo empty_shader_info; + EXPECT_FALSE(empty_shader_info); + EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr); + EXPECT_TRUE(empty_shader_info.src_code.size == 0u); +} + TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { for (const auto& sizes : standard_sizes_to_test) { if (sizes.size() < 3) { From bfc5b17393ca301f88116273629b1debfd21544a Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Thu, 22 Aug 2024 18:46:52 -0400 Subject: [PATCH 013/531] [ET-VK][ez] Enable no-op ExecuteNodes for view ops Differential Revision: D61666465 Pull Request resolved: https://github.com/pytorch/executorch/pull/4843 --- .../vulkan/runtime/graph/ops/ExecuteNode.cpp | 15 +++++++++++++++ backends/vulkan/runtime/graph/ops/ExecuteNode.h | 16 +++++++++++++++- backends/vulkan/test/vulkan_compute_api_test.cpp | 13 +++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 3b2a826f87f..2cb00ba65af 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -35,7 +35,22 @@ ExecuteNode::ExecuteNode( graph.update_descriptor_counts(shader, /*execute = */ true); } +ExecuteNode::ExecuteNode( + const ResizeFunction& resize_fn, + const std::vector& resize_args) + : shader_(), + global_workgroup_size_({0u, 0u, 0u}), + local_workgroup_size_({0u, 0u, 0u}), + args_(), + params_(), + spec_vars_(), + resize_fn_(resize_fn), + resize_args_(resize_args) {} + void ExecuteNode::encode(ComputeGraph* graph) { + if (!shader_) { + return; + } api::Context* const context = graph->context(); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 1fff14e020e..dece9ddb50d 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -48,7 +48,7 @@ class ExecuteNode final { const std::vector&, const std::vector&)>; - ExecuteNode( + explicit ExecuteNode( ComputeGraph& graph, const vkapi::ShaderInfo& shader, const utils::uvec3& global_workgroup_size, @@ -59,6 +59,15 @@ class ExecuteNode final { const ResizeFunction& resize_fn = nullptr, const std::vector& resize_args = {}); + /* + * This overload of the ExecuteNode constructor is used to register ops which + * update a tensor view. No shader is dispatched, but the node still needs to + * update the view's sizes and strides after a resize. + */ + explicit ExecuteNode( + const ResizeFunction& resize_fn = nullptr, + const std::vector& resize_args = {}); + ~ExecuteNode() = default; void encode(ComputeGraph* graph); @@ -83,6 +92,11 @@ class ExecuteNode final { const vkapi::SpecVarList spec_vars_; const ResizeFunction resize_fn_; const std::vector resize_args_; + + public: + operator bool() const { + return shader_; + } }; } // namespace vkcompute diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index cbd409112ff..af92728cb0c 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -982,6 +982,19 @@ TEST(VulkanComputeGraphTest, test_values_string) { EXPECT_TRUE(stored == "hello, world"); } +TEST(VulkanComputeGraphTest, empty_init_executenode_test) { + ExecuteNode node(nullptr, {}); + EXPECT_FALSE(node); + + GraphConfig config; + ComputeGraph graph(config); + + // Encode an empty ExecuteNode and check that command buffer encoding does not + // crash. + graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {})); + EXPECT_NO_FATAL_FAILURE(graph.encode_execute()); +} + TEST(VulkanComputeGraphTest, test_zero_dim_tensor) { GraphConfig config; ComputeGraph graph(config); From bf6481916cc9d0974e5f4a0011b48d56717a3eaf Mon Sep 17 00:00:00 2001 From: Peixuan Liu Date: Thu, 22 Aug 2024 16:17:59 -0700 Subject: [PATCH 014/531] Rename directory "executorch/sdk" to "executorch/devtools" Differential Revision: D60485375 Pull Request resolved: https://github.com/pytorch/executorch/pull/4823 --- CMakeLists.txt | 4 +- README.md | 2 +- backends/apple/mps/TARGETS | 4 +- backends/apple/mps/targets.bzl | 2 +- backends/apple/mps/test/test_mps_utils.py | 10 ++--- .../cadence/cadence_runner/cadence_runner.cpp | 4 +- backends/cadence/cadence_runner/targets.bzl | 8 ++-- backends/cadence/runtime/TARGETS | 6 +-- backends/cadence/runtime/executor.py | 9 ++-- backends/cadence/runtime/runtime.py | 2 +- backends/qualcomm/tests/utils.py | 4 +- backends/xnnpack/test/TARGETS | 6 +-- backends/xnnpack/test/test_xnnpack_utils.py | 12 +++--- {sdk => devtools}/CMakeLists.txt | 20 ++++----- {sdk => devtools}/TARGETS | 6 +-- {sdk => devtools}/__init__.py | 8 ++-- {sdk => devtools}/backend_debug/TARGETS | 0 {sdk => devtools}/backend_debug/__init__.py | 2 +- .../backend_debug/delegation_info.py | 0 {sdk => devtools}/backend_debug/tests/TARGETS | 2 +- .../tests/test_delegation_info.py | 2 +- {sdk => devtools}/bundled_program/TARGETS | 4 +- .../bundled_program/bundled_program.cpp | 4 +- .../bundled_program/bundled_program.h | 0 {sdk => devtools}/bundled_program/config.py | 0 {sdk => devtools}/bundled_program/core.py | 10 ++--- .../bundled_program/schema/README.md | 0 .../bundled_program/schema/TARGETS | 4 +- .../bundled_program/schema/__init__.py | 0 .../schema/bundled_program_schema.fbs | 0 .../schema/bundled_program_schema.py | 0 .../bundled_program/schema/scalar_type.fbs | 0 .../bundled_program/schema/targets.bzl | 6 +-- .../bundled_program/schema/test/TARGETS | 0 .../schema/test/test_schema.py | 4 +- .../bundled_program/serialize/TARGETS | 8 ++-- .../bundled_program/serialize/__init__.py | 4 +- .../bundled_program/serialize/test/TARGETS | 7 ++-- .../serialize/test/test_serialize.py | 8 ++-- {sdk => devtools}/bundled_program/targets.bzl | 2 +- .../bundled_program/test/TARGETS | 21 +++++----- .../bundled_program/test/test_bundle_data.py | 10 +++-- .../bundled_program/test/test_config.py | 6 +-- .../bundled_program/test/test_end2end.py | 8 ++-- .../bundled_program/util/TARGETS | 4 +- .../bundled_program/util/test_util.py | 6 +-- {sdk => devtools}/bundled_program/version.py | 0 {sdk => devtools}/debug_format/TARGETS | 0 {sdk => devtools}/debug_format/base_schema.py | 0 {sdk => devtools}/debug_format/et_schema.py | 2 +- {sdk => devtools}/etdump/TARGETS | 6 +-- {sdk => devtools}/etdump/emitter.cpp | 2 +- {sdk => devtools}/etdump/emitter.h | 2 +- {sdk => devtools}/etdump/etdump_flatcc.cpp | 8 ++-- {sdk => devtools}/etdump/etdump_flatcc.h | 0 .../etdump/etdump_schema_flatcc.fbs | 0 {sdk => devtools}/etdump/scalar_type.fbs | 0 {sdk => devtools}/etdump/schema_flatcc.py | 2 +- {sdk => devtools}/etdump/serialize.py | 2 +- {sdk => devtools}/etdump/targets.bzl | 0 {sdk => devtools}/etdump/tests/CMakeLists.txt | 0 {sdk => devtools}/etdump/tests/TARGETS | 4 +- .../etdump/tests/etdump_test.cpp | 6 +-- .../etdump/tests/serialize_test.py | 6 +-- {sdk => devtools}/etdump/tests/targets.bzl | 4 +- {sdk => devtools}/etrecord/TARGETS | 4 +- {sdk => devtools}/etrecord/__init__.py | 2 +- {sdk => devtools}/etrecord/_etrecord.py | 6 +-- {sdk => devtools}/etrecord/tests/TARGETS | 12 +++--- .../etrecord/tests/etrecord_test.py | 10 ++--- {sdk => devtools}/inspector/TARGETS | 18 ++++---- {sdk => devtools}/inspector/__init__.py | 9 +++- {sdk => devtools}/inspector/_inspector.py | 16 +++++--- .../inspector/_inspector_utils.py | 12 +++--- {sdk => devtools}/inspector/inspector_cli.py | 4 +- devtools/inspector/tests/TARGETS | 41 +++++++++++++++++++ .../inspector/tests/event_blocks_test.py | 8 ++-- .../inspector/tests/inspector_test.py | 23 +++++++---- .../inspector/tests/inspector_utils_test.py | 12 +++--- {sdk => devtools}/size_analysis_tool/TARGETS | 8 ++-- .../size_analysis_tool/size_analysis_tool.py | 2 +- .../size_analysis_tool_test.py | 6 +-- {sdk => devtools}/targets.bzl | 0 docs/source/extension-module.md | 2 +- docs/source/llm/getting-started.md | 6 +-- docs/source/sdk-bundled-io.md | 38 ++++++++--------- docs/source/sdk-debugging.md | 4 +- docs/source/sdk-etdump.md | 2 +- docs/source/sdk-etrecord.rst | 2 +- docs/source/sdk-inspector.rst | 18 ++++---- .../sdk-integration-tutorial.py | 18 ++++---- .../website/docs/tutorials/bundled_program.md | 2 +- examples/apple/coreml/executor_runner/main.mm | 2 +- .../coreml/scripts/build_executor_runner.sh | 2 +- examples/apple/coreml/scripts/export.py | 2 +- .../apple/coreml/scripts/inspector_cli.py | 4 +- .../apple/coreml/scripts/inspector_utils.py | 15 ++++--- examples/apple/mps/CMakeLists.txt | 4 +- .../executor_runner/mps_executor_runner.mm | 4 +- .../apple/mps/executor_runner/targets.bzl | 4 +- examples/apple/mps/scripts/mps_example.py | 10 ++--- examples/models/llama2/TARGETS | 2 +- examples/models/llama2/export_llama_lib.py | 4 +- .../executor_runner/qnn_executor_runner.cpp | 2 +- examples/qualcomm/scripts/export_example.py | 2 +- examples/sdk/CMakeLists.txt | 2 +- examples/sdk/README.md | 4 +- .../sdk/scripts/export_bundled_program.py | 12 +++--- examples/sdk/scripts/gen_sample_etrecord.py | 2 +- .../sdk_example_runner/sdk_example_runner.cpp | 4 +- examples/sdk/sdk_example_runner/targets.bzl | 4 +- examples/xnnpack/aot_compiler.py | 2 +- examples/xnnpack/targets.bzl | 2 +- exir/_serialize/TARGETS | 12 +++--- extension/pybindings/pybindings.cpp | 6 +-- pytest.ini | 3 +- runtime/executor/test/targets.bzl | 4 +- schema/targets.bzl | 2 +- sdk/inspector/tests/TARGETS | 40 ------------------ setup.py | 12 +++--- .../extension/pybindings/pybindings.bzl | 12 +++--- test/end2end/TARGETS | 12 +++--- .../generate_linear_out_bundled_program.py | 10 ++--- test/models/targets.bzl | 6 +-- test/run_oss_cpp_tests.sh | 2 +- 125 files changed, 393 insertions(+), 375 deletions(-) rename {sdk => devtools}/CMakeLists.txt (89%) rename {sdk => devtools}/TARGETS (54%) rename {sdk => devtools}/__init__.py (57%) rename {sdk => devtools}/backend_debug/TARGETS (100%) rename {sdk => devtools}/backend_debug/__init__.py (83%) rename {sdk => devtools}/backend_debug/delegation_info.py (100%) rename {sdk => devtools}/backend_debug/tests/TARGETS (86%) rename {sdk => devtools}/backend_debug/tests/test_delegation_info.py (96%) rename {sdk => devtools}/bundled_program/TARGETS (88%) rename {sdk => devtools}/bundled_program/bundled_program.cpp (98%) rename {sdk => devtools}/bundled_program/bundled_program.h (100%) rename {sdk => devtools}/bundled_program/config.py (100%) rename {sdk => devtools}/bundled_program/core.py (98%) rename {sdk => devtools}/bundled_program/schema/README.md (100%) rename {sdk => devtools}/bundled_program/schema/TARGETS (84%) rename {sdk => devtools}/bundled_program/schema/__init__.py (100%) rename {sdk => devtools}/bundled_program/schema/bundled_program_schema.fbs (100%) rename {sdk => devtools}/bundled_program/schema/bundled_program_schema.py (100%) rename {sdk => devtools}/bundled_program/schema/scalar_type.fbs (100%) rename {sdk => devtools}/bundled_program/schema/targets.bzl (93%) rename {sdk => devtools}/bundled_program/schema/test/TARGETS (100%) rename {sdk => devtools}/bundled_program/schema/test/test_schema.py (79%) rename {sdk => devtools}/bundled_program/serialize/TARGETS (76%) rename {sdk => devtools}/bundled_program/serialize/__init__.py (97%) rename {sdk => devtools}/bundled_program/serialize/test/TARGETS (51%) rename {sdk => devtools}/bundled_program/serialize/test/test_serialize.py (82%) rename {sdk => devtools}/bundled_program/targets.bzl (91%) rename {sdk => devtools}/bundled_program/test/TARGETS (68%) rename {sdk => devtools}/bundled_program/test/test_bundle_data.py (93%) rename {sdk => devtools}/bundled_program/test/test_config.py (97%) rename {sdk => devtools}/bundled_program/test/test_end2end.py (88%) rename {sdk => devtools}/bundled_program/util/TARGETS (68%) rename {sdk => devtools}/bundled_program/util/test_util.py (99%) rename {sdk => devtools}/bundled_program/version.py (100%) rename {sdk => devtools}/debug_format/TARGETS (100%) rename {sdk => devtools}/debug_format/base_schema.py (100%) rename {sdk => devtools}/debug_format/et_schema.py (99%) rename {sdk => devtools}/etdump/TARGETS (81%) rename {sdk => devtools}/etdump/emitter.cpp (98%) rename {sdk => devtools}/etdump/emitter.h (92%) rename {sdk => devtools}/etdump/etdump_flatcc.cpp (98%) rename {sdk => devtools}/etdump/etdump_flatcc.h (100%) rename {sdk => devtools}/etdump/etdump_schema_flatcc.fbs (100%) rename {sdk => devtools}/etdump/scalar_type.fbs (100%) rename {sdk => devtools}/etdump/schema_flatcc.py (97%) rename {sdk => devtools}/etdump/serialize.py (98%) rename {sdk => devtools}/etdump/targets.bzl (100%) rename {sdk => devtools}/etdump/tests/CMakeLists.txt (100%) rename {sdk => devtools}/etdump/tests/TARGETS (75%) rename {sdk => devtools}/etdump/tests/etdump_test.cpp (99%) rename {sdk => devtools}/etdump/tests/serialize_test.py (97%) rename {sdk => devtools}/etdump/tests/targets.bzl (82%) rename {sdk => devtools}/etrecord/TARGETS (71%) rename {sdk => devtools}/etrecord/__init__.py (86%) rename {sdk => devtools}/etrecord/_etrecord.py (98%) rename {sdk => devtools}/etrecord/tests/TARGETS (64%) rename {sdk => devtools}/etrecord/tests/etrecord_test.py (96%) rename {sdk => devtools}/inspector/TARGETS (70%) rename {sdk => devtools}/inspector/__init__.py (60%) rename {sdk => devtools}/inspector/_inspector.py (99%) rename {sdk => devtools}/inspector/_inspector_utils.py (97%) rename {sdk => devtools}/inspector/inspector_cli.py (93%) create mode 100644 devtools/inspector/tests/TARGETS rename {sdk => devtools}/inspector/tests/event_blocks_test.py (98%) rename {sdk => devtools}/inspector/tests/inspector_test.py (97%) rename {sdk => devtools}/inspector/tests/inspector_utils_test.py (94%) rename {sdk => devtools}/size_analysis_tool/TARGETS (86%) rename {sdk => devtools}/size_analysis_tool/size_analysis_tool.py (99%) rename {sdk => devtools}/size_analysis_tool/size_analysis_tool_test.py (98%) rename {sdk => devtools}/targets.bzl (100%) delete mode 100644 sdk/inspector/tests/TARGETS diff --git a/CMakeLists.txt b/CMakeLists.txt index afb0437fae4..b5a5b592350 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -617,7 +617,7 @@ if(EXECUTORCH_BUILD_SDK) ON CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE ) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) @@ -676,7 +676,7 @@ if(EXECUTORCH_BUILD_PYBIND) endif() if(NOT EXECUTORCH_BUILD_SDK) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() # find pytorch lib, to allow pybind to take at::Tensor as input/output diff --git a/README.md b/README.md index c4e6e0caf75..914eab472e7 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ tools. ├── schema # ExecuTorch PTE file format flatbuffer schemas. ├── scripts # Utility scripts for size management, dependency management, etc. -├── sdk # Model profiling, debugging, and introspection. +├── devtools # Model profiling, debugging, and introspection. ├── shim # Compatibility layer between OSS and Internal builds ├── test # Broad scoped end-to-end tests. ├── third-party # Third-party dependencies. diff --git a/backends/apple/mps/TARGETS b/backends/apple/mps/TARGETS index b8ab3427a9e..1ab92b3fca0 100644 --- a/backends/apple/mps/TARGETS +++ b/backends/apple/mps/TARGETS @@ -95,8 +95,8 @@ runtime.python_test( "//executorch/examples/models:models", "//executorch/exir/tests:models", "//executorch/extension/export_util:export_util", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program/serialize:lib", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program/serialize:lib", "fbsource//third-party/pypi/pytest:pytest", ], ) diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl index 8b9c64e143c..74d79448362 100644 --- a/backends/apple/mps/targets.bzl +++ b/backends/apple/mps/targets.bzl @@ -47,7 +47,7 @@ def define_common_targets(is_xplat = False, platforms = []): "//executorch/exir/backend:backend_lib", "//executorch/extension/pybindings/...", "//executorch/runtime/backend/...", - "//executorch/sdk/runners/...", + "//executorch/devtools/runners/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py index d7efe8bde41..77c02f533be 100644 --- a/backends/apple/mps/test/test_mps_utils.py +++ b/backends/apple/mps/test/test_mps_utils.py @@ -12,16 +12,16 @@ import torch from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner +from executorch.devtools import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge -from executorch.sdk import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from torch.export import export # Config for Capturing the weights, will be moved in the future diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp index d76ba004aae..a269ed5a8e8 100644 --- a/backends/cadence/cadence_runner/cadence_runner.cpp +++ b/backends/cadence/cadence_runner/cadence_runner.cpp @@ -22,13 +22,13 @@ #include +#include +#include #include #include #include #include #include -#include -#include static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl index 028ff7ad2ef..361fe9712ee 100644 --- a/backends/cadence/cadence_runner/targets.bzl +++ b/backends/cadence/cadence_runner/targets.bzl @@ -19,12 +19,12 @@ def define_common_targets(): visibility = ["PUBLIC"], deps = [ "fbsource//arvr/third-party/gflags:gflags", - "fbsource//xplat/executorch/kernels/portable:generated_lib", - "fbsource//xplat/executorch/runtime/executor:program", + "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc", + "fbsource//xplat/executorch/devtools/bundled_program:runtime", "fbsource//xplat/executorch/extension/data_loader:file_data_loader", "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader", + "fbsource//xplat/executorch/kernels/portable:generated_lib", + "fbsource//xplat/executorch/runtime/executor:program", "fbsource//xplat/executorch/util:util", - "fbsource//xplat/executorch/sdk/etdump:etdump_flatcc", - "fbsource//xplat/executorch/sdk/bundled_program:runtime", ], ) diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS index 9f30cadf6fd..1b55a7d541b 100644 --- a/backends/cadence/runtime/TARGETS +++ b/backends/cadence/runtime/TARGETS @@ -13,9 +13,9 @@ python_library( typing = True, deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", ], ) diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py index 7bcf705c034..d07b1b6a52e 100644 --- a/backends/cadence/runtime/executor.py +++ b/backends/cadence/runtime/executor.py @@ -18,14 +18,13 @@ import torch -from executorch.exir import ExecutorchProgram, ExecutorchProgramManager - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) +from executorch.exir import ExecutorchProgram, ExecutorchProgramManager # If quiet is true, suppress the printing of stdout and stderr output. quiet = False diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py index ec282f8f7b3..33bb20719c8 100644 --- a/backends/cadence/runtime/runtime.py +++ b/backends/cadence/runtime/runtime.py @@ -18,10 +18,10 @@ from executorch.backends.cadence.runtime import utils from executorch.backends.cadence.runtime.executor import Executor +from executorch.devtools import Inspector from executorch.exir import ExecutorchProgramManager from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.schema import DataLocation -from executorch.sdk import Inspector from numpy import ndarray diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 5fd6d5ad196..b206a7e1330 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -27,6 +27,8 @@ QcomChipset, ) from executorch.backends.qualcomm.utils.utils import capture_program +from executorch.devtools import generate_etrecord +from executorch.devtools.inspector import Inspector from executorch.examples.qualcomm.utils import ( generate_inputs, make_output_dir, @@ -40,8 +42,6 @@ from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from executorch.exir.program._program import ExecutorchProgram -from executorch.sdk import generate_etrecord -from executorch.sdk.inspector import Inspector from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS index abedffb8e61..629ac8275bc 100644 --- a/backends/xnnpack/test/TARGETS +++ b/backends/xnnpack/test/TARGETS @@ -36,10 +36,10 @@ runtime.python_test( deps = [ "//executorch/backends/xnnpack/partition:xnnpack_partitioner", "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir/passes:constant_prop_pass", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/serialize:lib", "//pytorch/ao:torchao", # @manual ], external_deps = [ diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py index c6b1513d317..3f5359a3f45 100644 --- a/backends/xnnpack/test/test_xnnpack_utils.py +++ b/backends/xnnpack/test/test_xnnpack_utils.py @@ -25,6 +25,12 @@ # import the xnnpack backend implementation from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend +from executorch.devtools import BundledProgram + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ExecutorchProgram, ExirExportedProgram from executorch.exir.backend.backend_api import to_backend, validation_disabled @@ -34,12 +40,6 @@ _load_for_executorch_from_buffer, ) from executorch.extension.pytree import tree_flatten -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from torch.ao.quantization import ( # @manual default_per_channel_symmetric_qnnpack_qconfig, diff --git a/sdk/CMakeLists.txt b/devtools/CMakeLists.txt similarity index 89% rename from sdk/CMakeLists.txt rename to devtools/CMakeLists.txt index 79903fc315e..4c4d15fd733 100644 --- a/sdk/CMakeLists.txt +++ b/devtools/CMakeLists.txt @@ -78,8 +78,8 @@ set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON) include(ExternalProject) # The include directory that will contain the generated schema headers. -set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/include") -set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/bundled_program") +set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include") +set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program") # TODO(dbort): Only enable this when cross-compiling. It can cause build race # conditions (libflatcc.a errors) when enabled. @@ -128,11 +128,11 @@ set(_etdump_schema__outputs) foreach(fbs_file ${_etdump_schema_names}) string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}") list(APPEND _etdump_schema__outputs - "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}" + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" ) string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}") list(APPEND _etdump_schema__outputs - "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}" + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" ) endforeach() @@ -143,7 +143,7 @@ foreach(fbs_file ${_bundled_input_schema_names}) list( APPEND _bundled_program_schema__outputs - "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema/${generated}" + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}" ) endforeach() @@ -152,9 +152,9 @@ add_library( bundled_program_schema INTERFACE ${_bundled_program_schema__outputs} ) -file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump) +file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump) file(MAKE_DIRECTORY - ${_program_schema__include_dir}/executorch/sdk/bundled_program + ${_program_schema__include_dir}/executorch/devtools/bundled_program ) add_custom_command( @@ -164,7 +164,7 @@ add_custom_command( # tree instead of under the binary directory, and there's no way to change # that behavior. ${_flatcc_source_dir}/bin/flatcc -cwr -o - ${_program_schema__include_dir}/executorch/sdk/etdump + ${_program_schema__include_dir}/executorch/devtools/etdump ${_etdump_schema__srcs} COMMAND rm -f ${_etdump_schema_cleanup_paths} DEPENDS ${_etdump_schema_gen_dep} @@ -186,9 +186,9 @@ add_custom_command( OUTPUT ${_bundled_program_schema__outputs} COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o - "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema" + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema" ${_bundled_program_schema__srcs} - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs} COMMENT "Generating bundled_program headers" VERBATIM diff --git a/sdk/TARGETS b/devtools/TARGETS similarity index 54% rename from sdk/TARGETS rename to devtools/TARGETS index 56d38a4ad3b..06964b83876 100644 --- a/sdk/TARGETS +++ b/devtools/TARGETS @@ -6,8 +6,8 @@ python_library( name = "lib", srcs = ["__init__.py"], deps = [ - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", - "//executorch/sdk/inspector:lib", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", + "//executorch/devtools/inspector:lib", ], ) diff --git a/sdk/__init__.py b/devtools/__init__.py similarity index 57% rename from sdk/__init__.py rename to devtools/__init__.py index 11134bf276a..821d75901f2 100644 --- a/sdk/__init__.py +++ b/devtools/__init__.py @@ -4,10 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import executorch.sdk.inspector as inspector -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.etrecord import ETRecord, generate_etrecord, parse_etrecord -from executorch.sdk.inspector import Inspector +import executorch.devtools.inspector as inspector +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import ETRecord, generate_etrecord, parse_etrecord +from executorch.devtools.inspector import Inspector __all__ = [ "ETRecord", diff --git a/sdk/backend_debug/TARGETS b/devtools/backend_debug/TARGETS similarity index 100% rename from sdk/backend_debug/TARGETS rename to devtools/backend_debug/TARGETS diff --git a/sdk/backend_debug/__init__.py b/devtools/backend_debug/__init__.py similarity index 83% rename from sdk/backend_debug/__init__.py rename to devtools/backend_debug/__init__.py index c1c9726b86b..b457b7d11d5 100644 --- a/sdk/backend_debug/__init__.py +++ b/devtools/backend_debug/__init__.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.sdk.backend_debug.delegation_info import ( +from executorch.devtools.backend_debug.delegation_info import ( DelegationBreakdown, get_delegation_info, ) diff --git a/sdk/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py similarity index 100% rename from sdk/backend_debug/delegation_info.py rename to devtools/backend_debug/delegation_info.py diff --git a/sdk/backend_debug/tests/TARGETS b/devtools/backend_debug/tests/TARGETS similarity index 86% rename from sdk/backend_debug/tests/TARGETS rename to devtools/backend_debug/tests/TARGETS index 3c9f6c2e64e..ae234df8ce4 100644 --- a/sdk/backend_debug/tests/TARGETS +++ b/devtools/backend_debug/tests/TARGETS @@ -10,8 +10,8 @@ python_unittest( deps = [ "fbsource//third-party/pypi/pandas:pandas", "//caffe2:torch", + "//executorch/devtools/backend_debug:delegation_info", "//executorch/exir:lib", "//executorch/exir/backend/test:op_partitioner_demo", - "//executorch/sdk/backend_debug:delegation_info", ], ) diff --git a/sdk/backend_debug/tests/test_delegation_info.py b/devtools/backend_debug/tests/test_delegation_info.py similarity index 96% rename from sdk/backend_debug/tests/test_delegation_info.py rename to devtools/backend_debug/tests/test_delegation_info.py index 2d98e9a5950..6ff5169094b 100644 --- a/sdk/backend_debug/tests/test_delegation_info.py +++ b/devtools/backend_debug/tests/test_delegation_info.py @@ -9,9 +9,9 @@ import pandas as pd import torch +from executorch.devtools.backend_debug import DelegationBreakdown, get_delegation_info from executorch.exir import to_edge from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo -from executorch.sdk.backend_debug import DelegationBreakdown, get_delegation_info from pandas.testing import assert_frame_equal diff --git a/sdk/bundled_program/TARGETS b/devtools/bundled_program/TARGETS similarity index 88% rename from sdk/bundled_program/TARGETS rename to devtools/bundled_program/TARGETS index c731606217f..27560f70877 100644 --- a/sdk/bundled_program/TARGETS +++ b/devtools/bundled_program/TARGETS @@ -18,10 +18,10 @@ runtime.python_library( ":config", ":version", "//caffe2:torch", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir:schema", "//executorch/exir:tensor", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) @@ -46,6 +46,6 @@ runtime.python_library( "version.py", ], visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], ) diff --git a/sdk/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp similarity index 98% rename from sdk/bundled_program/bundled_program.cpp rename to devtools/bundled_program/bundled_program.cpp index 63affa5c7f7..d174cbdcdad 100644 --- a/sdk/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -16,12 +16,12 @@ #include #endif // USE_ATEN_LIB +#include #include #include #include #include #include -#include namespace torch { namespace executor { diff --git a/sdk/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h similarity index 100% rename from sdk/bundled_program/bundled_program.h rename to devtools/bundled_program/bundled_program.h diff --git a/sdk/bundled_program/config.py b/devtools/bundled_program/config.py similarity index 100% rename from sdk/bundled_program/config.py rename to devtools/bundled_program/config.py diff --git a/sdk/bundled_program/core.py b/devtools/bundled_program/core.py similarity index 98% rename from sdk/bundled_program/core.py rename to devtools/bundled_program/core.py index 56fc817bbee..c775fb1510d 100644 --- a/sdk/bundled_program/core.py +++ b/devtools/bundled_program/core.py @@ -8,19 +8,19 @@ import typing from typing import Dict, List, Optional, Sequence, Type, Union -import executorch.exir.schema as core_schema +import executorch.devtools.bundled_program.schema as bp_schema -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.exir.schema as core_schema import torch import torch.fx +from executorch.devtools.bundled_program.config import ConfigValue, MethodTestSuite + +from executorch.devtools.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION from executorch.exir import ExecutorchProgram, ExecutorchProgramManager from executorch.exir._serialize import _serialize_pte_binary from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec -from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite - -from executorch.sdk.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION # pyre-ignore supported_program_type_table: Dict[Type[core_schema.KernelTypes], ConfigValue] = { diff --git a/sdk/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md similarity index 100% rename from sdk/bundled_program/schema/README.md rename to devtools/bundled_program/schema/README.md diff --git a/sdk/bundled_program/schema/TARGETS b/devtools/bundled_program/schema/TARGETS similarity index 84% rename from sdk/bundled_program/schema/TARGETS rename to devtools/bundled_program/schema/TARGETS index e9bd642069d..51c004cbec0 100644 --- a/sdk/bundled_program/schema/TARGETS +++ b/devtools/bundled_program/schema/TARGETS @@ -15,8 +15,8 @@ runtime.python_library( "bundled_program_schema.py", ], visibility = [ - "//executorch/sdk/bundled_program/...", - "//executorch/sdk/etrecord/...", + "//executorch/devtools/bundled_program/...", + "//executorch/devtools/etrecord/...", ], deps = [ "//executorch/exir:scalar_type", diff --git a/sdk/bundled_program/schema/__init__.py b/devtools/bundled_program/schema/__init__.py similarity index 100% rename from sdk/bundled_program/schema/__init__.py rename to devtools/bundled_program/schema/__init__.py diff --git a/sdk/bundled_program/schema/bundled_program_schema.fbs b/devtools/bundled_program/schema/bundled_program_schema.fbs similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.fbs rename to devtools/bundled_program/schema/bundled_program_schema.fbs diff --git a/sdk/bundled_program/schema/bundled_program_schema.py b/devtools/bundled_program/schema/bundled_program_schema.py similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.py rename to devtools/bundled_program/schema/bundled_program_schema.py diff --git a/sdk/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs similarity index 100% rename from sdk/bundled_program/schema/scalar_type.fbs rename to devtools/bundled_program/schema/scalar_type.fbs diff --git a/sdk/bundled_program/schema/targets.bzl b/devtools/bundled_program/schema/targets.bzl similarity index 93% rename from sdk/bundled_program/schema/targets.bzl rename to devtools/bundled_program/schema/targets.bzl index a25d792c5a3..532a01e039e 100644 --- a/sdk/bundled_program/schema/targets.bzl +++ b/devtools/bundled_program/schema/targets.bzl @@ -49,14 +49,14 @@ def define_common_targets(): runtime.export_file( name = INPUT_BUNDLED, visibility = [ - "//executorch/sdk/bundled_program/serialize/...", + "//executorch/devtools/bundled_program/serialize/...", ], ) runtime.export_file( name = INPUT_SCALAR_TYPE, visibility = [ - "//executorch/sdk/bundled_program/serialize/...", + "//executorch/devtools/bundled_program/serialize/...", ], ) @@ -72,7 +72,7 @@ def define_common_targets(): name = BUNDLED_LIBRARY_NAME, srcs = [], visibility = [ - "//executorch/sdk/bundled_program/...", + "//executorch/devtools/bundled_program/...", "//executorch/extension/pybindings/...", ], exported_headers = { diff --git a/sdk/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS similarity index 100% rename from sdk/bundled_program/schema/test/TARGETS rename to devtools/bundled_program/schema/test/TARGETS diff --git a/sdk/bundled_program/schema/test/test_schema.py b/devtools/bundled_program/schema/test/test_schema.py similarity index 79% rename from sdk/bundled_program/schema/test/test_schema.py rename to devtools/bundled_program/schema/test/test_schema.py index ab3d2760d29..c2a19adef79 100644 --- a/sdk/bundled_program/schema/test/test_schema.py +++ b/devtools/bundled_program/schema/test/test_schema.py @@ -20,8 +20,8 @@ def test_schema_sync(self) -> None: self.assertTrue( filecmp.cmp( - prefix + "sdk/bundled_program/schema/scalar_type.fbs", + prefix + "devtools/bundled_program/schema/scalar_type.fbs", prefix + "schema/scalar_type.fbs", ), - 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/sdk/bundled_program/schema/scalar_type.fbs" to sync schema changes.', + 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/devtools/bundled_program/schema/scalar_type.fbs" to sync schema changes.', ) diff --git a/sdk/bundled_program/serialize/TARGETS b/devtools/bundled_program/serialize/TARGETS similarity index 76% rename from sdk/bundled_program/serialize/TARGETS rename to devtools/bundled_program/serialize/TARGETS index 20abccd7fda..11c58399778 100644 --- a/sdk/bundled_program/serialize/TARGETS +++ b/devtools/bundled_program/serialize/TARGETS @@ -10,8 +10,8 @@ runtime.python_library( "__init__.py", ], resources = { - "//executorch/sdk/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", - "//executorch/sdk/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", + "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", }, # Currently serialization API should only be used in some dedicated targets, # to avoid ODR violation when linking with another Flatbuffers library. @@ -20,18 +20,18 @@ runtime.python_library( "//executorch/bacends/...", "//executorch/backends/xnnpack/test/...", "//executorch/codegen/...", + "//executorch/devtools/bundled_program/tests/...", "//executorch/examples/async_exec:emit_program_lib", "//executorch/exir:lib", "//executorch/extension/pybindings/test:test", "//executorch/extension/pybindings/test:test-library", "//executorch/profiler/...", - "//executorch/sdk/bundled_program/tests/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], deps = [ "fbsource//third-party/pypi/setuptools:setuptools", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) diff --git a/sdk/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py similarity index 97% rename from sdk/bundled_program/serialize/__init__.py rename to devtools/bundled_program/serialize/__init__.py index e0c75574c93..075436e9c11 100644 --- a/sdk/bundled_program/serialize/__init__.py +++ b/devtools/bundled_program/serialize/__init__.py @@ -12,14 +12,14 @@ import os import tempfile -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.devtools.bundled_program.schema as bp_schema # @manual=fbsource//third-party/pypi/setuptools:setuptools import pkg_resources +from executorch.devtools.bundled_program.core import BundledProgram from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.sdk.bundled_program.core import BundledProgram # The prefix of schema files used for bundled program BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema" diff --git a/sdk/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS similarity index 51% rename from sdk/bundled_program/serialize/test/TARGETS rename to devtools/bundled_program/serialize/test/TARGETS index 85f55c02f8d..dd92f63f2dd 100644 --- a/sdk/bundled_program/serialize/test/TARGETS +++ b/devtools/bundled_program/serialize/test/TARGETS @@ -10,9 +10,8 @@ python_unittest( "test_serialize.py", ], deps = [ - "//executorch/exir:print_program", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", - "//executorch/sdk/bundled_program/util:test_util", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", ], ) diff --git a/sdk/bundled_program/serialize/test/test_serialize.py b/devtools/bundled_program/serialize/test/test_serialize.py similarity index 82% rename from sdk/bundled_program/serialize/test/test_serialize.py rename to devtools/bundled_program/serialize/test/test_serialize.py index 1db6871fc06..48a914d1447 100644 --- a/sdk/bundled_program/serialize/test/test_serialize.py +++ b/devtools/bundled_program/serialize/test/test_serialize.py @@ -8,13 +8,15 @@ import unittest -from executorch.sdk.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( deserialize_from_flatbuffer_to_bundled_program, serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) class TestSerialize(unittest.TestCase): diff --git a/sdk/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl similarity index 91% rename from sdk/bundled_program/targets.bzl rename to devtools/bundled_program/targets.bzl index a3268dff2c5..7035b3b31f6 100644 --- a/sdk/bundled_program/targets.bzl +++ b/devtools/bundled_program/targets.bzl @@ -19,7 +19,7 @@ def define_common_targets(): ], deps = [ "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, - "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", ], exported_deps = [ "//executorch/runtime/core:memory_allocator", diff --git a/sdk/bundled_program/test/TARGETS b/devtools/bundled_program/test/TARGETS similarity index 68% rename from sdk/bundled_program/test/TARGETS rename to devtools/bundled_program/test/TARGETS index caf69be60e1..652c74b8f43 100644 --- a/sdk/bundled_program/test/TARGETS +++ b/devtools/bundled_program/test/TARGETS @@ -1,4 +1,5 @@ # @noautodeps + load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") oncall("executorch") @@ -10,11 +11,11 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", - "//executorch/sdk/bundled_program/util:test_util", ], ) @@ -25,9 +26,9 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/extension/pytree:pylib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/util:test_util", ], ) @@ -38,6 +39,10 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/exir:dynamic_shape", "//executorch/exir:lib", "//executorch/exir:memory", @@ -54,9 +59,5 @@ python_unittest( "//executorch/extension/pybindings:portable_lib", "//executorch/extension/pytree:pybindings", "//executorch/kernels/portable:custom_ops_generated_lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", - "//executorch/sdk/bundled_program/util:test_util", ], ) diff --git a/sdk/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py similarity index 93% rename from sdk/bundled_program/test/test_bundle_data.py rename to devtools/bundled_program/test/test_bundle_data.py index a8d9485c5ff..565539cbf15 100644 --- a/sdk/bundled_program/test/test_bundle_data.py +++ b/devtools/bundled_program/test/test_bundle_data.py @@ -9,13 +9,15 @@ import unittest from typing import List -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.devtools.bundled_program.schema as bp_schema import torch +from executorch.devtools.bundled_program.config import ConfigValue +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) from executorch.exir._serialize import _serialize_pte_binary -from executorch.sdk.bundled_program.config import ConfigValue -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program class TestBundle(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_config.py b/devtools/bundled_program/test/test_config.py similarity index 97% rename from sdk/bundled_program/test/test_config.py rename to devtools/bundled_program/test/test_config.py index 3183ad907fe..21f3d480423 100644 --- a/sdk/bundled_program/test/test_config.py +++ b/devtools/bundled_program/test/test_config.py @@ -10,14 +10,14 @@ from typing import get_args, List, Union import torch -from executorch.extension.pytree import tree_flatten -from executorch.sdk.bundled_program.config import DataContainer +from executorch.devtools.bundled_program.config import DataContainer -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_random_test_suites, get_random_test_suites_with_eager_model, SampleModel, ) +from executorch.extension.pytree import tree_flatten class TestConfig(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py similarity index 88% rename from sdk/bundled_program/test/test_end2end.py rename to devtools/bundled_program/test/test_end2end.py index 99d58ee15ca..7cee073be0e 100644 --- a/sdk/bundled_program/test/test_end2end.py +++ b/devtools/bundled_program/test/test_end2end.py @@ -21,12 +21,12 @@ import torch -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_common_executorch_program, SampleModel, ) @@ -45,7 +45,7 @@ pass try: - from executorch.extension.pybindings.aten_lib import ( + from executorch.extension.pybindings.aten_lib import ( # @manual=//executorch/extension/pybindings:aten_lib _load_bundled_program_from_buffer, _load_for_executorch_from_buffer, _load_for_executorch_from_bundled_program, diff --git a/sdk/bundled_program/util/TARGETS b/devtools/bundled_program/util/TARGETS similarity index 68% rename from sdk/bundled_program/util/TARGETS rename to devtools/bundled_program/util/TARGETS index 17d19dfb29a..7d019ce30fb 100644 --- a/sdk/bundled_program/util/TARGETS +++ b/devtools/bundled_program/util/TARGETS @@ -7,10 +7,10 @@ python_library( srcs = [ "test_util.py", ], - visibility = ["//executorch/sdk/bundled_program/..."], + visibility = ["//executorch/devtools/bundled_program/..."], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", "//executorch/exir:lib", - "//executorch/sdk/bundled_program:config", ], ) diff --git a/sdk/bundled_program/util/test_util.py b/devtools/bundled_program/util/test_util.py similarity index 99% rename from sdk/bundled_program/util/test_util.py rename to devtools/bundled_program/util/test_util.py index bfea8158acb..505186f3a08 100644 --- a/sdk/bundled_program/util/test_util.py +++ b/devtools/bundled_program/util/test_util.py @@ -10,14 +10,14 @@ from typing import List, Tuple import torch - -from executorch.exir import ExecutorchProgramManager, to_edge -from executorch.sdk.bundled_program.config import ( +from executorch.devtools.bundled_program.config import ( MethodInputType, MethodOutputType, MethodTestCase, MethodTestSuite, ) + +from executorch.exir import ExecutorchProgramManager, to_edge from torch.export import export from torch.export.unflatten import _assign_attr, _AttrKind diff --git a/sdk/bundled_program/version.py b/devtools/bundled_program/version.py similarity index 100% rename from sdk/bundled_program/version.py rename to devtools/bundled_program/version.py diff --git a/sdk/debug_format/TARGETS b/devtools/debug_format/TARGETS similarity index 100% rename from sdk/debug_format/TARGETS rename to devtools/debug_format/TARGETS diff --git a/sdk/debug_format/base_schema.py b/devtools/debug_format/base_schema.py similarity index 100% rename from sdk/debug_format/base_schema.py rename to devtools/debug_format/base_schema.py diff --git a/sdk/debug_format/et_schema.py b/devtools/debug_format/et_schema.py similarity index 99% rename from sdk/debug_format/et_schema.py rename to devtools/debug_format/et_schema.py index 9a6af4edba9..abe155233ae 100644 --- a/sdk/debug_format/et_schema.py +++ b/devtools/debug_format/et_schema.py @@ -21,7 +21,7 @@ import torch from executorch import exir -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( Node, OperatorGraph, OperatorNode, diff --git a/sdk/etdump/TARGETS b/devtools/etdump/TARGETS similarity index 81% rename from sdk/etdump/TARGETS rename to devtools/etdump/TARGETS index 22d07478cbe..7dcc4c1e84b 100644 --- a/sdk/etdump/TARGETS +++ b/devtools/etdump/TARGETS @@ -11,7 +11,7 @@ runtime.python_library( "schema_flatcc.py", ], visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], deps = [ "//executorch/exir:scalar_type", @@ -24,11 +24,11 @@ runtime.python_library( "serialize.py", ], resources = { + "//executorch/devtools/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs", "//executorch/schema:scalar_type.fbs": "scalar_type.fbs", - "//executorch/sdk/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs", }, visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], deps = [ "fbsource//third-party/pypi/setuptools:setuptools", diff --git a/sdk/etdump/emitter.cpp b/devtools/etdump/emitter.cpp similarity index 98% rename from sdk/etdump/emitter.cpp rename to devtools/etdump/emitter.cpp index 1b3cba9d196..dfca6295306 100644 --- a/sdk/etdump/emitter.cpp +++ b/devtools/etdump/emitter.cpp @@ -9,8 +9,8 @@ #include #include +#include "executorch/devtools/etdump/emitter.h" #include "executorch/runtime/platform/assert.h" -#include "executorch/sdk/etdump/emitter.h" namespace torch { namespace executor { diff --git a/sdk/etdump/emitter.h b/devtools/etdump/emitter.h similarity index 92% rename from sdk/etdump/emitter.h rename to devtools/etdump/emitter.h index 3910d3bd27b..bf8ab0b1e1c 100644 --- a/sdk/etdump/emitter.h +++ b/devtools/etdump/emitter.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include #pragma once diff --git a/sdk/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp similarity index 98% rename from sdk/etdump/etdump_flatcc.cpp rename to devtools/etdump/etdump_flatcc.cpp index dab1443b55f..ca46c12f51c 100644 --- a/sdk/etdump/etdump_flatcc.cpp +++ b/devtools/etdump/etdump_flatcc.cpp @@ -6,16 +6,16 @@ * LICENSE file in the root directory of this source tree. */ -#include "executorch/sdk/etdump/etdump_flatcc.h" -#include -#include +#include "executorch/devtools/etdump/etdump_flatcc.h" +#include +#include #include #include #include +#include "executorch/devtools/etdump/emitter.h" #include "executorch/runtime/core/exec_aten/exec_aten.h" #include "executorch/runtime/core/exec_aten/util/scalar_type_util.h" #include "executorch/runtime/platform/assert.h" -#include "executorch/sdk/etdump/emitter.h" namespace torch { namespace executor { diff --git a/sdk/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h similarity index 100% rename from sdk/etdump/etdump_flatcc.h rename to devtools/etdump/etdump_flatcc.h diff --git a/sdk/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs similarity index 100% rename from sdk/etdump/etdump_schema_flatcc.fbs rename to devtools/etdump/etdump_schema_flatcc.fbs diff --git a/sdk/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs similarity index 100% rename from sdk/etdump/scalar_type.fbs rename to devtools/etdump/scalar_type.fbs diff --git a/sdk/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py similarity index 97% rename from sdk/etdump/schema_flatcc.py rename to devtools/etdump/schema_flatcc.py index eaad876a536..f19f328d3fa 100644 --- a/sdk/etdump/schema_flatcc.py +++ b/devtools/etdump/schema_flatcc.py @@ -7,7 +7,7 @@ # pyre-strict """ This file is the python representation of the schema contained in -executorch/sdk/etdump/etdump_schema.fbs. Any changes made to that +executorch/devtools/etdump/etdump_schema.fbs. Any changes made to that flatbuffer schema should accordingly be reflected here also. """ diff --git a/sdk/etdump/serialize.py b/devtools/etdump/serialize.py similarity index 98% rename from sdk/etdump/serialize.py rename to devtools/etdump/serialize.py index 0cc6682bfcb..4ed63bc385b 100644 --- a/sdk/etdump/serialize.py +++ b/devtools/etdump/serialize.py @@ -11,11 +11,11 @@ import tempfile import pkg_resources +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC # The prefix of schema files used for etdump ETDUMP_FLATCC_SCHEMA_NAME = "etdump_schema_flatcc" diff --git a/sdk/etdump/targets.bzl b/devtools/etdump/targets.bzl similarity index 100% rename from sdk/etdump/targets.bzl rename to devtools/etdump/targets.bzl diff --git a/sdk/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt similarity index 100% rename from sdk/etdump/tests/CMakeLists.txt rename to devtools/etdump/tests/CMakeLists.txt diff --git a/sdk/etdump/tests/TARGETS b/devtools/etdump/tests/TARGETS similarity index 75% rename from sdk/etdump/tests/TARGETS rename to devtools/etdump/tests/TARGETS index ad48948c48a..51e807891df 100644 --- a/sdk/etdump/tests/TARGETS +++ b/devtools/etdump/tests/TARGETS @@ -11,8 +11,8 @@ python_unittest( "serialize_test.py", ], deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", "//executorch/exir/_serialize:lib", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etdump:serialize", ], ) diff --git a/sdk/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp similarity index 99% rename from sdk/etdump/tests/etdump_test.cpp rename to devtools/etdump/tests/etdump_test.cpp index d30bd9a3037..de8c0abc39d 100644 --- a/sdk/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -9,12 +9,12 @@ #include #include +#include +#include +#include #include #include #include -#include -#include -#include #include #include #include diff --git a/sdk/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py similarity index 97% rename from sdk/etdump/tests/serialize_test.py rename to devtools/etdump/tests/serialize_test.py index 2b1497f5974..1a7f3bd93f5 100644 --- a/sdk/etdump/tests/serialize_test.py +++ b/devtools/etdump/tests/serialize_test.py @@ -12,13 +12,13 @@ from pprint import pformat from typing import List -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.exir._serialize._dataclass import _DataclassEncoder +import executorch.devtools.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.serialize import ( +from executorch.devtools.etdump.serialize import ( deserialize_from_etdump_flatcc, serialize_to_etdump_flatcc, ) +from executorch.exir._serialize._dataclass import _DataclassEncoder def diff_jsons(a: str, b: str) -> List[str]: diff --git a/sdk/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl similarity index 82% rename from sdk/etdump/tests/targets.bzl rename to devtools/etdump/tests/targets.bzl index 41b19ca65ef..5299b7c1cb7 100644 --- a/sdk/etdump/tests/targets.bzl +++ b/devtools/etdump/tests/targets.bzl @@ -13,8 +13,8 @@ def define_common_targets(): "etdump_test.cpp", ], deps = [ - "//executorch/sdk/etdump:etdump_flatcc", - "//executorch/sdk/etdump:etdump_schema_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_schema_flatcc", "//executorch/runtime/platform:platform", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], diff --git a/sdk/etrecord/TARGETS b/devtools/etrecord/TARGETS similarity index 71% rename from sdk/etrecord/TARGETS rename to devtools/etrecord/TARGETS index c7de63a81f4..09fc3212bf8 100644 --- a/sdk/etrecord/TARGETS +++ b/devtools/etrecord/TARGETS @@ -9,10 +9,10 @@ python_library( "_etrecord.py", ], deps = [ + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir:lib", "//executorch/exir/emit:emit", "//executorch/exir/serde:serialize", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) diff --git a/sdk/etrecord/__init__.py b/devtools/etrecord/__init__.py similarity index 86% rename from sdk/etrecord/__init__.py rename to devtools/etrecord/__init__.py index 29c29462a7e..59ff4e44c2f 100644 --- a/sdk/etrecord/__init__.py +++ b/devtools/etrecord/__init__.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.sdk.etrecord._etrecord import ( +from executorch.devtools.etrecord._etrecord import ( ETRecord, generate_etrecord, parse_etrecord, diff --git a/sdk/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py similarity index 98% rename from sdk/etrecord/_etrecord.py rename to devtools/etrecord/_etrecord.py index 1ae46f27aaa..cd213254980 100644 --- a/sdk/etrecord/_etrecord.py +++ b/devtools/etrecord/_etrecord.py @@ -12,6 +12,9 @@ from zipfile import BadZipFile, ZipFile from executorch import exir +from executorch.devtools.bundled_program.core import BundledProgram + +from executorch.devtools.bundled_program.schema.bundled_program_schema import Value from executorch.exir import ( EdgeProgramManager, ExecutorchProgram, @@ -23,9 +26,6 @@ from executorch.exir.serde.export_serialize import SerializedArtifact from executorch.exir.serde.serialize import deserialize, serialize -from executorch.sdk.bundled_program.core import BundledProgram - -from executorch.sdk.bundled_program.schema.bundled_program_schema import Value ProgramOutput = List[Value] diff --git a/sdk/etrecord/tests/TARGETS b/devtools/etrecord/tests/TARGETS similarity index 64% rename from sdk/etrecord/tests/TARGETS rename to devtools/etrecord/tests/TARGETS index 0984c755a4e..fffa7f18341 100644 --- a/sdk/etrecord/tests/TARGETS +++ b/devtools/etrecord/tests/TARGETS @@ -8,11 +8,11 @@ python_unittest( srcs = ["etrecord_test.py"], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", "//executorch/exir/tests:models", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", ], ) @@ -21,10 +21,10 @@ python_library( srcs = ["etrecord_test.py"], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", "//executorch/exir/tests:models", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", ], ) diff --git a/sdk/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py similarity index 96% rename from sdk/etrecord/tests/etrecord_test.py rename to devtools/etrecord/tests/etrecord_test.py index bc534fd4871..b8e08dfe8c1 100644 --- a/sdk/etrecord/tests/etrecord_test.py +++ b/devtools/etrecord/tests/etrecord_test.py @@ -12,14 +12,14 @@ import executorch.exir.tests.models as models import torch from executorch import exir -from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.etrecord import generate_etrecord, parse_etrecord -from executorch.sdk.etrecord._etrecord import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import generate_etrecord, parse_etrecord +from executorch.devtools.etrecord._etrecord import ( _get_reference_outputs, ETRecordReservedFileNames, ) +from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from torch.export import export diff --git a/sdk/inspector/TARGETS b/devtools/inspector/TARGETS similarity index 70% rename from sdk/inspector/TARGETS rename to devtools/inspector/TARGETS index bc53c90c115..2b1cbecff32 100644 --- a/sdk/inspector/TARGETS +++ b/devtools/inspector/TARGETS @@ -14,10 +14,10 @@ python_library( "fbsource//third-party/pypi/pandas:pandas", "fbsource//third-party/pypi/tabulate:tabulate", ":inspector_utils", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etrecord:etrecord", ], ) @@ -27,7 +27,7 @@ python_binary( main_src = "inspector_cli.py", deps = [ ":inspector_utils", - "//executorch/sdk:lib", + "//executorch/devtools:lib", ], ) @@ -40,11 +40,11 @@ python_library( "fbsource//third-party/pypi/matplotlib:matplotlib", "fbsource//third-party/pypi/numpy:numpy", "//caffe2:torch", - "//executorch/sdk/debug_format:base_schema", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etdump:serialize", - "//executorch/sdk/etrecord:etrecord", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", + "//executorch/devtools/etrecord:etrecord", ], ) diff --git a/sdk/inspector/__init__.py b/devtools/inspector/__init__.py similarity index 60% rename from sdk/inspector/__init__.py rename to devtools/inspector/__init__.py index bef3d363d58..ff9bb814791 100644 --- a/sdk/inspector/__init__.py +++ b/devtools/inspector/__init__.py @@ -4,7 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.sdk.inspector._inspector import Event, EventBlock, Inspector, PerfData -from executorch.sdk.inspector._inspector_utils import TimeScale +from executorch.devtools.inspector._inspector import ( + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector_utils import TimeScale __all__ = ["Event", "EventBlock", "Inspector", "PerfData", "TimeScale"] diff --git a/sdk/inspector/_inspector.py b/devtools/inspector/_inspector.py similarity index 99% rename from sdk/inspector/_inspector.py rename to devtools/inspector/_inspector.py index 5f9bfafee70..f98e3cd3a56 100644 --- a/sdk/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -26,16 +26,19 @@ Union, ) -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import numpy as np import pandas as pd -from executorch.exir import ExportedProgram -from executorch.sdk.debug_format.et_schema import OperatorGraph, OperatorNode -from executorch.sdk.etdump.schema_flatcc import DebugEvent, ETDumpFlatCC, ProfileEvent -from executorch.sdk.etrecord import ETRecord, parse_etrecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.debug_format.et_schema import OperatorGraph, OperatorNode +from executorch.devtools.etdump.schema_flatcc import ( + DebugEvent, + ETDumpFlatCC, + ProfileEvent, +) +from executorch.devtools.etrecord import ETRecord, parse_etrecord +from executorch.devtools.inspector._inspector_utils import ( create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, EXCLUDED_COLUMNS_WHEN_PRINTING, @@ -53,6 +56,7 @@ TimeScale, verify_debug_data_equivalence, ) +from executorch.exir import ExportedProgram from tabulate import tabulate diff --git a/sdk/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py similarity index 97% rename from sdk/inspector/_inspector_utils.py rename to devtools/inspector/_inspector_utils.py index 6879e855057..98b5fdc722f 100644 --- a/sdk/inspector/_inspector_utils.py +++ b/devtools/inspector/_inspector_utils.py @@ -8,14 +8,14 @@ from enum import Enum from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import torch -from executorch.sdk.debug_format.base_schema import OperatorNode +from executorch.devtools.debug_format.base_schema import OperatorNode -from executorch.sdk.debug_format.et_schema import FXOperatorGraph, OperatorGraph -from executorch.sdk.etdump.schema_flatcc import ( +from executorch.devtools.debug_format.et_schema import FXOperatorGraph, OperatorGraph +from executorch.devtools.etdump.schema_flatcc import ( DebugEvent, ETDumpFlatCC, ProfileEvent, @@ -25,8 +25,8 @@ ValueType, ) -from executorch.sdk.etdump.serialize import deserialize_from_etdump_flatcc -from executorch.sdk.etrecord import ETRecord +from executorch.devtools.etdump.serialize import deserialize_from_etdump_flatcc +from executorch.devtools.etrecord import ETRecord FORWARD = "forward" EDGE_DIALECT_GRAPH_KEY = "edge_dialect_graph_module" diff --git a/sdk/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py similarity index 93% rename from sdk/inspector/inspector_cli.py rename to devtools/inspector/inspector_cli.py index d6c8d5442f3..bd76607a944 100644 --- a/sdk/inspector/inspector_cli.py +++ b/devtools/inspector/inspector_cli.py @@ -6,8 +6,8 @@ import argparse -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results, TimeScale +from executorch.devtools import Inspector +from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale def main() -> None: diff --git a/devtools/inspector/tests/TARGETS b/devtools/inspector/tests/TARGETS new file mode 100644 index 00000000000..eada6817bcb --- /dev/null +++ b/devtools/inspector/tests/TARGETS @@ -0,0 +1,41 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "inspector_test", + srcs = ["inspector_test.py"], + deps = [ + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + "//executorch/exir:lib", + ], +) + +python_unittest( + name = "event_blocks_test", + srcs = ["event_blocks_test.py"], + deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + ], +) + +python_unittest( + name = "inspector_utils_test", + srcs = ["inspector_utils_test.py"], + deps = [ + "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector_utils", + ], +) diff --git a/sdk/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py similarity index 98% rename from sdk/inspector/tests/event_blocks_test.py rename to devtools/inspector/tests/event_blocks_test.py index 7c7da001860..4101035f99b 100644 --- a/sdk/inspector/tests/event_blocks_test.py +++ b/devtools/inspector/tests/event_blocks_test.py @@ -8,10 +8,10 @@ import unittest from typing import List, Optional, Tuple, Union -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent -from executorch.sdk.inspector import Event, EventBlock, PerfData -from executorch.sdk.inspector._inspector import ( +import executorch.devtools.etdump.schema_flatcc as flatcc +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent +from executorch.devtools.inspector import Event, EventBlock, PerfData +from executorch.devtools.inspector._inspector import ( DelegateMetadata, EventSignature, InstructionEvent, diff --git a/sdk/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py similarity index 97% rename from sdk/inspector/tests/inspector_test.py rename to devtools/inspector/tests/inspector_test.py index a372c7c569c..55f0cd10ae9 100644 --- a/sdk/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -14,14 +14,19 @@ from unittest.mock import patch -from executorch.exir import ExportedProgram -from executorch.sdk import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.et_schema import OperatorNode -from executorch.sdk.etdump.schema_flatcc import ProfileEvent -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord - -from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData -from executorch.sdk.inspector._inspector import ( +from executorch.devtools import generate_etrecord, parse_etrecord +from executorch.devtools.debug_format.et_schema import OperatorNode +from executorch.devtools.etdump.schema_flatcc import ProfileEvent +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord + +from executorch.devtools.inspector import ( + _inspector, + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector import ( DebugEventSignature, flatcc, InstructionEvent, @@ -29,6 +34,8 @@ ProfileEventSignature, ) +from executorch.exir import ExportedProgram + OP_TYPE = "aten::add" EVENT_BLOCK_NAME = "block_0" diff --git a/sdk/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py similarity index 94% rename from sdk/inspector/tests/inspector_utils_test.py rename to devtools/inspector/tests/inspector_utils_test.py index b5b9b54d6c4..d853732fcc7 100644 --- a/sdk/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -10,19 +10,19 @@ import torch -from executorch.sdk import generate_etrecord, parse_etrecord +from executorch.devtools import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( OperatorGraph, OperatorNode, ValueNode, ) -from executorch.sdk.debug_format.et_schema import FXOperatorGraph -from executorch.sdk.etdump import schema_flatcc as flatcc +from executorch.devtools.debug_format.et_schema import FXOperatorGraph +from executorch.devtools.etdump import schema_flatcc as flatcc -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord +from executorch.devtools.inspector._inspector_utils import ( create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, find_populated_event, diff --git a/sdk/size_analysis_tool/TARGETS b/devtools/size_analysis_tool/TARGETS similarity index 86% rename from sdk/size_analysis_tool/TARGETS rename to devtools/size_analysis_tool/TARGETS index 44ae0aa6f8b..c365ba152d5 100644 --- a/sdk/size_analysis_tool/TARGETS +++ b/devtools/size_analysis_tool/TARGETS @@ -12,9 +12,9 @@ python_library( visibility = ["PUBLIC"], deps = [ "//caffe2:torch", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", - "//executorch/sdk:lib", ], ) @@ -23,13 +23,13 @@ python_binary( srcs = [ "size_analysis_tool.py", ], - main_function = "executorch.sdk.size_analysis_tool.size_analysis_tool.main", + main_function = "executorch.devtools.size_analysis_tool.size_analysis_tool.main", visibility = ["PUBLIC"], deps = [ "//caffe2:torch", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", - "//executorch/sdk:lib", ], ) @@ -43,9 +43,9 @@ python_unittest( "//caffe2:torch", "//executorch/backends/xnnpack/partition:xnnpack_partitioner", "//executorch/backends/xnnpack/utils:xnnpack_utils", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", "//executorch/exir/passes:spec_prop_pass", - "//executorch/sdk:lib", ], ) diff --git a/sdk/size_analysis_tool/size_analysis_tool.py b/devtools/size_analysis_tool/size_analysis_tool.py similarity index 99% rename from sdk/size_analysis_tool/size_analysis_tool.py rename to devtools/size_analysis_tool/size_analysis_tool.py index d17ec5ac477..8ea8ddbbf49 100644 --- a/sdk/size_analysis_tool/size_analysis_tool.py +++ b/devtools/size_analysis_tool/size_analysis_tool.py @@ -9,10 +9,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch +from executorch.devtools import parse_etrecord from executorch.exir import ExportedProgram from executorch.exir.backend.backend_api import LoweredBackendModule -from executorch.sdk import parse_etrecord def _get_tensor_data(node: torch.fx.Node, tensor: torch.Tensor) -> Dict[str, Any]: diff --git a/sdk/size_analysis_tool/size_analysis_tool_test.py b/devtools/size_analysis_tool/size_analysis_tool_test.py similarity index 98% rename from sdk/size_analysis_tool/size_analysis_tool_test.py rename to devtools/size_analysis_tool/size_analysis_tool_test.py index 3e1efec77b5..96feae7e423 100644 --- a/sdk/size_analysis_tool/size_analysis_tool_test.py +++ b/devtools/size_analysis_tool/size_analysis_tool_test.py @@ -14,12 +14,12 @@ get_xnnpack_executorch_backend_config, ) from executorch.backends.xnnpack.utils.utils import capture_graph_for_xnnpack -from executorch.exir.backend.backend_api import to_backend, validation_disabled -from executorch.exir.passes.spec_prop_pass import SpecPropPass -from executorch.sdk.size_analysis_tool.size_analysis_tool import ( +from executorch.devtools.size_analysis_tool.size_analysis_tool import ( generate_model_size_information, ) +from executorch.exir.backend.backend_api import to_backend, validation_disabled +from executorch.exir.passes.spec_prop_pass import SpecPropPass class SizeAnalysisToolTest(unittest.TestCase): diff --git a/sdk/targets.bzl b/devtools/targets.bzl similarity index 100% rename from sdk/targets.bzl rename to devtools/targets.bzl diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md index 9e236e8e489..97528c95405 100644 --- a/docs/source/extension-module.md +++ b/docs/source/extension-module.md @@ -132,7 +132,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc #include #include #include -#include +#include using namespace ::torch::executor; diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 5fffb7e8caf..6d79e1e0fd4 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -763,7 +763,7 @@ In your export script, after calling `to_edge()` and `to_executorch()`, call `ge ``` import copy -from executorch.sdk import generate_etrecord +from executorch.devtools import generate_etrecord # Make the deep copy immediately after to to_edge() edge_manager_copy = copy.deepcopy(edge_manager) @@ -784,7 +784,7 @@ Include the ETDump header in your code. ```cpp // main.cpp -#include +#include ``` Create an Instance of the ETDumpGen class and pass it to the Module constructor. @@ -835,7 +835,7 @@ Run the runner, you will see “etdump.etdp” generated. Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector inspector = Inspector(etdump_path="etdump.etdp") # If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")` diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md index 33deae3904b..288fce93df6 100644 --- a/docs/source/sdk-bundled-io.md +++ b/docs/source/sdk-bundled-io.md @@ -28,7 +28,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestCase` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestCase.__init__ +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__ :noindex: ``` ::: @@ -38,7 +38,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestSuite` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestSuite +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite :noindex: ``` ::: @@ -48,13 +48,13 @@ Since each model may have multiple inference methods, we need to generate `List[ ### Step 3: Generate `BundledProgram` -We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including +We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`: :::{dropdown} `BundledProgram` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__ +.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__ :noindex: ``` ::: @@ -65,18 +65,18 @@ Construtor of `BundledProgram `will do sannity check internally to see if the gi ### Step 4: Serialize `BundledProgram` to Flatbuffer. -To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/sdk/bundled_program/serialize/__init__.py`. +To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`. :::{dropdown} Serialize and Deserialize ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: serialize_from_bundled_program_to_flatbuffer :noindex: ``` ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: deserialize_from_flatbuffer_to_bundled_program :noindex: ``` @@ -90,10 +90,10 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) from torch._export import capture_pre_autograd_graph @@ -187,7 +187,7 @@ with open(save_path, "wb") as f: We can also regenerate `BundledProgram` from flatbuffer file if needed: ```python -from executorch.sdk.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program +from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program save_path = "bundled_program.bpte" with open(save_path, "rb") as f: serialized_bundled_program = f.read() @@ -313,9 +313,9 @@ Here's the example of the dtype of test input not meet model's requirement: import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -400,7 +400,7 @@ Cell In[1], line 72 68 ] 70 # Step 3: Generate BundledProgram ---> 72 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -411,7 +411,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) 215 # type of tensor input should match execution plan 216 if type(cur_plan_test_inputs[j]) == torch.Tensor: 217 # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]` @@ -449,9 +449,9 @@ Another common error would be the method name in any `MethodTestSuite` does not import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -532,7 +532,7 @@ Cell In[3], line 73 70 method_test_suites[0].method_name = "MISSING_METHOD_NAME" 72 # Generate BundledProgram ---> 73 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -543,7 +543,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) 138 method_name_of_program = {e.name for e in program.execution_plan} 139 method_name_of_test_suites = {t.method_name for t in method_test_suites} --> 141 assert method_name_of_test_suites.issubset( diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md index 45e50b44e87..14d4af0f153 100644 --- a/docs/source/sdk-debugging.md +++ b/docs/source/sdk-debugging.md @@ -38,7 +38,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector # Create an Inspector instance with etdump and the debug buffer. inspector = Inspector(etdump_path=etdump_path, @@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana ```python -from executorch.sdk.inspector._inspector_utils import compare_results +from executorch.devtools.inspector._inspector_utils import compare_results # Run a simple quality analysis between the model outputs sourced from the # runtime and a set of reference outputs. diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md index 4eacb18b14c..aad623efc8a 100644 --- a/docs/source/sdk-etdump.md +++ b/docs/source/sdk-etdump.md @@ -9,7 +9,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t 1. ***Include*** the ETDump header in your code. ```C++ -#include +#include ``` 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime. diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst index 43ed5095c64..b3b7f042cc4 100644 --- a/docs/source/sdk-etrecord.rst +++ b/docs/source/sdk-etrecord.rst @@ -31,7 +31,7 @@ they are interested in working with via our tooling. .. warning:: Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process. -.. currentmodule:: executorch.sdk.etrecord._etrecord +.. currentmodule:: executorch.devtools.etrecord._etrecord .. autofunction:: generate_etrecord Using an ``ETRecord`` diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst index e15c1f2a395..448f30cfb55 100644 --- a/docs/source/sdk-inspector.rst +++ b/docs/source/sdk-inspector.rst @@ -26,26 +26,26 @@ Inspector Methods Constructor ~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.__init__ +.. autofunction:: executorch.devtools.Inspector.__init__ **Example Usage:** .. code:: python - from executorch.sdk import Inspector + from executorch.devtools import Inspector inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") to_dataframe ~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.to_dataframe +.. autofunction:: executorch.devtools.Inspector.to_dataframe print_data_tabular ~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.print_data_tabular +.. autofunction:: executorch.devtools.Inspector.print_data_tabular .. _example-usage-1: @@ -62,7 +62,7 @@ Note that the unit of delegate profiling events is "cycles". We're working on pr find_total_for_module ~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.find_total_for_module +.. autofunction:: executorch.devtools.Inspector.find_total_for_module .. _example-usage-2: @@ -80,7 +80,7 @@ find_total_for_module get_exported_program ~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.get_exported_program +.. autofunction:: executorch.devtools.Inspector.get_exported_program .. _example-usage-3: @@ -119,7 +119,7 @@ of an ``Inspector`` instance, for example: inspector.event_blocks -.. autoclass:: executorch.sdk.inspector.EventBlock +.. autoclass:: executorch.devtools.inspector.EventBlock ``Event`` Class ~~~~~~~~~~~~~~~ @@ -127,7 +127,7 @@ of an ``Inspector`` instance, for example: Access ``Event`` instances through the ``events`` attribute of an ``EventBlock`` instance. -.. autoclass:: executorch.sdk.inspector.Event +.. autoclass:: executorch.devtools.inspector.Event **Example Usage:** @@ -152,7 +152,7 @@ table. This command produces the identical table output as calling the .. code:: bash - python3 -m sdk.inspector.inspector_cli --etdump_path --etrecord_path + python3 -m devtools.inspector.inspector_cli --etdump_path --etrecord_path Note that the `etrecord_path` argument is optional. diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py index ccc2e480ad0..35d200204cb 100644 --- a/docs/source/tutorials_source/sdk-integration-tutorial.py +++ b/docs/source/tutorials_source/sdk-integration-tutorial.py @@ -38,9 +38,9 @@ # # The first step is to generate an ``ETRecord``. ``ETRecord`` contains model # graphs and metadata for linking runtime results (such as profiling) to -# the eager model. This is generated via ``executorch.sdk.generate_etrecord``. +# the eager model. This is generated via ``executorch.devtools.generate_etrecord``. # -# ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the +# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the # edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model # (``ExecutorchProgramManager``), and an optional dictionary of additional models. # @@ -51,6 +51,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from executorch.devtools import generate_etrecord from executorch.exir import ( EdgeCompileConfig, @@ -58,7 +59,6 @@ ExecutorchProgramManager, to_edge, ) -from executorch.sdk import generate_etrecord from torch.export import export, ExportedProgram @@ -129,14 +129,14 @@ def forward(self, x): # In this tutorial, a `Bundled Program` is created from the example model above. import torch +from executorch.devtools import BundledProgram -from executorch.exir import to_edge -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) + +from executorch.exir import to_edge from torch.export import export # Step 1: ExecuTorch Program Export @@ -188,7 +188,7 @@ def forward(self, x): # # To visualize all runtime events, call Inspector's ``print_data_tabular``. -from executorch.sdk import Inspector +from executorch.devtools import Inspector # sphinx_gallery_start_ignore inspector_patch = patch.object(Inspector, "__init__", return_value=None) diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md index ac67d6f6285..fb119df7310 100644 --- a/docs/website/docs/tutorials/bundled_program.md +++ b/docs/website/docs/tutorials/bundled_program.md @@ -122,7 +122,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( ### Example -Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/sdk/fb/runners/executor_runner.cpp" and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/devtools/fb/runners/executor_runner.cpp" and please review that file if you need more info and context: ```c++ // method_name is the name for the method we want to test diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm index 4cc21ba30a2..2475d68fa9b 100644 --- a/examples/apple/coreml/executor_runner/main.mm +++ b/examples/apple/coreml/executor_runner/main.mm @@ -13,7 +13,7 @@ #import #import #import -#import +#import #import #import #import diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh index 16c5dea02a4..b57a8f12e7c 100755 --- a/examples/apple/coreml/scripts/build_executor_runner.sh +++ b/examples/apple/coreml/scripts/build_executor_runner.sh @@ -56,7 +56,7 @@ mkdir -p "$EXECUTORCH_INCLUDE_DIR_PATH" find extension \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find runtime \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find util \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; -find sdk \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; +find devtools \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH" # Copy required libraries diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py index 4bf26a7f3ea..5a8c9b227f6 100644 --- a/examples/apple/coreml/scripts/export.py +++ b/examples/apple/coreml/scripts/export.py @@ -17,10 +17,10 @@ from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from executorch.devtools.etrecord import generate_etrecord from executorch.exir import to_edge from executorch.exir.backend.backend_api import to_backend -from executorch.sdk.etrecord import generate_etrecord from torch.export import export REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py index 768465f770a..e0b81d4affb 100644 --- a/examples/apple/coreml/scripts/inspector_cli.py +++ b/examples/apple/coreml/scripts/inspector_cli.py @@ -8,8 +8,8 @@ from pathlib import Path -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results +from executorch.devtools import Inspector +from executorch.devtools.inspector._inspector_utils import compare_results def get_root_dir_path() -> Path: diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py index 1736c2cefbf..c5674ec520b 100644 --- a/examples/apple/coreml/scripts/inspector_utils.py +++ b/examples/apple/coreml/scripts/inspector_utils.py @@ -20,6 +20,13 @@ from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from executorch.devtools import BundledProgram, generate_etrecord, Inspector +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) +from executorch.devtools.inspector import Event + from executorch.exir import ( EdgeProgramManager, ExecutorchBackendConfig, @@ -30,14 +37,6 @@ from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.tracer import Value -from executorch.sdk import BundledProgram, generate_etrecord, Inspector - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from executorch.sdk.inspector import Event - from torch.export import export, ExportedProgram COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [ diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt index d1dd8e93d7e..319d8159ced 100644 --- a/examples/apple/mps/CMakeLists.txt +++ b/examples/apple/mps/CMakeLists.txt @@ -92,8 +92,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") include(${EXECUTORCH_SRCS_FILE}) target_include_directories( bundled_program - INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/include - ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/bundled_program + INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include + ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/bundled_program ${EXECUTORCH_ROOT}/third-party/flatbuffers/include ${EXECUTORCH_ROOT}/third-party/flatcc/include ${_mps_schema_headers} diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm index 604419a620e..040b2fcd996 100644 --- a/examples/apple/mps/executor_runner/mps_executor_runner.mm +++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm @@ -30,8 +30,8 @@ #include #include #include -#include -#include +#include +#include #include using namespace std::chrono; diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl index fd0a7a50468..14399411ae3 100644 --- a/examples/apple/mps/executor_runner/targets.bzl +++ b/examples/apple/mps/executor_runner/targets.bzl @@ -28,9 +28,9 @@ def define_common_targets(): "//executorch/extension/data_loader:file_data_loader", "//executorch/kernels/portable:generated_lib", "//executorch/extension/data_loader:file_data_loader", - "//executorch/sdk/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/sdk/bundled_program:runtime", + "//executorch/devtools/bundled_program:runtime", ], external_deps = [ "gflags", diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py index e561afb1858..636444e2b78 100644 --- a/examples/apple/mps/scripts/mps_example.py +++ b/examples/apple/mps/scripts/mps_example.py @@ -14,6 +14,11 @@ from executorch import exir from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner +from executorch.devtools import BundledProgram, generate_etrecord +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ( EdgeCompileConfig, @@ -24,11 +29,6 @@ from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge, save_pte_program -from executorch.sdk import BundledProgram, generate_etrecord -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from ....models import MODEL_NAME_TO_MODEL from ....models.model_factory import EagerModelFactory diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index 9bdbff5fbb3..467949a5ebf 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -93,7 +93,7 @@ runtime.python_library( # "//executorch/extension/pybindings:aten_lib", # "//executorch/extension/pybindings:portable_lib", # "//executorch/extension/pybindings:portable_lib_plus_custom", - "//executorch/sdk/etrecord:etrecord", + "//executorch/devtools/etrecord:etrecord", "//executorch/util:memory_profiler", "//executorch/util:python_profiler", "fbsource//third-party/pypi/coremltools:coremltools", diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 8ff5d3aa265..221f2f75bc6 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -22,6 +22,8 @@ import torch +from executorch.devtools.etrecord import generate_etrecord + from executorch.examples.models.llama2.llama_transformer import ModelArgs from executorch.extension.llm.export.builder import DType, LLMEdgeManager @@ -40,8 +42,6 @@ get_pt2e_quantizers, get_qnn_quantizer, ) - -from executorch.sdk.etrecord import generate_etrecord from executorch.util.activation_memory_profiler import generate_memory_trace from ..model_factory import EagerModelFactory diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 7cd3709b950..c2a6c2c46c6 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py index b12a44993de..8339b9f5b58 100644 --- a/examples/qualcomm/scripts/export_example.py +++ b/examples/qualcomm/scripts/export_example.py @@ -15,12 +15,12 @@ generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, ) +from executorch.devtools import generate_etrecord from executorch.examples.models import MODEL_NAME_TO_MODEL from executorch.examples.models.model_factory import EagerModelFactory from executorch.exir.backend.backend_api import to_backend, validation_disabled from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import save_pte_program -from executorch.sdk import generate_etrecord from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e diff --git a/examples/sdk/CMakeLists.txt b/examples/sdk/CMakeLists.txt index 76034b07601..af7e9b15bc5 100644 --- a/examples/sdk/CMakeLists.txt +++ b/examples/sdk/CMakeLists.txt @@ -49,7 +49,7 @@ add_executable(sdk_example_runner sdk_example_runner/sdk_example_runner.cpp) target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED) target_include_directories( - etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include + etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../devtools/include ${EXECUTORCH_ROOT}/third-party/flatcc/include ) target_link_libraries( diff --git a/examples/sdk/README.md b/examples/sdk/README.md index 68043517fba..096f90864e7 100644 --- a/examples/sdk/README.md +++ b/examples/sdk/README.md @@ -59,11 +59,11 @@ Running the program will generate an `ETDump` file (`.etdp`) at the location spe Once an `ETDump` has been generated, it can be viewed using the CLI inspector. This will print a tabular view of the data recorded in the ETDump. ```bash - python3 -m sdk.inspector.inspector_cli --etdump_path mv2_etdump.etdp + python3 -m devtools.inspector.inspector_cli --etdump_path mv2_etdump.etdp ``` ### ETDump C++ API -ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class. Include the header file located at `sdk/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program. +ETDump profiling can also be used in a custom C++ program. `ETDumpGen` is an implementation of the abstract `EventTracer` class. Include the header file located at `devtools/etdump/etdump_flatcc.h`. To initialize the ETDump generator, construct it before loading the method from the program. ```cpp torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); diff --git a/examples/sdk/scripts/export_bundled_program.py b/examples/sdk/scripts/export_bundled_program.py index a34a0ab4d34..052f5e99629 100644 --- a/examples/sdk/scripts/export_bundled_program.py +++ b/examples/sdk/scripts/export_bundled_program.py @@ -11,19 +11,19 @@ from typing import List import torch - -from executorch.exir import ExecutorchProgramManager -from executorch.extension.export_util.utils import export_to_exec_prog -from executorch.sdk import BundledProgram -from executorch.sdk.bundled_program.config import ( +from executorch.devtools import BundledProgram +from executorch.devtools.bundled_program.config import ( MethodInputType, MethodTestCase, MethodTestSuite, ) -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) +from executorch.exir import ExecutorchProgramManager +from executorch.extension.export_util.utils import export_to_exec_prog + from ...models import MODEL_NAME_TO_MODEL from ...models.model_factory import EagerModelFactory diff --git a/examples/sdk/scripts/gen_sample_etrecord.py b/examples/sdk/scripts/gen_sample_etrecord.py index c219ed4094f..d2c4913b035 100644 --- a/examples/sdk/scripts/gen_sample_etrecord.py +++ b/examples/sdk/scripts/gen_sample_etrecord.py @@ -10,6 +10,7 @@ from typing import Any import torch +from executorch.devtools import generate_etrecord from executorch.exir import ( EdgeCompileConfig, EdgeProgramManager, @@ -18,7 +19,6 @@ to_edge, ) from executorch.exir.capture._config import ExecutorchBackendConfig -from executorch.sdk import generate_etrecord from torch.export import export from ...models import MODEL_NAME_TO_MODEL diff --git a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp index e2e42ab670a..7e979937d1b 100644 --- a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp +++ b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp @@ -22,13 +22,13 @@ #include +#include +#include #include #include #include #include #include -#include -#include #include static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/sdk/sdk_example_runner/targets.bzl index a5e8feb33c3..025d42fee1a 100644 --- a/examples/sdk/sdk_example_runner/targets.bzl +++ b/examples/sdk/sdk_example_runner/targets.bzl @@ -20,8 +20,8 @@ def define_common_targets(): "//executorch/extension/data_loader:file_data_loader", "//executorch/extension/data_loader:buffer_data_loader", "//executorch/util:util", - "//executorch/sdk/etdump:etdump_flatcc", - "//executorch/sdk/bundled_program:runtime", + "//executorch/devtools/etdump:etdump_flatcc", + "//executorch/devtools/bundled_program:runtime", ], external_deps = [ "gflags", diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py index a816c4f0e74..32d67e0cd4a 100644 --- a/examples/xnnpack/aot_compiler.py +++ b/examples/xnnpack/aot_compiler.py @@ -12,9 +12,9 @@ import torch from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.devtools import generate_etrecord from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge, save_pte_program -from executorch.sdk import generate_etrecord from ..models import MODEL_NAME_TO_MODEL from ..models.model_factory import EagerModelFactory diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl index 30cafa56fa9..35df8999b47 100644 --- a/examples/xnnpack/targets.bzl +++ b/examples/xnnpack/targets.bzl @@ -32,7 +32,7 @@ def define_common_targets(): "//executorch/examples/xnnpack/quantization:quant_utils", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", - "//executorch/sdk:lib", + "//executorch/devtools:lib", ], ) diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS index 8ddf8300395..49419a4159c 100644 --- a/exir/_serialize/TARGETS +++ b/exir/_serialize/TARGETS @@ -14,8 +14,8 @@ cpp_python_extension( "//executorch/backends/fb/qnnpack/...", "//executorch/backends/vulkan/...", "//executorch/backends/xnnpack/...", - "//executorch/sdk/bundled_program/...", - "//executorch/sdk/etdump/...", + "//executorch/devtools/bundled_program/...", + "//executorch/devtools/etdump/...", ], deps = [ "fbsource//third-party/flatbuffers:flatc_library", @@ -45,6 +45,10 @@ runtime.python_library( visibility = [ "//executorch/backends/...", "//executorch/codegen/...", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/tests/...", + "//executorch/devtools/experimental/...", "//executorch/examples/async_exec:emit_program_lib", "//executorch/exir/...", "//executorch/exir/tests/...", @@ -52,10 +56,6 @@ runtime.python_library( "//executorch/extension/pybindings/test:test", "//executorch/extension/pybindings/test:test-library", "//executorch/profiler/...", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program/serialize:lib", - "//executorch/sdk/bundled_program/tests/...", - "//executorch/sdk/experimental/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 83cec280b89..7c98ee4aa06 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -17,6 +17,9 @@ #include #include +#include +#include +#include #include #include #include @@ -28,9 +31,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/pytest.ini b/pytest.ini index 5ed1780e611..7298773255a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -13,8 +13,7 @@ addopts = --ignore-glob=backends/arm/**/* # explicitly list out tests that are running successfully in oss examples/models/test - # sdk/ - sdk/ + devtools/ # examples examples/models/llama2/tests # examples/models/llava/test TODO: enable this diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index 401581421df..5ba989ef86a 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -19,7 +19,7 @@ def define_common_targets(is_fbcode = False): "//executorch/exir/backend/test/...", "//executorch/runtime/backend/...", "//executorch/extension/pybindings/...", - "//executorch/sdk/fb/runners/...", + "//executorch/devtools/fb/runners/...", "//executorch/test/...", "//executorch/examples/...", ], @@ -43,7 +43,7 @@ def define_common_targets(is_fbcode = False): "//executorch/exir/backend/test/...", "//executorch/runtime/backend/...", "//executorch/extension/pybindings/...", - "//executorch/sdk/fb/runners/...", + "//executorch/devtools/fb/runners/...", "//executorch/test/...", "//executorch/examples/...", ], diff --git a/schema/targets.bzl b/schema/targets.bzl index 2c797baa16b..40c6d8d5c8d 100644 --- a/schema/targets.bzl +++ b/schema/targets.bzl @@ -57,7 +57,7 @@ def define_common_targets(): name = INPUT_SCALAR_TYPE, visibility = [ "//executorch/exir/_serialize/...", - "//executorch/sdk/etdump/...", + "//executorch/devtools/etdump/...", ], ) diff --git a/sdk/inspector/tests/TARGETS b/sdk/inspector/tests/TARGETS deleted file mode 100644 index 374d2ea7538..00000000000 --- a/sdk/inspector/tests/TARGETS +++ /dev/null @@ -1,40 +0,0 @@ -load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") - -oncall("executorch") - -python_unittest( - name = "inspector_test", - srcs = ["inspector_test.py"], - deps = [ - "//executorch/exir:lib", - "//executorch/sdk:lib", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etrecord/tests:etrecord_test_library", - "//executorch/sdk/inspector:inspector", - "//executorch/sdk/inspector:lib", - ], -) - -python_unittest( - name = "event_blocks_test", - srcs = ["event_blocks_test.py"], - deps = [ - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/inspector:inspector", - "//executorch/sdk/inspector:lib", - ], -) - -python_unittest( - name = "inspector_utils_test", - srcs = ["inspector_utils_test.py"], - deps = [ - "//executorch/sdk:lib", - "//executorch/sdk/debug_format:base_schema", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etrecord/tests:etrecord_test_library", - "//executorch/sdk/inspector:inspector_utils", - ], -) diff --git a/setup.py b/setup.py index 58a9973c9f9..75b3ece526e 100644 --- a/setup.py +++ b/setup.py @@ -360,12 +360,12 @@ def run(self): ("schema/scalar_type.fbs", "exir/_serialize/scalar_type.fbs"), ("schema/program.fbs", "exir/_serialize/program.fbs"), ( - "sdk/bundled_program/schema/bundled_program_schema.fbs", - "sdk/bundled_program/serialize/bundled_program_schema.fbs", + "devtools/bundled_program/schema/bundled_program_schema.fbs", + "devtools/bundled_program/serialize/bundled_program_schema.fbs", ), ( - "sdk/bundled_program/schema/scalar_type.fbs", - "sdk/bundled_program/serialize/scalar_type.fbs", + "devtools/bundled_program/schema/scalar_type.fbs", + "devtools/bundled_program/serialize/scalar_type.fbs", ), ] for src, dst in src_to_dst: @@ -606,8 +606,8 @@ def get_ext_modules() -> List[Extension]: "executorch/extension": "extension", "executorch/kernels/quantized": "kernels/quantized", "executorch/schema": "schema", - "executorch/sdk": "sdk", - "executorch/sdk/bundled_program": "sdk/bundled_program", + "executorch/devtools": "devtools", + "executorch/devtools/bundled_program": "devtools/bundled_program", "executorch/util": "util", # Note: This will install a top-level module called "serializer", # which seems too generic and might conflict with other pip packages. diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl index f62c567ba40..813b420dbaa 100644 --- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl @@ -10,29 +10,29 @@ MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB = [ PORTABLE_MODULE_DEPS = [ "//executorch/runtime/kernel:operator_registry", "//executorch/runtime/executor:program", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", "//executorch/extension/aten_util:aten_bridge", - "//executorch/sdk/bundled_program:runtime", + "//executorch/devtools/bundled_program:runtime", "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/util:util", "//executorch/runtime/executor/test:test_backend_compiler_lib", - "//executorch/sdk/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", ] + get_all_cpu_backend_targets() ATEN_MODULE_DEPS = [ "//executorch/runtime/kernel:operator_registry", "//executorch/runtime/executor:program_aten", "//executorch/runtime/core/exec_aten:lib", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/util:read_file", - "//executorch/sdk/bundled_program:runtime_aten", + "//executorch/devtools/bundled_program:runtime_aten", "//executorch/runtime/executor/test:test_backend_compiler_lib_aten", - "//executorch/sdk/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", ] # Generated lib for all ATen ops with aten kernel used by models in model inventory diff --git a/test/end2end/TARGETS b/test/end2end/TARGETS index 8c0885e32eb..fdac0e4887a 100644 --- a/test/end2end/TARGETS +++ b/test/end2end/TARGETS @@ -42,6 +42,9 @@ python_unittest( ":exported_module", ":register_scratch_meta_fns", "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir:dynamic_shape", "//executorch/exir:lib", "//executorch/exir:memory", @@ -57,9 +60,6 @@ python_unittest( "//executorch/exir/tests:transformer", "//executorch/extension/pybindings:aten_lib", "//executorch/extension/pytree:pybindings", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/serialize:lib", ], ) @@ -73,6 +73,9 @@ python_unittest( ":exported_module", ":register_scratch_meta_fns", "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir:dynamic_shape", "//executorch/exir:lib", "//executorch/exir:memory", @@ -88,8 +91,5 @@ python_unittest( "//executorch/exir/tests:transformer", "//executorch/extension/pybindings:portable_lib", "//executorch/extension/pytree:pybindings", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/serialize:lib", ], ) diff --git a/test/models/generate_linear_out_bundled_program.py b/test/models/generate_linear_out_bundled_program.py index 9201e43adff..93fd1445ef5 100644 --- a/test/models/generate_linear_out_bundled_program.py +++ b/test/models/generate_linear_out_bundled_program.py @@ -17,15 +17,15 @@ from typing import List import torch +from executorch.devtools import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ExecutorchBackendConfig, to_edge from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass from executorch.exir.print_program import pretty_print -from executorch.sdk import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from executorch.test.models.linear_model import LinearModel from torch.export import export diff --git a/test/models/targets.bzl b/test/models/targets.bzl index ad907304edf..aea47c9e036 100644 --- a/test/models/targets.bzl +++ b/test/models/targets.bzl @@ -22,9 +22,9 @@ def define_common_targets(): deps = [ ":linear_model", "//caffe2:torch", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir:lib", "//executorch/exir/_serialize:lib", ], diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 2d2f8162092..3f17a9ead69 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -121,7 +121,7 @@ probe_tests() { kernels runtime schema - sdk + devtools test ) From d7c069f495e24d3919cc27ae25ceed8b042e2eed Mon Sep 17 00:00:00 2001 From: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com> Date: Thu, 22 Aug 2024 16:29:15 -0700 Subject: [PATCH 015/531] Fix SDPA decomp problem Differential Revision: D61639074 Pull Request resolved: https://github.com/pytorch/executorch/pull/4851 --- backends/cadence/aot/compiler.py | 18 +++++++++++++----- backends/cadence/aot/passes.py | 26 ++++++++++++++++++++++++++ backends/cadence/aot/utils.py | 8 ++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 509e254b550..405f8b5db4e 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -18,12 +18,13 @@ ReplaceLogicalNotBooleanWhereWithWherePass, ReplacePT2DequantWithCadenceDequantPass, ReplacePT2QuantWithCadenceQuantPass, + ReplaceSafeSoftmaxWithSoftmax, ReplaceScalarTensorWithFullPass, ReplaceSqueezeAndUnsqueezeWithViewPass, ) from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer -from executorch.backends.cadence.aot.utils import model_is_quantized +from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized from executorch.backends.transforms.decompose_sdpa import ( DecomposeScaledDotProductAttention, ) @@ -57,13 +58,20 @@ def convert_pt2( """ # Export with dynamo - model_exp = capture_pre_autograd_graph(model, inputs) + model_gm = capture_pre_autograd_graph(model, inputs) - # Decompose SDPA - DecomposeScaledDotProductAttention(False)(model_exp) + if model_gm_has_SDPA(model_gm): + # Decompose SDPA + DecomposeScaledDotProductAttention(False)(model_gm) + + # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882 + # for details). + result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) + assert result is not None + model_gm = result.graph_module # Prepare - prepared_model = prepare_pt2e(model_exp, quantizer) + prepared_model = prepare_pt2e(model_gm, quantizer) # Calibrate prepared_model(*inputs) diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py index db419bfb5e1..83ef43d1510 100644 --- a/backends/cadence/aot/passes.py +++ b/backends/cadence/aot/passes.py @@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: result = SpecPropPass()(graph_module) assert result is not None return result + + +class ReplaceSafeSoftmaxWithSoftmax(ExportPass): + """ + Replace _safe_softmax with _softmax + """ + + def call_operator( + self, + op, # pyre-ignore + args: tuple[Argument, ...], + kwargs: dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != torch.ops.aten._safe_softmax.default: + return super().call_operator(op, args, kwargs, meta) + + # Add False for the half_to_float argument of softmax + softmax_args = list(args) + [False] + + return super().call_operator( + torch.ops.aten._softmax.default, + tuple(softmax_args), + kwargs, + meta, + ) diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py index f0c294260a7..b710f7d4e57 100644 --- a/backends/cadence/aot/utils.py +++ b/backends/cadence/aot/utils.py @@ -177,3 +177,11 @@ def print_ops_info( tablefmt="outline", ) ) + + +def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool: + for node in model_gm.graph.nodes: + if node.op == "call_function": + if node.target == torch.ops.aten.scaled_dot_product_attention.default: + return True + return False From 891521a7836b545a6cbad5f6c8ca56aa2bdf886d Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Thu, 22 Aug 2024 21:08:38 -0400 Subject: [PATCH 016/531] [ET-VK] Use dim order as the source of truth for tensor strides Differential Revision: D61666464 Pull Request resolved: https://github.com/pytorch/executorch/pull/4844 --- .../vulkan/runtime/api/containers/Tensor.cpp | 204 ++++++++++++------ .../vulkan/runtime/api/containers/Tensor.h | 88 ++++++-- .../vulkan/runtime/graph/ComputeGraph.cpp | 7 + backends/vulkan/runtime/graph/ComputeGraph.h | 9 +- .../vulkan/test/vulkan_compute_api_test.cpp | 83 ++++++- 5 files changed, 298 insertions(+), 93 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 78aa4796aa5..be44679f3b0 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -13,36 +13,15 @@ namespace vkcompute { namespace api { -/* - * Given the strides of a buffer-backed tensor, find the index of the "fastest - * moving" dimension in WHCN dimension order. If multiple dims have the lowest - * stride, then the "earlier" dim is assumed to be the fastest moving (width is - * "earlier" than height). - */ -int32_t find_fastest_whcn_dim(const std::vector& strides) { - if (strides.size() == 0) { - return 0; - } - int32_t fastest_dim = 0; - int64_t min_stride = strides.at(0); - for (int d = strides.size() - 1; d >= 0; --d) { - if (strides.at(d) < min_stride) { - fastest_dim = d; - min_stride = strides.at(d); - } - } - return (strides.size() - 1 - fastest_dim); -} - /* * Given the strides of a buffer-backed tensor, estimate the equivalent memory * layout enum value by identifying the fastest moving dimension. */ utils::GPUMemoryLayout estimate_memory_layout( - const std::vector& strides) { - int32_t fastest_dim = find_fastest_whcn_dim(strides); - if (fastest_dim <= 3) { - return utils::GPUMemoryLayout(fastest_dim); + const std::vector& dim_order) { + int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back(); + if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) { + return utils::GPUMemoryLayout(fastest_dim_whcn); } // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding @@ -51,41 +30,70 @@ utils::GPUMemoryLayout estimate_memory_layout( VK_THROW("No compatible GPUMemoryLayout value"); } +std::vector calculate_dim_order( + const size_t ndim, + const utils::GPUMemoryLayout memory_layout) { + // Special case for zero dim tensors + if (ndim == 0) { + return {0}; + } + std::vector dim_order(ndim); + int64_t last_dim = + ndim - utils::to_packed_dim_nchw_offset(memory_layout); + + int64_t cur_dim = 0; + for (int d = 0; d < ndim; ++d) { + if (d == last_dim) { + cur_dim++; + } + dim_order[d] = cur_dim; + cur_dim++; + } + if (last_dim >= 0) { + dim_order[ndim - 1] = last_dim; + } + + return dim_order; +} + std::vector calculate_strides( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout) { + const std::vector& dim_order) { // For zero dim tensors if (sizes.size() == 0) { return {1}; } - const int64_t dim_offset = - utils::to_packed_dim_nchw_offset(memory_layout); - int64_t last_dim = sizes.size() - dim_offset; - if (last_dim < 0) { - last_dim = sizes.size() - 1; - } - size_t ndim = sizes.size(); std::vector strides(ndim); - const int64_t last_dim_size = sizes.at(last_dim); - - for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) { - strides.at(stride_d) = 1; - if (stride_d == last_dim) { - continue; - } - strides.at(stride_d) = last_dim_size; - for (int size_d = ndim - 1; size_d > stride_d; size_d--) { - if (size_d != last_dim) { - strides.at(stride_d) *= sizes.at(size_d); - } + strides[dim_order[ndim - 1]] = 1; + for (int32_t i = ndim - 2; i >= 0; --i) { + if (sizes[dim_order[i + 1]] == 0) { + strides[dim_order[i]] = strides[dim_order[i + 1]]; + } else { + strides[dim_order[i]] = + strides[dim_order[i + 1]] * sizes[dim_order[i + 1]]; } } + return strides; } +bool dim_order_is_valid(const std::vector& dim_order) { + int64_t sum = 0; + for (size_t i = 0; i < dim_order.size(); ++i) { + if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) { + return false; + } + sum += dim_order[i]; + } + int64_t n = static_cast(dim_order.size() - 1); + // Sanity check that the sum of the indices in the vector is equal to the sum + // of 0 + 1 + 2 + ... + (ndim - 1) + return sum == n * (n + 1) / 2; +} + std::vector unsqueeze_strides( const std::vector& strides, const int64_t numel) { @@ -170,7 +178,8 @@ vTensor::vTensor( memory_layout_(memory_layout), // Calculate tensor size metadata sizes_(sizes.begin(), sizes.end()), - strides_(calculate_strides(sizes, memory_layout_)), + dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)), + strides_(calculate_strides(sizes, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, @@ -189,6 +198,9 @@ vTensor::vTensor( padded_sizes_, dtype_, allocate_memory) { + VK_CHECK_COND( + dim_order_is_valid(dim_order_), "computed dim order is invalid"); + if (storage_type != utils::kBuffer) { texture_limits_.limits = utils::ivec3{ utils::safe_downcast(storage_.image_extents_[0]), @@ -204,16 +216,39 @@ vTensor::vTensor( } } +vTensor::vTensor(const vTensor& other) + : dtype_(other.dtype_), + memory_layout_(other.memory_layout_), + // Copy tensor size metadata + sizes_(other.sizes_.begin(), other.sizes_.end()), + dim_order_(other.dim_order_.begin(), other.dim_order_.end()), + strides_(other.strides_.begin(), other.strides_.end()), + numel_(other.numel_), + padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, + unsqueezed_strides_{ + other.unsqueezed_strides_.begin(), + other.unsqueezed_strides_.end()}, + padded_numel_(other.padded_numel_), + texture_limits_{other.texture_limits_}, + // Empty initialize Utility Uniform Buffers + sizes_uniform_(), + strides_uniform_(), + numel_uniform_(), + texture_limits_uniform_(), + // Copy Tensor storage + storage_(other.storage_) {} + vTensor::vTensor( const vTensor& other, const std::vector& sizes, - const std::vector& strides, - const size_t offset_numel) + const std::vector& dim_order, + const int64_t offset_numel) : dtype_(other.dtype_), - memory_layout_(estimate_memory_layout(strides)), + memory_layout_(estimate_memory_layout(dim_order)), // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), - strides_(strides.begin(), strides.end()), + dim_order_(dim_order.begin(), dim_order.end()), + strides_(calculate_strides(sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, @@ -226,6 +261,8 @@ vTensor::vTensor( texture_limits_uniform_(), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { + VK_CHECK_COND( + dim_order_is_valid(dim_order_), "new dim order provided is invalid"); VK_CHECK_COND( offset_numel + numel_ <= other.numel(), "Tensor alias cannot access more elements than available in the original" @@ -339,9 +376,17 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } } -void vTensor::update_size_metadata(const std::vector& new_sizes) { +void vTensor::update_metadata( + const std::vector& new_sizes, + const std::vector& new_dim_order) { sizes_ = new_sizes; - strides_ = calculate_strides(new_sizes, memory_layout_); + dim_order_ = new_dim_order; + strides_ = calculate_strides(sizes_, dim_order_); + // Only update the memory layout for buffer-backed tensors. Strides are + // meaningless for texture-backed tensors and do not impact the memory layout. + if (storage_type() == utils::kBuffer) { + memory_layout_ = estimate_memory_layout(dim_order_); + } numel_ = utils::multiply_integers(sizes_); padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_); @@ -373,15 +418,7 @@ void vTensor::update_size_metadata(const std::vector& new_sizes) { } } -void vTensor::reallocate(const std::vector& new_sizes) { - update_size_metadata(new_sizes); - storage_.discard_and_reallocate( - calculate_padded_sizes(new_sizes, memory_layout_), - memory_layout_, - dtype_); -} - -void vTensor::virtual_resize(const std::vector& new_sizes) { +void vTensor::check_sizes(const std::vector& sizes) const { if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. @@ -394,10 +431,47 @@ void vTensor::virtual_resize(const std::vector& new_sizes) { VK_CHECK_COND( valid_resize, - "Cannot use virtual resize if new sizes requires a larger texture."); + "tensor sizes requires a larger texture than the current one."); + } else { + // For buffer storage check that the current buffer is large enough for the + // new sizes of the tensor. + int64_t numel = utils::multiply_integers(sizes); + bool valid_resize = + numel + storage_.buffer_offset_ <= storage_.buffer_length_; + VK_CHECK_COND( + valid_resize, + "tensor sizes requires a larger buffer than the current one."); } +} + +void vTensor::virtual_reconfigure( + const std::vector& new_sizes, + const std::vector& new_dim_order) { + VK_CHECK_COND( + storage_type() == utils::kBuffer, + "virtual_reconfigure is only applicable for buffer backed tensors"); + VK_CHECK_COND(new_sizes.size() == new_dim_order.size()); + VK_CHECK_COND(dim_order_is_valid(new_dim_order)); - update_size_metadata(new_sizes); + check_sizes(new_sizes); + update_metadata(new_sizes, new_dim_order); +} + +void vTensor::virtual_resize(const std::vector& new_sizes) { + VK_CHECK_COND( + new_sizes.size() == dim_order_.size(), + "new sizes cannot modify the dimensionality of the tensor "); + + check_sizes(new_sizes); + update_metadata(new_sizes, dim_order_); +} + +void vTensor::reallocate(const std::vector& new_sizes) { + update_metadata(new_sizes, dim_order_); + storage_.discard_and_reallocate( + calculate_padded_sizes(new_sizes, memory_layout_), + memory_layout_, + dtype_); } // @@ -480,6 +554,7 @@ vTensorStorage::vTensorStorage( storage_type_{storage_type}, image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)), buffer_length_{utils::multiply_integers(padded_sizes)}, + buffer_offset_{0}, image_(allocate_image( context_, image_extents_, @@ -496,11 +571,12 @@ vTensorStorage::vTensorStorage( vTensorStorage::vTensorStorage( const vTensorStorage& other, - const size_t buffer_offset) + const int64_t buffer_offset) : context_(other.context_), storage_type_{other.storage_type_}, image_extents_(other.image_extents_), buffer_length_{other.buffer_length_}, + buffer_offset_{buffer_offset}, image_(), buffer_(other.buffer_, buffer_offset), last_access_{other.last_access_} { diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 5a4598291c0..8186ef1bd66 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -20,14 +20,21 @@ namespace vkcompute { namespace api { /* - * Given the sizes of a tensor and the GPU memory layout, calculate the strides - * of the tensor in NCHW dimension order. The GPU memory layout will be used to - * determine which dimension is packed along a texel; that dimension will be - * used as the "fasted moving" dimension with a stride of 1. + * Given a GPUMemoryLayout value, produce a dim order vector that matches the + * given memory layout. The produced dim order vector will be in the NCHW + * dimension order + */ +std::vector calculate_dim_order( + const size_t ndim, + const utils::GPUMemoryLayout memory_layout); + +/* + * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) + * dimension order, calculate the strides of the tensor. */ std::vector calculate_strides( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout); + const std::vector& dim_order); std::vector unsqueeze_strides( const std::vector& strides, @@ -96,7 +103,7 @@ class vTensorStorage final { * because this behaviour is unsafe, since the original tensor may be * destroyed before the copy is destroyed. */ - vTensorStorage(const vTensorStorage& other, const size_t buffer_offset = 0); + vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0); public: // To discourage creating copies, the assignment operator is still deleted. @@ -118,6 +125,7 @@ class vTensorStorage final { // Resource sizings utils::uvec3 image_extents_{}; int64_t buffer_length_{}; + int64_t buffer_offset_{}; // GPU Storage mutable vkapi::VulkanImage image_; @@ -167,8 +175,16 @@ class vTensor final { const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked, const bool allocate_memory = true); - vTensor(const vTensor& other) = delete; - vTensor& operator=(const vTensor& other) = delete; + /* + * This constructor allows for the creation of a vTensor that references the + * same buffer resource of another vTensor, with the same sizes and strides + * metadata. The created vTensor will not own the underlying resource. This is + * only applicable for buffer backed tensors at the moment. + * + * Once created, the sizes and strides of the aliased vTensor can be changed + * using the `virtual_reconfigure` member function. + */ + vTensor(const vTensor& other); /* * This constructor allows for the creation of a vTensor that references the @@ -176,6 +192,10 @@ class vTensor final { * strides metatdata. The created vTensor will not own the underlying * resource. This is only applicable for buffer backed tensors at the moment. * + * Note that dim order is used as the source of truth regarding the strides, + * and the new strides are computed from the new sizes and new dim order. + * Thus only the dim order is provided as an argument to this function. + * * The offset_numel argument allows the aliased tensor's memory region to * begin at an offset of N elements from the start of the original tensor's * buffer. @@ -183,8 +203,11 @@ class vTensor final { vTensor( const vTensor& other, const std::vector& sizes, - const std::vector& strides, - const size_t offset_numel = 0); + const std::vector& dim_order, + const int64_t offset_numel = 0); + + // To discourage making copies, the copy assignment operator is still deleted + vTensor& operator=(const vTensor& other) = delete; vTensor(vTensor&& other) = default; vTensor& operator=(vTensor&& other) = default; @@ -195,6 +218,11 @@ class vTensor final { // sizes of the tensor in NCHW dimension order std::vector sizes_; + // dim order of the tensor; dimension indices are in NCHW dimension order + // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger + // strides precede the dims with smaller strides in the dim order. The last + // dim is always the fastest moving dim with a stride of 1. + std::vector dim_order_; // strides of the tensor in NCHW dimension order std::vector strides_; // Contains the number of elements in the tensor according to the canonical @@ -305,6 +333,10 @@ class vTensor final { return sizes_.size(); } + inline const std::vector& dim_order() const { + return dim_order_; + } + inline const std::vector& strides() const { return strides_; } @@ -386,24 +418,46 @@ class vTensor final { private: /* - * Update the size metadata of the vTensor to be new sizes. Should not be used - * directly, reallocate() or virtual_resize() should be used instead. + * Given new sizes and new strides of the dim order, update the sizes and dim + * order metadata of the vTensor. New strides are computed using the new sizes + * and new dim order. + */ + void update_metadata( + const std::vector& new_sizes, + const std::vector& new_dim_order); + + /* + * Check that tensor sizes are valid given the current storage resource's + * limits. */ - void update_size_metadata(const std::vector& new_sizes); + void check_sizes(const std::vector& sizes) const; public: /* - * Discard the underlying VkImage or VkBuffer and re-allocate based on new - * tensor sizes + * Change how the tensor should be interpreted by compute shaders via updating + * the size and dim order of the tensor. The new sizes and dim order may have + * different dimensionality than the current dimensionality of the tensor. + * + * This function can only be used for buffer-backed tensors, since texture + * backed buffers cannot change dimensionality or memory layout. */ - void reallocate(const std::vector& new_sizes); + void virtual_reconfigure( + const std::vector& new_sizes, + const std::vector& new_dim_order); /* * Perform a virtual resize of the vTensor by modifying the size metadata that * gets used in compute shaders. This allows the shader to treat the - * underlying resource as if it were a different size. + * underlying resource as if it were a different size. The new sizes cannot + * modify the dimensionality of the tensor. */ void virtual_resize(const std::vector& new_sizes); + + /* + * Discard the underlying VkImage or VkBuffer and re-allocate based on new + * tensor sizes + */ + void reallocate(const std::vector& new_sizes); }; } // namespace api diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 50d927a913f..48e1ebf0a83 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -203,6 +203,13 @@ ValueRef ComputeGraph::add_tensor( sizes, dtype, suggested_memory_layout(sizes), shared_object_idx); } +ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) { + const vTensorPtr t = get_tensor(vref); + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(api::vTensor(*t)); + return idx; +} + ValueRef ComputeGraph::add_tensor_view( const ValueRef vref, const std::vector& sizes, diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index b432be83881..faa2f4107ec 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -356,10 +356,17 @@ class ComputeGraph final { * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for * more details. */ + ValueRef add_tensor_view(const ValueRef vref); + + /* + * Use the copy constructor of `api::vTensor` to create a "view" of the + * `vTensor` value at `vref` with different sizes and dim order. See the copy + * constructor of `api::vTensor` for more details. + */ ValueRef add_tensor_view( const ValueRef vref, const std::vector& sizes, - const std::vector& strides, + const std::vector& dim_order, const size_t offset_numel = 0); /* diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index af92728cb0c..1ac74e29ef4 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -175,7 +175,43 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { EXPECT_TRUE(empty_shader_info.src_code.size == 0u); } +TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { + // ndim, GPUMemoryLayout, expected dim order pairs + std::vector>> + test_cases = { + {1, utils::kWidthPacked, {0}}, + {1, utils::kHeightPacked, {0}}, + {1, utils::kChannelsPacked, {0}}, + {2, utils::kWidthPacked, {0, 1}}, + {2, utils::kHeightPacked, {1, 0}}, + {2, utils::kChannelsPacked, {0, 1}}, + {3, utils::kWidthPacked, {0, 1, 2}}, + {3, utils::kHeightPacked, {0, 2, 1}}, + {3, utils::kChannelsPacked, {1, 2, 0}}, + {4, utils::kWidthPacked, {0, 1, 2, 3}}, + {4, utils::kHeightPacked, {0, 1, 3, 2}}, + {4, utils::kChannelsPacked, {0, 2, 3, 1}}, + }; + + for (const auto& test_case : test_cases) { + const size_t& ndim = std::get<0>(test_case); + const utils::GPUMemoryLayout& layout = std::get<1>(test_case); + const auto& expected_dim_order = std::get<2>(test_case); + std::vector dim_order = calculate_dim_order(ndim, layout); + + ASSERT_TRUE(dim_order == expected_dim_order); + } +} + TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { + vTensor v_tensor_to_resize( + context(), + {25, 25, 25, 25}, + vkapi::kFloat, + utils::kBuffer, + utils::kWidthPacked, + /*allocate_memory = */ false); + for (const auto& sizes : standard_sizes_to_test) { if (sizes.size() < 3) { continue; @@ -183,7 +219,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { for (const auto& layout : {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { { - std::vector strides = calculate_strides(sizes, layout); + std::vector dim_order = + calculate_dim_order(sizes.size(), layout); + std::vector strides = calculate_strides(sizes, dim_order); std::vector ref_strides = get_reference_strides(sizes, layout); ASSERT_TRUE(strides == ref_strides); @@ -194,6 +232,25 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { get_reference_strides(sizes, layout, true); ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); + + // Create new vTensor and check that the strides are correct + vTensor new_v_tensor( + context(), + sizes, + vkapi::kFloat, + utils::kBuffer, + layout, + /*allocate_memory = */ false); + + ASSERT_TRUE(new_v_tensor.strides() == ref_strides); + ASSERT_TRUE( + new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides); + + // Resize vtensor and check that updated metadata is correct + v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); + ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); + ASSERT_TRUE( + v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides); } } } @@ -549,9 +606,10 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) { std::vector sizes = {9, 9}; std::vector strides = get_reference_strides(sizes, utils::kWidthPacked); + std::vector dim_order = {0, 1}; vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true); - vTensor copy = vTensor(original, sizes, strides); + vTensor copy = vTensor(original, sizes, dim_order); EXPECT_TRUE(get_vma_allocation_count() == 1); // Fill original tensor with some data @@ -564,7 +622,6 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) { for (size_t i = 0; i < data_out.size(); ++i) { CHECK_VALUE(data_out, i, 2.5f + i); } - std::cout << std::endl; } TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { @@ -576,7 +633,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { std::vector mat2_t_sizes = {K, N}; std::vector out_sizes = {M, N}; - std::vector transposed_strides = {1, K}; + std::vector transposed_dim_order = {1, 0}; vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true); vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true); @@ -588,8 +645,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { std::vector mat2_data = create_random_float_buffer(mat2.staging_buffer_numel()); - vTensor mat2_t = vTensor(mat2, mat2_t_sizes, transposed_strides); - EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked); + // Create direct view and modify sizes and strides later + vTensor mat2_t = vTensor(mat2); std::vector mat2_t_data = transpose_matrix(mat2_data, N, K); std::vector ref_out = @@ -601,6 +658,10 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { record_reference_matmul(api::context(), out, mat1, mat2_t); + // Update sizes and strides of mat2_t to be that of a transposed tensor + mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order); + EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked); + std::vector data_out(out.staging_buffer_numel()); // Extract the copy tensor; should contain the data of the original tensor extract_vtensor(out, data_out); @@ -622,7 +683,7 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) { constexpr int L_S2 = 7; constexpr int O_S2 = 3; - std::vector strides = {1}; + std::vector dim_order = {0}; std::vector t_sizes = {L}; std::vector s1_sizes = {L_S1}; @@ -632,8 +693,8 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_slice_test) { fill_vtensor(orig, 0); - vTensor s1 = vTensor(orig, s1_sizes, strides, O_S1); - vTensor s2 = vTensor(s1, s2_sizes, strides, O_S2); + vTensor s1 = vTensor(orig, s1_sizes, dim_order, O_S1); + vTensor s2 = vTensor(s1, s2_sizes, dim_order, O_S2); record_scalar_add_buffer(api::context(), s1, 4.5f); record_scalar_add_buffer(api::context(), s2, 7.5f); @@ -1093,7 +1154,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) { config.set_storage_type_override(utils::kBuffer); ComputeGraph graph(config); - std::vector strides = {W, 1}; + std::vector dim_order = {0, 1}; std::vector orig_sizes = {H, W}; std::vector slice_sizes = {S_H, W}; @@ -1103,7 +1164,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) { IOValueRef orig = graph.add_input_tensor(orig_sizes, vkapi::kFloat); ValueRef slice = - graph.add_tensor_view(orig.value, slice_sizes, strides, offset); + graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset); IOValueRef out = {}; From 02a76ea672a76a2dc648bff22064ccca38fab229 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Thu, 22 Aug 2024 18:35:43 -0700 Subject: [PATCH 017/531] Show warning if java code needs formatting Add a workflow for java formatting Pull Request resolved: https://github.com/pytorch/executorch/pull/4855 --- .ci/docker/common/install_linter.sh | 4 ++++ .github/workflows/lint.yml | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index 4a796a72d54..d262176e49b 100755 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -13,3 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" # NB: Install all linter dependencies, the caching of lintrunner init could be # done after Executorch becomes public pip_install -r requirements-lintrunner.txt + +# Install google-java-format +curl -L --retry 3 https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format +chmod +x /opt/google-java-format diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7cb2cf69b8b..ea068f65e1a 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,3 +54,20 @@ jobs: lint.json || true exit $RC + + android-java-format: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \ + examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \ + examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java) + if [ -n "$FILES_NEEDS_FORMAT" ]; then + echo "Warning: The following files need formatting. Please use google-java-format." + echo "$FILES_NEEDS_FORMAT" + exit 1 + fi From 6c26a872323e13c723e1544282fafb51f880742b Mon Sep 17 00:00:00 2001 From: Dalton Flanagan <6599399+dltn@users.noreply.github.com> Date: Thu, 22 Aug 2024 18:43:58 -0700 Subject: [PATCH 018/531] Fix Llama demo app after move in #4460 Differential Revision: D61693469 Pull Request resolved: https://github.com/pytorch/executorch/pull/4853 --- .../demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index e3a74456b38..fd5cdc7117f 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -94,7 +94,7 @@ 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; }; 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = runner.cpp; sourceTree = ""; }; 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = runner.h; sourceTree = ""; }; - 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = ""; }; + 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = util.h; path = ../../../../extension/llm/runner/util.h; sourceTree = ""; }; 03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = ""; }; 03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = ""; }; 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer.cpp; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.cpp; sourceTree = ""; }; @@ -264,7 +264,7 @@ 03729F102BB2042B00152F2E /* sampler.h */, ); name = sampler; - path = ../../../../../models/llama2/sampler; + path = ../../../../../../extension/llm/sampler; sourceTree = ""; }; /* End PBXGroup section */ From 11e8ed33bed08b4cb893ab8dda201f683017dee7 Mon Sep 17 00:00:00 2001 From: Mengtao Yuan Date: Thu, 22 Aug 2024 19:03:04 -0700 Subject: [PATCH 019/531] Reduce the memory usage of logits from O(context_length) to O(1) Differential Revision: D61246566 Pull Request resolved: https://github.com/pytorch/executorch/pull/4688 --- examples/models/llama2/export_llama_lib.py | 4 +-- examples/models/llama2/llama_transformer.py | 2 +- examples/models/llama2/model.py | 2 +- examples/models/llava/model.py | 7 ++-- .../llava/runner/llava_image_prefiller.h | 6 +++- examples/models/llava/runner/llava_runner.cpp | 4 +-- examples/models/llava/test/test_llava.py | 34 ++++++++++++------- extension/llm/runner/image_prefiller.h | 5 +-- extension/llm/runner/text_decoder_runner.h | 29 ++++++++++------ extension/llm/runner/text_prefiller.cpp | 5 --- 10 files changed, 59 insertions(+), 39 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 221f2f75bc6..172a1d72fd7 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -300,7 +300,7 @@ def build_args_parser() -> argparse.ArgumentParser: "--generate_full_logits", action="store_true", required=False, - default=True, + default=False, help="Generate logits for all inputs.", ) return parser @@ -598,7 +598,7 @@ def _load_llama_model( params_path: str, use_kv_cache: bool = False, use_sdpa_with_kv_cache: bool = False, - generate_full_logits: bool = True, + generate_full_logits: bool = False, weight_type: WeightType = WeightType.LLAMA, enable_dynamic_shape: bool = False, verbose: bool = False, diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py index 81b47a3a5d8..0c93115ee3b 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama2/llama_transformer.py @@ -99,7 +99,7 @@ class ModelArgs: # Generate logits for all inputs. When it's True, it would take big memory usage # at runtime. Enable it only necessary (e.g., use perplexity tools that requires # logits for all input tokens.) - generate_full_logits: bool = True + generate_full_logits: bool = False enable_dynamic_shape: bool = False # export model with dynamic shape support use_hf_rope: bool = False # Use HuggingFace's RoPE implementation rope_theta: Optional[float] = ( diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index b375399f336..f58a2a2def9 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -61,7 +61,7 @@ def __init__(self, **kwargs): self.use_kv_cache = kwargs.get("use_kv_cache", False) self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False) - self.generate_full_logits = kwargs.get("generate_full_logits", True) + self.generate_full_logits = kwargs.get("generate_full_logits", False) self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False) self.max_seq_len = kwargs.get("max_seq_len", 128) diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 9f6d8d32e8e..4f975e2ed4b 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -216,16 +216,19 @@ def prefill_embedding( result = torch.cat((embeds_before_img, image_embeds, embeds_after_img), dim=1) return result + # prefill using the in house text_model of llama transformer def prefill( self, prompt_before_image: torch.Tensor, images: torch.Tensor, prompt_after_image: torch.Tensor, - ) -> torch.Tensor: + ) -> (int, torch.Tensor): """Avoiding the torch.where() call to find placeholder and insert image embedding. Taking 3 inputs instead.""" embeds = self.prefill_embedding(prompt_before_image, images, prompt_after_image) - return self.text_model.forward(None, torch.tensor([0]), embeds) + # returns the prefilled token length too, because the text model generates one logits in each forward call. + return embeds.shape[1], self.text_model.forward(None, torch.tensor([0]), embeds) + # reference prefill using the text model in HF def prefill_ref( self, prompt_before_image: torch.Tensor, diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h index e8453299085..4d0a07b9a66 100644 --- a/examples/models/llava/runner/llava_image_prefiller.h +++ b/examples/models/llava/runner/llava_image_prefiller.h @@ -24,7 +24,7 @@ class LlavaImagePrefiller : public ImagePrefiller { * @param start_pos The starting position in KV cache of the input in the LLM * @return logits of the image prefill. */ - inline Result prefill(Image& image, int64_t start_pos = 0) + inline Result prefill(Image& image, int64_t& start_pos) override { ManagedTensor managed_images( image.data.data(), {3, image.height, image.width}, ScalarType::Byte); @@ -43,6 +43,10 @@ class LlavaImagePrefiller : public ImagePrefiller { outputs_res[0].isTensor(), "Non Tensor Output returned from executing image prefill"); + // Update the start_pos, which is only available inside this function. + // outputs_res can have only one logits. + start_pos += image_encoder_outputs[0].toTensor().size(1); + return outputs_res[0].toTensor(); } diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index a58fdfd5e59..b186af892f1 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -106,8 +106,8 @@ Error LlavaRunner::generate( // prefill images for (auto& image : images) { - auto logits = ET_UNWRAP(image_prefiller_->prefill(image, pos)); - pos += logits.size(1); + // pos is updated inside image prefill. + ET_UNWRAP(image_prefiller_->prefill(image, pos)); } // prefill user prompt. No BOS because preset prompt already has it. diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index ef503a88fc3..f464a580a87 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -35,12 +35,14 @@ def setUp(self): ) def test_prefill_logits(self): - prefill_logits = self.llava.prefill( + # For efficiency, the implemented prefill function only outputs the last logits. + _, prefill_logits = self.llava.prefill( self.prompt_before_image, self.resized, self.prompt_after_image ) + # The reference implementation in HF genetates the full logits. Get the last one. prefill_logits_ref = self.llava.prefill_ref( self.prompt_before_image, self.resized, self.prompt_after_image - )[0] + )[0][:, -1, :] self.assertTrue(torch.allclose(prefill_logits, prefill_logits_ref, atol=3e-2)) def test_generated_output(self): @@ -62,11 +64,11 @@ def test_generated_output(self): )[0].strip() # being tested, using llama_transformer - prefill_logits = self.llava.prefill( + context_len, prefill_logits = self.llava.prefill( self.prompt_before_image, self.resized, self.prompt_after_image ) - context_len = prefill_logits.shape[1] - new_tokens = [torch.argmax(prefill_logits[..., -1, :]).item()] + # Always generate one token at a time. + new_tokens = [torch.argmax(prefill_logits).item()] for i in range(4): logits = self.llava.step( torch.tensor([new_tokens[i]]), torch.tensor([context_len + i]) @@ -93,24 +95,27 @@ def test_llava_export(self): pte_embeds_before_img = llava_module.run_method( "token_embedding", (prompt_before_image,) )[0] - pte_prefill_before_img = llava_module.run_method( + llava_module.run_method( "text_model", (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img), - )[0] + ) - start_pos += pte_prefill_before_img.shape[1] + # Update the start_pos. start_pos is used in kv cache. The source of truth + # of the delta length is from the embeddings, not from the logits. + start_pos += pte_embeds_before_img.shape[1] # pte prefill image pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] - pte_prefill_img = llava_module.run_method( + llava_module.run_method( "text_model", ( torch.tensor([start_pos], dtype=torch.int64), pte_embeds_img, ), - )[0] + ) - start_pos += pte_prefill_img.shape[1] + # Update the logits for each prefill (kv cache) step. + start_pos += pte_embeds_img.shape[1] # pte prefill prompt after img pte_embeds_after_img = llava_module.run_method( @@ -121,8 +126,11 @@ def test_llava_export(self): (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img), )[0] + # Update the logits for each prefill (kv cache) step. + start_pos += pte_embeds_after_img.shape[1] + # being tested, using llama_transformer - new_tokens = [torch.argmax(pte_prefill_after_img[..., -1, :]).item()] + new_tokens = [torch.argmax(pte_prefill_after_img).item()] # TODO: uncomment this line # self.assertEquals(new_tokens[0], 1932) # When for i in range(4): @@ -134,7 +142,7 @@ def test_llava_export(self): "text_model", (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds), )[0] - new_tokens.append(torch.argmax(logits[..., -1, :]).item()) + new_tokens.append(torch.argmax(logits).item()) outputs = llava_model.tokenizer.batch_decode( torch.tensor([new_tokens]), skip_special_tokens=True diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h index 879b0a6e21a..93bb9a030bb 100644 --- a/extension/llm/runner/image_prefiller.h +++ b/extension/llm/runner/image_prefiller.h @@ -26,12 +26,13 @@ class ImagePrefiller { /** * Prefill an LLM Module with the given image input. * @param image The image input to the multimodal LLM. - * @param start_pos The starting position in KV cache of the input in the LLM + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. * @return The next token of the LLM Module after prefill. */ virtual ::executorch::runtime::Result prefill( Image& image, - int64_t start_pos = 0) = 0; + int64_t& start_pos) = 0; virtual ::executorch::runtime::Error load() = 0; virtual bool is_method_loaded() = 0; diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 6a8e3396fef..70ee1d01364 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -67,23 +67,32 @@ class TextDecoderRunner { * @return The next token. */ inline int32_t logits_to_token(const exec_aten::Tensor& logits_tensor) { - ET_CHECK_MSG(logits_tensor.dim() == 3, "Logits tensor must be 3D"); - auto num_tokens = logits_tensor.size(1); - auto vocab_size = logits_tensor.size(2); - switch (logits_tensor.scalar_type()) { + // If the logit_tensor rank is 3, the shape is [batch, seq_length, + // vocab_size], get the last logits, sample and return. Else the model + // outputs the last logit, directly sample and return. case exec_aten::ScalarType::Float: { float* logits = logits_tensor.mutable_data_ptr(); - float* logits_last = logits; - logits_last += (num_tokens - 1) * vocab_size; - return sampler_->sample(logits_last); + if (logits_tensor.dim() == 3) { + auto num_tokens = logits_tensor.size(1); + auto vocab_size = logits_tensor.size(2); + float* logits_last = logits; + logits_last += (num_tokens - 1) * vocab_size; + return sampler_->sample(logits_last); + } + return sampler_->sample(logits); } case exec_aten::ScalarType::Half: { exec_aten::Half* logits = logits_tensor.mutable_data_ptr(); - exec_aten::Half* logits_last = logits; - logits_last += (num_tokens - 1) * vocab_size; - return sampler_->sample(logits_last); + if (logits_tensor.dim() == 3) { + auto num_tokens = logits_tensor.size(1); + auto vocab_size = logits_tensor.size(2); + exec_aten::Half* logits_last = logits; + logits_last += (num_tokens - 1) * vocab_size; + return sampler_->sample(logits_last); + } + return sampler_->sample(logits); } default: ET_CHECK_MSG( diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index 19fc2d59363..4b9afb8326d 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -55,11 +55,6 @@ ::executorch::runtime::Result TextPrefiller::prefill( ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_LOG( Info, "Prefill token result numel(): %zu", outputs_res.get().numel()); - ET_CHECK_MSG( - outputs_res.get().size(1) == num_prompt_tokens, - "Expected number of output tokens %d does not match returned value %zu.", - num_prompt_tokens, - outputs_res.get().size(1)); // insert new token into prompt_tokens // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) uint64_t prev = prompt_tokens[0]; From f492d9624710fb37b195c91121c84949245f6b48 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 23 Aug 2024 01:42:55 -0700 Subject: [PATCH 020/531] Add prim ops neg.Scalar Differential Revision: D61686697 Pull Request resolved: https://github.com/pytorch/executorch/pull/4849 --- examples/models/llava/export_llava.py | 7 +++ .../selective_build/test_selective_build.sh | 8 ++-- exir/passes/executorch_prim_ops_registry.py | 6 +++ install_requirements.py | 3 +- kernels/prim_ops/register_prim_ops.cpp | 16 +++++++ kernels/prim_ops/test/CMakeLists.txt | 31 ++++++++++++ kernels/prim_ops/test/prim_ops_test.cpp | 47 +++++++++++++++++++ 7 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 kernels/prim_ops/test/CMakeLists.txt diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 50df2e038f2..390528844f7 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -253,6 +253,13 @@ def main(): with open(args.pte_name, "wb") as f: executorch_program.write_to_file(f) + logging.info( + "Required memory for activation in bytes: {}".format( + executorch_program._emitter_output.program.execution_plan[ + 0 + ].non_const_buffer_sizes + ), + ) logging.info(f"Exported ExecuTorch program to {args.pte_name}") # artifacts diff --git a/examples/selective_build/test_selective_build.sh b/examples/selective_build/test_selective_build.sh index 72333ac23cc..fd2ae421e22 100644 --- a/examples/selective_build/test_selective_build.sh +++ b/examples/selective_build/test_selective_build.sh @@ -48,9 +48,9 @@ test_buck2_select_ops_in_list() { ${PYTHON_EXECUTABLE} -m examples.portable.scripts.export --model_name="add_mul" echo "Running selective build test" - # set max_kernel_num=20: 18 primops, add, mul + # set max_kernel_num=21: 19 primops, add, mul $BUCK run //examples/selective_build:selective_build_test \ - --config=executorch.max_kernel_num=20 \ + --config=executorch.max_kernel_num=21 \ --config=executorch.select_ops=list \ -- --model_path=./add_mul.pte @@ -117,11 +117,11 @@ test_cmake_select_ops_in_list() { local example_dir=examples/selective_build local build_dir=cmake-out/${example_dir} - # set MAX_KERNEL_NUM=20: 18 primops, add, mul + # set MAX_KERNEL_NUM=21: 19 primops, add, mul rm -rf ${build_dir} retry cmake -DBUCK2="$BUCK" \ -DCMAKE_BUILD_TYPE=Release \ - -DMAX_KERNEL_NUM=20 \ + -DMAX_KERNEL_NUM=21 \ -DEXECUTORCH_SELECT_OPS_LIST="aten::convolution.out,\ aten::_native_batch_norm_legit_no_training.out,aten::hardtanh.out,aten::add.out,\ aten::mean.out,aten::view_copy.out,aten::permute_copy.out,aten::addmm.out,\ diff --git a/exir/passes/executorch_prim_ops_registry.py b/exir/passes/executorch_prim_ops_registry.py index 5159f630fe7..6362a471121 100644 --- a/exir/passes/executorch_prim_ops_registry.py +++ b/exir/passes/executorch_prim_ops_registry.py @@ -86,6 +86,11 @@ def mod(a: SymInt, b: SymInt) -> SymInt: return SymInt(int(a) % int(b)) +@bind_pattern_to_op(executorch_prims_lib, "neg.Scalar(Scalar a) -> Scalar") +def neg(a: _SymScalar) -> _SymScalar: + return -a # pyre-ignore + + _PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[OpOverload, OpOverload] = { operator.sub: ops.backend.executorch_prim.sub.Scalar, operator.mul: ops.backend.executorch_prim.mul.Scalar, @@ -98,6 +103,7 @@ def mod(a: SymInt, b: SymInt) -> SymInt: operator.ge: ops.backend.executorch_prim.ge.Scalar, operator.le: ops.backend.executorch_prim.le.Scalar, operator.mod: ops.backend.executorch_prim.mod.Scalar, + operator.neg: ops.backend.executorch_prim.neg.Scalar, torch.sym_float: ops.backend.executorch_prim.sym_float.Scalar, } diff --git a/install_requirements.py b/install_requirements.py index c61bdd02dbe..085b48ab370 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -1,11 +1,10 @@ -#!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # -# # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + import os import platform import re diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp index 721dc8e47a0..7010e4507d5 100644 --- a/kernels/prim_ops/register_prim_ops.cpp +++ b/kernels/prim_ops/register_prim_ops.cpp @@ -248,6 +248,22 @@ static Kernel prim_ops[] = { [](RuntimeContext& context, EValue** stack) { BOOLEAN_ET_PRIM_OP(<=, stack, context); }), + // executorch_prim::neg.Scalar(Scalar) -> Scalar + Kernel( + "executorch_prim::neg.Scalar", + [](RuntimeContext& context, EValue** stack) { + (void)context; + EValue& a = *stack[0]; + EValue& out = *stack[1]; + if (a.isInt()) { + out = EValue(-a.toInt()); + } else if (a.isDouble()) { + out = EValue(-a.toDouble()); + } else { + // TODO Fail using runtime context + ET_CHECK_MSG(false, "%zu", (size_t)a.tag); + } + }), // executorch_prim::floordiv.int(int, int) -> int Kernel( diff --git a/kernels/prim_ops/test/CMakeLists.txt b/kernels/prim_ops/test/CMakeLists.txt new file mode 100644 index 00000000000..93d53523a0b --- /dev/null +++ b/kernels/prim_ops/test/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# + +cmake_minimum_required(VERSION 3.19) +project(prim_ops_test) + +# Use C++17 for test. +set(CMAKE_CXX_STANDARD 17) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/build/Test.cmake) + +target_link_options_shared_lib(executorch) + +set(_test_srcs prim_ops_test.cpp) + +et_cxx_test( + kernels_prim_ops_test SOURCES ${_test_srcs} +) diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp index 50802a3e321..c7f63e2843c 100644 --- a/kernels/prim_ops/test/prim_ops_test.cpp +++ b/kernels/prim_ops/test/prim_ops_test.cpp @@ -280,6 +280,53 @@ TEST_F(RegisterPrimOpsTest, LocalScalarDenseReturnsCorrectValue) { EXPECT_EQ(stack[1]->toInt(), expected); } +TEST_F(RegisterPrimOpsTest, NegScalarReturnsCorrectValue) { + EValue values[2]; + + // Test with float + values[0] = EValue(5.0f); + values[1] = EValue(0.0f); + + EValue* stack[2]; + for (size_t i = 0; i < 2; i++) { + stack[i] = &values[i]; + } + + getOpsFn("executorch_prim::neg.Scalar")(context, stack); + + EXPECT_EQ(stack[1]->toDouble(), -5.0f); + + // Test with int + int64_t a = 5; + int64_t b = 0; + values[0] = EValue(a); + values[1] = EValue(b); + + getOpsFn("executorch_prim::neg.Scalar")(context, stack); + + EXPECT_EQ(stack[1]->toInt(), -5l); +} + +TEST_F(RegisterPrimOpsTest, TestNegScalarWithTensorDies) { + testing::TensorFactory tf; + + EValue values[2]; + + auto tensor = tf.make({2, 3}, {1, 2, 3, 4, 5, 6}); + + int64_t zero = 0; + values[0] = EValue(tensor); + values[1] = EValue(zero); + + EValue* stack[2]; + for (size_t i = 0; i < 2; i++) { + stack[i] = &values[i]; + } + + // Try to negate a tensor, which should cause a runtime error. + ET_EXPECT_DEATH(getOpsFn("executorch_prim::neg.Scalar")(context, stack), ""); +} + TEST_F(RegisterPrimOpsTest, TestETView) { EXPECT_TRUE(hasOpsFn("executorch_prim::et_view.default")); From 4915c9f91f6b5e4a1b07ce84fb6b8ab206114def Mon Sep 17 00:00:00 2001 From: Max Ren <40742183+mcr229@users.noreply.github.com> Date: Fri, 23 Aug 2024 10:52:17 -0700 Subject: [PATCH 021/531] Move towards quantize_ api for blockwise Differential Revision: D61363183 Pull Request resolved: https://github.com/pytorch/executorch/pull/4839 --- .../operators/op_dynamic_dequantize_ops.py | 47 +- .../operators/op_dynamic_quantize_ops.py | 57 ++ backends/xnnpack/operators/op_skip_ops.py | 9 + backends/xnnpack/operators/quant_params.py | 87 +- backends/xnnpack/partition/config/__init__.py | 9 + .../xnnpack/partition/config/gemm_configs.py | 14 +- .../partition/config/quant_affine_configs.py | 65 ++ backends/xnnpack/passes/TARGETS | 1 + .../xnnpack/passes/tag_implicit_q_dq_pass.py | 21 +- backends/xnnpack/test/ops/linear.py | 813 ++++++------------ backends/xnnpack/test/tester/tester.py | 3 +- backends/xnnpack/utils/quant_utils.py | 114 ++- 12 files changed, 626 insertions(+), 614 deletions(-) create mode 100644 backends/xnnpack/partition/config/quant_affine_configs.py diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py index d47f9f479e4..f8f0c54ee68 100644 --- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py @@ -12,7 +12,15 @@ register_node_visitor, ) from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph -from executorch.backends.xnnpack.utils.utils import get_input_node +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) +from executorch.backends.xnnpack.utils.utils import ( + check_or_raise, + get_input_node, + is_param_node, +) @register_node_visitor @@ -65,3 +73,40 @@ def define_node( dq_input = get_input_node(node, 0) if dq_input in vals_to_ids: vals_to_ids[node] = vals_to_ids[dq_input] + + +@register_node_visitor +class OpDequantizeAffine(NodeVisitor): + target = "quant.dequantize_affine.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define dequantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + check_or_raise( + is_param_node(self._exported_program, node.all_input_nodes[0]), + f"Expected quantize affine node with per-token semantics to be used " + f"in front of a weight node, but found node {node.all_input_nodes[0]}", + ) + # Affine dequantize was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Expecting Affine Dequantized Op to have per-token semantics", + ) + # This must be a per-token affine dequantized node, so let us serialize as such + dq_input = get_input_node(node, 0) + if dq_input in vals_to_ids: + vals_to_ids[node] = vals_to_ids[dq_input] diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py index bf5f3b7b092..23047e731f7 100644 --- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py @@ -17,6 +17,10 @@ XNNGraph, XNode, ) +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node @@ -118,3 +122,56 @@ def define_node( debug_handle=debug_handle, ) xnn_graph.xnodes.append(ser_node) + + +@register_node_visitor +class OpQuantizeAffine(NodeVisitor): + target = "quant.quantize_affine.default" + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define quantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + # Affine quantized was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Encountered affine quantized op which does not have per-token semantics", + ) + # Treat this node as dynamic per-token quantization + q_input = get_input_node(node, 0) + + # fp32 input + self.define_tensor(q_input, xnn_graph, vals_to_ids) + input_id = vals_to_ids[q_input] + + # dynamic quantized output + input_quant_params = QuantParams.from_q_dq_node(node) + # qinput isn't needed for dynamically quantized nodes since it will always be + # the output of a convert node. Instead we set q_input to the node itself so + # we can extract the shape from the dq output + input_quant_params.q_input = node + input_quant_params.is_input = False + check_or_raise( + input_quant_params.is_dynamic, + "Internal Error, dynamically quantized node expected dynamic quantized params", + ) + self.define_tensor( + node, xnn_graph, vals_to_ids, quant_params=input_quant_params + ) + output_id = vals_to_ids[node] + + ser_node = XNode( + xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0), + debug_handle=debug_handle, + ) + xnn_graph.xnodes.append(ser_node) diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py index d6a54c901eb..6597c0568e3 100644 --- a/backends/xnnpack/operators/op_skip_ops.py +++ b/backends/xnnpack/operators/op_skip_ops.py @@ -97,6 +97,15 @@ class OpSymSizeInt(OpSkipOps): target = "sym_size.int" +@register_node_visitor +class OpChooseQparamsAffine(OpSkipOps): + """ + do nothing if node is choose_qparams_affine.default + """ + + target = "quant.choose_qparams_affine.default" + + @register_node_visitor class OpChooseQparamsToken(OpSkipOps): """ diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py index d60c300276f..44908ac7fca 100644 --- a/backends/xnnpack/operators/quant_params.py +++ b/backends/xnnpack/operators/quant_params.py @@ -10,7 +10,15 @@ import torch from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, + is_dequant, + is_dynamic_qdq, + is_per_channel, + is_per_channel_group, + is_quant, +) from executorch.backends.xnnpack.utils.utils import ( check_or_raise, get_param_tensor, @@ -154,30 +162,18 @@ def from_q_dq_node( q_input = quant_node.all_input_nodes[0] # TODO: Use presence of choose_qparam node to determine if this is a dynamic quantization - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ]: + if is_dynamic_qdq(quant_node): return cls._from_dynamic_input_node(quant_node) - per_channel = quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - ] - - _groupwise = False - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, - ]: - # This is a sub-category of per channel quantization - per_channel = True - _groupwise = True - - scale = quant_node.args[1] - zp = quant_node.args[2] + per_channel = is_per_channel(quant_node) + + _groupwise = is_per_channel_group(quant_node) + quant_node_args = quant_node.args + if _groupwise and is_affine_qdq(quant_node): + quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node) + + scale = quant_node_args[1] + zp = quant_node_args[2] axis = 0 if per_channel: assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str) @@ -193,10 +189,15 @@ def _get_tensor(node): scale = _get_tensor(scale) zp = _get_tensor(zp) - axis = cast(int, quant_node.args[3]) + axis = cast(int, quant_node_args[3]) if _groupwise: scale_tensor = cast(torch.Tensor, scale) + if scale_tensor.ndim == 1: + scale_tensor = scale_tensor.reshape(-1, 1) + zp = zp.reshape(-1, 1) + scale = scale_tensor + assert ( scale_tensor.ndim == 2 ), "Weight scale must be 2D for per_channel_group [de]quant node, got {scale.ndim}D" @@ -204,23 +205,23 @@ def _get_tensor(node): check_or_raise( bool( - quant_node.args[-1] != torch.uint8 - or quant_node.args[-1] != torch.quint8 + quant_node_args[-1] != torch.uint8 + or quant_node_args[-1] != torch.quint8 ), "XNNPACK does not support unsigned quantization", ) if _groupwise: - _ = quant_node.args[-1] # output dtype - not used - group_size = cast(int, quant_node.args[-2]) - dtype = cast(torch.dtype, quant_node.args[-3]) - qmax = cast(int, quant_node.args[-4]) - qmin = cast(int, quant_node.args[-5]) + _ = quant_node_args[-1] # output dtype - not used + group_size = cast(int, quant_node_args[-2]) + dtype = cast(torch.dtype, quant_node_args[-3]) + qmax = cast(int, quant_node_args[-4]) + qmin = cast(int, quant_node_args[-5]) else: group_size = 0 - dtype = cast(torch.dtype, quant_node.args[-1]) - qmax = cast(int, quant_node.args[-2]) - qmin = cast(int, quant_node.args[-3]) + dtype = cast(torch.dtype, quant_node_args[-1]) + qmax = cast(int, quant_node_args[-2]) + qmin = cast(int, quant_node_args[-3]) is_output = any( user_node.op == "output" for user_node in quant_node.users.keys() @@ -244,26 +245,14 @@ def _get_tensor(node): def from_weights( cls, tensor_node: torch.fx.Node, ep: Optional[ExportedProgram] = None ) -> Optional[QuantParams]: - # Ignore transpose for weights - # TODO:T148540997 remove the t_copy/permute_copy check when convert addmm to linear - dq = ( - tensor_node.all_input_nodes[0] - if tensor_node.target - in ( - exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.t_copy.default, - ) - else tensor_node - ) - # check input of t_copy/permute_copy is dequant - if not is_dequant(dq): + if not is_dequant(tensor_node): return None # source node for quant params - src = dq + src = tensor_node # is input of dq is q? - dq_input = dq.all_input_nodes[0] + dq_input = src.all_input_nodes[0] if is_quant(dq_input): src = dq_input diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py index 141ccf9802e..ed105dc1f53 100644 --- a/backends/xnnpack/partition/config/__init__.py +++ b/backends/xnnpack/partition/config/__init__.py @@ -53,6 +53,11 @@ MaxDimConfig, PreluConfig, ) +from executorch.backends.xnnpack.partition.config.quant_affine_configs import ( + ChooseQParamsAffineConfig, + DeQuantizeAffineConfig, + QuantizeAffineConfig, +) from executorch.backends.xnnpack.partition.config.xnnpack_config import ( XNNPartitionerConfig, ) @@ -98,4 +103,8 @@ # Quant/Dequant Op Configs QuantizedPerTensorConfig, DeQuantizedPerTensorConfig, + # Quant Affine Configs to preserve decomp + QuantizeAffineConfig, + DeQuantizeAffineConfig, + ChooseQParamsAffineConfig, ] diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py index 3c4d446a6b4..a20285483b2 100644 --- a/backends/xnnpack/partition/config/gemm_configs.py +++ b/backends/xnnpack/partition/config/gemm_configs.py @@ -13,9 +13,12 @@ XNNPartitionerConfig, ) from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, is_dequant, is_dynamic_qdq, is_per_channel, + is_per_channel_group, is_qparam, is_quant, ) @@ -131,7 +134,7 @@ def _get_weight_deps( return False, [] gemm_deps.append(weight) - if is_per_channel(dequant_node): + if is_per_channel(dequant_node) or is_per_channel_group(dequant_node): if len(dequant_node.all_input_nodes) < 2: # Expected channel quantized to have scale/zp nodes return False, [] @@ -214,12 +217,15 @@ def _get_act_deps( return (False, []) gemm_deps.append(q_input) - if not (is_node(q_input.args[1]) and is_node(q_input.args[2])): + q_input_args = q_input.args + if is_affine_qdq(q_input): + q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input) + if not (is_node(q_input_args[1]) and is_node(q_input_args[2])): # expected to find getitem node from choose qparam return (False, []) - getitem1 = get_input_node(q_input, 1) - getitem2 = get_input_node(q_input, 2) + getitem1 = q_input_args[1] + getitem2 = q_input_args[2] if not (is_getitem(getitem1) and is_getitem(getitem2)): # expected getitem node from choose qparam diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py new file mode 100644 index 00000000000..d9e789104b6 --- /dev/null +++ b/backends/xnnpack/partition/config/quant_affine_configs.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from torch.export import ExportedProgram + + +class QDQAffineConfigs(XNNPartitionerConfig): + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + return True + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + # Do not return anything from this because we only use this to + # preserve the decomposition + return [] + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.DYNAMIC_QUANT] + + +class QuantizeAffineConfig(QDQAffineConfigs): + target_name = "quantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.quantize_affine.default + except: + return None + + +class DeQuantizeAffineConfig(QDQAffineConfigs): + target_name = "dequantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.dequantize_affine.default + except: + return None + + +class ChooseQParamsAffineConfig(QDQAffineConfigs): + target_name = "choose_qparams_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.choose_qparams_affine.default + except: + return None diff --git a/backends/xnnpack/passes/TARGETS b/backends/xnnpack/passes/TARGETS index e91614c735b..6bc3742abe6 100644 --- a/backends/xnnpack/passes/TARGETS +++ b/backends/xnnpack/passes/TARGETS @@ -30,6 +30,7 @@ python_library( "//executorch/exir:pass_base", "//executorch/exir/dialects:lib", "//executorch/exir/passes:const_prop_pass", + "//executorch/exir/passes:memory_format_ops_pass", "//executorch/exir/program:program", ], ) diff --git a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py index 0aa2e1291e3..ac6ccc9b89d 100644 --- a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py +++ b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py @@ -12,7 +12,11 @@ SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET, ) from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + is_dequant, + is_dynamic_qdq, + is_quant, +) from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult @@ -76,18 +80,7 @@ def is_output_node(self, node: torch.fx.Node) -> bool: return node.op == "output" def is_dynamically_quantized(self, node: torch.fx.Node) -> bool: - return any( - is_dequant(input_node) - and ( - cast( - torch._ops.OpOverload, input_node.target - )._schema.schema.overload_name - == "tensor" - or input_node.target - == exir_ops.edge.quantized_decomposed.dequantize_per_token.default - ) - for input_node in node.all_input_nodes - ) + return is_dynamic_qdq(node) def is_supported_quant_op(self, node: torch.fx.Node) -> bool: return ( @@ -191,7 +184,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: ending_implicit_q_nodes = [] for user in first_node.users: - if self.is_dynamically_quantized(user): + if self.is_dynamically_quantized(first_node): # if the dq is a dynamic dq, then it is implicit break user_end_nodes = self.get_ending_implicit_q_nodes(user) diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py index a9459050e79..d886ce26694 100644 --- a/backends/xnnpack/test/ops/linear.py +++ b/backends/xnnpack/test/ops/linear.py @@ -26,8 +26,167 @@ ) from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig +try: + from torchao.quantization.quant_api import ( + int8_dynamic_activation_int4_weight, + quantize_, + unwrap_tensor_subclass, + ) + + torchao_installed = True +except: + torchao_installed = False + + +# Pytorch Modules Used for Testing +class BaseLinear(torch.nn.Module): + def __init__( + self, + in_size: int = 2, + input_channels: int = 4, + output_channels: int = 4, + dtype: torch.dtype = torch.float, + use_bias: bool = False, + ): + super().__init__() + self.linear = torch.nn.Linear( + input_channels, output_channels, bias=use_bias + ).to(dtype=dtype) + + self.ic = input_channels + self.oc = output_channels + + assert dtype in [torch.float, torch.half], "Unsupported op dtype" + self.op_dtype = dtype + self.in_size = in_size + + def forward(self, x): + return self.linear(x) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class AddMMModule(torch.nn.Module): + def __init__(self, in_size, out_size): + super().__init__() + self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) + self.bias = torch.nn.Parameter(torch.randn(1, out_size)) + + def forward(self, x): + return torch.addmm(self.bias, x, self.mat) + + +class LinearReluModule(torch.nn.Module): + def __init__(self, in_size, out_size, use_bias, dtype=torch.float): + super().__init__() + self.dtype = dtype + self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias).to(dtype=dtype) + + def forward(self, x): + return torch.nn.functional.relu(self.linear(x)) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class LinearParallelSequentialModule(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear3_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x, y): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(y, self.linear2_weight, self.linear2_bias) + c = torch.nn.functional.linear(b, self.linear3_weight, self.linear3_bias) + return (a, c) + + def get_inputs(self): + return ( + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + ) + + +class LinearSequential(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(a, self.linear2_weight, self.linear2_bias) + return b + + def get_inputs(self): + return (torch.rand(self.in_size, self.input_size, dtype=torch.float),) + class TestLinear(unittest.TestCase): + """ + Test Class for XNNPACK Linear Operators. + + Notes: + - XNNPACK Does not support Per Tensor Quantized Weights with Dynamic Activations + - XNNPACK Only supports Per-Token Activation, so Dynamic per-tensor Quantization + As done by the default dynamic quantization flow does Per-Token Quantization + Activation under the hood, where the torch.nn.Module is doing Per-Tensor Quantization + on the Activation. This is sufficient because Per-Token Quantization on Activations + should produce strictly better results compared to Per-Tensor Quantization + """ + + @staticmethod + def _get_4b_dqconfig() -> QuantizationConfig: + # Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. + qconfig: QuantizationConfig = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=True, + weight_qmin=-8, + weight_qmax=7, + ) + return qconfig + def test_fp16_linear(self): for use_bias in (True, False): for num_batch_dims in range(1, 3): @@ -65,33 +224,13 @@ def test_qc8_linear(self): ) def test_fp32_addmm(self): - """ - Note that the ConvertToLinear pass requires the weight matrix to be transposed. - """ - - class AddMMModule(torch.nn.Module): - def __init__(self, in_size, out_size): - super().__init__() - self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) - self.bias = torch.nn.Parameter(torch.randn(1, out_size)) - - def forward(self, x): - return torch.addmm(self.bias, x, self.mat) - + # Note that the ConvertToLinear pass requires the weight matrix to be transposed. self._test_linear( lambda in_size, out_size: AddMMModule(in_size, out_size), uses_bias=True, ) def test_fp32_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -105,14 +244,6 @@ def forward(self, x): ) def test_qs8_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -138,21 +269,6 @@ def test_qs8_linear(self): quant_type="per_tensor", ) - @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.") - def _test_qd8_per_tensor_linear(self): - for uses_bias in (False, True): - inputs = (torch.randn(2, 4),) - module = torch.nn.Linear(4, 5, bias=uses_bias) - dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) - - self._test_dqlinear( - module, - inputs, - dynamic_shapes=dynamic_shapes, - is_per_channel=False, - uses_bias=uses_bias, - ) - def test_qd8_per_channel_linear(self): for uses_bias in (False, True): inputs = (torch.randn(2, 4),) @@ -166,19 +282,6 @@ def test_qd8_per_channel_linear(self): uses_bias=uses_bias, ) - @staticmethod - def _get_4b_dqconfig() -> QuantizationConfig: - """ - Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. - """ - qconfig: QuantizationConfig = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=True, - weight_qmin=-8, - weight_qmax=7, - ) - return qconfig - def test_qd8_per_channel_4w_linear(self): qconfig = self._get_4b_dqconfig() input_channels = [2, 63] @@ -267,38 +370,12 @@ def test_qd8_per_channel_linear_with_two_batch(self): ) def test_qd8_per_channel_linear_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearSequential(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - a, self.linear2_weight, self.linear2_bias - ) - return b - - inputs = (torch.rand(in_size, input_size, dtype=torch.float),) + lin_mod = LinearSequential() + inputs = lin_mod.get_inputs() dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) self._test_dqlinear( - LinearSequential(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=2, @@ -307,53 +384,16 @@ def forward(self, x): atol=1e-1, ) - def test_qd8_per_channel_linear_parellel_and_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear3_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x, y): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - y, self.linear2_weight, self.linear2_bias - ) - c = torch.nn.functional.linear( - b, self.linear3_weight, self.linear3_bias - ) - return (a, c) - - inputs = ( - torch.rand(in_size, input_size, dtype=torch.float), - torch.rand(in_size, input_size, dtype=torch.float), - ) + def test_qd8_per_channel_linear_parallel_and_sequential(self): + lin_mod = LinearParallelSequentialModule() + inputs = lin_mod.get_inputs() dynamic_shapes = ( {0: torch.export.Dim("batch", max=100)}, {0: torch.export.Dim("batch2", max=100)}, ) self._test_dqlinear( - LinearModule(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=3, @@ -362,29 +402,9 @@ def forward(self, x, y): atol=1e-1, ) - def test_qd8_fp32_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float) - - def test_qd8_fp32_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float) - - # This fails because the output tensor dtype is different, but if you squint and ignore that and look at the values, - # it is not too bad. - # Difference: max: 0.042601585388183594, abs: 0.042601585388183594. - # -- Model vs. Reference -- - # Numel: 68, 68 - # Median: -0.7754800915718079, -0.7755751013755798 - # Mean: -0.6128872036933899, -0.6143574714660645 - # Max: 12.518657684326172, 12.516003608703613 - # Min: -20.070953369140625, -20.077701568603516 - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float16) - - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float16) - + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] K_sizes = [8, 32, 64, 128] @@ -392,60 +412,49 @@ def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) inputs = (torch.randn(1, M, K),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias ) - @unittest.skip("Need to fix the dq_per_channel_group output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_group_int4(self): + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) + def test_qd8_fp16_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] K_sizes = [8, 32, 64, 128] bl_sizes = [8, 16, 16, 32] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( + in_size=M, input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float16, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) - inputs = (torch.randn(1, M, K, dtype=torch.float16),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, - atol=0.1, - rtol=0.1, + inputs = lin_mod.get_inputs() + # This requires slightly higher atol, but if you look at error it is not that bad: + # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375. + # -- Model vs. Reference -- + # Numel: 4, 4 + # Median: -0.05023193359375, -0.0516357421875 + # Mean: 0.2373046875, 0.237060546875 + # Max: 1.0078125, 1.0078125 + # Min: -0.08465576171875, -0.08441162109375 + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2 ) def _test_linear( @@ -467,7 +476,20 @@ def _test_linear( input_sizes = [4, 37, 17] output_sizes = [4, 17, 37] - quant = quant_type is not None + quant_config = None + if quant_type is not None: + if quant_type == "per_channel": + quant_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=False, + ) + elif quant_type == "per_tensor": + quant_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False, + ) + else: + raise ValueError(f"Unsupported quant type {quant_type}") """ Note that torch.nn.Linear maps to aten.mm.default (no bias) or aten.addmm.default (bias), @@ -478,7 +500,6 @@ def _test_linear( input_size = int(input_sizes[i]) output_size = int(output_sizes[i]) input_shape = [in_size] * num_batch_dims + [input_size] - print(f"Testing input_shape {input_shape} with {output_size} out_channels") module = make_module(input_size, output_size).eval().to(dtype) inputs = (torch.randn(input_shape).to(dtype),) @@ -487,28 +508,15 @@ def _test_linear( dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size) dynamic_shape = (dynamic_shape,) - print(dynamic_shape) for legacy_mode in (True, False): tester = Tester(module, inputs, dynamic_shapes=dynamic_shape) - if quant: - if quant_type == "per_channel": - quant_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - elif quant_type == "per_tensor": - quant_config = get_symmetric_quantization_config( - is_per_channel=False, - is_dynamic=False, - ) - else: - raise ValueError(f"Unsupported quant type {quant_type}") + if quant_config: tester.quantize(Quantize(quantization_config=quant_config)) tester.export() - if quant: + if quant_config: tester.check(["torch.ops.quantized_decomposed"]) if legacy_mode: @@ -522,12 +530,19 @@ def _test_linear( ) tester.check_not([edge_op]) - if quant: - tester.check_not([edge_op, "torch.ops.quantized_decomposed"]) + if quant_config: + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) tester.to_executorch() tester.serialize() - tester.run_method_and_compare_outputs(qtol=quant, atol=atol) + tester.run_method_and_compare_outputs( + qtol=bool(quant_config), atol=atol + ) def _test_dqlinear( self, @@ -540,24 +555,19 @@ def _test_dqlinear( qconfig: Optional[QuantizationConfig] = None, atol=5e-02, ): - edge_op = ( - "executorch_exir_dialects_edge__ops_aten_addmm_default" - if uses_bias - else "executorch_exir_dialects_edge__ops_aten_mm_default" - ) - quant_config = qconfig or get_symmetric_quantization_config( is_per_channel=is_per_channel, is_dynamic=True, ) for legacy_partitioner in (True, False): for per_op_mode in (True, False): - tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) - tester.quantize(Quantize(quantization_config=quant_config)) DynamicallyQuantizedPartitioner = XnnpackPartitioner( config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=per_op_mode, ) + + tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) + tester.quantize(Quantize(quantization_config=quant_config)) tester.export() if legacy_partitioner: @@ -567,357 +577,74 @@ def _test_dqlinear( tester.to_edge_transform_and_lower( ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) ) - num_call_delegates = linear_count if per_op_mode else 1 tester.check_count( { - "torch.ops.higher_order.executorch_call_delegate": num_call_delegates + "torch.ops.higher_order.executorch_call_delegate": ( + linear_count if per_op_mode else 1 + ) } ) - tester.check_not([edge_op]) + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) tester.to_executorch() tester.serialize() tester.run_method_and_compare_outputs(atol=atol) - class ManualDQLinear(torch.nn.Module): - def __init__( - self, - input_channels: int = 4, - output_channels: int = 4, - dtype: torch.dtype = torch.float, - weight_n_bit: int = 4, - group_size: int = 0, - force_groupwise_quant: bool = False, - use_bias: bool = False, - ): - super().__init__() - - self.ic = input_channels - self.oc = output_channels - - assert dtype in [torch.float, torch.half], "Unsupported op dtype" - self.op_dtype = dtype - - self.group_size = self.ic if group_size == 0 else group_size - self.num_groups = 1 - if self.group_size != self.ic: - assert self.ic % self.group_size == 0 - assert self.group_size % 8 == 0 # TODO make this 16 - self.num_groups = self.ic // self.group_size - - assert weight_n_bit in [4, 8], "Unsupported weight_n_bit" - self.w_n_bit = weight_n_bit - self.w_quant_min, self.w_quant_max = self.get_min_max(self.w_n_bit) - - self.w = torch.nn.Parameter( - torch.randn(self.oc, self.ic), requires_grad=False - ) - self.w_q = torch.nn.Parameter( - torch.zeros(self.oc, self.ic), requires_grad=False - ) - # Quantize the weights as per folded setup - if self.group_size != self.ic or force_groupwise_quant: - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.quant_weight_per_channel_group() - else: # per_channel quantization - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.quant_weight_per_channel() - - self.bias = ( - torch.nn.Parameter( - torch.randn(self.oc).to(self.op_dtype), requires_grad=False - ) - if use_bias - else None - ) - - def get_min_max(self, n_bit: int = 4): - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - return min_int, max_int - - def get_channel_qparams_symmetric( - self, - w: torch.Tensor, - n_bit: int = 4, - precision: torch.dtype = torch.float32, - ): - assert w.dim() == 2 - - to_quant = w.to(precision) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - min_int, max_int = self.get_min_max(n_bit) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0]), zeros.to( - precision - ).reshape(w.shape[0]).reshape(w.shape[0]) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def get_group_qparams_symmetric( - self, w, n_bit=4, groupsize=128, precision=torch.float32 - ): - # needed for GPTQ with padding - if groupsize > w.shape[-1]: - groupsize = w.shape[-1] - assert groupsize > 1 - assert w.shape[-1] % groupsize == 0 - assert w.dim() == 2 - - to_quant = w.reshape(-1, groupsize) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - # TODO: make sure abs(scales) is not too small? - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0], -1), zeros.to( - precision - ).reshape(w.shape[0], -1) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def group_quantize_tensor_symmetric( - self, w, n_bit=4, group_size=128, precision=torch.float32 - ): - scales, zeros = self.get_group_qparams_symmetric( - w, n_bit, group_size, precision - ) - n_bit = 4 - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - # TODO: currently we don't know how to express torch.int4, we'll - # add torch.int4 to core later - w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group( - w, scales, zeros, min_int, max_int, torch.int8, group_size - ) - - return w_int8, scales, zeros - - def fwd_input_per_token(self, input: torch.Tensor) -> torch.Tensor: - ip_quant_min = -128 - ip_quant_max = 127 - ( - ip_scales, - ip_zero_points, - ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric( - input, torch.int8 - ) - - input = torch.ops.quantized_decomposed.quantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - ) - input = torch.ops.quantized_decomposed.dequantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - self.op_dtype, - ) - return input - - def quant_weight_per_channel(self): - ( - self.w_scales.data, - self.w_zero_points.data, - ) = self.get_channel_qparams_symmetric( - self.w, n_bit=self.w_n_bit, precision=self.op_dtype - ) - self.w_q.data = torch.ops.quantized_decomposed.quantize_per_channel( - self.w, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, - ) - - def quant_weight_per_channel_group(self): - self.w_q.data, w, zp = self.group_quantize_tensor_symmetric( - self.w, - n_bit=self.w_n_bit, - group_size=self.group_size, - ) - expected_min, expected_max = self.get_min_max(self.w_n_bit) - assert ( - torch.min(self.w_q.data) >= expected_min - ), "Found smaller than min element in quantized weight tensor" - assert ( - torch.max(self.w_q.data) <= expected_max - ), "Found larger than max element in quantized weight tensor" - assert ( - w.ndim == 2 and zp.ndim == 2 - ), f"Expecting 2d scales and zp tensors, but got {w.shape}, {zp.shape}" - self.w_scales.data, self.w_zero_points.data = w, zp - - def fwd_weight_per_channel(self) -> torch.Tensor: - # This is HACKY because the dequant will produce fp32 - return torch.ops.quantized_decomposed.dequantize_per_channel( - self.w_q, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - ) - - def fwd_weight_per_channel_group(self) -> torch.Tensor: - return torch.ops.quantized_decomposed.dequantize_per_channel_group( - self.w_q, - self.w_scales, - self.w_zero_points, - self.w_quant_min, - self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - group_size=self.group_size, - output_dtype=self.op_dtype, - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - # Input - input = self.fwd_input_per_token(input) - - # Weights - w = ( - self.fwd_weight_per_channel_group() - if self.w_scales.ndim == 2 - else self.fwd_weight_per_channel() - ) - assert isinstance(w, torch.Tensor) - return torch.nn.functional.linear(input, w, self.bias) - - def _test_manual_dq_linear( + def _test_groupwise_dq_linear( self, mod: torch.nn.Module, inputs: Tuple[torch.Tensor], - weight_groupwise: bool = False, use_bias: bool = False, + group_size: int = 8, + num_linears: int = 1, atol: float = 1e-3, rtol: float = 1e-3, ): - linear_edge_op = ( - "executorch_exir_dialects_edge__ops_aten_addmm_default" - if use_bias - else "executorch_exir_dialects_edge__ops_aten_mm_default" + quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) + unwrap_tensor_subclass(mod) + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=True, ) - - weight_dq_edge_op = ( - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_group_default" - if weight_groupwise - else "torch.ops.quantized_decomposed.dequantize_per_channel.default" - ) - - weight_dq_aten_op = ( - "torch.ops.quantized_decomposed.dequantize_per_channel_group.default" - if weight_groupwise - else "torch.ops.quantized_decomposed.dequantize_per_channel.default" + tester = ( + Tester(mod, inputs) + .export() + .check_count( + { + "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears, + "torch.ops.quant.quantize_affine.default": 1 * num_linears, + "torch.ops.quant.dequantize_affine.default": 2 * num_linears, + "torch.ops.aten.linear.default": 1 * num_linears, + } + ) ) - for legacy_partitioner in (True, False): - tester = ( - Tester(mod, inputs) - .export() - .check_count( - { - "torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default": 1, - "torch.ops.quantized_decomposed.quantize_per_token.default": 1, - "torch.ops.quantized_decomposed.dequantize_per_token.default": 1, - weight_dq_aten_op: 1, - "torch.ops.aten.linear.default": 1, - } - ) + ( + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) ) + ) - DynamicallyQuantizedPartitioner = XnnpackPartitioner( - config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, - per_op_mode=True, + ( + tester.check_count( + { + "torch.ops.higher_order.executorch_call_delegate": 1, + } ) - if legacy_partitioner: - tester.to_edge() - tester.partition(Partition(DynamicallyQuantizedPartitioner)) - else: - ( - tester.to_edge_transform_and_lower( - ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) - ) - ) - - ( - tester.check_count( - { - "torch.ops.higher_order.executorch_call_delegate": 1, - } - ) - .check_not( - [ - "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_per_token_asymmetric_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_token_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_token_default", - weight_dq_edge_op, - linear_edge_op, - ] - ) - .to_executorch() - .serialize() - .run_method_and_compare_outputs(atol=atol, rtol=rtol) + .check_not( + [ + "executorch_exir_dialects_edge__ops_quant_choose_qparams_affine_default", + "executorch_exir_dialects_edge__ops_quant_quantize_affine_default", + "executorch_exir_dialects_edge__ops_quant_dequantize_affine_default", + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] ) - - def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype): - in_sizes = [1, 4, 4] - input_sizes = [4, 37, 17] - output_sizes = [4, 17, 37] - - for use_bias in [True, False]: - for i, _ in enumerate(in_sizes): - in_size = int(in_sizes[i]) - input_size = int(input_sizes[i]) - output_size = int(output_sizes[i]) - mod = self.ManualDQLinear( - input_channels=input_size, - output_channels=output_size, - weight_n_bit=weight_n_bit, - dtype=op_dtype, - use_bias=use_bias, - ) - - inputs = (torch.randn(1, in_size, input_size).to(op_dtype),) - self._test_manual_dq_linear(mod, inputs, use_bias=use_bias) + .to_executorch() + .serialize() + .run_method_and_compare_outputs(atol=atol, rtol=rtol) + ) diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py index 6fdf1615215..eb25a14cfea 100644 --- a/backends/xnnpack/test/tester/tester.py +++ b/backends/xnnpack/test/tester/tester.py @@ -561,7 +561,8 @@ def to_edge(self, to_edge_stage: Optional[ToEdge] = None): if not to_edge_stage: to_edge_stage = ToEdge() to_edge_stage.edge_compile_conf._skip_dim_order = True - return self._run_stage(to_edge_stage) + res = self._run_stage(to_edge_stage) + return res def to_edge_transform_and_lower( self, to_edge_and_transform_stage: Optional[ToEdgeTransformAndLower] = None diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py index d5a7ec7fd0d..7c035757a6f 100644 --- a/backends/xnnpack/utils/quant_utils.py +++ b/backends/xnnpack/utils/quant_utils.py @@ -4,6 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator +from itertools import accumulate +from typing import cast + import torch from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, @@ -15,6 +19,7 @@ "quantize_per_channel.default", "quantize_per_channel_group.default", "quantize_per_token.default", + "quantize_affine.default", } _DQ_OPS = { @@ -23,12 +28,14 @@ "dequantize_per_channel.default", "dequantize_per_channel_group.default", "dequantize_per_token.default", + "dequantize_affine.default", } _QPARAM_OPS = { "choose_qparams.tensor", "choose_qparams_per_token_asymmetric.default", + "choose_qparams_affine.default", } _DYNAMIC_OPS = { @@ -43,8 +50,9 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool: if node.op != "call_function": return False node_name = format_target_name(node.target.__name__) # pyre-ignore + is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node) - return node_name in _DYNAMIC_OPS + return node_name in _DYNAMIC_OPS or is_dynamic_affine def is_qparam(node: torch.fx.Node) -> bool: @@ -75,4 +83,106 @@ def is_per_channel(node: torch.fx.Node) -> bool: if not (is_quant(node) or is_dequant(node)): return False - return "per_channel" in node.target.__name__ # pyre-ignore + is_affine_per_channel_group = is_per_channel_group(node) + is_per_channel = "per_channel" in node.target.__name__ # pyre-ignore + + return is_per_channel or is_affine_per_channel_group + + +def is_affine_qdq(node: torch.fx.Node) -> bool: + if not (is_quant(node) or is_dequant(node)): + return False + + return "quantize_affine" in node.target.__name__ # pyre-ignore + + +def _get_block_size_input_scale(node: torch.fx.Node): + assert is_affine_qdq(node) + block_size = node.args[1] + input_val = node.all_input_nodes[0].meta["val"] + scale_val = node.all_input_nodes[1].meta["val"] + return block_size, input_val, scale_val + + +def is_per_token(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_token" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + scale_numel_expected = 1 + for i in range(len(block_size) - 1): + flag &= block_size[i] == 1 + scale_numel_expected *= input_val.shape[i] + + flag &= block_size[-1] == input_val.shape[-1] + flag &= scale_val.numel() == scale_numel_expected + return flag + + return False + + +def is_per_channel_group(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_channel_group" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + flag &= len(block_size) == 2 + flag &= block_size[0] == 1 + group_size = block_size[1] + scale_numel = list(accumulate(scale_val.shape, operator.mul))[-1] + input_numel = list(accumulate(input_val.shape, operator.mul))[-1] + flag &= input_numel == group_size * scale_numel + return flag + + return False + + +def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node): + if not is_affine_qdq(node): + return None, None + # make sure input_dtype and zero_point_domain have expected values + input_node = node.args[0] + scale_node = node.args[2] + zero_point_node = node.args[3] + args = [input_node, scale_node, zero_point_node] + assert ( + len(node.args) > 4 + ), f"expecting at least 6 args, got node: {node.format_node()}" + + if node.args[4] != torch.int8: + return None, None + target_dtype = cast(torch.dtype, node.args[4]) + + if len(node.args) > 6: + # quant_min + args.append(node.args[5]) + # quant_max + args.append(node.args[6]) + else: + dtype_info = torch.iinfo(target_dtype) + quant_min = dtype_info.min + quant_max = dtype_info.max + args.append(quant_min) + args.append(quant_max) + + # add target_dtype_node after quant_min/quant_max + args.append(target_dtype) + # zero_point_domain + if len(node.args) > 7 and node.args[7] != "INT": + return None, None + + if is_per_channel_group(node): + block_sizes = cast(list[int], node.args[1]) + args.append(block_sizes[-1]) + + args.append(node.args[-1]) + + return args From 0add885d20555b930a27361c79d053b1ca95b3a3 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Fri, 23 Aug 2024 11:04:42 -0700 Subject: [PATCH 022/531] Update Xcode project to include new sources. Differential Revision: D61695409 Pull Request resolved: https://github.com/pytorch/executorch/pull/4857 --- .../LLaMA/LLaMA.xcodeproj/project.pbxproj | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index fd5cdc7117f..54ae7d33198 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -48,6 +48,10 @@ 03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */; }; 03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */; }; 03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE222BD2EB6700DDFDC2 /* tiktoken.h */; }; + 03D03DA72C7823620088D6A7 /* text_prefiller.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA52C7823620088D6A7 /* text_prefiller.cpp */; }; + 03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DA62C7823620088D6A7 /* text_prefiller.h */; }; + 03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */; }; + 03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */; }; 03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; }; /* End PBXBuildFile section */ @@ -92,8 +96,8 @@ 035A5E942BB4B523001E0553 /* LLaMA.entitlements */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.entitlements; path = LLaMA.entitlements; sourceTree = ""; }; 036CAF9D2BB1444500D6C2D5 /* LLaMA.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMA.app; sourceTree = BUILT_PRODUCTS_DIR; }; 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; }; - 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = runner.cpp; sourceTree = ""; }; - 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = runner.h; sourceTree = ""; }; + 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = ""; }; + 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = ""; }; 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = util.h; path = ../../../../extension/llm/runner/util.h; sourceTree = ""; }; 03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = ""; }; 03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = ""; }; @@ -104,6 +108,10 @@ 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = bpe_tokenizer.h; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.h; sourceTree = ""; }; 03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = ../../../../extension/llm/tokenizer/tiktoken.cpp; sourceTree = ""; }; 03BADE222BD2EB6700DDFDC2 /* tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tiktoken.h; path = ../../../../extension/llm/tokenizer/tiktoken.h; sourceTree = ""; }; + 03D03DA52C7823620088D6A7 /* text_prefiller.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_prefiller.cpp; sourceTree = ""; }; + 03D03DA62C7823620088D6A7 /* text_prefiller.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_prefiller.h; sourceTree = ""; }; + 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = ""; }; + 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = ""; }; 03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = base64.h; path = ../../../../extension/llm/tokenizer/base64.h; sourceTree = ""; }; /* End PBXFileReference section */ @@ -235,10 +243,14 @@ children = ( 03729F072BB203B300152F2E /* runner.cpp */, 03729F082BB203B300152F2E /* runner.h */, + 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */, + 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */, + 03D03DA52C7823620088D6A7 /* text_prefiller.cpp */, + 03D03DA62C7823620088D6A7 /* text_prefiller.h */, 03729F092BB203B300152F2E /* util.h */, ); name = runner; - path = ../../../../../models/llama2/runner; + path = ../../../../../../extension/llm/runner; sourceTree = ""; }; 03729F0E2BB203D700152F2E /* tokenizer */ = { @@ -277,6 +289,8 @@ 03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */, 03729F172BB2043600152F2E /* tokenizer.h in Headers */, 03729EE22BB1F93E00152F2E /* LLaMARunner.h in Headers */, + 03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */, + 03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */, 03DDA0FB2BD6368100D234B3 /* base64.h in Headers */, 03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */, 03729F122BB2042B00152F2E /* sampler.h in Headers */, @@ -450,9 +464,11 @@ 03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */, 03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */, 038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */, + 03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */, 03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */, 03729F0A2BB203B300152F2E /* runner.cpp in Sources */, 03729F132BB2042B00152F2E /* sampler.cpp in Sources */, + 03D03DA72C7823620088D6A7 /* text_prefiller.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; From b0d67c22e07b144ddee46dcb8bc83ce90a08a0e5 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 11:06:08 -0700 Subject: [PATCH 023/531] [Build] Remove .exe suffix on windows (#4864) --- build/pip_data_bin_init.py.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build/pip_data_bin_init.py.in b/build/pip_data_bin_init.py.in index 9644c5621df..0c9d60e0498 100644 --- a/build/pip_data_bin_init.py.in +++ b/build/pip_data_bin_init.py.in @@ -21,7 +21,9 @@ def _find_executable_files_under(dir): for filename in os.listdir(dir): filepath = os.path.join(dir, filename) if os.path.isfile(filepath) and os.access(filepath, os.X_OK): - bin_names.append(filename) + # Remove .exe suffix on windows. + filename_without_ext = os.path.splitext(filename)[0] + bin_names.append(filename_without_ext) return bin_names # The list of binaries to create wrapper functions for. From c7bc7e0de981d2de13c94a4790dde96a8589ce1f Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 11:37:18 -0700 Subject: [PATCH 024/531] [Build] Support windows in setup.py (#4858) Fix windows build issues in setup.py 1. Build executable name and dynamic lib name based on platform. 2. Set the src directory based on build config for windows. 3. Avoid using os.geteuid() on windows. For #4661 --- setup.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index 75b3ece526e..f6adb4f86c3 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ import contextlib import os +import platform import re import sys @@ -162,6 +163,31 @@ def write_to_python_file(cls, path: str) -> None: fp.write("\n".join(lines) + "\n") +# The build type is determined by the DEBUG environment variable. If DEBUG is +# set to a non-empty value, the build type is Debug. Otherwise, the build type +# is Release. +def get_build_type(is_debug=None) -> str: + debug = int(os.environ.get("DEBUG", 0)) if is_debug is None else is_debug + cfg = "Debug" if debug else "Release" + return cfg + + +def get_dynamic_lib_name(name: str) -> str: + if platform.system() == "Windows": + return name + ".dll" + elif platform.system() == "Darwin": + return "lib" + name + ".dylib" + else: + return "lib" + name + ".so" + + +def get_executable_name(name: str) -> str: + if platform.system() == "Windows": + return name + ".exe" + else: + return name + + class _BaseExtension(Extension): """A base class that maps an abstract source to an abstract destination.""" @@ -189,9 +215,17 @@ def src_path(self, installer: "InstallerBuildExt") -> Path: installer: The InstallerBuildExt instance that is installing the file. """ - # TODO(dbort): share the cmake-out location with CustomBuild. Can get a - # handle with installer.get_finalized_command('build') - cmake_cache_dir: Path = Path().cwd() / installer.build_temp / "cmake-out" + # Share the cmake-out location with CustomBuild. + cmake_cache_dir = Path(installer.get_finalized_command("build").cmake_cache_dir) + + cfg = get_build_type(installer.debug) + + if os.name == "nt": + # Replace %BUILD_TYPE% with the current build type. + self.src = self.src.replace("%BUILD_TYPE%", cfg) + else: + # Remove %BUILD_TYPE% from the path. + self.src = self.src.replace("/%BUILD_TYPE%", "") # Construct the full source path, resolving globs. If there are no glob # pattern characters, this will just ensure that the source file exists. @@ -212,17 +246,39 @@ class BuiltFile(_BaseExtension): `ext_modules`. """ - def __init__(self, src: str, dst: str): + def __init__( + self, + src_dir: str, + src_name: str, + dst: str, + is_executable: bool = False, + is_dynamic_lib: bool = False, + ): """Initializes a BuiltFile. Args: - src: The path to the file to install, relative to the cmake-out - directory. May be an fnmatch-style glob that matches exactly one - file. + src_dir: The directory of the file to install, relative to the cmake-out + directory. A placeholder %BUILD_TYPE% will be replaced with the build + type for multi-config generators (like Visual Studio) where the build + output is in a subdirectory named after the build type. For single- + config generators (like Makefile Generators or Ninja), this placeholder + will be removed. + src_name: The name of the file to install dst: The path to install to, relative to the root of the pip package. If dst ends in "/", it is treated as a directory. Otherwise it is treated as a filename. + is_executable: If True, the file is an executable. This is used to + determine the destination filename for executable. + is_dynamic_lib: If True, the file is a dynamic library. This is used + to determine the destination filename for dynamic library. """ + if is_executable and is_dynamic_lib: + raise ValueError("is_executable and is_dynamic_lib cannot be both True.") + if is_executable: + src_name = get_executable_name(src_name) + elif is_dynamic_lib: + src_name = get_dynamic_lib_name(src_name) + src = os.path.join(src_dir, src_name) # This is not a real extension, so use a unique name that doesn't look # like a module path. Some of setuptools's autodiscovery will look for # extension names with prefixes that match certain module paths. @@ -397,7 +453,7 @@ def __init__(self): self.saved_env = {} def __enter__(self): - if os.geteuid() == 0 and "HOME" in os.environ: + if os.name != "nt" and os.geteuid() == 0 and "HOME" in os.environ: log.info("temporarily unsetting HOME while running as root") self.saved_env["HOME"] = os.environ.pop("HOME") return self @@ -432,8 +488,7 @@ def initialize_options(self): def run(self): self.dump_options() - debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug - cfg = "Debug" if debug else "Release" + cfg = get_build_type(self.debug) # get_python_lib() typically returns the path to site-packages, where # all pip packages in the environment are installed. @@ -508,6 +563,14 @@ def run(self): item for item in os.environ["CMAKE_BUILD_ARGS"].split(" ") if item ] + # CMAKE_BUILD_TYPE variable specifies the build type (configuration) for + # single-configuration generators (e.g., Makefile Generators or Ninja). + # For multi-config generators (like Visual Studio), CMAKE_BUILD_TYPE + # isn’t directly applicable. + # During the build step, --config specifies the configuration to build + # for multi-config generators. + build_args += ["--config", cfg] + # Put the cmake cache under the temp directory, like # "pip-out/temp./cmake-out". cmake_cache_dir = os.path.join(repo_root, self.build_temp, "cmake-out") @@ -545,6 +608,8 @@ def run(self): "build/pip_data_bin_init.py.in", os.path.join(bin_dir, "__init__.py"), ) + # Share the cmake-out location with _BaseExtension. + self.cmake_cache_dir = cmake_cache_dir # Finally, run the underlying subcommands like build_py, build_ext. build.run(self) @@ -552,11 +617,15 @@ def run(self): def get_ext_modules() -> List[Extension]: """Returns the set of extension modules to build.""" - ext_modules = [] if ShouldBuild.flatc(): ext_modules.append( - BuiltFile("third-party/flatbuffers/flatc", "executorch/data/bin/") + BuiltFile( + src_dir="third-party/flatbuffers/%BUILD_TYPE%/", + src_name="flatc", + dst="executorch/data/bin/", + is_executable=True, + ) ) if ShouldBuild.pybindings(): @@ -570,17 +639,20 @@ def get_ext_modules() -> List[Extension]: ) if ShouldBuild.llama_custom_ops(): ext_modules.append( - # Install the prebuilt library for custom ops used in llama. BuiltFile( - "extension/llm/custom_ops/libcustom_ops_aot_lib.*", - "executorch/extension/llm/custom_ops/", + src_dir="extension/llm/custom_ops/%BUILD_TYPE%/", + src_name="custom_ops_aot_lib", + dst="executorch/extension/llm/custom_ops", + is_dynamic_lib=True, ) ) ext_modules.append( # Install the prebuilt library for quantized ops required by custom ops. BuiltFile( - "kernels/quantized/libquantized_ops_aot_lib.*", - "executorch/kernels/quantized/", + src_dir="kernels/quantized/%BUILD_TYPE%/", + src_name="quantized_ops_aot_lib", + dst="executorch/kernels/quantized/", + is_dynamic_lib=True, ) ) From ee6e4e9af502bf5b9d89fba7d3345e7b6d415b99 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 11:39:21 -0700 Subject: [PATCH 025/531] [Build] Add install_requirements.bat for windows build (#4862) Add install_requirements.bat on windows like install_requirements.sh. Also force use ClangCL on windows in install_requirements.py. For #4661 --- install_requirements.bat | 21 +++++++++++++++++++++ install_requirements.py | 6 ++++++ 2 files changed, 27 insertions(+) create mode 100644 install_requirements.bat diff --git a/install_requirements.bat b/install_requirements.bat new file mode 100644 index 00000000000..4cfe4b21c4b --- /dev/null +++ b/install_requirements.bat @@ -0,0 +1,21 @@ +@ECHO OFF + +rem Copyright (c) Meta Platforms, Inc. and affiliates. +rem All rights reserved. + +rem This batch file provides a basic functionality similar to the bash script. + +cd /d "%~dp0" + +rem Find the names of the python tools to use (replace with your actual python installation) +if "%PYTHON_EXECUTABLE%"=="" ( + if "%CONDA_DEFAULT_ENV%"=="" OR "%CONDA_DEFAULT_ENV%"=="base" OR NOT EXIST "python" ( + set PYTHON_EXECUTABLE=python3 + ) else ( + set PYTHON_EXECUTABLE=python + ) +) + +"%PYTHON_EXECUTABLE%" install_requirements.py %* + +exit /b %ERRORLEVEL% \ No newline at end of file diff --git a/install_requirements.py b/install_requirements.py index 085b48ab370..4b7dedc0a49 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -82,6 +82,12 @@ def python_is_compatible(): print(f"Error: Unknown option {arg}") sys.exit(1) +# Use ClangCL on Windows. +# ClangCL is an alias to Clang that configures it to work in an MSVC-compatible +# mode. Using it on Windows to avoid compiler compatibility issues for MSVC. +if os.name == "nt": + CMAKE_ARGS += " -T ClangCL" + # Since ExecuTorch often uses main-branch features of pytorch, only the nightly # pip versions will have the required features. # From bc66ff2789582b596fa02725c616373ed55a7b26 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 11:41:15 -0700 Subject: [PATCH 026/531] [Build] add kernel_link_options for windows (#4865) Add msvc version of kernel_link_options. For #4661 --- build/Utils.cmake | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/build/Utils.cmake b/build/Utils.cmake index 56fc1e104b0..55f5892a55e 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -143,11 +143,21 @@ function(macos_kernel_link_options target_name) ) endfunction() +# Same as kernel_link_options but it's for MSVC linker +function(msvc_kernel_link_options target_name) + target_link_options( + ${target_name} INTERFACE + "SHELL:LINKER:/WHOLEARCHIVE:$" + ) +endfunction() + # Ensure that the load-time constructor functions run. By default, the linker # would remove them since there are no other references to them. function(target_link_options_shared_lib target_name) if(APPLE) macos_kernel_link_options(${target_name}) + elseif(MSVC) + msvc_kernel_link_options(${target_name}) else() kernel_link_options(${target_name}) endif() From d8be9b1c52ad9d9ee2f1ab35840aab0e382f57b9 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Fri, 23 Aug 2024 11:50:35 -0700 Subject: [PATCH 027/531] [ET-VK] Set export log level INFO Differential Revision: D61723563 Pull Request resolved: https://github.com/pytorch/executorch/pull/4870 --- backends/vulkan/partitioner/vulkan_partitioner.py | 9 ++++++--- backends/vulkan/serialization/vulkan_graph_builder.py | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index 4d24877b631..c4fbaabdbc5 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -38,6 +38,9 @@ torch.ops.aten.upsample_nearest2d.vec, ] +logger: logging.Logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + class VulkanSupportedOperators(OperatorSupportBase): _ops: OpList = enumerate_supported_ops() @@ -110,7 +113,7 @@ def is_node_supported( ) -> bool: r = self._is_node_supported(submodules, node) if not r and node.op == "call_function": - logging.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") + logger.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") return r def _is_node_supported( @@ -179,9 +182,9 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: pl = len(partition_list) if pl == 0: - logging.warning("No Vulkan subgraphs can be partitioned!") + logger.warning("No Vulkan subgraphs can be partitioned!") else: - logging.info(f"Found {pl} Vulkan subgraphs to be partitioned.") + logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.") tag_constant_data(exported_program) diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index da40f0a720b..fcbf3edb7e5 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -24,6 +24,9 @@ Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str ] +logger: logging.Logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + class VkGraphBuilder: def __init__( @@ -351,9 +354,9 @@ def build_graph(self) -> vk_graph_schema.VkGraph: self.process_node(node, call_node_debug_hdl) call_node_debug_hdl += 1 - logging.info("Operators included in this Vulkan partition: ") + logger.info("Operators included in this Vulkan partition: ") for op in self.seen_ops: - logging.info(f" {op.__name__}") + logger.info(f" {op.__name__}") return vk_graph_schema.VkGraph( version="0", From 33fbe03fce42296e05c787da94d83afea61a0518 Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 23 Aug 2024 12:17:58 -0700 Subject: [PATCH 028/531] Remove custom op pad tests Differential Revision: D61725011 Pull Request resolved: https://github.com/pytorch/executorch/pull/4871 --- .../custom_ops/test_preprocess_custom_ops.py | 22 +------------------ 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/extension/llm/custom_ops/test_preprocess_custom_ops.py b/extension/llm/custom_ops/test_preprocess_custom_ops.py index 50e149ece16..c3922782ea2 100644 --- a/extension/llm/custom_ops/test_preprocess_custom_ops.py +++ b/extension/llm/custom_ops/test_preprocess_custom_ops.py @@ -7,7 +7,7 @@ # pyre-unsafe import unittest -from typing import List, Tuple +from typing import Tuple import torch @@ -17,33 +17,13 @@ class PreprocessTest(unittest.TestCase): def setUp(self): - # pad - self.pad_input = torch.ones(3, 200, 300) - # tile_crop self.tile_size = 224 - def _compare_pad(self, image: torch.Tensor, padding: List[int]) -> None: - output = torch.ops.preprocess.pad.default(image, padding) - output_ref = torch.nn.functional.pad(image, padding) - self.assertTrue(torch.allclose(output_ref, output, 1e-6)) - def _test_tile_crop(self, image: torch.Tensor, expected_shape: Tuple[int]) -> None: output = torch.ops.preprocess.tile_crop.default(image, self.tile_size) self.assertTrue(output.shape == expected_shape) - def test_op_pad_without_padding(self): - self._compare_pad(self.pad_input, [0, 0, 0, 0]) - - def test_op_pad_with_right_bottom_padding(self): - self._compare_pad(self.pad_input, [0, 124, 0, 148]) - - def test_op_pad_with_right_padding(self): - self._compare_pad(self.pad_input, [0, 124, 0, 0]) - - def test_op_pad_with_bottom_padding(self): - self._compare_pad(self.pad_input, [0, 0, 0, 148]) - def test_op_tile_crop_2x2(self): self._test_tile_crop(torch.ones(3, 448, 448), (4, 3, 224, 224)) From 4afc4fb45c58995b079b99c03d66304cca1b6ecf Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 23 Aug 2024 15:37:59 -0400 Subject: [PATCH 029/531] [ET-VK] Add buffer implementation for matrix multiplication Differential Revision: D61666461 Pull Request resolved: https://github.com/pytorch/executorch/pull/4845 --- .../vulkan/runtime/graph/ComputeGraph.cpp | 2 +- backends/vulkan/runtime/graph/ComputeGraph.h | 15 +++ .../graph/ops/glsl/matmul_naive_buffer.glsl | 60 ++++++++++ .../graph/ops/glsl/matmul_naive_buffer.yaml | 16 +++ ...naive.glsl => matmul_naive_texture3d.glsl} | 16 +-- ...naive.yaml => matmul_naive_texture3d.yaml} | 10 +- .../vulkan/runtime/graph/ops/impl/MatMul.cpp | 54 ++++++++- backends/vulkan/test/op_tests/cases.py | 1 + .../vulkan/test/vulkan_compute_api_test.cpp | 106 ++++++++++-------- 9 files changed, 211 insertions(+), 69 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml rename backends/vulkan/runtime/graph/ops/glsl/{matmul_naive.glsl => matmul_naive_texture3d.glsl} (72%) rename backends/vulkan/runtime/graph/ops/glsl/{matmul_naive.yaml => matmul_naive_texture3d.yaml} (71%) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 48e1ebf0a83..9fa0091b298 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -368,7 +368,7 @@ utils::uvec3 ComputeGraph::create_local_wg_size( } utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) { - return create_local_wg_size(image_extents_of(idx)); + return create_local_wg_size(create_global_wg_size(idx)); } void ComputeGraph::copy_into_staging( diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index faa2f4107ec..58a97c9e255 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -186,6 +186,21 @@ class ComputeGraph final { std::vector sizes_of(const ValueRef idx) const; + /* + * Returns the size of the tensor at `idx` along the specified dimension. + * Negative indexing is allowed. + */ + template + T size_at(const int64_t dim, const ValueRef idx) const { + const Value& val = values_.at(idx); + if (val.isTensor()) { + return static_cast(utils::val_at(dim, val.toConstTensor().sizes())); + } else if (val.isTensorRef()) { + return static_cast(utils::val_at(dim, val.toConstTensorRef().sizes)); + } + VK_THROW("Could not get sizes of value with type ", val.type()); + } + vkapi::ScalarType dtype_of(const ValueRef idx) const; inline utils::uvec3 image_extents_of(const ValueRef idx) const { diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl new file mode 100644 index 00000000000..25a6a742779 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(1, "r", "t_mat1", DTYPE, "buffer")} +${layout_declare_tensor(2, "r", "t_mat2", DTYPE, "buffer")} +${layout_declare_ubo(3, "ivec4", "out_sizes")} +${layout_declare_ubo(4, "ivec4", "out_strides")} +${layout_declare_ubo(5, "ivec4", "mat1_sizes")} +${layout_declare_ubo(6, "ivec4", "mat1_strides")} +${layout_declare_ubo(7, "ivec4", "mat2_sizes")} +${layout_declare_ubo(8, "ivec4", "mat2_strides")} +${layout_declare_ubo(9, "int", "out_numel")} + +#include "indexing_utils.h" + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec4 out_idx = ivec4( + gl_GlobalInvocationID.x, + gl_GlobalInvocationID.y, + gl_GlobalInvocationID.z % out_sizes.z, + gl_GlobalInvocationID.z / out_sizes.z); + + if (any(greaterThanEqual(out_idx, out_sizes))) { + return; + } + + int mat1_id = to_buffer_id( + ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides); + int mat2_id = to_buffer_id( + ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides); + + T sum = T(0.0); + for (int i = 0; i < mat1_sizes.x; ++i) { + sum += t_mat1[mat1_id] * t_mat2[mat2_id]; + + mat1_id += mat1_strides.x; + mat2_id += mat2_strides.y; + } + + const int out_id = to_buffer_id(out_idx, out_strides); + t_out[out_id] = T(sum); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml new file mode 100644 index 00000000000..54eb444f73d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +matmul_naive_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: matmul_naive_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl similarity index 72% rename from backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl rename to backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl index 37a9b60f3c5..7225f2c64a0 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl @@ -16,17 +16,11 @@ $if MAT2_IS_TRANSPOSED: #include "indexing_utils.h" #include "matmul.h" -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict InSizes { - ivec4 in_sizes; -}; +${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")} +${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")} +${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")} +${layout_declare_ubo(3, "ivec3", "out_limits")} +${layout_declare_ubo(4, "ivec4", "in_sizes")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml similarity index 71% rename from backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml rename to backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml index 1c4db3f0ce9..bb1eed494a5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml @@ -4,10 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -matmul_naive: +matmul_naive_texture3d: parameter_names_with_default_values: DTYPE: float - NDIM: 3 + STORAGE: texture3d MAT1_PACKING: W_packed MAT2_PACKING: H_packed MAT2_IS_TRANSPOSED: false @@ -16,9 +16,9 @@ matmul_naive: - VALUE: float - VALUE: half shader_variants: - - NAME: matmul_naive_W_packed_H_packed - - NAME: matmul_naive_W_packed_W_packed + - NAME: matmul_naive_texture3d_W_packed_H_packed + - NAME: matmul_naive_texture3d_W_packed_W_packed MAT2_PACKING: W_packed - - NAME: matmul_transposed_naive_W_packed_W_packed + - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed MAT2_PACKING: W_packed MAT2_IS_TRANSPOSED: true diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp index d1d3ad47d76..a25a602e38f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp @@ -62,7 +62,48 @@ void resize_matmul_node( out->virtual_resize(new_out_sizes); } -void add_matmul_naive_node( +void add_matmul_naive_buffer_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef out, + const ValueRef mat2_is_transposed) { + ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked); + + std::string kernel_name = "matmul_naive_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + utils::uvec3 global_size = { + graph.size_at(-1, out), + graph.size_at(-2, out), + graph.size_at(-3, out) * graph.size_at(-4, out)}; + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_size, + graph.create_local_wg_size(global_size), + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + { + graph.sizes_ubo(out), + graph.strides_ubo(out), + graph.sizes_ubo(mat1), + graph.strides_ubo(mat1), + graph.sizes_ubo(mat2), + graph.strides_ubo(mat2), + graph.numel_ubo(out), + }, + // Specialization Constants + {}, + // Resizing Logic + resize_matmul_node, + {mat2_is_transposed})); +} + +void add_matmul_naive_texture3d_node( ComputeGraph& graph, const ValueRef mat1, const ValueRef mat2_data, @@ -74,6 +115,7 @@ void add_matmul_naive_node( ? "matmul_transposed_naive" : "matmul_naive"; kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); @@ -174,12 +216,16 @@ void add_matmul_node( const ValueRef mat2_data, const ValueRef out, const ValueRef mat2_is_transposed) { - if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { + if (graph.is_buffer_storage(out)) { + add_matmul_naive_buffer_node( + graph, mat1, mat2_data, out, mat2_is_transposed); + } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed); } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) { - add_matmul_naive_node(graph, mat1, mat2_data, out, mat2_is_transposed); + add_matmul_naive_texture3d_node( + graph, mat1, mat2_data, out, mat2_is_transposed); } else { - VK_THROW("Input should be channel packed or width packed."); + VK_THROW("Input texture should be channel packed or width packed."); } } diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index ff5c7a60e0f..7f9f1842adf 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -70,6 +70,7 @@ def get_mm_inputs(): test_suite.prepacked_args = ["mat2"] # ATen matmul doesn't support half test_suite.dtypes = ["at::kFloat"] + test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] test_suite.layouts = [ "utils::kWidthPacked", "utils::kChannelsPacked", diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 1ac74e29ef4..bba0b246491 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -2282,24 +2282,28 @@ void test_binary_op( } } -#define CALL_TEST_FN_FORALL_CONDITIONS(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, true) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true) - -#define CALL_TEST_FN_FOR_W_PACKED(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true) - -#define CALL_TEST_FN_FOR_C_PACKED(_) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \ - _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true) +#define CALL_TEST_FN_FORALL_CONDITIONS(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, true) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) + +#define CALL_TEST_FN_FOR_W_PACKED(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true) \ + _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false) \ + _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true) + +#define CALL_TEST_FN_FOR_C_PACKED(_) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true) \ + _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false) \ + _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true) TEST(VulkanComputeGraphOpsTest, add_smoke_test) { -#define RUN_TESTS(dtype, layout, prepack) \ +#define RUN_TESTS(dtype, storage, layout, prepack) \ test_binary_op("add", {17, 21}, {17, 21}, dtype, layout, prepack); \ test_binary_op("add", {17, 21}, {1, 1}, dtype, layout, prepack); \ test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout, prepack); \ @@ -2320,9 +2324,11 @@ void test_mm( int K, int N, vkapi::ScalarType dtype, + utils::StorageType storage_type, utils::GPUMemoryLayout memory_layout, bool prepack = true) { GraphConfig config; + config.set_storage_type_override(storage_type); ComputeGraph graph(config); std::vector mat1_size = {M, K}; @@ -2379,38 +2385,42 @@ void test_mm( } TEST(VulkanComputeGraphOpsTest, mm_smoke_test) { -#define RUN_TESTS(dtype, layout, prepack) \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 5, \ - /*M = */ 31, \ - /*K = */ 127, \ - /*N = */ 23, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 7, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - layout, \ - prepack); \ - test_mm( \ - /*B = */ 1, \ - /*M = */ 13, \ - /*K = */ 89, \ - /*N = */ 17, \ - dtype, \ - layout, \ +#define RUN_TESTS(dtype, storage_type, layout, prepack) \ + test_mm( \ + /*B = */ 1, \ + /*M = */ 31, \ + /*K = */ 127, \ + /*N = */ 23, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 5, \ + /*M = */ 31, \ + /*K = */ 127, \ + /*N = */ 23, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 7, \ + /*M = */ 13, \ + /*K = */ 89, \ + /*N = */ 17, \ + dtype, \ + storage_type, \ + layout, \ + prepack); \ + test_mm( \ + /*B = */ 1, \ + /*M = */ 13, \ + /*K = */ 89, \ + /*N = */ 17, \ + dtype, \ + storage_type, \ + layout, \ prepack); CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS); From 7fe3a69bb649593c557d63a4306c7dba062c63fa Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 23 Aug 2024 17:49:19 -0400 Subject: [PATCH 030/531] [ET-VK][Ez] Add utilities to check if one vTensor is a view of another Differential Revision: D61666458 Pull Request resolved: https://github.com/pytorch/executorch/pull/4846 --- backends/vulkan/runtime/api/containers/Tensor.cpp | 11 +++++++++++ backends/vulkan/runtime/api/containers/Tensor.h | 12 ++++++++++++ backends/vulkan/runtime/graph/ComputeGraph.h | 7 +++++++ backends/vulkan/runtime/vk_api/memory/Buffer.h | 4 ++++ backends/vulkan/test/vulkan_compute_api_test.cpp | 3 +++ 5 files changed, 37 insertions(+) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index be44679f3b0..578898ad194 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -652,6 +652,17 @@ void vTensorStorage::transition( last_access_.access = cur_access; } +bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { + if (storage_type_ != other.storage_type_) { + return false; + } + // Copies are only enabled for buffer storage at the moment + if (storage_type_ != utils::kBuffer) { + return false; + } + return buffer_.is_copy_of(other.buffer_); +} + void vTensorStorage::discard_and_reallocate( const std::vector& padded_sizes, const utils::GPUMemoryLayout gpu_memory_layout, diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 8186ef1bd66..d37628e4adc 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -152,6 +152,11 @@ class vTensorStorage final { return image_.format(); } + /* + * Used for checking if this vTensorStorage is a copy of another instance + */ + bool is_copy_of(const vTensorStorage& other) const; + void discard_and_reallocate( const std::vector& padded_sizes, const utils::GPUMemoryLayout gpu_memory_layout, @@ -458,6 +463,13 @@ class vTensor final { * tensor sizes */ void reallocate(const std::vector& new_sizes); + + /* + * Check if this vTensor instance is a view of another vTensor instance + */ + inline bool is_view_of(const vTensor& other) const { + return storage_.is_copy_of(other.storage_); + } }; } // namespace api diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 58a97c9e255..5740d24a448 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -219,6 +219,13 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().has_buffer_storage(); } + inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base) + const { + return values_.at(maybe_view) + .toConstTensor() + .is_view_of(values_.at(base).toConstTensor()); + } + inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().gpu_memory_layout(); } diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 3f69d1f2237..9302048f861 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -150,6 +150,10 @@ class VulkanBuffer final { return (handle_ != VK_NULL_HANDLE); } + inline bool is_copy_of(const VulkanBuffer& other) const { + return (handle_ == other.handle_) && is_copy_; + } + inline void bind_allocation(const Allocation& memory) { VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_)); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index bba0b246491..e24e2ea4e06 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -611,6 +611,7 @@ TEST_F(VulkanComputeAPITest, tensor_copy_test) { vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true); vTensor copy = vTensor(original, sizes, dim_order); EXPECT_TRUE(get_vma_allocation_count() == 1); + EXPECT_TRUE(copy.is_view_of(original)); // Fill original tensor with some data fill_vtensor(original, 2.5f, true); @@ -1166,6 +1167,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) { ValueRef slice = graph.add_tensor_view(orig.value, slice_sizes, dim_order, offset); + EXPECT_TRUE(graph.val_is_view_of(slice, orig.value)); + IOValueRef out = {}; out.value = graph.add_tensor(slice_sizes, vkapi::kFloat); From 047656b3683da5da74c8624d49bed7207ad77208 Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 23 Aug 2024 15:21:56 -0700 Subject: [PATCH 031/531] Add quantized ops to pybindings Differential Revision: D61308890 Pull Request resolved: https://github.com/pytorch/executorch/pull/4696 --- CMakeLists.txt | 70 ++++++++++---------- build/Codegen.cmake | 5 +- examples/models/llama2/eval_llama_lib.py | 9 +++ examples/models/llama2/runner/generation.py | 5 -- examples/models/llama2/runner/native.py | 7 ++ extension/pybindings/test/TARGETS | 3 + extension/pybindings/test/make_test.py | 41 ++++++++++++ extension/pybindings/test/test_pybindings.py | 2 + kernels/quantized/CMakeLists.txt | 31 +++++++++ kernels/quantized/cpu/targets.bzl | 5 +- 10 files changed, 133 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5a5b592350..4f4ad1ddf8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,10 +548,6 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() -if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) -endif() - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations) # @@ -582,30 +578,6 @@ cmake_dependent_option( EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON EXECUTORCH_BUILD_HOST_TARGETS OFF ) -if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) - # Baseline libraries that executor_runner will link against. - set(_executor_runner_libs executorch gflags) - - if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) - elseif(EXECUTORCH_BUILD_CADENCE) - list(APPEND _executor_runner_libs cadence_ops_lib) - else() - list(APPEND _executor_runner_libs portable_ops_lib) - endif() - - # Generate lib to register quantized ops - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - list(APPEND _executor_runner_libs quantized_ops_lib) - endif() - - add_executable(executor_runner ${_executor_runner__srcs}) - if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) - target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") - endif() - target_link_libraries(executor_runner ${_executor_runner_libs}) - target_compile_options(executor_runner PUBLIC ${_common_compile_options}) -endif() # Add googletest if any test targets should be built if(EXECUTORCH_BUILD_GTESTS) @@ -644,10 +616,6 @@ if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() -if(EXECUTORCH_BUILD_VULKAN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) -endif() - if(EXECUTORCH_BUILD_QNN) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) endif() @@ -710,10 +678,6 @@ if(EXECUTORCH_BUILD_PYBIND) list(APPEND _dep_libs xnnpack_backend XNNPACK) endif() - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - target_link_options_shared_lib(quantized_ops_lib) - endif() - # compile options for pybind set(_pybind_compile_options -Wno-deprecated-declarations @@ -799,5 +763,39 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM) ) endif() +if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) + target_link_options_shared_lib(quantized_ops_lib) +endif() + +if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) + # Baseline libraries that executor_runner will link against. + set(_executor_runner_libs executorch gflags) + + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) + elseif(EXECUTORCH_BUILD_CADENCE) + list(APPEND _executor_runner_libs cadence_ops_lib) + else() + list(APPEND _executor_runner_libs portable_ops_lib) + endif() + + # Generate lib to register quantized ops + if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + list(APPEND _executor_runner_libs quantized_ops_lib) + endif() + + add_executable(executor_runner ${_executor_runner__srcs}) + if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) + target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") + endif() + target_link_libraries(executor_runner ${_executor_runner_libs}) + target_compile_options(executor_runner PUBLIC ${_common_compile_options}) +endif() + +if(EXECUTORCH_BUILD_VULKAN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/build/Codegen.cmake b/build/Codegen.cmake index 1c309cf3bce..818deb17581 100644 --- a/build/Codegen.cmake +++ b/build/Codegen.cmake @@ -150,9 +150,8 @@ function(gen_custom_ops_aot_lib) include(${EXECUTORCH_ROOT}/build/Utils.cmake) target_link_options_shared_lib(${GEN_LIB_NAME}) - if(EXECUTORCH_BUILD_PYBIND AND APPLE) - target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops) - target_link_options(${GEN_LIB_NAME} PRIVATE -undefined dynamic_lookup) + if(TARGET portable_lib) + target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib) else() target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops) endif() diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 5959ba6a386..9e27b987bb8 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -46,6 +46,15 @@ def __init__( from executorch.extension.pybindings.portable_lib import _load_for_executorch + # Load custom ops and quantized ops. + from executorch.extension.pybindings import portable_lib # noqa # usort: skip + + # Note: import this after portable_lib + from executorch.extension.llm.custom_ops import ( # noqa + sdpa_with_kv_cache, # usort: skip + ) + from executorch.kernels import quantized # noqa + self._et_model = _load_for_executorch(self._model) self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0] diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama2/runner/generation.py index f1a6b54d88f..6d43c84932f 100644 --- a/examples/models/llama2/runner/generation.py +++ b/examples/models/llama2/runner/generation.py @@ -12,11 +12,6 @@ from executorch.examples.models.llama2.llama_transformer import ModelArgs from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer -from executorch.extension.pybindings import portable_lib # noqa # usort: skip - -# Note: import this after portable_lib -from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip - class CompletionPrediction(TypedDict, total=False): generation: str diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama2/runner/native.py index cefafc1a88d..b0d6c20e961 100644 --- a/examples/models/llama2/runner/native.py +++ b/examples/models/llama2/runner/native.py @@ -13,6 +13,13 @@ from examples.models.llama2.llama_transformer import ModelArgs from executorch.extension.pybindings.portable_lib import _load_for_executorch +# Load custom ops and quantized ops. +from executorch.extension.pybindings import portable_lib # noqa # usort: skip + +# Note: import this after portable_lib +from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +from executorch.kernels import quantized # noqa + from .generation import LlamaRunner diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS index c569b97dcb5..feb4779a05e 100644 --- a/extension/pybindings/test/TARGETS +++ b/extension/pybindings/test/TARGETS @@ -28,6 +28,7 @@ runtime.python_library( runtime.python_test( name = "test_pybindings_portable_lib", srcs = ["test_pybindings.py"], + preload_deps = ["//executorch/kernels/quantized:aot_lib"], deps = [ ":make_test", "//executorch/extension/pybindings:portable_lib", @@ -37,8 +38,10 @@ runtime.python_test( runtime.python_test( name = "test_pybindings_aten_lib", srcs = ["test_pybindings.py"], + preload_deps = ["//executorch/kernels/quantized:aot_lib"], deps = [ ":make_test", "//executorch/extension/pybindings:aten_lib", + "//executorch/kernels/quantized:aot_lib", ], ) diff --git a/extension/pybindings/test/make_test.py b/extension/pybindings/test/make_test.py index a977b373a45..708e67e4309 100644 --- a/extension/pybindings/test/make_test.py +++ b/extension/pybindings/test/make_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import unittest from typing import Any, Callable, Tuple @@ -211,11 +213,50 @@ def __str__(self): except Exception: tester.assertTrue(str(out).find("The length of given input array")) + def test_quantized_ops(tester): + eager_module = ModuleAdd() + + from executorch.exir import EdgeCompileConfig + from executorch.exir.passes.quant_fusion_pass import QuantFusionPass + from torch.ao.quantization import get_default_qconfig_mapping + from torch.ao.quantization.backend_config.executorch import ( + get_executorch_backend_config, + ) + from torch.ao.quantization.quantize_fx import ( + _convert_to_reference_decomposed_fx, + prepare_fx, + ) + + qconfig_mapping = get_default_qconfig_mapping("qnnpack") + example_inputs = ( + torch.ones(1, 5, dtype=torch.float32), + torch.ones(1, 5, dtype=torch.float32), + ) + m = prepare_fx( + eager_module, + qconfig_mapping, + example_inputs, + backend_config=get_executorch_backend_config(), + ) + m = _convert_to_reference_decomposed_fx(m) + config = EdgeCompileConfig(_check_ir_validity=False) + m = to_edge(export(m, example_inputs), compile_config=config) + m = m.transform([QuantFusionPass(_fix_node_meta_val=True)]) + + exec_prog = m.to_executorch() + + executorch_module = load_fn(exec_prog.buffer) + executorch_output = executorch_module.forward(example_inputs)[0] + + expected = example_inputs[0] + example_inputs[1] + tester.assertEqual(str(expected), str(executorch_output)) + test_e2e(tester) test_multiple_entry(tester) test_output_lifespan(tester) test_module_callable(tester) test_module_single_input(tester) test_stderr_redirect(tester) + test_quantized_ops(tester) return wrapper diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py index dbc2b057bfb..d4ce2af0390 100644 --- a/extension/pybindings/test/test_pybindings.py +++ b/extension/pybindings/test/test_pybindings.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import unittest kernel_mode = None # either aten mode or portable mode diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt index c649a793ed3..dbc9edcb973 100644 --- a/kernels/quantized/CMakeLists.txt +++ b/kernels/quantized/CMakeLists.txt @@ -87,6 +87,37 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode" gen_custom_ops_aot_lib( LIB_NAME "quantized_ops_aot_lib" KERNEL_SOURCES "${_quantized_sources}" ) + + # Register quantized ops to portable_lib, so that they're available + # via pybindings. + if(TARGET portable_lib) + add_library(quantized_pybind_kernels_lib ${_quantized_kernels__srcs}) + target_link_libraries(quantized_pybind_kernels_lib PRIVATE portable_lib) + target_compile_options( + quantized_pybind_kernels_lib PUBLIC ${_common_compile_options}) + target_include_directories( + quantized_pybind_kernels_lib PUBLIC "${_common_include_directories}" + ) + gen_selected_ops( + LIB_NAME "quantized_ops_pybind_lib" OPS_SCHEMA_YAML "${_yaml_file}" + ) + generate_bindings_for_kernels( + LIB_NAME "quantized_ops_pybind_lib" CUSTOM_OPS_YAML "${_yaml_file}" + ) + # Build a library for pybind usage. + # quantized_ops_pybind_lib: Register quantized ops kernels into + # Executorch runtime for pybind. + gen_operators_lib( + LIB_NAME "quantized_ops_pybind_lib" + KERNEL_LIBS quantized_pybind_kernels_lib + DEPS portable_lib + ) + target_link_libraries( + quantized_ops_aot_lib + PUBLIC + quantized_ops_pybind_lib + ) + endif() endif() endif() diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl index 39552aaaf10..d2bbbfebe04 100644 --- a/kernels/quantized/cpu/targets.bzl +++ b/kernels/quantized/cpu/targets.bzl @@ -58,7 +58,10 @@ def define_common_targets(): runtime.cxx_library( name = "quantized_cpu", srcs = [], - visibility = ["//executorch/kernels/quantized/..."], + visibility = [ + "//executorch/kernels/quantized/...", + "//executorch/extension/pybindings/test/...", + ], exported_deps = quant_op_targets, ) From bf71fd4bb20271fbf1d68d046e65954814e42f60 Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 23 Aug 2024 15:52:20 -0700 Subject: [PATCH 032/531] Clean up install scripts Differential Revision: D61725249 Pull Request resolved: https://github.com/pytorch/executorch/pull/4826 --- .ci/scripts/setup-linux.sh | 1 - .ci/scripts/setup-macos.sh | 2 -- .ci/scripts/utils.sh | 36 ------------------------------------ .github/workflows/trunk.yml | 2 -- 4 files changed, 41 deletions(-) diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh index 4bccabad5cf..5df4668f65c 100755 --- a/.ci/scripts/setup-linux.sh +++ b/.ci/scripts/setup-linux.sh @@ -20,6 +20,5 @@ fi # As Linux job is running inside a Docker container, all of its dependencies # have already been installed -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh index 2be7d9efe83..833ba0aafe6 100755 --- a/.ci/scripts/setup-macos.sh +++ b/.ci/scripts/setup-macos.sh @@ -128,7 +128,5 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then fi print_cmake_info -install_pytorch_and_domains -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index ebc5361d00a..64c512cdccd 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -33,42 +33,6 @@ install_pip_dependencies() { popd || return } -install_domains() { - echo "Install torchvision and torchaudio" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${TORCHVISION_VERSION}" -} - -install_pytorch_and_domains() { - pushd .ci/docker || return - TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt) - popd || return - - git clone https://github.com/pytorch/pytorch.git - - # Fetch the target commit - pushd pytorch || return - git checkout "${TORCH_VERSION}" - git submodule update --init --recursive - - export _GLIBCXX_USE_CXX11_ABI=0 - # Then build and install PyTorch - python setup.py bdist_wheel - pip install "$(echo dist/*.whl)" - - # Grab the pinned audio and vision commits from PyTorch - TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt) - export TORCHAUDIO_VERSION - TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt) - export TORCHVISION_VERSION - - install_domains - - popd || return - # Print sccache stats for debugging - sccache --show-stats || true -} - install_flatc_from_source() { # NB: This function could be used to install flatbuffer from source pushd third-party/flatbuffers || return diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 98d14824638..9c45406fa80 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -143,7 +143,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm @@ -169,7 +168,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm From b157e5f3ec6c63b8c8c3241c3ff1d3e304ae9656 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 15:53:40 -0700 Subject: [PATCH 033/531] [Build] Link pthreadpool and cpuinfo for windows. There is no direct replacement for libpthread.so on Windows. Link pthreadpool and cpuinfo staticly for windows. For #4661 Pull Request resolved: https://github.com/pytorch/executorch/pull/4860 --- extension/llm/custom_ops/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 3f242e3d7d7..f057825ec80 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -101,6 +101,11 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) endif() target_link_libraries(custom_ops_aot_lib PUBLIC cpublas torch) + if(WIN32) + # There is no direct replacement for libpthread.so on Windows. + # For the Windows build, link directly against pthreadpool and cpuinfo. + target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo) + endif() target_compile_options( custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions From 6d29c1da4a6aa5de1d32c944787b23e47b5f7425 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Fri, 23 Aug 2024 15:54:04 -0700 Subject: [PATCH 034/531] [Build] Define ssize_t for windows build. Move size_t and ssize_t header to runtime/platform/compiler.h. Define ssize_t for windows as ptrdiff_t. For #4661 Pull Request resolved: https://github.com/pytorch/executorch/pull/4859 --- runtime/core/portable_type/tensor.h | 1 - runtime/core/portable_type/tensor_impl.h | 2 -- runtime/platform/compiler.h | 8 ++++++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/runtime/core/portable_type/tensor.h b/runtime/core/portable_type/tensor.h index 6006bddaaa6..6e952e30b9b 100644 --- a/runtime/core/portable_type/tensor.h +++ b/runtime/core/portable_type/tensor.h @@ -9,7 +9,6 @@ #pragma once #include -#include // TODO(T126923429): Include size_t, ssize_t #include diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h index 09ee744ae7f..19977b71e09 100644 --- a/runtime/core/portable_type/tensor_impl.h +++ b/runtime/core/portable_type/tensor_impl.h @@ -8,8 +8,6 @@ #pragma once -#include // TODO(T126923429): Include size_t, ssize_t - #include #include #include diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index f370cd110be..c7f603756c8 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -138,6 +138,14 @@ #endif #endif // ifndef +// Define size_t and ssize_t. +#ifndef _WIN32 +#include +#else +#include +using ssize_t = ptrdiff_t; +#endif + // DEPRECATED: Use the non-underscore-prefixed versions instead. // TODO(T199005537): Remove these once all users have stopped using them. #define __ET_DEPRECATED ET_DEPRECATED From 26e921e54472ddaa1e7086a8b6c7519af1cf9e40 Mon Sep 17 00:00:00 2001 From: Nathanael See Date: Fri, 23 Aug 2024 16:20:52 -0700 Subject: [PATCH 035/531] exir dialect view to squeeze/unsqueeze pass Differential Revision: D61732548 Pull Request resolved: https://github.com/pytorch/executorch/pull/4877 --- backends/transforms/TARGETS | 14 ++ .../view_copy_to_squeeze_unsqueeze.py | 128 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 backends/transforms/view_copy_to_squeeze_unsqueeze.py diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS index d461eb49788..df50e45f099 100644 --- a/backends/transforms/TARGETS +++ b/backends/transforms/TARGETS @@ -88,6 +88,20 @@ runtime.python_library( ], ) +runtime.python_library( + name = "view_copy_to_squeeze_unsqueeze", + srcs = ["view_copy_to_squeeze_unsqueeze.py"], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/exir:pass_base", + "//executorch/exir/dialects:lib", + ], +) + runtime.python_library( name = "fuse_view_copy", srcs = ["fuse_view_copy.py"], diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py new file mode 100644 index 00000000000..094ec6a3340 --- /dev/null +++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py @@ -0,0 +1,128 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from typing import List, Optional, Union + +import torch + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ViewCopyToSqueezeUnsqueezePass(ExportPass): + """ + Replaces view_copy nodes with squeeze_copy.dims nodes if the view node reduces dims of size 1. + Replaces view_copy nodes with unsqueeze_copy.default nodes if the view node adds a dim of size 1. + """ + + def __init__(self) -> None: + super().__init__() + self.view_copy_op: torch._ops.OpOverload = exir_ops.edge.aten.view_copy.default + self.squeeze_op: torch._ops.OpOverload = exir_ops.edge.aten.squeeze_copy.dims + self.unsqueeze_op: torch._ops.OpOverload = ( + exir_ops.edge.aten.unsqueeze_copy.default + ) + + def is_node_target( + self, node: torch.fx.Node, target: torch._ops.OperatorBase + ) -> bool: + return node.op == "call_function" and node.target == target + + def find_squeeze_dims( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[List[int]]: + # view_shape should be a subset of input_shape + if len(input_shape) <= len(view_shape): + return None + + # check that all dims are equal except the removed dims + i = 0 + j = 0 + idx = [] + while i < len(input_shape): + if input_shape[i] != view_shape[j]: + if input_shape[i] == 1: + idx.append(i) + j -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def find_unsqueeze_dim( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[int]: + # unsqueeze should increase the length of input_shape by 1 + if len(view_shape) - len(input_shape) != 1: + return None + + # check that all dims are equal except the added dim + i = 0 + j = 0 + idx = -1 + while j < len(view_shape): + if input_shape[i] != view_shape[j]: + if view_shape[j] == 1: + idx = j + i -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def replace_view_copy_node( + self, + graph_module: torch.fx.GraphModule, + view_node: torch.fx.Node, + op: torch._ops.OpOverload, + arg: Union[List[int], int], + ) -> None: + with graph_module.graph.inserting_before(view_node): + new_node = graph_module.graph.create_node( + "call_function", + op, + (view_node.args[0], arg), + ) + new_node.meta = view_node.meta + view_node.replace_all_uses_with(new_node) + graph_module.graph.erase_node(view_node) + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + for node in graph_module.graph.nodes: + if self.is_node_target(node, self.view_copy_op): + input_node = node.args[0] + input_shape = input_node.meta["val"].shape + view_shape = node.args[1] + squeeze_dims = self.find_squeeze_dims(input_shape, view_shape) + if squeeze_dims: + self.replace_view_copy_node( + graph_module, node, self.squeeze_op, squeeze_dims + ) + modified = True + continue + unsqueeze_dim = self.find_unsqueeze_dim(input_shape, view_shape) + if unsqueeze_dim: + self.replace_view_copy_node( + graph_module, node, self.unsqueeze_op, unsqueeze_dim + ) + modified = True + continue + + if modified: + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) From 48f4eee474073b9bcc1208c718a76cc53323d8b3 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 23 Aug 2024 16:37:28 -0700 Subject: [PATCH 036/531] [exir] Enable dict for sym shape eval pass Differential Revision: D61728068 Pull Request resolved: https://github.com/pytorch/executorch/pull/4872 --- examples/models/llava/export_llava.py | 35 ++++++++++++++++------ examples/models/llava/test/test_pte.py | 11 +++++++ exir/capture/_config.py | 7 ++++- exir/program/_program.py | 41 ++++++++++++++++++++------ 4 files changed, 75 insertions(+), 19 deletions(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 390528844f7..4f2aa6576b9 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -23,7 +23,15 @@ replace_sdpa_with_custom_op, ) from executorch.examples.models.llava.model import LlavaModel -from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) + +from executorch.exir.passes import MemoryPlanningPass +from executorch.exir.passes.quant_fusion_pass import QuantFusionPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.extension.llm.export.builder import DType, LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import Tokenizer @@ -199,7 +207,23 @@ def export_all(llava_model: LlavaModel): compile_config=EdgeCompileConfig(_check_ir_validity=False), ) - executorch_program = lowered_and_edge.to_executorch() + executorch_program = lowered_and_edge.to_executorch( + ExecutorchBackendConfig( + extract_constant_segment=True, + extract_delegate_segments=True, + passes=[ + QuantFusionPass(), + ], + memory_planning_pass=MemoryPlanningPass("greedy", alloc_graph_input=False), + sym_shape_eval_pass={ + "image_encoder": ConstraintBasedSymShapeEvalPass(), + }, + ) + ) + for execution_plan in executorch_program._emitter_output.program.execution_plan: + logging.info( + f"Required memory for activation in bytes: {execution_plan.non_const_buffer_sizes}" + ) return executorch_program @@ -253,13 +277,6 @@ def main(): with open(args.pte_name, "wb") as f: executorch_program.write_to_file(f) - logging.info( - "Required memory for activation in bytes: {}".format( - executorch_program._emitter_output.program.execution_plan[ - 0 - ].non_const_buffer_sizes - ), - ) logging.info(f"Exported ExecuTorch program to {args.pte_name}") # artifacts diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py index cdf24761c59..d793b2ae221 100644 --- a/examples/models/llava/test/test_pte.py +++ b/examples/models/llava/test/test_pte.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging import sys import torch @@ -17,6 +18,10 @@ from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.DEBUG, format=FORMAT) + + def main(): args = sys.argv[1:] llava_module = _load_for_executorch(args[0]) @@ -41,7 +46,10 @@ def main(): start_pos += pte_prefill_before_img.shape[1] # pte prefill image + logging.warning("Image encoder started") pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0] + logging.warning("Image encoder finished") + logging.warning("Image token prefill started") pte_prefill_img = llava_module.run_method( "text_model", ( @@ -49,11 +57,13 @@ def main(): pte_embeds_img, ), )[0] + logging.warning("Image token prefill finished") print(pte_prefill_img) start_pos += pte_prefill_img.shape[1] # pte prefill prompt after img + logging.warning("Text token prefill started") pte_embeds_after_img = llava_module.run_method( "token_embedding", (prompt_after_image,) )[0] @@ -61,6 +71,7 @@ def main(): "text_model", (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img), )[0] + logging.warning("Text token prefill finished") print(pte_prefill_after_img) # being tested, using llama_transformer diff --git a/exir/capture/_config.py b/exir/capture/_config.py index d959f10403d..42dc170c19d 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -82,7 +82,12 @@ class ExecutorchBackendConfig: # If provided, the minimum alignment of delegate data in the program. Must # be a power of 2. If not provided, uses the value in the schema file. delegate_alignment: Optional[int] = None - sym_shape_eval_pass: PassType = HintBasedSymShapeEvalPass() + + # A single sym shape eval pass can be defined for all the programs in the + # EdgeProgramManager or can be defined per program. + sym_shape_eval_pass: Union[PassType, Dict[str, PassType]] = ( + HintBasedSymShapeEvalPass() + ) # If set to true, view_copy operations will be converted to lightweight # view operations in the ET runtime diff --git a/exir/program/_program.py b/exir/program/_program.py index 9031ce39e66..849eae4f6f0 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -13,7 +13,6 @@ import torch import torch._export - from executorch.exir._serialize import _serialize_pte_binary from executorch.exir._serialize._cord import Cord from executorch.exir.backend.backend_api import to_backend @@ -23,6 +22,7 @@ from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap from executorch.exir.error import ExportError from executorch.exir.graph_module import get_control_flow_submodules +from executorch.exir.pass_base import PassBase from executorch.exir.pass_manager import PassType from executorch.exir.passes import ( base_post_op_replace_passes, @@ -641,25 +641,48 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram": return new_ep -def pre_memory_planning_passes(config: ExecutorchBackendConfig) -> List[PassType]: +def pre_memory_planning_passes( + config: ExecutorchBackendConfig, name: Optional[str] = None +) -> List[PassType]: + """ + Returns a list of passes to run before memory planning. + Get the sym shape eval pass based on the method name, if the pass is not in the dict, use the default pass. + """ + # Handle symbolic shape eval pass + if isinstance(config.sym_shape_eval_pass, dict): + default_pass = ExecutorchBackendConfig().sym_shape_eval_pass + if not name: + sym_shape_eval_pass = default_pass + # pyre-ignore: Undefined attribute [16] + sym_shape_eval_pass = config.sym_shape_eval_pass.get(name, default_pass) + elif isinstance(config.sym_shape_eval_pass, PassBase): + sym_shape_eval_pass = config.sym_shape_eval_pass + else: + raise RuntimeError( + f"sym_shape_eval_pass must be a dict or a PassBase, got {config.sym_shape_eval_pass}" + ) if config.remove_view_copy: - # pyre-ignore return [ NormalizeViewCopyBasePass(), dead_code_elimination_pass, ReplaceViewCopyWithViewPass(), - config.sym_shape_eval_pass, + sym_shape_eval_pass, config.to_out_var_pass, ] else: - # pyre-ignore return [ - config.sym_shape_eval_pass, + sym_shape_eval_pass, config.to_out_var_pass, ] -def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType]: +def edge_to_executorch_passes( + config: ExecutorchBackendConfig, name: Optional[str] = None +) -> List[PassType]: + """ + Returns a list of passes to lower from edge to executorch. + Get the pre memory planning passes based on the method name, if the pass is not in the dict, use the default pass. + """ passes: List[PassType] = [ *config.passes, SpecPropPass(), @@ -668,7 +691,7 @@ def edge_to_executorch_passes(config: ExecutorchBackendConfig) -> List[PassType] # there exists an unbacked symint operation. EdgeToBackendOpsPass(), RemoveGraphAssertsPass(), - ] + pre_memory_planning_passes(config) + ] + pre_memory_planning_passes(config, name) return passes @@ -1234,7 +1257,7 @@ def to_executorch( program = unsafe_remove_auto_functionalized_pass(program) gm, new_signature = insert_write_back_for_buffers_pass(program) new_gm = program.graph_module - for p in edge_to_executorch_passes(config): + for p in edge_to_executorch_passes(config, name): new_gm_res = p(new_gm) assert new_gm_res is not None new_gm = new_gm_res.graph_module From 7b4be5431eb57d0eed189d5a5cb55518130e3cb8 Mon Sep 17 00:00:00 2001 From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com> Date: Sat, 24 Aug 2024 11:30:48 +0800 Subject: [PATCH 037/531] Qualcomm AI Engine Direct - Use AIHub's context binary file for Stable Diffusion (#4836) Summary: - Add script for the export and runtime of AIHUB Stable Diffusion. - Add AIHUB Stable Diffusion runner - Add README tutorial --- backends/qualcomm/tests/test_qnn_delegate.py | 49 ++ examples/qualcomm/CMakeLists.txt | 5 + .../stable_diffusion/CMakeLists.txt | 26 + .../qaihub_scripts/stable_diffusion/README.md | 35 + .../stable_diffusion/install_requirements.sh | 3 + .../qaihub_stable_diffusion.py | 472 +++++++++++++ .../qaihub_stable_diffusion_runner.cpp | 140 ++++ .../stable_diffusion/runner/runner.cpp | 621 ++++++++++++++++++ .../stable_diffusion/runner/runner.h | 141 ++++ .../stable_diffusion/stable_diffusion_lib.py | 22 + 10 files changed, 1514 insertions(+) create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/README.md create mode 100755 examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h create mode 100644 examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index dd704c35c08..08fd907c40a 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -1998,6 +1998,55 @@ def test_llama3_8b(self): model_out = msg["result"] self.assertTrue(model_out.startswith(prompt)) + def test_stable_diffusion(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "a photo of an astronaut riding a horse on mars" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--text_encoder_bin", + f"{self.artifact_dir}/text_encoder.serialized.bin", + "--unet_bin", + f"{self.artifact_dir}/unet.serialized.bin", + "--vae_bin", + f"{self.artifact_dir}/vae.serialized.bin", + "--vocab_json", + f"{self.artifact_dir}/vocab.json", + "--num_time_steps", + "20", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--fix_latents", + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + # For the default settings and prompt, the expected results will be {PSNR: 23.258, SSIM: 0.852} + self.assertGreaterEqual(msg["PSNR"], 20) + self.assertGreaterEqual(msg["SSIM"], 0.8) + class TestExampleScript(TestQNN): def required_envs(self, conditions=None) -> bool: diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index fd9c1388b2d..94af209cb6c 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -81,3 +81,8 @@ add_subdirectory( add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama ) + +# build qaihub_stable_diffusion_runner +add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/stable_diffusion +) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt new file mode 100644 index 00000000000..c897f5f9f84 --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# preprocess qaihub_stable_diffusion_runner_src files +set(_qaihub_stable_diffusion_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/qaihub_stable_diffusion_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h +) + +# build qaihub_stable_diffusion_runner +add_executable(qaihub_stable_diffusion_runner ${_qaihub_stable_diffusion_runner__srcs}) +target_include_directories(qaihub_stable_diffusion_runner + PUBLIC ${_common_include_directories} +) +target_link_libraries(qaihub_stable_diffusion_runner + qnn_executorch_backend + executorch_no_prim_ops + extension_data_loader + extension_module + gflags +) +target_compile_options(qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options}) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md new file mode 100644 index 00000000000..21b3370df70 --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md @@ -0,0 +1,35 @@ +# Summary + +## Overview +This file provides you the instructions to run Stable-Diffusion-v2.1 with different parameters via Qualcomm HTP backend. We will demonstrate how to run Stable Diffusion v2.1 on mobile devices using context binaries from Qualcomm AI Hub’s Stable Diffusion v2.1 + +Please check corresponding section for more information. + +## Stable-Diffusion-v2.1 +The model architecture, scheduler, and time embedding are from the [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base). + +### Instructions +#### Step 1: Setup +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. +2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. + +#### Step2: Prepare Model +1. Download the context binaries for TextEncoder, UNet, and VAEDecoder under https://huggingface.co/qualcomm/Stable-Diffusion-v2.1/tree/main +2. Download vocab.json under https://huggingface.co/openai/clip-vit-base-patch32/tree/main + + +#### Step3: Install Requirements +Before running the code, you need to install the necessary Python packages. + +We have verified the code with `diffusers`==0.29.0 and `piq`==0.8.0. Please follow the instructions here to install the required items: +```bash +sh examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh +``` + +#### Step4: Run default example +In this example, we execute the script for 20 time steps with the `prompt` 'a photo of an astronaut riding a horse on mars': +```bash +python examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py -a ${ARTIFACTS} -b build_android -m ${SOC_MODEL} --s ${SERIAL_NUM} --text_encoder_bin ${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY} --unet_bin ${PATH_TO_UNET_CONTEXT_BINARY} --vae_bin ${PATH_TO_VAE_CONTEXT_BINARY} --vocab_json ${PATH_TO_VOCAB_JSON_FILE} --num_time_steps 20 --prompt "a photo of an astronaut riding a horse on mars" +``` +- Please replace `${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY}`, `${PATH_TO_UNET_CONTEXT_BINARY}`, and `${PATH_TO_VAE_CONTEXT_BINARY}` with the actual paths to your AI Hub context binary files. +- Please replace `${PATH_TO_VOCAB_JSON_FILE}` with the actual path to your vocab.json file. diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh b/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh new file mode 100755 index 00000000000..bbb4767bee3 --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh @@ -0,0 +1,3 @@ +# For Stable Diffusion V2.1 +pip install diffusers==0.29.0 +pip install piq==0.8.0 diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py new file mode 100644 index 00000000000..862db31f174 --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py @@ -0,0 +1,472 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import gc +import json +import os +from multiprocessing.connection import Client + +import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor +import numpy as np +import piq +import torch +from diffusers import EulerDiscreteScheduler, UNet2DConditionModel +from diffusers.models.embeddings import get_timestep_embedding +from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QcomChipset, +) +from executorch.backends.qualcomm.utils.utils import ( + canonicalize_program, + from_context_binary, + generate_htp_compiler_spec, + generate_qnn_executorch_compiler_spec, + generate_qnn_executorch_option, +) + +from executorch.examples.qualcomm.qaihub_scripts.stable_diffusion.stable_diffusion_lib import ( + StableDiffusion, +) +from executorch.examples.qualcomm.utils import ( + setup_common_args_and_variables, + SimpleADB, +) +from executorch.exir.backend.backend_api import to_backend +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass +from PIL import Image +from torchvision.transforms import ToTensor + +target_names = ("text_encoder", "unet", "vae") + + +def get_quant_data( + encoding: dict, data: torch.Tensor, input_model: str, input_index: int +): + scale = encoding[f"{input_model}_input"]["scale"][input_index] + offset = encoding[f"{input_model}_input"]["offset"][input_index] + if offset < 0: + quant_data = data.div(scale).sub(offset).clip(min=0, max=65535).detach() + else: + quant_data = data.div(scale).add(offset).clip(min=0, max=65535).detach() + + return quant_data.to(dtype=torch.uint16) + + +def get_encoding( + path_to_shard: str, + compiler_specs: str, + get_input: bool, + get_output: bool, + num_input: int, + num_output: int, +): + encoding_list = [] + with open(path_to_shard, "rb") as f: + ctx_bin = f.read() + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), ctx_bin + ) + assert qnn_mgr.Init().value == 0, "failed to load context binary" + qnn_mgr.AllocateTensor() + if get_input: + encoding_input = {"scale": [], "offset": []} + for i in range(num_input): + inputs = qnn_mgr.GetGraphInputs()[i] + encoding = inputs.GetEncodings() + encoding_input["scale"].append(encoding.data["scale"].item()) + encoding_input["offset"].append(encoding.data["offset"].item()) + encoding_list.append(encoding_input) + if get_output: + encoding_output = {"scale": [], "offset": []} + for i in range(num_output): + outputs = qnn_mgr.GetGraphOutputs()[i] + encoding = outputs.GetEncodings() + encoding_output["scale"].append(encoding.data["scale"].item()) + encoding_output["offset"].append(encoding.data["offset"].item()) + encoding_list.append(encoding_output) + qnn_mgr.Destroy() + return encoding_list + + +def get_encodings( + path_to_shard_encoder: str, + path_to_shard_unet: str, + path_to_shard_vae: str, + compiler_specs, +): + text_encoder_encoding = get_encoding( + path_to_shard=path_to_shard_encoder, + compiler_specs=compiler_specs, + get_input=False, + get_output=True, + num_input=1, + num_output=1, + ) + unet_encoding = get_encoding( + path_to_shard=path_to_shard_unet, + compiler_specs=compiler_specs, + get_input=True, + get_output=True, + num_input=3, + num_output=1, + ) + vae_encoding = get_encoding( + path_to_shard=path_to_shard_vae, + compiler_specs=compiler_specs, + get_input=True, + get_output=True, + num_input=1, + num_output=1, + ) + + return ( + text_encoder_encoding[0], + unet_encoding[0], + unet_encoding[1], + vae_encoding[0], + vae_encoding[1], + ) + + +def get_time_embedding(timestep, time_embedding): + timestep = torch.tensor([timestep]) + t_emb = get_timestep_embedding(timestep, 320, True, 0) + emb = time_embedding(t_emb) + + return emb + + +def build_args_parser(): + parser = setup_common_args_and_variables() + + parser.add_argument( + "-a", + "--artifact", + help="Path for storing generated artifacts by this example. Default ./stable_diffusion_qai_hub", + default="./stable_diffusion_qai_hub", + type=str, + ) + + parser.add_argument( + "--pte_prefix", + help="Prefix of pte files name. Default qaihub_stable_diffusion", + default="qaihub_stable_diffusion", + type=str, + ) + + parser.add_argument( + "--text_encoder_bin", + type=str, + default=None, + help="[For AI hub ctx binary] Path to Text Encoder.", + required=True, + ) + + parser.add_argument( + "--unet_bin", + type=str, + default=None, + help="[For AI hub ctx binary] Path to UNet.", + required=True, + ) + + parser.add_argument( + "--vae_bin", + type=str, + default=None, + help="[For AI hub ctx binary] Path to Vae Decoder.", + required=True, + ) + + parser.add_argument( + "--prompt", + default="a photo of an astronaut riding a horse on mars", + type=str, + help="Prompt to generate image from.", + ) + + parser.add_argument( + "--num_time_steps", + default=20, + type=int, + help="The number of diffusion time steps.", + ) + + parser.add_argument( + "--guidance_scale", + type=float, + default=7.5, + help="Strength of guidance (higher means more influence from prompt).", + ) + + parser.add_argument( + "--vocab_json", + type=str, + help="Path to tokenizer vocab.json file. Can get vocab.json under https://huggingface.co/openai/clip-vit-base-patch32/tree/main", + required=True, + ) + + parser.add_argument( + "--pre_gen_pte", + help="folder path to pre-compiled ptes", + default=None, + type=str, + ) + + parser.add_argument( + "--fix_latents", + help="Enable this option to fix the latents in the unet diffuse step.", + action="store_true", + ) + + return parser + + +def broadcast_ut_result(output_image, seed): + sd = StableDiffusion(seed) + to_tensor = ToTensor() + target = sd(args.prompt, 512, 512, args.num_time_steps) + target = to_tensor(target).unsqueeze(0) + output_tensor = to_tensor( + Image.fromarray(np.round(output_image[0] * 255).astype(np.uint8)[0]) + ).unsqueeze(0) + + psnr_piq = piq.psnr(target, output_tensor) + ssim_piq = piq.ssim(target, output_tensor) + print(f"PSNR: {round(psnr_piq.item(), 3)}, SSIM: {round(ssim_piq.item(), 3)}") + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"PSNR": psnr_piq.item(), "SSIM": ssim_piq.item()})) + + +def save_result(output_image): + img = Image.fromarray(np.round(output_image[0] * 255).astype(np.uint8)[0]) + save_path = f"{args.artifact}/outputs/output_image.jpg" + img.save(save_path) + print(f"Output image saved at {save_path}") + + +def gen_pte_from_ctx_bin(args, compiler_specs): + # Create custom operators as context loader + bundle_programs = [ + from_context_binary(args.text_encoder_bin, "ctx_loader_0"), + from_context_binary(args.unet_bin, "ctx_loader_1"), + from_context_binary(args.vae_bin, "ctx_loader_2"), + ] + + # Lower with QnnBackend + lowered_modules = [ + to_backend("QnnBackend", prog["edge_program"], compiler_specs) + for prog in bundle_programs + ] + # Setup spill-fill buffer for relieving runtime memory usage + canonicalize_program(lowered_modules) + # export pte files + pte_files = [] + for target_name in target_names: + memory_planning_pass = MemoryPlanningPass( + memory_planning_algo="greedy", + alloc_graph_input=False, + alloc_graph_output=False, + ) + pte_files.append(f"{args.artifact}/{args.pte_prefix}_{target_name}.pte") + with open(pte_files[-1], "wb") as file: + file.write( + lowered_modules[0].buffer( + extract_delegate_segments=True, memory_planning=memory_planning_pass + ) + ) + # GC for reducing host memory consuming + bundle_programs.pop(0) + lowered_modules.pop(0) + gc.collect() + + return pte_files + + +def inference(args, compiler_specs, pte_files): + # Loading a pretrained EulerDiscreteScheduler from the https://huggingface.co/stabilityai/stable-diffusion-2-1-base. + scheduler = EulerDiscreteScheduler.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="scheduler", revision="main" + ) + + # Loading a pretrained UNet2DConditionModel (which includes the time embedding) from the https://huggingface.co/stabilityai/stable-diffusion-2-1-base. + time_embedding = UNet2DConditionModel.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="unet", revision="main" + ).time_embedding + + scheduler.set_timesteps(args.num_time_steps) + scheduler.config.prediction_type = "epsilon" + # Get encoding of unet and vae + ( + encoder_output, + unet_input, + unet_output, + vae_input, + vae_output, + ) = get_encodings( + args.text_encoder_bin, + args.unet_bin, + args.vae_bin, + compiler_specs, + ) + encoding = { + "encoder_output": encoder_output, + "unet_input": unet_input, + "unet_output": unet_output, + "vae_input": vae_input, + "vae_output": vae_output, + } + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=args.build_folder, + pte_path=pte_files, + workspace=f"/data/local/tmp/executorch/{args.pte_prefix}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + runner="examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner", + ) + + input_unet = () + input_list_unet = "" + + for i, t in enumerate(scheduler.timesteps): + time_emb = get_quant_data( + encoding, get_time_embedding(t, time_embedding), "unet", 1 + ) + input_list_unet += f"input_{i}_0.raw\n" + input_unet = input_unet + (time_emb,) + + qnn_executor_runner_args = [ + f"--text_encoder_path {adb.workspace}/{args.pte_prefix}_text_encoder.pte", + f"--unet_path {adb.workspace}/{args.pte_prefix}_unet.pte", + f"--vae_path {adb.workspace}/{args.pte_prefix}_vae.pte", + f"--input_list_path {adb.workspace}/input_list.txt", + f"--output_folder_path {adb.output_folder}", + f'--prompt "{args.prompt}"', + f"--guidance_scale {args.guidance_scale}", + f"--num_time_steps {args.num_time_steps}", + f"--vocab_json {adb.workspace}/vocab.json", + ] + if args.fix_latents: + qnn_executor_runner_args.append("--fix_latents") + + text_encoder_output_scale = encoding["encoder_output"]["scale"][0] + text_encoder_output_offset = encoding["encoder_output"]["offset"][0] + unet_input_latent_scale = encoding["unet_input"]["scale"][0] + unet_input_latent_offset = encoding["unet_input"]["offset"][0] + unet_input_text_emb_scale = encoding["unet_input"]["scale"][2] + unet_input_text_emb_offset = encoding["unet_input"]["offset"][2] + unet_output_scale = encoding["unet_output"]["scale"][0] + unet_output_offset = encoding["unet_output"]["offset"][0] + vae_input_scale = encoding["vae_input"]["scale"][0] + vae_input_offset = encoding["vae_input"]["offset"][0] + vae_output_scale = encoding["vae_output"]["scale"][0] + vae_output_offset = encoding["vae_output"]["offset"][0] + + qnn_executor_runner_args = qnn_executor_runner_args + [ + f"--text_encoder_output_scale {text_encoder_output_scale}", + f"--text_encoder_output_offset {text_encoder_output_offset}", + f"--unet_input_latent_scale {unet_input_latent_scale}", + f"--unet_input_latent_offset {unet_input_latent_offset}", + f"--unet_input_text_emb_scale {unet_input_text_emb_scale}", + f"--unet_input_text_emb_offset {unet_input_text_emb_offset}", + f"--unet_output_scale {unet_output_scale}", + f"--unet_output_offset {unet_output_offset}", + f"--vae_input_scale {vae_input_scale}", + f"--vae_input_offset {vae_input_offset}", + f"--vae_output_scale {vae_output_scale}", + f"--vae_output_offset {vae_output_offset}", + ] + + qnn_executor_runner_args = " ".join( + [ + f"cd {adb.workspace} &&", + "export ADSP_LIBRARY_PATH=. &&", + "export LD_LIBRARY_PATH=. &&", + f"./qaihub_stable_diffusion_runner {' '.join(qnn_executor_runner_args)}", + ] + ) + + files = [args.vocab_json] + + if args.fix_latents: + seed = 42 + latents = torch.randn((1, 4, 64, 64), generator=torch.manual_seed(seed)).to( + "cpu" + ) + # We need to explicitly permute after init tensor or else the random value will be different + latents = latents.permute(0, 2, 3, 1).contiguous() + latents = latents * scheduler.init_noise_sigma + flattened_tensor = latents.view(-1) + # Save the flattened tensor to a .raw file + with open(os.path.join(args.artifact, "latents.raw"), "wb") as file: + file.write(flattened_tensor.numpy().tobytes()) + files.append(os.path.join(args.artifact, "latents.raw")) + + adb.push(inputs=input_unet, input_list=input_list_unet, files=files) + adb.execute(custom_runner_cmd=qnn_executor_runner_args) + + output_image = [] + + def post_process_vae(): + with open(f"{args.artifact}/outputs/output_0_0.raw", "rb") as f: + output_image.append( + np.fromfile(f, dtype=np.float32).reshape(1, 512, 512, 3) + ) + + adb.pull(output_path=args.artifact, callback=post_process_vae) + + if args.fix_latents: + broadcast_ut_result(output_image, seed) + else: + save_result(output_image) + + +def main(args): + os.makedirs(args.artifact, exist_ok=True) + + # common part for compile & inference + backend_options = generate_htp_compiler_spec( + use_fp16=False, + use_multi_contexts=True, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=getattr(QcomChipset, args.model), + backend_options=backend_options, + is_from_context_binary=True, + ) + + if args.pre_gen_pte is None: + pte_files = gen_pte_from_ctx_bin(args, compiler_specs) + assert ( + len(pte_files) == 3 + ), f"Error: Expected 3 PTE files, but got {len(pte_files)} files." + + else: + pte_files = [ + f"{args.pre_gen_pte}/{args.pte_prefix}_{target_name}.pte" + for target_name in target_names + ] + if args.compile_only: + return + + inference(args, compiler_specs, pte_files) + + +if __name__ == "__main__": # noqa: C901 + parser = build_args_parser() + args = parser.parse_args() + + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp new file mode 100644 index 00000000000..687a260c4a5 --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion_runner.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +DEFINE_string( + text_encoder_path, + "qaihub_stable_diffusion_text_encoder.pte", + "Text Encoder Model serialized in flatbuffer format."); +DEFINE_string( + unet_path, + "qaihub_stable_diffusion_unet.pte", + "Unet Model serialized in flatbuffer format."); +DEFINE_string( + vae_path, + "qaihub_stable_diffusion_vae.pte", + "Vae Model serialized in flatbuffer format."); +DEFINE_string( + output_folder_path, + "outputs", + "Executorch inference data output path."); +DEFINE_string( + input_list_path, + "input_list.txt", + "Input list storing time embedding."); +DEFINE_string( + vocab_json, + "vocab.json", + "Json path to retrieve a list of vocabs."); +DEFINE_string( + prompt, + "a photo of an astronaut riding a horse on mars", + "User input prompt"); +DEFINE_int32(num_time_steps, 20, "Number of time steps."); +DEFINE_double(guidance_scale, 7.5, "Guidance Scale"); + +DEFINE_double(text_encoder_output_scale, 0.0, "Text encoder output scale"); +DEFINE_int32(text_encoder_output_offset, 0, "Text encoder output offset"); +DEFINE_double(unet_input_latent_scale, 0.0, "Unet input latent scale"); +DEFINE_int32(unet_input_latent_offset, 0, "Unet input latent offset"); +DEFINE_double(unet_input_text_emb_scale, 0.0, "Unet input text emb scale"); +DEFINE_int32(unet_input_text_emb_offset, 0, "Unet input text emb offset"); +DEFINE_double(unet_output_scale, 0.0, "Unet output scale"); +DEFINE_int32(unet_output_offset, 0, "Unet output offset"); +DEFINE_double(vae_input_scale, 0.0, "Vae input scale"); +DEFINE_int32(vae_input_offset, 0, "Vae input offset"); +DEFINE_double(vae_output_scale, 0.0, "Vae output scale"); +DEFINE_int32(vae_output_offset, 0, "Vae output offset"); +DEFINE_bool( + fix_latents, + false, + "Enable this option to fix the latents in the unet diffuse step."); + +void usage_message() { + std::string usage_message = + "This is a sample executor runner capable of executing stable diffusion models." + "Users will need binary .pte program files for text_encoder, unet, and vae. Below are the options to retrieve required .pte program files:\n" + "For further information on how to generate the .pte program files and example command to execute this runner, please refer to qaihub_stable_diffsion.py."; + gflags::SetUsageMessage(usage_message); +} + +int main(int argc, char** argv) { + using namespace torch::executor; + runtime_init(); + usage_message(); + gflags::ParseCommandLineFlags(&argc, &argv, true); + bool is_default = + gflags::GetCommandLineFlagInfoOrDie("text_encoder_output_scale") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("text_encoder_output_offset") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_input_latent_scale") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_input_latent_offset") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_input_text_emb_scale") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_input_text_emb_offset") + .is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_output_scale").is_default || + gflags::GetCommandLineFlagInfoOrDie("unet_output_offset").is_default || + gflags::GetCommandLineFlagInfoOrDie("vae_input_scale").is_default || + gflags::GetCommandLineFlagInfoOrDie("vae_input_offset").is_default || + gflags::GetCommandLineFlagInfoOrDie("vae_output_scale").is_default || + gflags::GetCommandLineFlagInfoOrDie("vae_output_offset").is_default; + + ET_CHECK_MSG( + !is_default, + "Please provide scale and offset for unet latent input, unet output, and vae input/output." + "Please refer to qaihub_stable_diffusion.py if you are unsure how to retrieve these values."); + + ET_LOG(Info, "Stable Diffusion runner started"); + std::vector models_path = { + FLAGS_text_encoder_path, FLAGS_unet_path, FLAGS_vae_path}; + + // Create stable_diffusion_runner + Runner runner( + models_path, + FLAGS_num_time_steps, + FLAGS_guidance_scale, + FLAGS_text_encoder_output_scale, + FLAGS_text_encoder_output_offset, + FLAGS_unet_input_latent_scale, + FLAGS_unet_input_latent_offset, + FLAGS_unet_input_text_emb_scale, + FLAGS_unet_input_text_emb_offset, + FLAGS_unet_output_scale, + FLAGS_unet_output_offset, + FLAGS_vae_input_scale, + FLAGS_vae_input_offset, + FLAGS_vae_output_scale, + FLAGS_vae_output_offset, + FLAGS_output_folder_path, + FLAGS_fix_latents); + + ET_CHECK_MSG( + runner.init_tokenizer(FLAGS_vocab_json) == Error::Ok, + "Runner failed to init tokenizer"); + + ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method"); + + ET_CHECK_MSG( + runner.parse_input_list(FLAGS_input_list_path) == Error::Ok, + "Failed to parse time embedding input list"); + ET_CHECK_MSG( + runner.generate(FLAGS_prompt) == Error::Ok, "Runner failed to generate"); + + ET_CHECK_MSG( + runner.print_performance() == Error::Ok, + "Runner failed to print performance"); + + return 0; +} diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp new file mode 100644 index 00000000000..a997397855b --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp @@ -0,0 +1,621 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple stable diffusion runner that includes preprocessing and post +// processing logic. The module takes in a string as input and emits a tensor as +// output. + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace torch { +namespace executor { + +Runner::Runner( + const std::vector& models_path, + const int num_time_steps, + const float guidance_scale, + const float text_encoder_output_scale, + const int text_encoder_output_offset, + const float unet_input_latent_scale, + const int unet_input_latent_offset, + const float unet_input_text_emb_scale, + const float unet_input_text_emb_offset, + const float unet_output_scale, + const int unet_output_offset, + const float vae_input_scale, + const int vae_input_offset, + const float vae_output_scale, + const int vae_output_offset, + const std::string output_path, + const bool fix_latents) + : num_time_steps_(num_time_steps), + guidance_scale_(guidance_scale), + text_encoder_output_scale_(text_encoder_output_scale), + text_encoder_output_offset_(text_encoder_output_offset), + unet_input_latent_scale_(unet_input_latent_scale), + unet_input_latent_offset_(unet_input_latent_offset), + unet_input_text_emb_scale_(unet_input_text_emb_scale), + unet_input_text_emb_offset_(unet_input_text_emb_offset), + unet_output_scale_(unet_output_scale), + unet_output_offset_(unet_output_offset), + vae_input_scale_(vae_input_scale), + vae_input_offset_(vae_input_offset), + vae_output_scale_(vae_output_scale), + vae_output_offset_(vae_output_offset), + output_path_(output_path), + fix_latents_(fix_latents) { + for (int i = 0; i < models_path.size(); i++) { + modules_.push_back(std::make_unique( + models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors)); + ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str()); + } +} + +std::vector> Runner::get_methods_meta() { + std::vector> methods_meta; + for (std::unique_ptr& module : modules_) { + methods_meta.emplace_back(module->method_meta("forward")); + } + return methods_meta; +} + +bool Runner::is_loaded() const { + bool loaded = true; + for (const std::unique_ptr& module : modules_) { + loaded &= module->is_loaded(); + } + return loaded; +} + +Error Runner::load() { + if (is_loaded()) { + return Error::Ok; + } + stats_.model_load_start_ms = util::time_in_ms(); + for (auto& module : modules_) { + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward")); + } + stats_.model_load_end_ms = util::time_in_ms(); + return Error::Ok; +} + +Error Runner::parse_input_list(std::string& path) { + // Fill in data for input + std::ifstream input_list(path); + time_emb_list_.reserve(num_time_steps_); + ET_CHECK_MSG(input_list.is_open(), "Input list error opening file"); + std::string time_emb_file; + for (int i = 0; i < num_time_steps_; i++) { + std::getline(input_list, time_emb_file); + std::ifstream is; + is.open(time_emb_file, std::ios::binary); + is.seekg(0, std::ios::end); + size_t filesize = is.tellg(); + is.seekg(0, std::ios::beg); + std::vector time_emb; + time_emb.resize(filesize / sizeof(uint16_t)); + is.read(reinterpret_cast(time_emb.data()), filesize); + time_emb_list_.push_back(time_emb); + } + return Error::Ok; +} + +Error Runner::init_tokenizer(const std::string& vocab_json_path) { + ET_LOG(Info, "Loading Tokenizer from json"); + stats_.tokenizer_load_start_ms = util::time_in_ms(); + std::ifstream fin(vocab_json_path); + auto update_map = [this](std::string& target, std::regex& re) { + std::smatch sm; + std::regex_search(target, sm, re); + // replace special character, please extend this if any cornor case found + std::string text = sm[1]; + std::unordered_map post_process = { + {"\"", std::regex(R"(\\\")")}, + {" ", std::regex(R"()")}, + {"\\", std::regex(R"(\\\\)")}}; + for (auto& p : post_process) { + text = std::regex_replace(text, p.second, p.first); + } + vocab_to_token_map_[text] = std::stoi(sm[2]); + }; + + if (fin.is_open()) { + std::string line, text; + while (getline(fin, line)) { + text += line; + } + fin.close(); + + std::regex re_anchor(R"(\d,\")"); + std::regex re_pattern(R"(\{?\"(.*)\":([\d]+)\}?)"); + auto begin = std::sregex_iterator(text.begin(), text.end(), re_anchor); + auto end = std::sregex_iterator(); + size_t pos = 0; + for (std::sregex_iterator iter = begin; iter != end; ++iter) { + std::smatch match; + size_t len = iter->position() - pos + 1; + std::string target = text.substr(pos, len); + update_map(target, re_pattern); + pos = iter->position() + 1; + } + // process last vocabulary + std::string target = text.substr(pos); + update_map(target, re_pattern); + } + stats_.tokenizer_load_end_ms = util::time_in_ms(); + return Error::Ok; +} + +std::vector Runner::tokenize(std::string prompt) { + std::string bos("<|startoftext|>"), eos("<|endoftext|>"); + std::vector vocabs; + vocabs.reserve(max_tokens_); + std::vector tokens(1, vocab_to_token_map_[bos]); + + // pretokenize + // ref: https://github.com/monatis/clip.cpp + // https://huggingface.co/openai/clip-vit-base-patch32 + std::string text; + std::regex re( + R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"); + std::smatch sm; + while (std::regex_search(prompt, sm, re)) { + for (auto& v : sm) { + vocabs.push_back(v); + } + prompt = sm.suffix(); + } + for (std::string& v : vocabs) { + std::string word = (v[0] == ' ') ? v.substr(1) : v; + word += " "; + auto iter = vocab_to_token_map_.find(word); + if (iter != vocab_to_token_map_.end()) { + tokens.push_back(iter->second); + continue; + } + for (int i = 0; i < v.size(); ++i) { + for (int j = v.size() - 1; j >= i; --j) { + std::string token = v.substr(i, j - 1 + 1); + auto iter = vocab_to_token_map_.find(token); + if (iter != vocab_to_token_map_.end()) { + tokens.push_back(iter->second); + i = j + 1; + break; + } else if (j == i) { + ET_LOG(Error, "unknown token found: %s", token.c_str()); + } + } + } + } + tokens.push_back(vocab_to_token_map_[eos]); + return tokens; +} + +std::vector Runner::gen_latent_from_file() { + std::vector tensor_vector; + std::ifstream file("latents.raw", std::ios::binary); + if (!file.is_open()) { + ET_LOG(Error, "Error opening file!"); + return tensor_vector; + } + + // Read the tensor data + float value; + while (file.read(reinterpret_cast(&value), sizeof(float))) { + tensor_vector.push_back(value); + } + file.close(); + return tensor_vector; +} + +std::vector Runner::gen_random_latent(float sigma) { + std::random_device rnd_device; + std::mt19937 mersenne_engine{rnd_device()}; + std::normal_distribution dist{0.0f, 1.0f}; + + constexpr int latent_size = 1 * 64 * 64 * 4; + std::vector random_vector(latent_size); + + for (float& value : random_vector) { + value = dist(mersenne_engine) * sigma; + } + return random_vector; +} + +std::vector Runner::get_time_steps() { + std::vector time_steps(num_time_steps_); + for (int i = 0; i < num_time_steps_; ++i) { + time_steps[i] = (num_train_timesteps_ - 1) * + (1.0f - static_cast(i) / (num_time_steps_ - 1)); + } + return time_steps; +} + +std::vector Runner::get_sigmas(const std::vector& time_steps) { + float start = std::sqrt(beta_start_); + float end = std::sqrt(beta_end_); + std::vector betas(num_train_timesteps_); + float step = (end - start) / (num_train_timesteps_ - 1); + for (int i = 0; i < num_train_timesteps_; ++i) { + float value = start + i * step; + betas[i] = 1 - (value * value); + } + + std::vector alphas_cumprod(num_train_timesteps_); + float cumprod = 1.0; + for (int i = 0; i < num_train_timesteps_; ++i) { + cumprod *= betas[i]; + alphas_cumprod[i] = cumprod; + } + + std::vector sigmas(num_train_timesteps_); + for (int i = 0; i < num_train_timesteps_; ++i) { + sigmas[i] = std::sqrt((1.0 - alphas_cumprod[i]) / alphas_cumprod[i]); + } + + std::vector res(time_steps.size()); + for (size_t i = 0; i < time_steps.size(); ++i) { + float index = + static_cast(i) * (sigmas.size() - 1) / (time_steps.size() - 1); + size_t lower_index = static_cast(std::floor(index)); + size_t upper_index = static_cast(std::ceil(index)); + + float weight = index - lower_index; + res[i] = + (1.0 - weight) * sigmas[lower_index] + weight * sigmas[upper_index]; + } + std::reverse(res.begin(), res.end()); + res.push_back(0); + + return res; +} + +void Runner::scale_model_input( + const std::vector& latents, + std::vector& latent_model_input, + float sigma) { + for (int i = 0; i < latents.size(); i++) { + latent_model_input[i] = (latents[i] / std::sqrt(sigma * sigma + 1)); + } +} + +void Runner::quant_tensor( + const std::vector& fp_vec, + std::vector& quant_vec, + float scale, + int offset) { + offset = abs(offset); + for (int i = 0; i < fp_vec.size(); i++) { + quant_vec[i] = static_cast((fp_vec[i] / scale) + offset); + } +} + +void Runner::dequant_tensor( + const std::vector& quant_vec, + std::vector& fp_vec, + float scale, + int offset) { + offset = abs(offset); + for (int i = 0; i < quant_vec.size(); i++) { + fp_vec[i] = (quant_vec[i] - offset) * scale; + } +} + +// Using the same algorithm as EulerDiscreteScheduler in python. +void Runner::step( + const std::vector& model_output, + const std::vector& sigmas, + std::vector& sample, + std::vector& prev_sample, + int step_index) { + float sigma = sigmas[step_index]; + float dt = sigmas[step_index + 1] - sigma; + + for (int i = 0; i < sample.size(); ++i) { + float sigma_hat = sample[i] - (sigma * model_output[i]); + prev_sample[i] = (sample[i] - sigma_hat) / sigma; + prev_sample[i] = sample[i] + (prev_sample[i] * dt); + } + sample = prev_sample; +} + +Error Runner::generate(std::string prompt) { + ET_LOG(Info, "Start generating"); + stats_.generate_start_ms = util::time_in_ms(); + + // Start tokenize + stats_.tokenizer_parsing_start_ms = util::time_in_ms(); + std::vector cond_tokens = tokenize(prompt); + cond_tokens.resize(max_tokens_); + std::vector uncond_tokens = tokenize(""); + uncond_tokens.resize(max_tokens_); + stats_.tokenizer_parsing_end_ms = util::time_in_ms(); + + std::vector> method_metas = get_methods_meta(); + + MethodMeta encoder_method_meta = method_metas[0].get(); + // Initialize text_encoder input tensors: cond/uncond tokenized_input[1,77] + ManagedTensor managed_cond_tokens( + cond_tokens.data(), + {1, 77}, + encoder_method_meta.input_tensor_meta(0)->scalar_type()); + ManagedTensor managed_uncond_tokens( + uncond_tokens.data(), + {1, 77}, + encoder_method_meta.input_tensor_meta(0)->scalar_type()); + Tensor cond_tokens_tensor = managed_cond_tokens.get_aliasing_tensor(); + Tensor uncond_tokens_tensor = managed_uncond_tokens.get_aliasing_tensor(); + // Initialize text_encoder output tensors: cond/uncond embedding[1, 77, 1024] + constexpr int emb_size = 1 * 77 * 1024; + std::vector cond_emb_vec(emb_size); + std::vector uncond_emb_vec(emb_size); + std::vector fp_emb_vec(emb_size); + ManagedTensor managed_cond_emb( + cond_emb_vec.data(), + {1, 77, 1024}, + encoder_method_meta.output_tensor_meta(0)->scalar_type()); + ManagedTensor managed_uncond_emb( + uncond_emb_vec.data(), + {1, 77, 1024}, + encoder_method_meta.output_tensor_meta(0)->scalar_type()); + Tensor cond_emb_tensor = managed_cond_emb.get_aliasing_tensor(); + Tensor uncond_emb_tensor = managed_uncond_emb.get_aliasing_tensor(); + modules_[0]->set_output_data_ptr(cond_emb_tensor, 0); + long encoder_start = util::time_in_ms(); + auto cond_res = modules_[0]->forward({cond_tokens_tensor}); + stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start); + modules_[0]->set_output_data_ptr(uncond_emb_tensor, 0); + encoder_start = util::time_in_ms(); + auto uncond_res = modules_[0]->forward({uncond_tokens_tensor}); + stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start); + + // Initialize unet parameters + MethodMeta unet_method_meta = method_metas[1].get(); + std::vector time_steps = get_time_steps(); + std::vector sigmas = get_sigmas(time_steps); + float max_sigma = *std::max_element(sigmas.begin(), sigmas.end()); + std::vector latent; + if (fix_latents_) { + latent = gen_latent_from_file(); + } else { + latent = gen_random_latent(max_sigma); + } + std::vector prev_sample(latent.size()); + + // Initialize unet input tensors + // 1. latent[1,64,64,4] + // 2. time_embedding[1,1280] + // 3. cond/uncond embedding[1,77,1024] + std::vector latent_model_input(latent.size()); + std::vector fp_latent_model_input(latent.size()); + ManagedTensor managed_latent( + latent_model_input.data(), + {1, 64, 64, 4}, + unet_method_meta.input_tensor_meta(0)->scalar_type()); + Tensor latent_tensor = managed_latent.get_aliasing_tensor(); + std::vector managed_time_emb_tensors; + std::vector time_emb_tensors; + managed_time_emb_tensors.reserve(num_time_steps_); + time_emb_tensors.reserve(num_time_steps_); + for (int step_index = 0; step_index < num_time_steps_; step_index++) { + managed_time_emb_tensors.emplace_back(ManagedTensor( + time_emb_list_[step_index].data(), + {1, 1280}, + unet_method_meta.input_tensor_meta(1)->scalar_type())); + time_emb_tensors.emplace_back( + managed_time_emb_tensors.back().get_aliasing_tensor()); + } + // requantize text encoders output + dequant_tensor( + cond_emb_vec, + fp_emb_vec, + text_encoder_output_scale_, + text_encoder_output_offset_); + quant_tensor( + fp_emb_vec, + cond_emb_vec, + unet_input_text_emb_scale_, + unet_input_text_emb_offset_); + dequant_tensor( + uncond_emb_vec, + fp_emb_vec, + text_encoder_output_scale_, + text_encoder_output_offset_); + quant_tensor( + fp_emb_vec, + uncond_emb_vec, + unet_input_text_emb_scale_, + unet_input_text_emb_offset_); + + // Initialize unet output tensors: text/uncond noise_pred[1,64,64,4] + std::vector noise_pred_text(latent.size()); + std::vector noise_pred_uncond(latent.size()); + std::vector fp_noise_pred_text(noise_pred_text.size()); + std::vector fp_noise_pred_uncond(noise_pred_uncond.size()); + ManagedTensor managed_noise_pred_text( + noise_pred_text.data(), + {1, 64, 64, 4}, + unet_method_meta.output_tensor_meta(0)->scalar_type()); + Tensor noise_pred_text_tensor = managed_noise_pred_text.get_aliasing_tensor(); + ManagedTensor managed_noise_pred_uncond( + noise_pred_uncond.data(), + {1, 64, 64, 4}, + unet_method_meta.output_tensor_meta(0)->scalar_type()); + Tensor noise_pred_uncond_tensor = + managed_noise_pred_uncond.get_aliasing_tensor(); + + // Execute unet + for (int step_index = 0; step_index < num_time_steps_; step_index++) { + long start_post_process = util::time_in_ms(); + scale_model_input(latent, fp_latent_model_input, sigmas[step_index]); + + quant_tensor( + fp_latent_model_input, + latent_model_input, + unet_input_latent_scale_, + unet_input_latent_offset_); + + stats_.unet_aggregate_post_processing_time += + (util::time_in_ms() - start_post_process); + modules_[1]->set_output_data_ptr(noise_pred_text_tensor, 0); + long start_unet_execution = util::time_in_ms(); + auto cond_res = modules_[1]->forward( + {latent_tensor, time_emb_tensors[step_index], cond_emb_tensor}); + stats_.unet_aggregate_execution_time += + (util::time_in_ms() - start_unet_execution); + modules_[1]->set_output_data_ptr(noise_pred_uncond_tensor, 0); + start_unet_execution = util::time_in_ms(); + auto uncond_res = modules_[1]->forward( + {latent_tensor, + time_emb_tensors[step_index], + uncond_emb_tensor}); // results in noise_pred_uncond_vec + stats_.unet_aggregate_execution_time += + (util::time_in_ms() - start_unet_execution); + + // start unet post processing + start_post_process = util::time_in_ms(); + + dequant_tensor( + noise_pred_text, + fp_noise_pred_text, + unet_output_scale_, + unet_output_offset_); + dequant_tensor( + noise_pred_uncond, + fp_noise_pred_uncond, + unet_output_scale_, + unet_output_offset_); + + for (int i = 0; i < fp_noise_pred_text.size(); i++) { + fp_noise_pred_text[i] = fp_noise_pred_uncond[i] + + guidance_scale_ * (fp_noise_pred_text[i] - fp_noise_pred_uncond[i]); + } + step(fp_noise_pred_text, sigmas, latent, prev_sample, step_index); + stats_.unet_aggregate_post_processing_time += + (util::time_in_ms() - start_post_process); + } + + // Start VAE + MethodMeta vae_method_meta = method_metas[2].get(); + // Initialize vae input tensor : latent[1,64,64,4] + std::vector vae_input(latent.size()); + ManagedTensor managed_vae_input( + vae_input.data(), + {1, 64, 64, 4}, + vae_method_meta.input_tensor_meta(0)->scalar_type()); + Tensor vae_input_tensor = managed_vae_input.get_aliasing_tensor(); + // Intialize vae output tensor: output[1,512,512,3] + constexpr int image_size = 1 * 512 * 512 * 3; + std::vector q_out(image_size); + std::vector out(image_size); + ManagedTensor managed_output( + q_out.data(), + {1, 512, 512, 3}, + vae_method_meta.output_tensor_meta(0)->scalar_type()); + Tensor output_tensor = managed_output.get_aliasing_tensor(); + + quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_); + + modules_[2]->set_output_data_ptr(output_tensor, 0); + long start_vae_execution = util::time_in_ms(); + auto vae_res = modules_[2]->forward({vae_input_tensor}); + stats_.vae_execution_time = (util::time_in_ms() - start_vae_execution); + stats_.generate_end_ms = util::time_in_ms(); + + // Dequant uint16 output to fp32 output + dequant_tensor(q_out, out, vae_output_scale_, vae_output_offset_); + + // Saving outputs + auto output_file_name = output_path_ + "/output_0_0.raw"; + std::ofstream fout(output_file_name.c_str(), std::ios::binary); + fout.write( + reinterpret_cast(out.data()), out.size() * sizeof(float)); + fout.close(); + + return Error::Ok; +} + +Error Runner::print_performance() { + ET_LOG(Info, "\tTotal Number of steps:\t\t\t\t%d", num_time_steps_); + + ET_LOG( + Info, + "\tTokenizer Load Time:\t\t\t\t%f (seconds)", + ((double)(stats_.tokenizer_load_end_ms - stats_.tokenizer_load_start_ms) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tModel Load Time:\t\t\t\t%f (seconds)", + ((double)(stats_.model_load_end_ms - stats_.model_load_start_ms) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tGenerate Time(Tokenize + Encoder + UNet + VAE):\t%f (seconds)", + ((double)(stats_.generate_end_ms - stats_.generate_start_ms) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tTokenize Time:\t\t\t\t\t%f (seconds)", + ((double)(stats_.tokenizer_parsing_end_ms - + stats_.tokenizer_parsing_start_ms) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tText Encoder Execution Time:\t\t\t%f (seconds)", + ((double)(stats_.text_encoder_execution_time) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tUnet Aggregate (Cond + Uncond) Execution Time:\t%f (seconds)", + ((double)stats_.unet_aggregate_execution_time / + (stats_.SCALING_FACTOR_UNITS_PER_SECOND))); + + ET_LOG( + Info, + "\tUnet Average Execution Time:\t\t\t%f (seconds)", + ((double)(stats_.unet_aggregate_execution_time / (num_time_steps_ * 2)) / + (stats_.SCALING_FACTOR_UNITS_PER_SECOND))); + + ET_LOG( + Info, + "\tUnet Aggregate Post-Processing Time:\t\t%f (seconds)", + ((double)(stats_.unet_aggregate_post_processing_time) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tUnet Average Post-Processing Time:\t\t%f (seconds)", + ((double)(stats_.unet_aggregate_post_processing_time / + (num_time_steps_ * 2)) / + (stats_.SCALING_FACTOR_UNITS_PER_SECOND))); + + ET_LOG( + Info, + "\tVAE Execution Time:\t\t\t\t%f (seconds)", + ((double)(stats_.vae_execution_time) / + stats_.SCALING_FACTOR_UNITS_PER_SECOND)); + return Error::Ok; +} + +} // namespace executor +} // namespace torch diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h new file mode 100644 index 00000000000..e081ab80ccc --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple diffusion runner that includes preprocessing and post processing +// logic. The module takes in a string as input and emites a tensor as output. + +#pragma once + +#include +#include +#include + +#include + +namespace torch { +namespace executor { + +class Runner { + public: + explicit Runner( + const std::vector& models_path, + const int num_time_steps, + const float guidance_scale, + const float text_encoder_output_scale, + const int text_encoder_output_offset, + const float unet_input_latent_scale, + const int unet_input_latent_offset, + const float unet_input_text_emb_scale, + const float unet_input_text_emb_offset, + const float unet_output_scale, + const int unet_output_offset, + const float vae_input_scale, + const int vae_input_offset, + const float vae_output_scale, + const int vae_output_offset, + const std::string output_path, + const bool fix_latents); + + struct Stats { + // Scaling factor for timestamps - in this case, we use ms. + const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; + // Time stamps for the different stages of the execution + // model_load_start_ms: Model loading time + long model_load_start_ms; + long model_load_end_ms; + + // tokenizer loading time + long tokenizer_load_start_ms = 0; + long tokenizer_load_end_ms = 0; + + // tokenizer parsing time + long tokenizer_parsing_start_ms = 0; + long tokenizer_parsing_end_ms = 0; + + // Total time to run generate + long generate_start_ms = 0; + long generate_end_ms = 0; + + // text encoder execution time + long text_encoder_execution_time = 0; + + // Unet aggregation execution time over n steps for cond + uncond + long unet_aggregate_execution_time = 0; + + // UNet aggregation post processing time over n steps for cond + uncond. + // This is the time from processing unet's output until feeding it into the + // next iteration. + long unet_aggregate_post_processing_time = 0; + + // VAE execution time + long vae_execution_time = 0; + }; + + bool is_loaded() const; + Error load(); + Error init_tokenizer(const std::string& vocab_json_path); + Error print_performance(); + std::vector tokenize(std::string prompt); + std::vector gen_latent_from_file(); + std::vector gen_random_latent(float sigma); + void step( + const std::vector& model_output, + const std::vector& sigmas, + std::vector& sample, + std::vector& prev_sample, + int step_index); + std::vector> get_methods_meta(); + std::vector get_time_steps(); + std::vector get_sigmas(const std::vector& time_steps); + void scale_model_input( + const std::vector& vec, + std::vector& latent_model_input, + float sigma); + Error parse_input_list(std::string& path); + Error generate(std::string prompt); + void quant_tensor( + const std::vector& fp_vec, + std::vector& quant_vec, + float scale, + int offset); + void dequant_tensor( + const std::vector& quant_vec, + std::vector& fp_vec, + float scale, + int offset); + + private: + Stats stats_; + std::vector> modules_; + std::vector> time_emb_list_; + std::unordered_map vocab_to_token_map_; + + std::string output_path_; + int num_time_steps_; + float guidance_scale_; + float text_encoder_output_scale_; + int text_encoder_output_offset_; + float unet_input_latent_scale_; + int unet_input_latent_offset_; + float unet_input_text_emb_scale_; + int unet_input_text_emb_offset_; + float unet_output_scale_; + int unet_output_offset_; + float vae_input_scale_; + int vae_input_offset_; + float vae_output_scale_; + int vae_output_offset_; + const float beta_start_ = 0.00085; + const float beta_end_ = 0.012; + const int num_train_timesteps_ = 1000; + const int max_tokens_ = 77; + const bool fix_latents_ = false; +}; + +} // namespace executor +} // namespace torch diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py new file mode 100644 index 00000000000..8ec5783131d --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/stable_diffusion_lib.py @@ -0,0 +1,22 @@ +import torch +from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline + + +class StableDiffusion: + def __init__(self, seed=42): + self.model_id: str = "stabilityai/stable-diffusion-2-1-base" + self.generator = torch.manual_seed(seed) + self.scheduler = EulerDiscreteScheduler.from_pretrained( + self.model_id, subfolder="scheduler" + ) + + self.pipe = StableDiffusionPipeline.from_pretrained( + self.model_id, scheduler=self.scheduler, torch_dtype=torch.float32 + ) + self.pipe = self.pipe.to("cpu") + + def __call__(self, prompt, height, width, num_time_steps): + image = self.pipe( + prompt, height, width, num_time_steps, generator=self.generator + ).images[0] + return image From 8471c22fa3683d28b6677d81f1926e99d483429e Mon Sep 17 00:00:00 2001 From: meta-emilian <162623112+meta-emilian@users.noreply.github.com> Date: Sat, 24 Aug 2024 02:34:34 -0700 Subject: [PATCH 038/531] Enable MKL on x86 to get around long-context discrepancies with torch.nn.functional.scaled_dot_product_attention Differential Revision: D61290864 Pull Request resolved: https://github.com/pytorch/executorch/pull/4758 --- extension/llm/custom_ops/targets.bzl | 81 +++++++++++++------------ kernels/optimized/lib_defs.bzl | 91 ++++++++++++++++------------ 2 files changed, 92 insertions(+), 80 deletions(-) diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index fe93f6a422d..0976202d452 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -6,47 +6,48 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ - runtime.cxx_library( - name = "custom_ops", - srcs = ["op_sdpa.cpp"], - exported_headers = ["op_sdpa.h"], - exported_deps = [ - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/kernels/optimized:libblas", - "//executorch/kernels/optimized:libvec", - "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", - "//executorch/backends/xnnpack/threadpool:threadpool", - ], - compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], - visibility = [ - "//executorch/...", - "//executorch/extension/llm/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - # @lint-ignore BUCKLINT link_whole - link_whole = True, - force_static = True, - ) + for mkl_dep in ["", "_mkl_lp64_omp"]: + runtime.cxx_library( + name = "custom_ops" + mkl_dep, + srcs = ["op_sdpa.cpp"], + exported_headers = ["op_sdpa.h"], + exported_deps = [ + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/kernels/optimized:libblas{}".format(mkl_dep), + "//executorch/kernels/optimized:libvec", + "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/parallel:thread_parallel", + "//executorch/backends/xnnpack/threadpool:threadpool", + ], + compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], + visibility = [ + "//executorch/...", + "//executorch/extension/llm/custom_ops/...", + "@EXECUTORCH_CLIENTS", + ], + # @lint-ignore BUCKLINT link_whole + link_whole = True, + force_static = True, + ) - runtime.cxx_library( - name = "custom_ops_aot_lib", - srcs = [ - "op_sdpa_aot.cpp", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - external_deps = [ - "libtorch", - ], - deps = [ - ":custom_ops", - "//executorch/extension/aten_util:aten_bridge", - ], - ) + runtime.cxx_library( + name = "custom_ops_aot_lib" + mkl_dep, + srcs = [ + "op_sdpa_aot.cpp", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + external_deps = [ + "libtorch", + ], + deps = [ + ":custom_ops" + mkl_dep, + "//executorch/extension/aten_util:aten_bridge", + ], + ) runtime.python_library( name = "custom_ops_aot_py", diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 5af9b423ad0..045e8684217 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -99,44 +99,55 @@ def define_libs(): ], ) - runtime.cxx_library( - name = "libblas", - srcs = native.glob([ - "blas/**/*.cpp", - ]), - exported_headers = native.glob([ - "blas/**/*.h", - ]), - header_namespace = "executorch/kernels/optimized", - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - fbandroid_platform_preprocessor_flags = [ - ( - "^android-arm64.*$", - [ + for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl"), ("libblas_mkl_lp64_omp", "fbsource//third-party/mkl:mkl_lp64_omp")]: + runtime.cxx_library( + name = libblas_name, + srcs = native.glob([ + "blas/**/*.cpp", + ]), + exported_headers = native.glob([ + "blas/**/*.h", + ]), + header_namespace = "executorch/kernels/optimized", + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + preprocessor_flags = select({ + "DEFAULT": [], + "ovr_config//os:linux-x86_64": [ "-DET_BUILD_WITH_BLAS", - ], - ), - ], - fbandroid_platform_deps = [ - ( - "^android-arm64.*$", - [ - "fbsource//third-party/openblas:openblas", - ], - ), - ], - fbobjc_exported_preprocessor_flags = [ - "-DET_BUILD_WITH_BLAS", - "-DET_BUILD_FOR_APPLE", - ], - fbobjc_frameworks = [ - "Accelerate", - ], - exported_deps = [ - "//executorch/kernels/optimized:libutils", - "//executorch/runtime/core/exec_aten:lib", - ], - ) + ] if not runtime.is_oss else [], + }), + fbandroid_platform_preprocessor_flags = [ + ( + "^android-arm64.*$", + [ + "-DET_BUILD_WITH_BLAS", + ], + ), + ], + fbandroid_platform_deps = [ + ( + "^android-arm64.*$", + [ + "fbsource//third-party/openblas:openblas", + ], + ), + ], + fbobjc_exported_preprocessor_flags = [ + "-DET_BUILD_WITH_BLAS", + "-DET_BUILD_FOR_APPLE", + ], + fbobjc_frameworks = [ + "Accelerate", + ], + deps = select({ + "DEFAULT": [], + "ovr_config//os:linux-x86_64": [mkl_dep] if not runtime.is_oss else [], + }), + exported_deps = [ + "//executorch/kernels/optimized:libutils", + "//executorch/runtime/core/exec_aten:lib", + ], + ) From ebe9075ad572a546816c1913e6b49069be921a65 Mon Sep 17 00:00:00 2001 From: Aaron Orenstein Date: Sun, 25 Aug 2024 23:57:33 -0400 Subject: [PATCH 039/531] typing for decorators - fx/_compatibility Differential Revision: D61493706 Pull Request resolved: https://github.com/pytorch/executorch/pull/4810 --- backends/cadence/aot/compiler.py | 6 +++--- backends/transforms/addmm_mm_to_linear.py | 4 ++-- backends/transforms/decompose_sdpa.py | 2 +- .../passes/channels_last_tagged_reshape_pass.py | 2 +- backends/xnnpack/passes/convert_to_sdpa.py | 2 +- .../llama2/source_transformation/quantize.py | 4 ++-- examples/models/phi-3-mini/export_phi-3-mini.py | 2 +- exir/emit/_emitter.py | 14 +++++++------- exir/lowered_backend_module.py | 4 ++-- exir/pass_base.py | 8 ++++---- exir/passes/__init__.py | 1 + exir/passes/remove_noop_pass.py | 2 +- exir/tests/test_passes.py | 2 +- exir/tests/test_quantization.py | 2 +- exir/tracer.py | 6 +++--- exir/verification/arg_validator.py | 6 +++--- extension/llm/export/builder.py | 4 +++- extension/llm/export/partitioner_lib.py | 15 ++++++++------- extension/llm/export/quantizer_lib.py | 8 ++++---- 19 files changed, 49 insertions(+), 45 deletions(-) diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 405f8b5db4e..e1494f8d20d 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -60,13 +60,13 @@ def convert_pt2( # Export with dynamo model_gm = capture_pre_autograd_graph(model, inputs) - if model_gm_has_SDPA(model_gm): + if model_gm_has_SDPA(model_gm): # pyre-fixme[6] # Decompose SDPA - DecomposeScaledDotProductAttention(False)(model_gm) + DecomposeScaledDotProductAttention(False)(model_gm) # pyre-fixme[6] # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882 # for details). - result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) + result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) # pyre-fixme[6] assert result is not None model_gm = result.graph_module diff --git a/backends/transforms/addmm_mm_to_linear.py b/backends/transforms/addmm_mm_to_linear.py index 7855de617b7..358cbb7ac14 100644 --- a/backends/transforms/addmm_mm_to_linear.py +++ b/backends/transforms/addmm_mm_to_linear.py @@ -130,7 +130,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"], args[2].meta["val"] ) else: @@ -147,7 +147,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"] ) linear_node.meta = node.meta diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py index 6dbbf564f56..329dab96df2 100644 --- a/backends/transforms/decompose_sdpa.py +++ b/backends/transforms/decompose_sdpa.py @@ -34,7 +34,7 @@ def call( # refer to pytorch/test/test_decomp.py decomposed_module = make_fx( node.target, - decomposition_table=get_decompositions( + decomposition_table=get_decompositions( # pyre-fixme[6] [ torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default, ] diff --git a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py index f1f9a69acca..692f1a9d145 100644 --- a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py @@ -124,7 +124,7 @@ def create_call_function_node( "call_function", target=target, args=args, - kwargs=( + kwargs=( # pyre-fixme[6] {"memory_format": memory_format} if memory_format is not None else {} ), ) diff --git a/backends/xnnpack/passes/convert_to_sdpa.py b/backends/xnnpack/passes/convert_to_sdpa.py index 76bb24cc949..97aca5491dd 100644 --- a/backends/xnnpack/passes/convert_to_sdpa.py +++ b/backends/xnnpack/passes/convert_to_sdpa.py @@ -83,7 +83,7 @@ def create_sdpa( kwargs={"scale": scale}, ) - sdpa_node.meta["val"] = sdpa_node.target( + sdpa_node.meta["val"] = sdpa_node.target( # pyre-fixme[29] *[n.meta["val"] for n in match.placeholder_nodes], scale=scale, ) diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py index bb014145bd8..4f3eaf1125b 100644 --- a/examples/models/llama2/source_transformation/quantize.py +++ b/examples/models/llama2/source_transformation/quantize.py @@ -96,7 +96,7 @@ def quantize( try: # torchao 0.3+ - from torchao._eval import InputRecorder + from torchao._eval import InputRecorder # pyre-fixme[21] except ImportError: from torchao.quantization.GPTQ import InputRecorder # pyre-ignore @@ -110,7 +110,7 @@ def quantize( ) inputs = ( - InputRecorder( + InputRecorder( # pyre-fixme[16] tokenizer, calibration_seq_length, None, # input_prep_func diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py index ab5e04c3073..553fded67fb 100644 --- a/examples/models/phi-3-mini/export_phi-3-mini.py +++ b/examples/models/phi-3-mini/export_phi-3-mini.py @@ -67,7 +67,7 @@ def export(args) -> None: model = capture_pre_autograd_graph( model, example_inputs, dynamic_shapes=dynamic_shapes ) - model = prepare_pt2e(model, xnnpack_quantizer) + model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6] model(*example_inputs) model = convert_pt2e(model, fold_quantize=False) DuplicateDynamicQuantChainPass()(model) diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index f51b4113c8c..2d2cc0f3f18 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -1270,7 +1270,7 @@ def _emit_prim_getters(self, prim_getters: Dict[str, Any]) -> List[ExecutionPlan def fetch_attr(self, target: _Target) -> _AbstractValue: """Fetch weights and other module parameters. If the attribute is a tensor, emit it.""" - attr = super().fetch_attr(target) + attr = super().fetch_attr(target) # pyre-fixme[6] if isinstance(attr, torch.Tensor): return self._emit_evalue( @@ -1286,7 +1286,7 @@ def fetch_attr(self, target: _Target) -> _AbstractValue: else: return attr - def call_module( + def call_module( # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> None: """Unsupported in execution IR, so unhandled by the emitter.""" @@ -1294,7 +1294,7 @@ def call_module( self._emit_node_specific_error(self.node, "call_module is not supported") ) - def call_method( + def call_method( # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> _EmitterValue: """Unsupported in execution IR, so unhandled by the emitter.""" @@ -1302,7 +1302,7 @@ def call_method( self._emit_node_specific_error(self.node, "call_method is not supported") ) - def placeholder( + def placeholder( # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> _AbstractValue: """Performs actions for the placeholder node of a graph module. @@ -1324,7 +1324,7 @@ def placeholder( self.placeholder_count += 1 return value - def output( + def output( # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> None: """Performs actions for the output node of a graph module. @@ -1354,7 +1354,7 @@ def output( ) self.chain.instructions.append(instruction) - def call_function( + def call_function( # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> _EmitterValue: """Performs actions for the call_function node of a graph module. @@ -1412,7 +1412,7 @@ def call_function( ) ) - def run( + def run( # pyre-fixme[14] self, *args: _Argument, initial_env: Optional[Dict[torch.fx.Node, _Argument]] = None, diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 2c2cd8eb0dd..4d07fdcdf06 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -139,7 +139,7 @@ def buffer( segment_alignment: int = 4096, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, - memory_planning: MemoryPlanningPass = None, + memory_planning: MemoryPlanningPass = None, # pyre-fixme[9] ) -> bytes: """ Returns a buffer containing the serialized ExecuTorch binary. @@ -161,7 +161,7 @@ def buffer( def program( self, emit_stacktrace: bool = False, - memory_planning: MemoryPlanningPass = None, + memory_planning: MemoryPlanningPass = None, # pyre-fixme[9] ) -> Program: # Fix autodpes introuces cyclic dependencies: # program -> verifier -> lowered_backend_module -> program diff --git a/exir/pass_base.py b/exir/pass_base.py index dd55641f257..3b1a2928e2c 100644 --- a/exir/pass_base.py +++ b/exir/pass_base.py @@ -177,7 +177,7 @@ def __init__(self, callback: "_ExportPassBase", codegen: CodeGen) -> None: self.fake_tensor_mode: Optional[FakeTensorMode] = None self.submodules: Dict[torch.nn.Module, str] = {} - def trace(self) -> None: + def trace(self) -> None: # pyre-fixme[14,15] raise ExportPassBaseError("ExportTracer doesn't support trace().") def create_arg(self, a: Argument) -> torch.fx.Node: @@ -290,7 +290,7 @@ def __init__(self, callback: "_ExportPassBase", gm: fx.GraphModule) -> None: self.callback = callback self.node: torch.fx.Node = next(iter(gm.graph.nodes)) - def placeholder( + def placeholder( # pyre-fixme[14] self, target: str, args: Tuple[Argument, ...], @@ -351,7 +351,7 @@ def call_function( else: raise ExportPassBaseError(f"Unsupported target type: {target}") - def get_attr( + def get_attr( # pyre-fixme[14] self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument] ) -> Argument: return super().get_attr(target, args, kwargs) @@ -364,7 +364,7 @@ def call_module( ) -> None: raise ExportPassBaseError("call_module is not supported.") - def call_method( + def call_method( # pyre-fixme[14] self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument] ) -> None: raise ExportPassBaseError("call_method is not supported.") diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py index 99507ccdc9d..7a0623040f8 100644 --- a/exir/passes/__init__.py +++ b/exir/passes/__init__.py @@ -302,6 +302,7 @@ def make_alloc_node( "Memory allocator node needs FakeTensor val or TensorMetadata to proceed" ) + # pyre-fixme[6] alloc = graph_module.graph.call_function(memory.alloc, (alloc_spec,)) alloc.meta["val"] = val alloc.meta["tensor_meta"] = tensor_meta diff --git a/exir/passes/remove_noop_pass.py b/exir/passes/remove_noop_pass.py index c834ca92947..d9b99556636 100644 --- a/exir/passes/remove_noop_pass.py +++ b/exir/passes/remove_noop_pass.py @@ -40,7 +40,7 @@ def eliminate_dq_q( qparams_q = list(user.args)[1:] if qparams_dq != qparams_q: continue - user.replace_all_uses_with(node.args[0]) + user.replace_all_uses_with(node.args[0]) # pyre-fixme[6] class RemoveNoopPass(ExportPass): diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py index 99ec6481458..a167a67dd94 100644 --- a/exir/tests/test_passes.py +++ b/exir/tests/test_passes.py @@ -1421,7 +1421,7 @@ def quantize_model( quantizer = XNNPACKQuantizer() quantization_config = get_symmetric_quantization_config() quantizer.set_global(quantization_config) - m = prepare_pt2e(m, quantizer) + m = prepare_pt2e(m, quantizer) # pyre-fixme[6] m = convert_pt2e(m, fold_quantize=True) ep = torch.export.export(m, example_inputs) dq_nodes_pre = count_dq_nodes(ep.graph_module) diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py index ca85386db64..ebe94775221 100644 --- a/exir/tests/test_quantization.py +++ b/exir/tests/test_quantization.py @@ -58,7 +58,7 @@ def test_resnet(self) -> None: quantizer = XNNPACKQuantizer() operator_config = get_symmetric_quantization_config(is_per_channel=True) quantizer.set_global(operator_config) - m = prepare_pt2e(m, quantizer) + m = prepare_pt2e(m, quantizer) # pyre-fixme[6] self.assertEqual( id(m.activation_post_process_3), id(m.activation_post_process_2) ) diff --git a/exir/tracer.py b/exir/tracer.py index 1a8709a2372..c4593cca8e3 100644 --- a/exir/tracer.py +++ b/exir/tracer.py @@ -272,7 +272,7 @@ def __torch_function__( kwargs = {} if torch.is_inference_mode_enabled(): if func is torch.nn.functional.layer_norm: - args, kwargs = normalize_function(func, args, kwargs) + args, kwargs = normalize_function(func, args, kwargs) # pyre-fixme[23] input, normalized_shape = args normalized_shape = list(normalized_shape) return cls.__torch_dispatch__( @@ -470,13 +470,13 @@ def create_arg(self, a: Value) -> torch.fx.Node: # noqa: C901 self.submodules[a] = name_submodule return self.create_node("get_attr", self.submodules[a], (), {}) - return super().create_arg(a) + return super().create_arg(a) # pyre-fixme[7] @staticmethod def get() -> "DispatchTracer": return TRACER - def trace( + def trace( # pyre-fixme[14,15] self, root: Callable[..., Value], concrete_args: Tuple[Value, ...] = (), diff --git a/exir/verification/arg_validator.py b/exir/verification/arg_validator.py index 65ab146782c..c087944b12d 100644 --- a/exir/verification/arg_validator.py +++ b/exir/verification/arg_validator.py @@ -62,7 +62,7 @@ def _get_kernel_arg(self, schema_arg, schema_arg_idx, args, kwargs): return kernel_arg - def call_function( # noqa: C901 + def call_function( # noqa: C901 # pyre-fixme[14] self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> Any: """ @@ -73,7 +73,7 @@ def call_function( # noqa: C901 ): if isinstance(target, HigherOrderOperator): raise RunHigherOrderOperatorError("Can't run delegate") - return super().call_function(target, args, kwargs) + return super().call_function(target, args, kwargs) # pyre-fixme[6] # TODO(gasoonjia): Update Optional[torch.dtype] to a concrete class to support mixed dtypes in tensorlist. tensor_arg_types: Dict[str, Optional[torch.dtype]] = {} @@ -126,4 +126,4 @@ def call_function( # noqa: C901 valid = target._schema.dtype_constraint.validate(tensor_arg_types) if not valid: self.violating_ops[target] = tensor_arg_types - return super().call_function(target, args, kwargs) + return super().call_function(target, args, kwargs) # pyre-fixme[6] diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 28afef20d04..eccb3317e7f 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -161,6 +161,7 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + # pyre-fixme[8] self.pre_autograd_graph_module = capture_pre_autograd_graph( self.model, self.example_inputs, dynamic_shapes=dynamic_shape ) @@ -209,11 +210,12 @@ def export_to_edge(self) -> "LLMEdgeManager": # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): if self.pre_autograd_graph_module is None: + # pyre-fixme[8] self.pre_autograd_graph_module = capture_pre_autograd_graph( self.model, self.example_inputs, dynamic_shapes=dynamic_shape ) self.edge_manager = export_to_edge( - self.pre_autograd_graph_module, + self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs, dynamic_shapes=dynamic_shape, edge_constant_methods=self.metadata, diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 501ef6fa6bb..ab98f2543f7 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -52,7 +52,7 @@ def get_mps_partitioner(use_kv_cache: bool = False): ) compile_specs = [CompileSpec("use_fp16", bytes([True]))] - return MPSPartitioner(compile_specs) + return MPSPartitioner(compile_specs) # pyre-fixme[16] def get_coreml_partitioner( @@ -92,14 +92,14 @@ def get_coreml_partitioner( # if use_kv_cache: # minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) - compile_specs = CoreMLBackend.generate_compile_specs( + compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16] minimum_deployment_target=minimum_deployment_target, compute_precision=ct.precision(ct.precision.FLOAT16.value), # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU` compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()], - model_type=CoreMLBackend.MODEL_TYPE.MODEL, + model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16] ) - return CoreMLPartitioner( + return CoreMLPartitioner( # pyre-fixme[16] compile_specs=compile_specs, ) @@ -136,9 +136,10 @@ def get_qnn_partitioner( if pt2e_quantize is not None: use_fp16 = False - return QnnPartitioner( - generate_qnn_executorch_compiler_spec( - soc_model=QcomChipset.SM8650, # default to SM8650 + return QnnPartitioner( # pyre-fixme[16] + generate_qnn_executorch_compiler_spec( # pyre-fixme[16] + soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] + # pyre-fixme[16] backend_options=generate_htp_compiler_spec(use_fp16=use_fp16), debug=False, saver=False, diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 8514e5d2558..36d2f630b03 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -146,7 +146,7 @@ def get_qnn_quantizer( quantization_mode: Optional[str] = None, ): try: - from executorch.backends.qualcomm.quantizer.custom_annotation import ( + from executorch.backends.qualcomm.quantizer.custom_annotation import ( # pyre-fixme[21] custom_annotate_llama_matmul_16a8w, ) @@ -168,15 +168,15 @@ def get_qnn_quantizer( assert ( backend == "qnn" ), f"The quantization config is for backend {backend} instead of qnn." - qnn_quantizer = QnnQuantizer() + qnn_quantizer = QnnQuantizer() # pyre-fixme[16] qnn_quantizer.set_per_channel_conv_quant(enable=True) qnn_quantizer.set_per_channel_linear_quant(enable=True) # more custom quantization are supported including 16a4w etc. default to 8bit quantized custom_annotations = () if quant_config == "8a8w": - quant_dtype = QuantDtype.use_8a8w + quant_dtype = QuantDtype.use_8a8w # pyre-fixme[16] elif quant_config == "16a16w": - quant_dtype = QuantDtype.use_16a16w + quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS) qnn_quantizer.set_bit16_op_quant_config( # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. From 316bd15ebcf78025243ecafcd106f17a08bd7104 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 26 Aug 2024 17:19:01 +0200 Subject: [PATCH 040/531] Add cat op to Arm backend Differential Revision: D61542537 Pull Request resolved: https://github.com/pytorch/executorch/pull/4787 --- backends/arm/arm_partitioner.py | 1 + backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_cat.py | 45 ++++++ backends/arm/quantizer/arm_quantizer.py | 1 + .../quantization_annotation/__init__.py | 1 + .../quantization_annotation/cat_annotator.py | 66 +++++++++ backends/arm/test/ops/test_cat.py | 131 ++++++++++++++++++ backends/arm/test/tester/arm_tester.py | 16 ++- 8 files changed, 255 insertions(+), 7 deletions(-) create mode 100644 backends/arm/operators/op_cat.py create mode 100644 backends/arm/quantizer/quantization_annotation/cat_annotator.py create mode 100644 backends/arm/test/ops/test_cat.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index f73d97480bc..d16a2699c01 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -39,6 +39,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.addmm.default, exir_ops.edge.aten.expand_copy.default, + exir_ops.edge.aten.cat.default, exir_ops.edge.aten.permute_copy.default, exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.convolution.default, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 94a16d8c941..176931c3607 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -9,6 +9,7 @@ op_addmm, op_avg_pool2d, op_batch_norm, + op_cat, op_conv2d, op_dequant, op_div, diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py new file mode 100644 index 00000000000..f2b41656572 --- /dev/null +++ b/backends/arm/operators/op_cat.py @@ -0,0 +1,45 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class CatVisitor(NodeVisitor): + target = "aten.cat.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + tensors = inputs[0].special + dim = 0 if len(inputs) < 2 else inputs[1].number + rank = len(output.shape) + dim = (dim + rank) % rank + dim = output.dim_order.index(dim) + + attr = ts.TosaSerializerAttribute() + attr.AxisAttribute(dim) + + tosa_graph.addOperator( + TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr + ) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 8d5edf386a0..26920383524 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -267,6 +267,7 @@ class ArmQuantizer(Quantizer): "mul", "sigmoid", "mm", + "cat", ] def __init__(self) -> None: diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py index 60808d2f234..68ad522feeb 100644 --- a/backends/arm/quantizer/quantization_annotation/__init__.py +++ b/backends/arm/quantizer/quantization_annotation/__init__.py @@ -49,6 +49,7 @@ def decorator(annotator: AnnotatorType): from . import ( # noqa adaptive_ang_pool2d_annotator, add_annotator, + cat_annotator, conv_annotator, linear_annotator, max_pool2d_annotator, diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py new file mode 100644 index 00000000000..40dd19526b3 --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +from typing import Callable, List, Optional + +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer import ( + QuantizationAnnotation, + SharedQuantizationSpec, +) +from torch.fx import Node +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +@register_annotator("cat") +def _annotate_cat( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn) + cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values())) + annotated_partitions = [] + for cat_partition in cat_partitions: + annotated_partitions.append(cat_partition.nodes) + cat_node = cat_partition.output_nodes[0] + if arm_quantizer_utils.is_annotated(cat_node): + continue + + input_acts = cat_node.args[0] + input_act0 = input_acts[0] + + input_act_qspec = quantization_config.get_input_act_qspec() + shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node)) + + input_qspec_map = {} + + # First input is set to input qspec from the quantization config. + if isinstance(input_act0, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act0, gm): + continue + input_qspec_map[input_act0] = input_act_qspec + + # For the rest of the inputs, share qspec with first. + # If we can't quantize any of the inputs, abort annotation. + for input_act in input_acts[1:]: + if isinstance(input_act, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act, gm): + continue + if input_act is not input_act0: + input_qspec_map[input_act] = shared_with_input0_qspec + + if input_qspec_map is not None: + cat_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_with_input0_qspec, + _annotated=True, + ) + return annotated_partitions diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py new file mode 100644 index 00000000000..f677aa5590c --- /dev/null +++ b/backends/arm/test/ops/test_cat.py @@ -0,0 +1,131 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + + +class TestCat(unittest.TestCase): + + class Cat(torch.nn.Module): + test_parameters = [ + ((torch.ones(1), torch.ones(1)), 0), + ((torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)), 1), + ( + ( + torch.ones(1, 2, 5), + torch.randn(1, 2, 4), + torch.randn(1, 2, 2), + torch.randn(1, 2, 1), + ), + -1, + ), + ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3), + ( + ( + 10000 * torch.randn(2, 3, 1, 4), + torch.randn(2, 7, 1, 4), + torch.randn(2, 1, 1, 4), + ), + -3, + ), + ] + + def __init__(self): + super().__init__() + + def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor: + return torch.cat(tensors, dim=dim) + + def _test_cat_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_cat_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_cat_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + def test_cat_4d_tosa_MI(self): + square = torch.ones((2, 2, 2, 2)) + for dim in range(-3, 3): + test_data = ((square, square), dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_BI_pipeline(self.Cat(), test_data) + + @parameterized.expand(Cat.test_parameters) + def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_u55_BI_pipeline(self.Cat(), test_data) diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 41fc907fdfe..8a02c63d7a6 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -242,16 +242,18 @@ def run_method_and_compare_outputs( # Loop inputs and compare reference stage with the compared stage. for run_iteration in range(num_runs): reference_input = inputs if inputs else next(self.generate_random_inputs()) - if is_nhwc: - test_input = self.transpose_data_format(reference_input, "NHWC") - else: - test_input = reference_input # Test parameters can include constants that are used in eager mode but are already set as attributes # in TOSA. Therefore, only accept torch.Tensor inputs. - test_input = [ - tensor for tensor in test_input if isinstance(tensor, torch.Tensor) - ] + test_input: list[torch.Tensor] = [] + for arg in reference_input: + if isinstance(arg, torch.Tensor): + test_input.append(arg) + if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor): + test_input.extend(list(arg)) + + if is_nhwc: + test_input = self.transpose_data_format(test_input, "NHWC") input_shapes = [ generated_input.shape if hasattr(generated_input, "shape") else (1,) From 2b7aa2b0c8479cbdee8b0a9f0363274d99227b31 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 26 Aug 2024 17:24:25 +0200 Subject: [PATCH 041/531] Improve logic for getting submodules from target name Differential Revision: D61718685 Pull Request resolved: https://github.com/pytorch/executorch/pull/4835 --- backends/arm/quantizer/arm_quantizer_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index c5da32a40ad..b49f6b52e66 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -102,12 +102,19 @@ def is_input_ok_for_quantization(input_act: Node, gm: GraphModule): ) +def get_node_target(module: torch.nn.Module | GraphModule, target_str: str): + targets = target_str.split(".") + for target in targets[:-1]: + module = module.get_submodule(target) + return getattr(module, targets[-1]) + + def is_input_large_scalar(node: Node, gm: GraphModule): """Check if input is a large scalar value. So that we can skip quantization for the node since histc op (in HistogramObserver) only works for values up to certain upper bound """ if node.op == "get_attr" and isinstance(node.target, str): - tensor = getattr(gm, node.target) + tensor = get_node_target(gm, node.target) # torch.histc works until this upper bound HISTC_UPPER_BOUND = 3.4028235e15 return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND From e3ac39a123e6a74511ae46c664e653481853cc80 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 26 Aug 2024 17:31:05 +0200 Subject: [PATCH 042/531] Add ReLU operator to Arm backend Differential Revision: D61718601 Pull Request resolved: https://github.com/pytorch/executorch/pull/4834 --- backends/arm/arm_partitioner.py | 1 + backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_relu.py | 55 ++++++++ backends/arm/quantizer/arm_quantizer_utils.py | 1 + backends/arm/test/ops/test_conv_combos.py | 14 +- backends/arm/test/ops/test_relu.py | 120 ++++++++++++++++++ 6 files changed, 185 insertions(+), 7 deletions(-) create mode 100644 backends/arm/operators/op_relu.py create mode 100644 backends/arm/test/ops/test_relu.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index d16a2699c01..bab08d749f1 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -52,6 +52,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.mm.default, exir_ops.edge.aten.repeat.default, + exir_ops.edge.aten.relu.default, exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.sub.Tensor, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 176931c3607..4fdaeaba62b 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -21,6 +21,7 @@ op_mul, op_permute, op_quant, + op_relu, op_repeat, op_sigmoid, op_slice, diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py new file mode 100644 index 00000000000..5afe1ac7bce --- /dev/null +++ b/backends/arm/operators/op_relu.py @@ -0,0 +1,55 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import executorch.backends.arm.tosa_quant_utils as tqutils +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class ReluVisitor(NodeVisitor): + target = "aten.relu.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + attr = ts.TosaSerializerAttribute() + + clamp_min_fp = 0.0 + clamp_max_fp = 0.0 + clamp_min_qs = 0 + clamp_max_qs = 0 + if is_quant_node: + out_qargs = tqutils.get_quant_node_args(list(node.users)[0]) + clamp_min_qs = tqutils.quantize_value(0, out_qargs) + clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs) + + else: + clamp_min_fp = 0 + clamp_max_fp = float("inf") + + attr.ClampAttribute( + tosa_graph.builder, + clamp_min_qs, + clamp_max_qs, + clamp_min_fp, + clamp_max_fp, + ) + + tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr) diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index b49f6b52e66..417aa454a8e 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -138,6 +138,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool: return op in [ torch.ops.aten.hardtanh.default, torch.ops.aten.hardtanh_.default, + torch.ops.aten.relu.default, torch.ops.aten.mean.default, torch.ops.aten.mean.dim, torch.ops.aten.permute.default, diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 88006df1a01..31051ef8f7d 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -102,7 +102,7 @@ def forward(self, x): return self.adaptive_avg_pool2d(x) -class ComboConvBatchnormRelu(torch.nn.Module): +class ComboConvBatchnormRelu6(torch.nn.Module): edge_op_list = [ "executorch_exir_dialects_edge__ops_aten_convolution_default", "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default", @@ -235,16 +235,16 @@ def test_conv_meandim_u55_BI(self): ############################## ## Conv + batch norm + relu ## ############################## - def test_conv_batchnorm_relu_tosa_MI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_MI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_tosa_BI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_BI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_u55_BI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_u55_BI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs()) ################## diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py new file mode 100644 index 00000000000..d2ca8540f4c --- /dev/null +++ b/backends/arm/test/ops/test_relu.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestRelu(unittest.TestCase): + class Relu(torch.nn.Module): + def __init__(self): + super().__init__() + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + def _test_relu_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.relu.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_relu_tosa_MI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_tosa_u55_BI_pipeline(self.Relu(), (test_data,)) From ef9c07f5db0ec29e8a514c007060879a98b2766c Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 26 Aug 2024 17:31:09 +0200 Subject: [PATCH 043/531] Add unsqueeze op to Arm backend Differential Revision: D61718607 Pull Request resolved: https://github.com/pytorch/executorch/pull/4833 --- backends/arm/arm_partitioner.py | 1 + backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_unsqueeze.py | 51 ++++++++++++ backends/arm/test/ops/test_unsqueeze.py | 103 ++++++++++++++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 backends/arm/operators/op_unsqueeze.py create mode 100644 backends/arm/test/ops/test_unsqueeze.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index bab08d749f1..0dc3d36b5c6 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -59,6 +59,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.clone.default, exir_ops.edge.aten.mean.dim, + exir_ops.edge.aten.unsqueeze_copy.default, operator.getitem, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 4fdaeaba62b..dc1fcc8e2c3 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -27,5 +27,6 @@ op_slice, op_softmax, op_sub, + op_unsqueeze, op_view, ) diff --git a/backends/arm/operators/op_unsqueeze.py b/backends/arm/operators/op_unsqueeze.py new file mode 100644 index 00000000000..a7ff8ce0b40 --- /dev/null +++ b/backends/arm/operators/op_unsqueeze.py @@ -0,0 +1,51 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Follows this specification: https://pytorch.org/docs/stable/generated/torch.unsqueeze.html + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_utils import tosa_shape +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class UnsqueezeVisitor(NodeVisitor): + target = "aten.unsqueeze_copy.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + dim = inputs[1].number + shape = inputs[0].shape + rank = len(shape) + + assert -rank - 1 <= dim < rank + 1 + if dim < 0: + dim = dim + rank + 1 + + new_shape = list(shape) + new_shape.insert(dim, 1) + new_shape = tosa_shape(new_shape, output.dim_order) + + attr = ts.TosaSerializerAttribute() + attr.ReshapeAttribute(new_shape) + tosa_graph.addOperator( + TosaOp.Op().RESHAPE, [inputs[0].name], [output.name], attr + ) diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py new file mode 100644 index 00000000000..6da6a196c07 --- /dev/null +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -0,0 +1,103 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the unsqueeze op which copies the data of the input tensor (possibly with new data format) +# + +import unittest +from typing import Sequence, Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import Quantize +from parameterized import parameterized + + +class TestSimpleUnsqueeze(unittest.TestCase): + class Unsqueeze(torch.nn.Module): + shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 5, 5)] + test_parameters: list[tuple[torch.Tensor]] = [(torch.ones(n),) for n in shapes] + + def forward(self, x: torch.Tensor, dim): + return x.unsqueeze(dim) + + def _test_unsqueeze_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_unsqueeze_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_unsqueeze_tosa_u55_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor): + for i in range(-test_tensor.dim() - 1, test_tensor.dim() + 1): + self._test_unsqueeze_tosa_MI_pipeline(self.Unsqueeze(), (test_tensor, i)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_tosa_u55_pipeline(self.Unsqueeze(), (test_tensor, 0)) From 29797d41e1d5fba91618e520fb38f3a5a2fadc8f Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Mon, 26 Aug 2024 17:44:53 +0200 Subject: [PATCH 044/531] Add support for lifted tensors in ArmPartitioner Differential Revision: D61542388 Pull Request resolved: https://github.com/pytorch/executorch/pull/4788 --- backends/arm/operators/op_placeholder.py | 25 +++++++++++- backends/arm/test/misc/test_lifted_tensor.py | 42 ++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 backends/arm/test/misc/test_lifted_tensor.py diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py index 0b2e65f45d0..918a270bb00 100644 --- a/backends/arm/operators/op_placeholder.py +++ b/backends/arm/operators/op_placeholder.py @@ -5,7 +5,7 @@ import numpy as np import serializer.tosa_serializer as ts -import torch +import torch.fx from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_quant_utils import ( get_quant_arg_dtype, @@ -130,6 +130,21 @@ def process_inputs_to_buffers( ) +def process_inputs_to_lifted_tensor_constants( + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + edge_program: ExportedProgram, +): + arg = TosaArg(node) + tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[ + arg.name + ] + tensor = edge_program.tensor_constants[tensor_name] + tensor_data = tensor.detach().numpy() + + tosa_graph.addConst(tensor_data.shape, arg.dtype, tensor_data, name=arg.name) + + def process_placeholder( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, @@ -145,5 +160,11 @@ def process_placeholder( process_inputs_to_parameters(node, tosa_graph, edge_program) elif node.name in edge_program.graph_signature.inputs_to_buffers: process_inputs_to_buffers(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants: + process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs: + raise NotImplementedError( + "Placeholder is of type 'lifted custom object' which is not supported." + ) else: - raise RuntimeError(f"Unknown placeholder {node.name}") + raise RuntimeError(f"Placeholder '{node.name}' is of unknown type.") diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py new file mode 100644 index 00000000000..90aa7e2950c --- /dev/null +++ b/backends/arm/test/misc/test_lifted_tensor.py @@ -0,0 +1,42 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + + +class LiftedTensor(torch.nn.Module): + + def __init__(self): + super().__init__() + self.lifted_tensor = torch.Tensor([[1, 2], [3, 4]]) + + def forward(self, x: torch.Tensor, length) -> torch.Tensor: + sliced = self.lifted_tensor[:, :length] + return sliced + x + + +class TestLiftedTensor(unittest.TestCase): + """Tests the ArmPartitioner with a placeholder of type lifted tensor.""" + + def test_partition_lifted_tensor(self): + tester = ( + ArmTester( + LiftedTensor(), + example_inputs=(torch.ones(2, 2), 2), + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .to_edge() + .dump_artifact() + ) + signature = tester.get_artifact().exported_program().graph_signature + assert len(signature.lifted_tensor_constants) > 0 + tester.partition() + tester.to_executorch() + tester.run_method_and_compare_outputs((torch.ones(2, 2), 2)) From 260cf6fc1a04cc7aca51da77dfd0fe759f2b0cf0 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Mon, 26 Aug 2024 11:00:18 -0700 Subject: [PATCH 045/531] Revert D61290864 Differential Revision: D61799833 Pull Request resolved: https://github.com/pytorch/executorch/pull/4906 --- extension/llm/custom_ops/targets.bzl | 81 ++++++++++++------------- kernels/optimized/lib_defs.bzl | 91 ++++++++++++---------------- 2 files changed, 80 insertions(+), 92 deletions(-) diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index 0976202d452..fe93f6a422d 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -6,48 +6,47 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ - for mkl_dep in ["", "_mkl_lp64_omp"]: - runtime.cxx_library( - name = "custom_ops" + mkl_dep, - srcs = ["op_sdpa.cpp"], - exported_headers = ["op_sdpa.h"], - exported_deps = [ - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/kernels/optimized:libblas{}".format(mkl_dep), - "//executorch/kernels/optimized:libvec", - "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", - "//executorch/backends/xnnpack/threadpool:threadpool", - ], - compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], - visibility = [ - "//executorch/...", - "//executorch/extension/llm/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - # @lint-ignore BUCKLINT link_whole - link_whole = True, - force_static = True, - ) + runtime.cxx_library( + name = "custom_ops", + srcs = ["op_sdpa.cpp"], + exported_headers = ["op_sdpa.h"], + exported_deps = [ + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/optimized:libvec", + "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/parallel:thread_parallel", + "//executorch/backends/xnnpack/threadpool:threadpool", + ], + compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], + visibility = [ + "//executorch/...", + "//executorch/extension/llm/custom_ops/...", + "@EXECUTORCH_CLIENTS", + ], + # @lint-ignore BUCKLINT link_whole + link_whole = True, + force_static = True, + ) - runtime.cxx_library( - name = "custom_ops_aot_lib" + mkl_dep, - srcs = [ - "op_sdpa_aot.cpp", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - external_deps = [ - "libtorch", - ], - deps = [ - ":custom_ops" + mkl_dep, - "//executorch/extension/aten_util:aten_bridge", - ], - ) + runtime.cxx_library( + name = "custom_ops_aot_lib", + srcs = [ + "op_sdpa_aot.cpp", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + external_deps = [ + "libtorch", + ], + deps = [ + ":custom_ops", + "//executorch/extension/aten_util:aten_bridge", + ], + ) runtime.python_library( name = "custom_ops_aot_py", diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 045e8684217..5af9b423ad0 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -99,55 +99,44 @@ def define_libs(): ], ) - for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl"), ("libblas_mkl_lp64_omp", "fbsource//third-party/mkl:mkl_lp64_omp")]: - runtime.cxx_library( - name = libblas_name, - srcs = native.glob([ - "blas/**/*.cpp", - ]), - exported_headers = native.glob([ - "blas/**/*.h", - ]), - header_namespace = "executorch/kernels/optimized", - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - preprocessor_flags = select({ - "DEFAULT": [], - "ovr_config//os:linux-x86_64": [ + runtime.cxx_library( + name = "libblas", + srcs = native.glob([ + "blas/**/*.cpp", + ]), + exported_headers = native.glob([ + "blas/**/*.h", + ]), + header_namespace = "executorch/kernels/optimized", + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + fbandroid_platform_preprocessor_flags = [ + ( + "^android-arm64.*$", + [ "-DET_BUILD_WITH_BLAS", - ] if not runtime.is_oss else [], - }), - fbandroid_platform_preprocessor_flags = [ - ( - "^android-arm64.*$", - [ - "-DET_BUILD_WITH_BLAS", - ], - ), - ], - fbandroid_platform_deps = [ - ( - "^android-arm64.*$", - [ - "fbsource//third-party/openblas:openblas", - ], - ), - ], - fbobjc_exported_preprocessor_flags = [ - "-DET_BUILD_WITH_BLAS", - "-DET_BUILD_FOR_APPLE", - ], - fbobjc_frameworks = [ - "Accelerate", - ], - deps = select({ - "DEFAULT": [], - "ovr_config//os:linux-x86_64": [mkl_dep] if not runtime.is_oss else [], - }), - exported_deps = [ - "//executorch/kernels/optimized:libutils", - "//executorch/runtime/core/exec_aten:lib", - ], - ) + ], + ), + ], + fbandroid_platform_deps = [ + ( + "^android-arm64.*$", + [ + "fbsource//third-party/openblas:openblas", + ], + ), + ], + fbobjc_exported_preprocessor_flags = [ + "-DET_BUILD_WITH_BLAS", + "-DET_BUILD_FOR_APPLE", + ], + fbobjc_frameworks = [ + "Accelerate", + ], + exported_deps = [ + "//executorch/kernels/optimized:libutils", + "//executorch/runtime/core/exec_aten:lib", + ], + ) From dc66414c70dbec763fa37aba7908d50373299435 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Mon, 26 Aug 2024 13:45:54 -0500 Subject: [PATCH 046/531] WhyNoPartition Differential Revision: D61418620 Pull Request resolved: https://github.com/pytorch/executorch/pull/4878 --- .../xnnpack/partition/config/gemm_configs.py | 9 ++ .../partition/config/generic_node_configs.py | 90 +++++++++++++++++-- .../xnnpack/partition/config/node_configs.py | 22 ++++- .../partition/config/xnnpack_config.py | 8 +- .../xnnpack/partition/xnnpack_partitioner.py | 14 +++ backends/xnnpack/test/ops/mean_dim.py | 13 +++ exir/backend/utils.py | 31 ++++++- 7 files changed, 172 insertions(+), 15 deletions(-) diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py index a20285483b2..54c07ad5abc 100644 --- a/backends/xnnpack/partition/config/gemm_configs.py +++ b/backends/xnnpack/partition/config/gemm_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from itertools import chain from typing import cast, List, Optional, Tuple @@ -31,12 +32,16 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram from torch.fx.passes.utils.source_matcher_utils import ( get_source_partitions, SourcePartition, ) +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class GEMMConfig(XNNPartitionerConfig): """ @@ -60,6 +65,8 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False is_valid, _ = self.get_deps(node, ep) + if not is_valid: + why(node, "Failed to get valid dependent nodes.") return is_valid def get_node_and_deps( @@ -282,10 +289,12 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: conv_stride = cast(List[int], node.args[3]) if len(conv_stride) > 2: + why(node, "Only support 1D + 2D Conv") return False # Only support 1D + 2D Conv transposed = cast(bool, node.args[6]) if transposed: + why(node, "Transposed Conv is not supported") return False # Currently don't support transposed conv return True diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py index e309a3bd038..69defae0213 100644 --- a/backends/xnnpack/partition/config/generic_node_configs.py +++ b/backends/xnnpack/partition/config/generic_node_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from typing import cast, List, Optional import torch @@ -16,8 +17,12 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class GenericNodePartitionerConfig(XNNPartitionerConfig): def __init__(self, fused_act: Optional[List[str]] = None): @@ -141,9 +146,22 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: if len(args) >= 7: divisor_override = cast(int, args[6]) - return ( - not (ceil_mode or count_include_pad) and divisor_override == pooling_region - ) + if ceil_mode: + why(node, reason="ceil mode is not supported") + return False + + if count_include_pad: + why( + node, + reason="zero-padding in the averaging calculation is not supported", + ) + return False + + if divisor_override != pooling_region: + why(node, reason="divisor override is not supported") + return False + + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -160,7 +178,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False num_tensors = len(node.all_input_nodes) - return num_tensors >= 2 and num_tensors <= 4 + + if not (num_tensors >= 2 and num_tensors <= 4): + why( + node, + reason=f"only support concatenation of 2 - 4 tensors, got {num_tensors} tensors", + ) + return False + + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -210,7 +236,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: dim = cast(int, node.args[1]) node_input = node.all_input_nodes[0] tensor_dims = node_input.meta["val"].dim() - return dim == -1 or dim == tensor_dims - 1 + + if not (dim == -1 or dim == tensor_dims - 1): + why( + node, + reason=f"dim must be the last dim, got dim = {dim} for tensor of rank {tensor_dims}", + ) + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -255,7 +288,10 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False is_ceil_mode = len(node.args) >= 6 and cast(bool, node.args[5]) - return not is_ceil_mode + if is_ceil_mode: + why(node, reason="ceil mode is not supported") + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -309,7 +345,20 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: dims = node.args[1] output_dims = node.meta["val"].dim() - return dims in ([-2, -1], [-1, -2]) and output_dims == 4 + if dims not in ([-2, -1], [-1, -2]): + why( + node, + reason="mean.dim only supports averaging 4D tensors across the innermost dimensions", + ) + return False + + if output_dims != 4: + why( + node, + reason=f"mean.dim only supports averaging 4D tensors, got tensor of rank {output_dims}", + ) + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -340,7 +389,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False power = node.args[1] - return isinstance(power, int) and power == 2 + + if not isinstance(power, int): + why(node, reason=f"only support int powers, got {power}") + return False + + if power != 2: + why(node, reason=f"only support power == 2, got {power}") + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -372,10 +429,18 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: for dim in input_shape: if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"input tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) return False for dim in output_shape: if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"output tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) return False return True @@ -431,7 +496,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False mask_node = node.all_input_nodes[3] mask_rank = mask_node.meta["val"].dim() - return mask_rank == 2 + if mask_rank != 2: + why( + node, + reason=f"mask must have rank 2, got mask of rank {mask_rank}", + ) + return False + + return True def get_original_aten(self) -> Optional[torch._ops.OpOverload]: return torch.ops.aten.scaled_dot_product_attention.default diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py index 501216eaae3..1e4d1f05fe4 100644 --- a/backends/xnnpack/partition/config/node_configs.py +++ b/backends/xnnpack/partition/config/node_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging import operator from typing import List, Optional @@ -19,8 +20,12 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class BatchNormConfig(XNNPartitionerConfig): target_name = "_native_batch_norm_legit_no_training.default" @@ -38,9 +43,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: conv_name = format_target_name(conv.target.__name__) # pyre-ignore if conv_name not in ["convolution.default"]: + why(node, f"Invalid conv target {conv_name}") + return False + + can_fuse = FuseBatchNormWithConvPass.can_fuse(conv, bn, ep) + if not can_fuse: + why(node, "BatchNorm cannot be fused with Convolution") return False - return FuseBatchNormWithConvPass.can_fuse(conv, bn, ep) + return True def get_node_and_deps( self, node: torch.fx.Node, ep: ExportedProgram @@ -76,15 +87,18 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: output_0 = node_val[0] # Don't check indicies dtype if output_0.dtype not in supported_dtypes: + why(node, f"Unsupported output dtype {output_0.dtype}") return False max_input = node.all_input_nodes[0] if max_input.meta.get("val").dtype not in supported_dtypes: + why(node, f"Unsupported input dtype {max_input.meta.get('val').dtype}") return False # Make sure that all users are getitems of the first output for user in node.users: if not (user.target == operator.getitem and user.args[1] == 0): + why(node, "Unsupported user of max.dim") return False return True @@ -111,7 +125,11 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False weight = node.all_input_nodes[1] - return is_param_node(ep, weight) + is_param = is_param_node(ep, weight) + if not is_param: + why(node, "Prelu weight must be a parameter") + return False + return True def get_original_aten(self) -> Optional[torch._ops.OpOverload]: return torch.ops.aten.prelu.default diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py index 840ffbd43b4..f39a651e198 100644 --- a/backends/xnnpack/partition/config/xnnpack_config.py +++ b/backends/xnnpack/partition/config/xnnpack_config.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from abc import abstractmethod from enum import Enum from typing import List, Optional @@ -13,8 +14,12 @@ format_target_name, PartitionerConfig, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class ConfigPrecisionType(Enum): FP32 = 1 @@ -22,7 +27,6 @@ class ConfigPrecisionType(Enum): DYNAMIC_QUANT = 3 -# TODO: add WhyNotPartition to XNNPartitionerConfig class XNNPartitionerConfig(PartitionerConfig): """ Base partitioner config for XNNPACK Partitioner Configs. Base wrapper class @@ -125,10 +129,12 @@ def check_common_constraints( ) if len(self.enabled_precision_types) == 0: + why(node, reason="not enabled precision types") return False has_valid_dtypes = self._check_node_has_valid_dtype(node) if not has_valid_dtypes: + why(node, reason="invalid dtype") return False return True diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py index f582ea753f4..9afbefebce3 100644 --- a/backends/xnnpack/partition/xnnpack_partitioner.py +++ b/backends/xnnpack/partition/xnnpack_partitioner.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. import itertools + +import logging from typing import List, Optional, Type, Union from executorch.backends.xnnpack.partition.config import ALL_PARTITIONER_CONFIGS @@ -21,6 +23,9 @@ from executorch.exir.backend.partitioner import DelegationSpec from torch.fx.passes.infra.partitioner import Partition +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) + class XnnpackPartitioner(ConfigerationBasedPartitioner): def __init__( @@ -30,7 +35,16 @@ def __init__( Union[ConfigPrecisionType, List[ConfigPrecisionType]] ] = None, per_op_mode=False, + verbose: bool = False, ): + """ + @verbose: if True, print out more information about the partitioner. + Default level is WARNING. If verbose is True, level is set to DEBUG. + """ + if verbose: + logger.setLevel(logging.DEBUG) + logger.debug("Verbose logging enabled for XNNPACK partitioner.") + delegation_spec = DelegationSpec(XnnpackBackend.__name__, []) configs_to_use = configs or ALL_PARTITIONER_CONFIGS # Can do logic and have extra args to filter/delete/select diff --git a/backends/xnnpack/test/ops/mean_dim.py b/backends/xnnpack/test/ops/mean_dim.py index e39d3aee080..3bac5f3239c 100644 --- a/backends/xnnpack/test/ops/mean_dim.py +++ b/backends/xnnpack/test/ops/mean_dim.py @@ -56,6 +56,19 @@ def test_fp32_mean_dim_unsupported(self): .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) ) + def test_fp32_mean_dim_unsupported_3d(self): + """ + XNNPack mean.dim implementation only supports 4D tensors. + """ + inputs = (torch.randn(1, 5, 4),) + ( + Tester(self.MeanDim((-1, -2)), inputs) + .export() + .check_count({"torch.ops.aten.mean.dim": 1}) + .to_edge_transform_and_lower() + .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) + ) + def test_qs8_mean_dim(self): inputs = (torch.randn(1, 5, 4, 4),) ( diff --git a/exir/backend/utils.py b/exir/backend/utils.py index b5072604d2d..2b768fe7c23 100644 --- a/exir/backend/utils.py +++ b/exir/backend/utils.py @@ -28,9 +28,6 @@ T_DQuantPerTensor = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default -log: logging.Logger = logging.getLogger(__name__) - - # NB: Set this to None to handle validation from MobileBert @lru_cache(maxsize=None) def is_same_node( @@ -499,3 +496,31 @@ def insert_delegate_mapping_entry( # pyre-ignore Warning from Union[int, st] keys self._debug_handle_map[identifier] = filtered_debug_handles return identifier + + +class WhyNoPartition: + """ + Simple helper class for partitioners to log why a node was not lowered. + + Example usage: + + # In your backend partitioner file(s) + why = WhyNoPartition(logger=your_backend_logger) + + # hypothetical function that checks if a node can be lowered + if not can_be_lowered(node): + why(node, "This node was not lowered because ...") + """ + + def __init__(self, logger: logging.Logger): + self.logger = logger + self.node: Optional[torch.fx.Node] = None + self.reason: str = "" + + def __call__(self, node: torch.fx.Node, reason: str) -> None: + self.node = node + self.reason = reason + self.logger.debug(self) + + def __str__(self) -> str: + return f"WhyNoPartition: Node {self.node} was not partitioned because {self.reason}." From 0ae82f9344092ad21264a53e2df4a220d4ae2642 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Mon, 26 Aug 2024 12:30:12 -0700 Subject: [PATCH 047/531] Vulkan logger fixup Differential Revision: D61806616 Pull Request resolved: https://github.com/pytorch/executorch/pull/4883 --- backends/vulkan/partitioner/vulkan_partitioner.py | 2 +- backends/vulkan/serialization/vulkan_graph_builder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index c4fbaabdbc5..103297bc758 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -38,7 +38,7 @@ torch.ops.aten.upsample_nearest2d.vec, ] -logger: logging.Logger = logging.getLogger(__name__) +logger: logging.Logger = logging.getLogger("") logger.setLevel(logging.INFO) diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index fcbf3edb7e5..20d09f1df5c 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -24,7 +24,7 @@ Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str ] -logger: logging.Logger = logging.getLogger(__name__) +logger: logging.Logger = logging.getLogger("") logger.setLevel(logging.INFO) From 1253ed5e7f6870814443465ae1d086e05ae8b9fb Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:11:25 -0700 Subject: [PATCH 048/531] Remove unnecessary INativePeer Differential Revision: D61728431 Pull Request resolved: https://github.com/pytorch/executorch/pull/4875 --- .../org/pytorch/executorch/INativePeer.java | 28 ------------------- .../java/org/pytorch/executorch/Module.java | 6 ++-- .../org/pytorch/executorch/NativePeer.java | 2 +- 3 files changed, 4 insertions(+), 32 deletions(-) delete mode 100644 extension/android/src/main/java/org/pytorch/executorch/INativePeer.java diff --git a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java deleted file mode 100644 index 0878f151437..00000000000 --- a/extension/android/src/main/java/org/pytorch/executorch/INativePeer.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -/** Interface for the native peer object for entry points to the Module */ -interface INativePeer { - /** Clean up the native resources associated with this instance */ - void resetNative(); - - /** Run a "forward" call with the given inputs */ - EValue[] forward(EValue... inputs); - - /** Run an arbitrary method on the module */ - EValue[] execute(String methodName, EValue... inputs); - - /** - * Load a method on this module. - * - * @return the Error code if there was an error loading the method - */ - int loadMethod(String methodName); -} diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java index 3d2d2dd86ee..5e57174114d 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/Module.java +++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java @@ -27,8 +27,8 @@ public class Module { /** Load mode for the module. Use memory locking and ignore errors. */ public static final int LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3; - /** Reference to the INativePeer object of this module. */ - private INativePeer mNativePeer; + /** Reference to the NativePeer object of this module. */ + private NativePeer mNativePeer; /** * Loads a serialized ExecuTorch module from the specified path on the disk. Uses default load @@ -68,7 +68,7 @@ public static Module load(final String modelPath) { return load(modelPath, null); } - Module(INativePeer nativePeer) { + Module(NativePeer nativePeer) { this.mNativePeer = nativePeer; } diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java index a9116dcd842..865c503765d 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java +++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java @@ -13,7 +13,7 @@ import com.facebook.soloader.nativeloader.NativeLoader; import java.util.Map; -class NativePeer implements INativePeer { +class NativePeer { static { // Loads libexecutorch.so from jniLibs NativeLoader.loadLibrary("executorch"); From a532d9c68faab9365fab49fdfc9a82285c246b62 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 26 Aug 2024 13:56:26 -0700 Subject: [PATCH 049/531] Delete multi_runner and relocatable_runner Differential Revision: D61747495 Pull Request resolved: https://github.com/pytorch/executorch/pull/4885 --- test/multi_runner.cpp | 378 ------------------------------------ test/relocatable_runner.cpp | 332 ------------------------------- test/targets.bzl | 39 ---- 3 files changed, 749 deletions(-) delete mode 100644 test/multi_runner.cpp delete mode 100644 test/relocatable_runner.cpp diff --git a/test/multi_runner.cpp b/test/multi_runner.cpp deleted file mode 100644 index 8ea6b413953..00000000000 --- a/test/multi_runner.cpp +++ /dev/null @@ -1,378 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * @file - * - * Creates multiple Executor instances at the same time, demonstrating that the - * same process can handle multiple runtimes at once. - * - * Usage: - * multi_runner --models=[,[,...]] [--num_instances=] - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -DEFINE_string( - models, - "", - "Comma-separated list of paths to serialized ExecuTorch model files"); -DEFINE_int32( - num_instances, - 10, - "Number of Executor instances to create in parallel, for each model"); - -static bool validate_path_list( - const char* flagname, - const std::string& path_list); -DEFINE_validator(models, &validate_path_list); - -static bool validate_positive_int32(const char* flagname, int32_t val); -DEFINE_validator(num_instances, &validate_positive_int32); - -namespace { -using torch::executor::DataLoader; -using torch::executor::Error; -using torch::executor::FreeableBuffer; -using torch::executor::MemoryAllocator; -using torch::executor::MemoryManager; -using torch::executor::Method; -using torch::executor::Program; -using torch::executor::Result; -using torch::executor::testing::ManagedMemoryManager; -using torch::executor::util::BufferDataLoader; - -/** - * A model that has been loaded and has had its execution plan and inputs - * prepared. Can be run once. - * - * Creates and owns the underyling state, making things easier to manage. - */ -class PreparedModel final { - public: - PreparedModel( - const std::string& name, - const void* model_data, - size_t model_data_size, - size_t non_const_mem_bytes, - size_t runtime_mem_bytes) - : name_(name), - loader_(model_data, model_data_size), - program_(load_program_or_die(loader_)), - memory_manager_(non_const_mem_bytes, runtime_mem_bytes), - method_(load_method_or_die(program_, &memory_manager_.get())), - has_run_(false) { - inputs_ = torch::executor::util::PrepareInputTensors(method_); - } - - void run() { - ET_CHECK_MSG(!has_run_, "A PreparedModel may only be run once"); - has_run_ = true; - - Error status = method_.execute(); - ET_CHECK_MSG( - status == Error::Ok, - "plan.execute() failed with status 0x%" PRIx32, - status); - - // TODO(T131578656): Do something with the outputs. - } - - const std::string& name() const { - return name_; - } - - ~PreparedModel() { - torch::executor::util::FreeInputs(inputs_); - } - - private: - static Program load_program_or_die(DataLoader& loader) { - Result program = Program::load(&loader); - ET_CHECK(program.ok()); - return std::move(program.get()); - } - - static Method load_method_or_die( - const Program& program, - MemoryManager* memory_manager) { - Result method = program.load_method("forward", memory_manager); - ET_CHECK(method.ok()); - return std::move(method.get()); - } - - const std::string name_; - BufferDataLoader loader_; // Needs to outlive program_ - Program program_; // Needs to outlive executor_ - ManagedMemoryManager memory_manager_; // Needs to outlive executor_ - Method method_; - exec_aten::ArrayRef inputs_; - - bool has_run_; -}; - -/** - * Creates PreparedModels based on the provided serialized data and memory - * parameters. - */ -class ModelFactory { - public: - ModelFactory( - const std::string& name, // For debugging - std::shared_ptr model_data, - size_t model_data_size, - size_t non_const_mem_bytes = 40 * 1024U * 1024U, // 40 MB - size_t runtime_mem_bytes = 2 * 1024U * 1024U) // 2 MB - : name_(name), - model_data_(model_data), - model_data_size_(model_data_size), - non_const_mem_bytes_(non_const_mem_bytes), - runtime_mem_bytes_(runtime_mem_bytes) {} - - std::unique_ptr prepare( - std::string_view name_affix = "") const { - return std::make_unique( - name_affix.empty() ? name_ : std::string(name_affix) + ":" + name_, - model_data_.get(), - model_data_size_, - non_const_mem_bytes_, - runtime_mem_bytes_); - } - - const std::string& name() const { - return name_; - } - - private: - const std::string name_; - std::shared_ptr model_data_; - - const size_t model_data_size_; - const size_t non_const_mem_bytes_; - const size_t runtime_mem_bytes_; -}; - -/// Synchronizes a set of model threads as they walk through prepare/run states. -class Synchronizer { - public: - explicit Synchronizer(size_t total_threads) - : total_threads_(total_threads), state_(State::INIT_THREAD) {} - - /// The states for threads to move through. Must advance in order. - enum class State { - /// Initial state. - INIT_THREAD, - - /// Thread is ready to prepare its model instance. - PREPARE_MODEL, - - /// Thread is ready to run its model instance. - RUN_MODEL, - }; - - /// Wait until all threads have requested to advance to this state, then - /// advance all of them. - void advance_to(State new_state) { - std::unique_lock lock(lock_); - - // Enforce valid state machine transitions. - assert( - (new_state == State::PREPARE_MODEL && state_ == State::INIT_THREAD) || - (new_state == State::RUN_MODEL && state_ == State::PREPARE_MODEL)); - - // Indicate that this thread is ready to move to the new state. - num_ready_++; - if (num_ready_ == total_threads_) { - // We were the last thread to become ready. Tell all threads to - // move to the next state. - state_ = new_state; - num_ready_ = 0; - cv_.notify_all(); - } else { - // Wait until all other threads are ready. - cv_.wait(lock, [=] { return this->state_ == new_state; }); - } - } - - private: - /// The total number of threads to wait for. - const size_t total_threads_; - - /// Locks all mutable fields in this class. - std::mutex lock_; - - /// The number of threads that are ready to move to the next state. - size_t num_ready_ = 0; - - /// The state that all threads should be in. - State state_; - - /// Signals threads to check for state updates. - std::condition_variable cv_; -}; - -/** - * Waits for all threads to begin running; prepares a model and waits for all - * threads to finish preparation; runs the model and exits. - */ -void model_thread(ModelFactory& factory, Synchronizer& sync, size_t thread_id) { - ET_LOG( - Info, - "[%zu] Thread has started for %s.", - thread_id, - factory.name().c_str()); - - sync.advance_to(Synchronizer::State::PREPARE_MODEL); - - // Create and prepare our model instance. - ET_LOG(Info, "[%zu] Preparing %s...", thread_id, factory.name().c_str()); - std::unique_ptr model = - factory.prepare(/*name_affix=*/std::to_string(thread_id)); - ET_LOG(Info, "[%zu] Prepared %s.", thread_id, model->name().c_str()); - - sync.advance_to(Synchronizer::State::RUN_MODEL); - - // Run our model. - ET_LOG(Info, "[%zu] Running %s...", thread_id, model->name().c_str()); - model->run(); - ET_LOG( - Info, "[%zu] Finished running %s...", thread_id, model->name().c_str()); - - // TODO(T131578656): Check the model output. -} - -/** - * Splits the provided string on `,` and returns a vector of the non-empty - * elements. Does not string whitespace. - */ -std::vector split_string_list(const std::string& list) { - std::vector items; - std::stringstream sstream(list); - while (sstream.good()) { - std::string item; - getline(sstream, item, ','); - if (!item.empty()) { - items.push_back(item); - } - } - return items; -} - -} // namespace - -int main(int argc, char** argv) { - torch::executor::runtime_init(); - - // Parse and extract flags. - gflags::SetUsageMessage( - "Creates multiple Executor instances at the same time, demonstrating " - "that the same process can handle multiple runtimes at once."); - gflags::ParseCommandLineFlags(&argc, &argv, true); - std::vector model_paths = split_string_list(FLAGS_models); - size_t num_instances = FLAGS_num_instances; - - // Create a factory for each model provided on the commandline. - std::vector> factories; - for (const auto& model_path : model_paths) { - std::shared_ptr file_data; - size_t file_size; - Error err = torch::executor::util::read_file_content( - model_path.c_str(), &file_data, &file_size); - ET_CHECK(err == Error::Ok); - factories.push_back(std::make_unique( - /*name=*/model_path, file_data, file_size)); - } - - // Spawn threads to prepare and run separate instances of the models in - // parallel. - const size_t num_threads = factories.size() * num_instances; - Synchronizer state(num_threads); - std::vector threads; - size_t thread_id = 0; // Unique ID for every thread. - ET_LOG(Info, "Creating %zu threads...", num_threads); - for (const auto& factory : factories) { - for (size_t i = 0; i < num_instances; ++i) { - threads.push_back(std::thread( - model_thread, std::ref(*factory), std::ref(state), thread_id++)); - } - } - - // Wait for all threads to finish. - ET_LOG(Info, "Waiting for %zu threads to exit...", threads.size()); - for (auto& thread : threads) { - thread.join(); - } - ET_LOG(Info, "All %zu threads exited.", threads.size()); -} - -// -// Flag validation -// - -/// Returns true if the specified path exists in the filesystem. -static bool path_exists(const std::string& path) { - struct stat st; - return stat(path.c_str(), &st) == 0; -} - -/// Returns true if `path_list` contains a comma-separated list of at least one -/// path that exists in the filesystem. -static bool validate_path_list( - const char* flagname, - const std::string& path_list) { - const std::vector paths = split_string_list(path_list); - if (paths.empty()) { - fprintf( - stderr, "Must specify at least one valid path with --%s\n", flagname); - return false; - } - for (const auto& path : split_string_list(path_list)) { - if (!path_exists(path)) { - fprintf( - stderr, - "Path '%s' does not exist in --%s='%s'\n", - path.c_str(), - flagname, - path_list.c_str()); - return false; - } - } - return true; -} - -/// Returns true if `val` is positive. -static bool validate_positive_int32(const char* flagname, int32_t val) { - if (val <= 0) { - fprintf( - stderr, "Value must be positive for --%s=%" PRId32 "\n", flagname, val); - return false; - } - return true; -} diff --git a/test/relocatable_runner.cpp b/test/relocatable_runner.cpp deleted file mode 100644 index 47616dfa698..00000000000 --- a/test/relocatable_runner.cpp +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace torch::executor; - -/** - * @file - * - * In some hardware environments, the same model may run on different cores for - * different inference requests. The same core may also see a power-cycle (i.e., - * power down and then back up) in between two inference requests. - * - * For ExecuTorch to work efficiently in these environments, we want to - * initialize the Method once once for the model and avoid re-initializing it - * for every inference. This can be achieved by restricting the runtime contexts - * (torch::executor::Program and torch::executor::Method) to live in a - * pre-allocated, shared, and persistent memory. - * - * This tool demonstrates that the memory can be managed this way. - */ - -static uint8_t method_allocator_pool[2 * 1024U * 1024U]; // 4 MB - -#define MAX_INPUTS_PER_MODEL 16 -#define MAX_OUTPUTS_PER_MODEL 8 - -DEFINE_string( - model_path, - "model.pte", - "Model serialized in flatbuffer format."); - -// These functions represent the work done on a worker core. -namespace worker { - -Program* load_program( - const void* file_data, - size_t file_data_len, - MemoryAllocator& allocator) { - // Wrap the data in a DataLoader. The Program will take a pointer to it, so it - // must live for at least as long as the Program instance. - auto loader = allocator.allocateInstance(); - ET_CHECK(loader != nullptr); - new (loader) util::BufferDataLoader(file_data, file_data_len); - - // Load the program. - Result program_result = Program::load(loader); - ET_CHECK(program_result.ok()); - - // Move the Program into worker memory. - auto program = allocator.allocateInstance(); - ET_CHECK(program != nullptr); - new (program) Program(std::move(program_result.get())); - - return program; -} - -MemoryManager* create_memory_manager( - MethodMeta* method_meta, - MemoryAllocator& worker_allocator) { - // Create the runtime allocator. - auto* method_allocator = worker_allocator.allocateInstance(); - ET_CHECK(method_allocator != nullptr); - new (method_allocator) - MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool); - - // Create the memory planned buffers. - size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); - Span* memory_planned_buffers = - worker_allocator.allocateList>(num_memory_planned_buffers); - ET_CHECK(memory_planned_buffers != nullptr); - for (size_t id = 0; id < num_memory_planned_buffers; ++id) { - const size_t buffer_size = - method_meta->memory_planned_buffer_size(id).get(); - ET_LOG( - Info, "Setting up planned buffer id %zu, size %zu.", id, buffer_size); - void* buffer = worker_allocator.allocate(buffer_size); - ET_CHECK(buffer != nullptr); - memory_planned_buffers[id] = {(uint8_t*)buffer, buffer_size}; - ET_LOG( - Info, - "Created memory_planned_buffers with size %zu and addr %p", - buffer_size, - buffer); - } - auto* planned_memory = - worker_allocator.allocateInstance(); - ET_CHECK(planned_memory != nullptr); - new (planned_memory) HierarchicalAllocator( - {memory_planned_buffers, num_memory_planned_buffers}); - - // The constant allocator is not currently used, but must be provided. - auto* const_allocator = worker_allocator.allocateInstance(); - ET_CHECK(const_allocator != nullptr); - new (const_allocator) MemoryAllocator(0, nullptr); - - // Assemble all of the allocators into the MemoryManager that the Method - // will use. - auto* memory_manager = worker_allocator.allocateInstance(); - ET_CHECK(memory_manager != nullptr); - new (memory_manager) MemoryManager(method_allocator, planned_memory); - - return memory_manager; -} - -Method* init_method( - Program* program, - const char* method_name, - MemoryAllocator& worker_allocator, - std::vector& input_sizes, - std::vector& output_sizes) { - Result method_meta = program->method_meta(method_name); - ET_CHECK(method_meta.ok()); - - MemoryManager* memory_manager = - create_memory_manager(&method_meta.get(), worker_allocator); - - // - // Create and load a method from the program, using the provided - // allocators. The Method is what actually runs the model. It is - // mutable, so should only be used by a single thread at at time, but it can - // be reused. - // - - auto* method = worker_allocator.allocateInstance(); - ET_CHECK(method != nullptr); - auto method_res = program->load_method(method_name, memory_manager); - ET_CHECK_MSG( - method_res.error() == Error::Ok, - "loading method('%s') failed with status 0x%" PRIx32, - method_name, - method_res.error()); - new (method) Method(std::move(method_res.get())); - - ET_LOG(Info, "Model method '%s' initialized.", method_name); - - // Gather the byte size of each input/output tensor. - const size_t input_size = method->inputs_size(); - for (size_t i = 0; i < input_size; i++) { - if (!method->get_input(i).isTensor()) { - ET_LOG(Info, "input %zu is not a tensor, skipping", i); - continue; - } - const auto& t = method->get_input(i).toTensor(); - input_sizes.push_back(t.nbytes()); - } - - const size_t output_size = method->outputs_size(); - for (size_t i = 0; i < output_size; i++) { - const auto& t = method->get_output(i).toTensor(); - output_sizes.push_back(t.nbytes()); - } - - return method; -} - -void inference_loop( - Method* method, - const std::vector& input_buffers, - const std::vector& output_buffers) { - ET_LOG( - Info, - "Assigning input pointers, receiving %lu inputs", - input_buffers.size()); - - // Prepare the inputs. - { - size_t bufi = 0; - for (size_t i = 0; i < method->inputs_size(); i++) { - if (!method->get_input(i).isTensor()) { - ET_LOG(Info, "input %zu is not a tensor, skipping", i); - continue; - } - const auto& t = method->get_input(i).toTensor(); - ET_CHECK_MSG( - bufi < input_buffers.size(), "Not enough input buffers for model"); - t.set_data(input_buffers[bufi++]); - } - } - ET_LOG(Info, "Inputs prepared."); - - // Prepare the outputs. - { - size_t bufi = 0; - for (size_t i = 0; i < method->outputs_size(); i++) { - if (!method->get_output(i).isTensor()) { - ET_LOG(Info, "output %zu is not a tensor, skipping", i); - continue; - } - const auto& t = method->get_output(i).toTensor(); - ET_CHECK_MSG( - bufi < output_buffers.size(), "Not enough output buffers for model"); - t.set_data(output_buffers[bufi++]); - } - } - ET_LOG(Info, "Outputs prepared."); - - // Run the model. - Error status = method->execute(); - ET_CHECK_MSG( - status == Error::Ok, - "method->execute() failed with status 0x%" PRIx32, - status); - ET_LOG(Info, "Model executed successfully."); -} - -} // namespace worker - -/* - * This is an example of how ExecuTorch stack should run on multiple - * processors setup where there is a control core for memory - * management and a worker core that runs the actual inference. - */ - -int main(int argc, char** argv) { - torch::executor::runtime_init(); - gflags::ParseCommandLineFlags(&argc, &argv, true); - - /* - * Step 1: The model gets loaded from file to memory on the control core - */ - std::shared_ptr file_data; - size_t file_size; - Error err = torch::executor::util::read_file_content( - FLAGS_model_path.c_str(), &file_data, &file_size); - ET_CHECK_MSG(err == Error::Ok, "read_file_content failed: %d", int(err)); - - /* - * Step 2: Prepare the memory space required for worker core - */ - // The actual allocation size can be backend/model specific and smaller - constexpr size_t kWorkerBufferSize = 1 * 1024U * 1024U; // 1 MB - auto worker_buffer = std::make_unique(kWorkerBufferSize); - MemoryAllocator worker_allocator(kWorkerBufferSize, worker_buffer.get()); - - /* - * Step 3: The worker core sets up the corresponding data structures for the - * program - */ - Program* program = - worker::load_program(file_data.get(), file_size, worker_allocator); - ET_LOG( - Info, - "Loaded %s and constructed program at %p", - FLAGS_model_path.c_str(), - program); - ET_CHECK(program != nullptr); - - /* - * Step 4: The worker core sets up the Method. Here we let the control - * core read out the I/O info from the Method. This can also be done on - * the control core from the program flatbuffer, though there is no - * direct API at the moment. - */ - - // Get the method name to execute. - const char* method_name = nullptr; - { - // Use the first method in the program. - const auto method_name_result = program->get_method_name(0); - ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); - method_name = *method_name_result; - } - ET_LOG(Info, "Using method %s", method_name); - - std::vector input_sizes; - std::vector output_sizes; - - Method* method = worker::init_method( - program, method_name, worker_allocator, input_sizes, output_sizes); - - ET_LOG( - Info, - "Number of inputs is %lu and number of outputs is %lu", - input_sizes.size(), - output_sizes.size()); - - /* - * Step 5: The control core or the applicaton code prepares the I/O - */ - - // Allocate and initialize input/output tensor buffers for the inference - std::vector input_buffers; - for (size_t buffer_size : input_sizes) { - void* buffer = malloc(buffer_size); - memset(static_cast(buffer), 0, buffer_size); - input_buffers.push_back(buffer); - } - ET_LOG(Info, "Allocated the inputs"); - - std::vector output_buffers; - for (size_t buffer_size : output_sizes) { - void* buffer = malloc(buffer_size); - memset(static_cast(buffer), 0, buffer_size); - output_buffers.push_back(buffer); - } - ET_LOG(Info, "Allocated the outputs"); - - /* - * Step 6: The control core forwards the inference request and the worker - * core runs the program. - */ - - // Run the inference on the inputs. CHECK-fails on error. - worker::inference_loop(method, input_buffers, output_buffers); - - for (void* buffer : input_buffers) { - free(buffer); - } - for (void* buffer : output_buffers) { - free(buffer); - } - - return 0; -} diff --git a/test/targets.bzl b/test/targets.bzl index 5c2a28cad15..3c2a69f592b 100644 --- a/test/targets.bzl +++ b/test/targets.bzl @@ -49,42 +49,3 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) - - # Test binary that can create multiple Executor instances in the same - # process. - runtime.cxx_binary( - name = "multi_runner", - srcs = ["multi_runner.cpp"], - deps = [ - "//executorch/runtime/core:core", - "//executorch/kernels/portable:generated_lib", - "//executorch/runtime/executor:program", - "//executorch/runtime/executor/test:managed_memory_manager", - "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/util:read_file", - "//executorch/util:util", - ], - external_deps = [ - "gflags", - ], - **get_oss_build_kwargs() - ) - - # Test binary that can create relocatable Executor instances. - runtime.cxx_binary( - name = "relocatable_runner", - srcs = ["relocatable_runner.cpp"], - deps = [ - "//executorch/kernels/portable:generated_lib", - "//executorch/runtime/executor:program", - "//executorch/configurations:executor_cpu_optimized", - "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/util:read_file", - "//executorch/util:util", - ], - external_deps = [ - "gflags", - ], - preprocessor_flags = [], - define_static_target = True, - ) From 6feb6399c8f55f84bf8a1ccfd0d19c48b0c347a4 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 26 Aug 2024 13:56:29 -0700 Subject: [PATCH 050/531] Remove all uses of PrepareInputTensors Differential Revision: D61750837 Pull Request resolved: https://github.com/pytorch/executorch/pull/4910 --- .../test/allocation_failure_stress_test.cpp | 15 +++-- .../test/backend_integration_test.cpp | 7 +-- .../executor/test/kernel_integration_test.cpp | 13 +++-- runtime/executor/test/method_test.cpp | 57 +++++++++---------- runtime/executor/test/targets.bzl | 8 +-- 5 files changed, 49 insertions(+), 51 deletions(-) diff --git a/runtime/executor/test/allocation_failure_stress_test.cpp b/runtime/executor/test/allocation_failure_stress_test.cpp index 750ecd0a1b8..9e0c857b933 100644 --- a/runtime/executor/test/allocation_failure_stress_test.cpp +++ b/runtime/executor/test/allocation_failure_stress_test.cpp @@ -11,12 +11,12 @@ #include #include +#include #include #include #include #include #include -#include #include @@ -24,6 +24,8 @@ using namespace ::testing; using exec_aten::ArrayRef; using exec_aten::Scalar; using exec_aten::Tensor; +using executorch::extension::FileDataLoader; +using executorch::extension::prepare_input_tensors; using executorch::runtime::Error; using executorch::runtime::MemoryAllocator; using executorch::runtime::MemoryManager; @@ -31,7 +33,6 @@ using executorch::runtime::Method; using executorch::runtime::Program; using executorch::runtime::Result; using executorch::runtime::testing::ManagedMemoryManager; -using torch::executor::util::FileDataLoader; constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U; constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U; @@ -85,10 +86,9 @@ TEST_F(AllocationFailureStressTest, End2EndIncreaseRuntimeMemUntilSuccess) { // Execution does not use the runtime allocator, so it should always succeed // once load was successful. - exec_aten::ArrayRef inputs = - torch::executor::util::PrepareInputTensors(*method); + auto input_cleanup = prepare_input_tensors(*method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); err = method->execute(); - torch::executor::util::FreeInputs(inputs); ASSERT_EQ(err, Error::Ok); } EXPECT_GT(num_load_failures, 0) << "Expected at least some failures"; @@ -121,10 +121,9 @@ TEST_F(AllocationFailureStressTest, End2EndNonConstantMemUntilSuccess) { // Execution does not use the runtime allocator, so it should always succeed // once load was successful. - exec_aten::ArrayRef inputs = - torch::executor::util::PrepareInputTensors(*method); + auto input_cleanup = prepare_input_tensors(*method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); err = method->execute(); - torch::executor::util::FreeInputs(inputs); ASSERT_EQ(err, Error::Ok); } EXPECT_GT(num_load_failures, 0) << "Expected at least some failures"; diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp index 2445b659431..e3902bb9bc4 100644 --- a/runtime/executor/test/backend_integration_test.cpp +++ b/runtime/executor/test/backend_integration_test.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include @@ -454,10 +454,9 @@ TEST_P(BackendIntegrationTest, EndToEndTestWithProcessedAsHandle) { EXPECT_FALSE(spy_loader.WasFreed(init_processed->data())); auto method(std::move(method_res.get())); // Execute the model. - exec_aten::ArrayRef inputs = - torch::executor::util::PrepareInputTensors(method); + auto input_cleanup = executorch::extension::prepare_input_tensors(method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); auto err = method.execute(); - torch::executor::util::FreeInputs(inputs); EXPECT_EQ(err, Error::Ok); // Check that the processed buffer was passed to execute() as the handle. diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp index 83ade9f7ff4..3e7da810933 100644 --- a/runtime/executor/test/kernel_integration_test.cpp +++ b/runtime/executor/test/kernel_integration_test.cpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include #include @@ -158,12 +158,15 @@ class KernelIntegrationTest : public ::testing::Test { method_ = std::make_unique(std::move(method.get())); // Set up its inputs. - inputs_ = torch::executor::util::PrepareInputTensors(*method_); + auto inputs_cleanup = + executorch::extension::prepare_input_tensors(*method_); + ASSERT_EQ(inputs_cleanup.error(), Error::Ok); + inputs_cleanup_ = std::make_unique( + std::move(*inputs_cleanup)); } void TearDown() override { - torch::executor::util::FreeInputs(inputs_); - inputs_ = {}; + inputs_cleanup_.reset(); } private: @@ -173,7 +176,7 @@ class KernelIntegrationTest : public ::testing::Test { // Must outlive method_ std::unique_ptr program_; std::unique_ptr mmm_; - ArrayRef inputs_; + std::unique_ptr inputs_cleanup_; protected: // An executable method that will call the kernel associated with control_. diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp index 827bb2daa11..06b84d338e1 100644 --- a/runtime/executor/test/method_test.cpp +++ b/runtime/executor/test/method_test.cpp @@ -10,17 +10,18 @@ #include #include +#include #include #include #include #include #include #include -#include #include using namespace ::testing; using exec_aten::ArrayRef; +using executorch::extension::prepare_input_tensors; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::Method; @@ -80,8 +81,8 @@ TEST_F(MethodTest, MoveTest) { ASSERT_EQ(method.error(), Error::Ok); // Can execute the method. - exec_aten::ArrayRef inputs = - torch::executor::util::PrepareInputTensors(*method); + auto input_cleanup = prepare_input_tensors(*method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); Error err = method->execute(); ASSERT_EQ(err, Error::Ok); @@ -95,8 +96,6 @@ TEST_F(MethodTest, MoveTest) { // Can execute the new method. err = new_method.execute(); ASSERT_EQ(err, Error::Ok); - - torch::executor::util::FreeInputs(inputs); } TEST_F(MethodTest, GetInputTests) { @@ -173,8 +172,8 @@ TEST_F(MethodTest, SetPrimInputTest) { ASSERT_EQ(method.error(), Error::Ok); // Can execute the method. - exec_aten::ArrayRef inputs = - torch::executor::util::PrepareInputTensors(*method); + auto input_cleanup = prepare_input_tensors(*method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); // The args to the method are x, y, alpha. x and y are tensors handled above // alpha is a prim. @@ -189,8 +188,6 @@ TEST_F(MethodTest, SetPrimInputTest) { Error err = method->execute(); EXPECT_EQ(err, Error::Ok); - - torch::executor::util::FreeInputs(inputs); } TEST_F(MethodTest, MethodMetaTest) { @@ -297,28 +294,28 @@ TEST_F(MethodTest, ConstantBufferTest) { ASSERT_EQ(err, Error::Ok); } -// TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of -// the portable op lib - -// TEST_F(MethodTest, OptionalTensorListDeserialization) { -// ManagedMemoryManager mmm(kDefaultNonConstMemBytes, -// kDefaultRuntimeMemBytes); Result method = -// index_program_->load_method("forward", &mmm.get()); -// ASSERT_EQ(method.error(), Error::Ok); +/* + * TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of + * the portable op lib -// // Can execute the method. -// exec_aten::ArrayRef inputs = -// executorch::runtime::util::PrepareInputTensors(*method); -// Error err = method->execute(); -// ASSERT_EQ(err, Error::Ok); +TEST_F(MethodTest, OptionalTensorListDeserialization) { + ManagedMemoryManager mmm(kDefaultNonConstMemBytes, + kDefaultRuntimeMemBytes); Result method = + index_program_->load_method("forward", &mmm.get()); + ASSERT_EQ(method.error(), Error::Ok); -// EXPECT_EQ(method->inputs_size(), 1); + // Can execute the method. + auto input_cleanup = prepare_input_tensors(*method); + ASSERT_EQ(input_cleanup.error(), Error::Ok); + Error err = method->execute(); + ASSERT_EQ(err, Error::Ok); -// auto outputs = method->get_output(0); -// EXPECT_EQ(outputs.toTensor().dim(), 3); -// EXPECT_EQ(outputs.toTensor().size(0), 5); -// EXPECT_EQ(outputs.toTensor().size(1), 2); -// EXPECT_EQ(outputs.toTensor().size(2), 10); + EXPECT_EQ(method->inputs_size(), 1); -// executorch::runtime::util::FreeInputs(inputs); -// } + auto outputs = method->get_output(0); + EXPECT_EQ(outputs.toTensor().dim(), 3); + EXPECT_EQ(outputs.toTensor().size(0), 5); + EXPECT_EQ(outputs.toTensor().size(1), 2); + EXPECT_EQ(outputs.toTensor().size(2), 10); +} +*/ diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index 5ba989ef86a..a329a4884ef 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -120,7 +120,7 @@ def define_common_targets(is_fbcode = False): "//executorch/runtime/executor:program", "//executorch/kernels/portable:generated_lib", "//executorch/extension/data_loader:file_data_loader", - "//executorch/util:util", + "//executorch/extension/runner_util:inputs", ], env = modules_env, ) @@ -133,8 +133,8 @@ def define_common_targets(is_fbcode = False): deps = [ ":managed_memory_manager", "//executorch/runtime/executor:program", - "//executorch/util:util", "//executorch/extension/data_loader:file_data_loader", + "//executorch/extension/runner_util:inputs", "//executorch/kernels/portable:generated_lib", ], env = modules_env, @@ -189,12 +189,12 @@ def define_common_targets(is_fbcode = False): deps = [ ":managed_memory_manager", "//executorch/extension/data_loader:file_data_loader", + "//executorch/extension/runner_util:inputs", "//executorch/runtime/core:core", "//executorch/runtime/executor:program", "//executorch/runtime/kernel:kernel_runtime_context", "//executorch/runtime/kernel:operator_registry", "//executorch/runtime/platform:platform", - "//executorch/util:util", ], env = modules_env, ) @@ -210,7 +210,7 @@ def define_common_targets(is_fbcode = False): "//executorch/runtime/executor:program", "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:file_data_loader", - "//executorch/util:util", + "//executorch/extension/runner_util:inputs", ], env = { # The tests use these vars to find the program files to load. From f560682667c2b42bc5fe9b6d5bc5f8df2a9bd6e1 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 14:11:40 -0700 Subject: [PATCH 051/531] Allow single EValue to be passed to Module execute. Differential Revision: D61799496 Pull Request resolved: https://github.com/pytorch/executorch/pull/4907 --- docs/source/extension-module.md | 6 +- .../Sources/MobileNet/MobileNetClassifier.mm | 2 +- .../llava/runner/llava_image_prefiller.h | 2 +- .../llava/runner/llava_text_decoder_runner.h | 2 +- extension/llm/runner/text_decoder_runner.cpp | 2 +- extension/module/module.h | 57 ++++++++++++++++++- extension/module/test/module_test.cpp | 26 ++++----- 7 files changed, 74 insertions(+), 23 deletions(-) diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md index 97528c95405..7516184d1cc 100644 --- a/docs/source/extension-module.md +++ b/docs/source/extension-module.md @@ -22,7 +22,7 @@ Tensor::SizesType sizes[] = {1, 3, 256, 256}; TensorImpl tensor(ScalarType::Float, std::size(sizes), sizes, input); // Perform an inference. -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); // Check for success or failure. if (result.ok()) { @@ -105,13 +105,13 @@ Note: `method_meta()` will try to force-load the `Method` when called for the fi Assuming that the `Program`'s method names and their input format is known ahead of time, we rarely need to query for those and can run the methods directly by name using the `execute()` function: ```cpp -const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); +const auto result = module.execute("forward", Tensor(&tensor)); ``` Which can also be simplified for the standard `forward()` method name as: ```cpp -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); ``` Note: `execute()` or `forward()` will try to force load the `Program` and the `Method` when called for the first time. Therefore, the first inference will take more time than subsequent ones as it loads the model lazily and prepares it for execution unless the `Program` or `Method` was loaded explicitly earlier using the corresponding functions. diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm index cc5a5c81394..733dcdc8b35 100644 --- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm +++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm @@ -35,7 +35,7 @@ - (BOOL)classifyWithInput:(float*)input error:(NSError**)error { int32_t sizes[] = {1, kChannels, kSize, kSize}; TensorImpl inputTensor(ScalarType::Float, std::size(sizes), sizes, input); - const auto result = _module->forward({EValue(Tensor(&inputTensor))}); + const auto result = _module->forward(Tensor(&inputTensor)); if (!result.ok()) { if (error) { diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h index 4d0a07b9a66..50c981026a6 100644 --- a/examples/models/llava/runner/llava_image_prefiller.h +++ b/examples/models/llava/runner/llava_image_prefiller.h @@ -30,7 +30,7 @@ class LlavaImagePrefiller : public ImagePrefiller { image.data.data(), {3, image.height, image.width}, ScalarType::Byte); // Run image encoder std::vector image_encoder_outputs = ET_UNWRAP(module_->execute( - kImageEncoderMethod, {managed_images.get_aliasing_tensor()})); + kImageEncoderMethod, managed_images.get_aliasing_tensor())); // inputs:[start_pos, embeds] ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long); diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h index 8303b295f53..e70ba59d513 100644 --- a/examples/models/llava/runner/llava_text_decoder_runner.h +++ b/examples/models/llava/runner/llava_text_decoder_runner.h @@ -27,7 +27,7 @@ class LlavaTextDecoderRunner : public TextDecoderRunner { // run token embedding std::vector token_embedding_outputs = - ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, {tokens})); + ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens)); // run text model std::vector outputs_res = ET_UNWRAP(module_->execute( diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index a0963769eab..5b77c69825f 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -60,7 +60,7 @@ ::executorch::runtime::Result TextDecoderRunner::step( (void)managed_start_pos; // unused ::executorch::runtime::Result> - outputs_res = module_->forward({tokens}); + outputs_res = module_->forward(tokens); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, diff --git a/extension/module/module.h b/extension/module/module.h index a0b575d5bf6..4289a3c9c76 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -181,6 +181,25 @@ class Module final { const std::string& method_name, const std::vector<::executorch::runtime::EValue>& input); + /** + * Execute a specific method with a single input value. + * Loads the program and method before executing if needed. + * + * @param[in] method_name The name of the method to execute. + * @param[in] input A value to be passed to the method. + * + * @returns A Result object containing either a vector of output values + * from the method or an error to indicate failure. + */ + ET_NODISCARD + ::executorch::runtime::Result> + execute( + const std::string& method_name, + const ::executorch::runtime::EValue& input) { + return execute( + method_name, std::vector<::executorch::runtime::EValue>{input}); + } + /** * Execute a specific method without any input values. * Loads the program and method before executing if needed. @@ -193,7 +212,7 @@ class Module final { ET_NODISCARD ::executorch::runtime::Result> execute(const std::string& method_name) { - return execute(method_name, {}); + return execute(method_name, std::vector<::executorch::runtime::EValue>{}); } /** @@ -217,6 +236,23 @@ class Module final { return result[0]; } + /** + * Retrieve the output value of a specific method with a single input value. + * Loads the program and method before execution if needed. + * + * @param[in] method_name The name of the method to execute. + * @param[in] input A value to be passed to the method. + * + * @returns A Result object containing either the first output value from the + * method or an error to indicate failure. + */ + ET_NODISCARD + ::executorch::runtime::Result<::executorch::runtime::EValue> get( + const std::string& method_name, + const ::executorch::runtime::EValue& input) { + return get(method_name, std::vector<::executorch::runtime::EValue>{input}); + } + /** * Retrieve the output value of a specific method without any input values. * Loads the program and method before execution if needed. @@ -229,7 +265,7 @@ class Module final { ET_NODISCARD ::executorch::runtime::Result<::executorch::runtime::EValue> get( const std::string& method_name) { - return get(method_name, {}); + return get(method_name, std::vector<::executorch::runtime::EValue>{}); } /** @@ -247,6 +283,21 @@ class Module final { return execute("forward", input); } + /** + * Execute the 'forward' method with a single value. + * Loads the program and method before executing if needed. + * + * @param[in] input A value for the 'forward' method. + * + * @returns A Result object containing either a vector of output values + * from the 'forward' method or an error to indicate failure. + */ + ET_NODISCARD + ::executorch::runtime::Result> + forward(const ::executorch::runtime::EValue& input) { + return forward(std::vector<::executorch::runtime::EValue>{input}); + } + /** * Execute the 'forward' method without any input values. * Loads the program and method before executing if needed. @@ -257,7 +308,7 @@ class Module final { ET_NODISCARD ::executorch::runtime::Result> forward() { - return forward({}); + return forward(std::vector<::executorch::runtime::EValue>{}); } /** diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp index d5bf1f52dfb..4ef454e1c75 100644 --- a/extension/module/test/module_test.cpp +++ b/extension/module/test/module_test.cpp @@ -129,7 +129,7 @@ TEST_F(ModuleTest, TestExecute) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); + const auto result = module.execute("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); EXPECT_TRUE(module.is_loaded()); EXPECT_TRUE(module.is_method_loaded("forward")); @@ -150,7 +150,7 @@ TEST_F(ModuleTest, TestExecutePreload) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); + const auto result = module.execute("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -169,7 +169,7 @@ TEST_F(ModuleTest, TestExecutePreload_method) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); + const auto result = module.execute("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -191,7 +191,7 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); + const auto result = module.execute("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -223,7 +223,7 @@ TEST_F(ModuleTest, TestGet) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.get("forward", {EValue(Tensor(&tensor))}); + const auto result = module.get("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->toTensor().const_data_ptr(); @@ -237,7 +237,7 @@ TEST_F(ModuleTest, TestForward) { std::array sizes{1, 2}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module->forward({EValue(Tensor(&tensor))}); + const auto result = module->forward(Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -247,7 +247,7 @@ TEST_F(ModuleTest, TestForward) { std::array input2{2, 3}; TensorImpl tensor2( ScalarType::Float, sizes.size(), sizes.data(), input2.data()); - const auto result2 = module->forward({EValue(Tensor(&tensor2))}); + const auto result2 = module->forward(Tensor(&tensor2)); EXPECT_TRUE(result2.ok()); const auto data2 = result->at(0).toTensor().const_data_ptr(); @@ -258,7 +258,7 @@ TEST_F(ModuleTest, TestForward) { TEST_F(ModuleTest, TestForwardWithInvalidInputs) { Module module(model_path_); - const auto result = module.forward({EValue()}); + const auto result = module.forward(EValue()); EXPECT_FALSE(result.ok()); } @@ -308,18 +308,18 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result1 = module1->execute("forward", {EValue(Tensor(&tensor))}); + auto result1 = module1->execute("forward", Tensor(&tensor)); EXPECT_TRUE(result1.ok()); auto module2 = std::make_unique(module1->program()); - auto result2 = module2->execute("forward", {EValue(Tensor(&tensor))}); + auto result2 = module2->execute("forward", Tensor(&tensor)); EXPECT_TRUE(result2.ok()); module1 = std::make_unique("/path/to/nonexistent/file.pte"); EXPECT_FALSE(module1->is_loaded()); - auto result3 = module2->execute("forward", {EValue(Tensor(&tensor))}); + auto result3 = module2->execute("forward", Tensor(&tensor)); EXPECT_TRUE(result3.ok()); } @@ -356,7 +356,7 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result = module.execute("forward", {EValue(Tensor(&tensor))}); + auto result = module.execute("forward", Tensor(&tensor)); EXPECT_TRUE(result.ok()); auto data = result->at(0).toTensor().const_data_ptr(); @@ -385,7 +385,7 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data()); - const auto result = module.forward({EValue(Tensor(&tensor))}); + const auto result = module.forward(Tensor(&tensor)); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); From 2b2911b50f4a9da31de31833b9f1d3d455b45266 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 26 Aug 2024 14:48:46 -0700 Subject: [PATCH 052/531] Add a script to install Apple certificate for CI iOS jobs Differential Revision: D61707676 Pull Request resolved: https://github.com/pytorch/executorch/pull/4703 --- .ci/scripts/setup-ios.sh | 33 +++++++++++++ .github/workflows/apple.yml | 14 +++++- build/test_ios_ci.sh | 49 +++++++++++++++++++ .../ExecuTorchDemo.xcodeproj/project.pbxproj | 19 ++++++- 4 files changed, 112 insertions(+), 3 deletions(-) create mode 100755 .ci/scripts/setup-ios.sh diff --git a/.ci/scripts/setup-ios.sh b/.ci/scripts/setup-ios.sh new file mode 100755 index 00000000000..519cd2581eb --- /dev/null +++ b/.ci/scripts/setup-ios.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# This script follows the instructions from GitHub to install an Apple certificate +# https://docs.github.com/en/actions/use-cases-and-examples/deploying/installing-an-apple-certificate-on-macos-runners-for-xcode-development + +CERTIFICATE_PATH="${RUNNER_TEMP}"/build_certificate.p12 +PP_PATH="${RUNNER_TEMP}"/build_pp.mobileprovision +KEYCHAIN_PATH="${RUNNER_TEMP}"/app-signing.keychain-db + +# Import certificate and provisioning profile from secrets +echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH +echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH + +# Create a temporary keychain +security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security set-keychain-settings -lut 21600 $KEYCHAIN_PATH +security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH + +# Import certificate to the keychain +security import $CERTIFICATE_PATH -P "" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH +security set-key-partition-list -S apple-tool:,apple: -k "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security list-keychain -d user -s $KEYCHAIN_PATH + +# Apply provisioning profile +mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles +cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 60022b81f9e..5f19831250c 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -8,6 +8,7 @@ on: pull_request: paths: - .ci/docker/** + - .ci/scripts/setup-ios.sh - .github/workflows/apple.yml - install_requirements.sh - backends/apple/** @@ -27,24 +28,35 @@ jobs: test-demo-ios: name: test-demo-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + secrets: inherit with: runner: macos-latest-xlarge python-version: '3.11' submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + upload-artifact: ios-apps script: | BUILD_TOOL=cmake .ci/scripts/setup-conda.sh + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + # Setup MacOS dependencies as there is no Docker support on MacOS atm GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + # Build and test iOS Demo App PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - build/test_ios_ci.sh + build/test_ios_ci.sh ${ARTIFACTS_DIR_NAME} build-frameworks-ios: name: build-frameworks-ios diff --git a/build/test_ios_ci.sh b/build/test_ios_ci.sh index 5fa6ef7d246..50c6448d4b2 100755 --- a/build/test_ios_ci.sh +++ b/build/test_ios_ci.sh @@ -11,6 +11,9 @@ APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo" MODEL_NAME="mv3" SIMULATOR_NAME="executorch" +# If this is set, copy the build artifacts to this directory +ARTIFACTS_DIR_NAME="$1" + finish() { EXIT_STATUS=$? if xcrun simctl list | grep -q "$SIMULATOR_NAME"; then @@ -64,3 +67,49 @@ xcodebuild test \ -project "$APP_PATH.xcodeproj" \ -scheme MobileNetClassifierTest \ -destination name="$SIMULATOR_NAME" + +# NB: https://docs.aws.amazon.com/devicefarm/latest/developerguide/test-types-ios-xctest-ui.html +say "Package The Test Suite" + +xcodebuild build-for-testing \ + -project "$APP_PATH.xcodeproj" \ + -scheme MobileNetClassifierTest \ + -destination platform="iOS" \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER=ExecuTorchDemo \ + CODE_SIGN_IDENTITY="iPhone Distribution" + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app +MODE="Debug" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +MOCK_APP_NAME=ExecuTorchDemo + +ls -lah +cp -r "${MOCK_APP_NAME}.app" Payload && zip -vr "${MOCK_APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${MOCK_APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${MOCK_APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${MOCK_APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj index cbcb03a3b72..857c5252845 100644 --- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj @@ -59,6 +59,13 @@ remoteGlobalIDString = 03C818302AC79FCD0084CC29; remoteInfo = ImageClassification; }; + 84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 032C01672AC228E5002955E1 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 032C016E2AC228E6002955E1; + remoteInfo = App; + }; /* End PBXContainerItemProxy section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -330,6 +337,7 @@ buildRules = ( ); dependencies = ( + 84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */, ); name = MobileNetClassifierTest; packageProductDependencies = ( @@ -489,6 +497,11 @@ target = 03C818302AC79FCD0084CC29 /* ImageClassification */; targetProxy = 03C818452AC7A0DB0084CC29 /* PBXContainerItemProxy */; }; + 84EF1FEA2C7850B6005922B4 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 032C016E2AC228E6002955E1 /* App */; + targetProxy = 84EF1FE92C7850B6005922B4 /* PBXContainerItemProxy */; + }; /* End PBXTargetDependency section */ /* Begin XCBuildConfiguration section */ @@ -633,7 +646,7 @@ INFOPLIST_KEY_UIRequiresFullScreen = YES; INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait; MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test; PRODUCT_NAME = "$(PROJECT_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; @@ -661,7 +674,7 @@ INFOPLIST_KEY_UIRequiresFullScreen = YES; INFOPLIST_KEY_UISupportedInterfaceOrientations = UIInterfaceOrientationPortrait; MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.demo.test; PRODUCT_NAME = "$(PROJECT_NAME)"; PROVISIONING_PROFILE_SPECIFIER = ""; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; @@ -703,6 +716,7 @@ SUPPORTS_MACCATALYST = NO; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo"; }; name = Debug; }; @@ -717,6 +731,7 @@ PRODUCT_NAME = "$(TARGET_NAME)"; SUPPORTS_MACCATALYST = NO; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/ExecuTorchDemo.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/ExecuTorchDemo"; }; name = Release; }; From 1f0487d65875efd54eb05bf76a649a3b4378785b Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Mon, 26 Aug 2024 15:05:39 -0700 Subject: [PATCH 053/531] Update llama special tokens Differential Revision: D61730848 Pull Request resolved: https://github.com/pytorch/executorch/pull/4876 --- examples/models/llama2/tokenizer/llama_tiktoken.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama2/tokenizer/llama_tiktoken.cpp index 60620a9b333..0a1dddcc22e 100644 --- a/examples/models/llama2/tokenizer/llama_tiktoken.cpp +++ b/examples/models/llama2/tokenizer/llama_tiktoken.cpp @@ -23,15 +23,15 @@ _get_default_special_tokens() { "<|end_of_text|>", "<|reserved_special_token_0|>", "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", + "<|finetune_right_pad_id|>", + "<|step_id|>", "<|start_header_id|>", "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>"}); - + "<|eom_id|>", + "<|eot_id|>", + "<|python_tag|>"}); // pad the rest of the special tokens with reserved tokens - ssize_t reserved_special_token_num = 5; + ssize_t reserved_special_token_num = 2; while (special_tokens->size() < kSpecialTokensSize) { special_tokens->emplace_back( "<|reserved_special_token_" + From b284866b5f5faa2965b11328c275fd8e5ba98e5c Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 26 Aug 2024 15:26:26 -0700 Subject: [PATCH 054/531] Remove torch:: references from examples/portable Differential Revision: D61742292 Pull Request resolved: https://github.com/pytorch/executorch/pull/4882 --- .../portable/custom_ops/custom_ops_1_out.cpp | 12 ++++++++---- .../portable/custom_ops/custom_ops_2_out.cpp | 12 +++++++++--- .../executor_runner/executor_runner.cpp | 19 ++++++++++++++----- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/examples/portable/custom_ops/custom_ops_1_out.cpp b/examples/portable/custom_ops/custom_ops_1_out.cpp index 14c327071cf..c1f1dee0ceb 100644 --- a/examples/portable/custom_ops/custom_ops_1_out.cpp +++ b/examples/portable/custom_ops/custom_ops_1_out.cpp @@ -13,7 +13,7 @@ namespace native { using exec_aten::ScalarType; using exec_aten::Tensor; -using torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace { void check_preconditions(const Tensor& in, Tensor& out) { @@ -35,10 +35,13 @@ void check_preconditions(const Tensor& in, Tensor& out) { ssize_t(in.numel())); } } // namespace -// mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!) -Tensor& mul3_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) { - (void)ctx; +// mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!) +// ExecuTorch-compatible function signature, with a KernelRuntimeContext. +Tensor& mul3_out_impl( + ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + Tensor& out) { check_preconditions(in, out); float* out_data = out.mutable_data_ptr(); const float* in_data = in.const_data_ptr(); @@ -47,5 +50,6 @@ Tensor& mul3_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) { } return out; } + } // namespace native } // namespace custom diff --git a/examples/portable/custom_ops/custom_ops_2_out.cpp b/examples/portable/custom_ops/custom_ops_2_out.cpp index ffb3a8be443..f792d06f29e 100644 --- a/examples/portable/custom_ops/custom_ops_2_out.cpp +++ b/examples/portable/custom_ops/custom_ops_2_out.cpp @@ -13,7 +13,7 @@ namespace native { using exec_aten::ScalarType; using exec_aten::Tensor; -using torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace { void check_preconditions(const Tensor& in, Tensor& out) { @@ -35,7 +35,9 @@ void check_preconditions(const Tensor& in, Tensor& out) { ssize_t(in.numel())); } } // namespace + // mul4.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!) +// ATen-compatible function signature, without a KernelRuntimeContext. Tensor& mul4_out_impl(const Tensor& in, Tensor& out) { check_preconditions(in, out); float* out_data = out.mutable_data_ptr(); @@ -46,8 +48,12 @@ Tensor& mul4_out_impl(const Tensor& in, Tensor& out) { return out; } -Tensor& mul4_out_impl(RuntimeContext& ctx, const Tensor& in, Tensor& out) { - (void)ctx; +// mul4.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!) +// ExecuTorch-compatible function signature, with a KernelRuntimeContext. +Tensor& mul4_out_impl( + ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& in, + Tensor& out) { mul4_out_impl(in, out); return out; } diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index a0644487d23..93c150c0b90 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -37,11 +37,20 @@ DEFINE_string( "model.pte", "Model serialized in flatbuffer format."); -using namespace torch::executor; -using torch::executor::util::FileDataLoader; +using executorch::extension::FileDataLoader; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; int main(int argc, char** argv) { - runtime_init(); + executorch::runtime::runtime_init(); gflags::ParseCommandLineFlags(&argc, &argv, true); if (argc != 1) { @@ -154,7 +163,7 @@ int main(int argc, char** argv) { // Allocate input tensors and set all of their elements to 1. The `inputs` // variable owns the allocated memory and must live past the last call to // `execute()`. - auto inputs = util::prepare_input_tensors(*method); + auto inputs = executorch::extension::prepare_input_tensors(*method); ET_CHECK_MSG( inputs.ok(), "Could not prepare inputs: 0x%" PRIx32, @@ -176,7 +185,7 @@ int main(int argc, char** argv) { status = method->get_outputs(outputs.data(), outputs.size()); ET_CHECK(status == Error::Ok); // Print the first and last 100 elements of long lists of scalars. - std::cout << torch::executor::util::evalue_edge_items(100); + std::cout << executorch::extension::evalue_edge_items(100); for (int i = 0; i < outputs.size(); ++i) { std::cout << "Output " << i << ": " << outputs[i] << std::endl; } From 2c7b7e8388c82aeabf75e1f2dd1752b86f994b64 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 26 Aug 2024 15:41:30 -0700 Subject: [PATCH 055/531] [llava] Enable dynamic shape for image preprocessor Differential Revision: D61818152 Pull Request resolved: https://github.com/pytorch/executorch/pull/4821 --- .ci/scripts/test_llava.sh | 15 ++++- examples/models/llava/export_llava.py | 11 +--- examples/models/llava/image_util.py | 78 ++++++++++++++++++++++++++ examples/models/llava/model.py | 57 ++++++++++++------- examples/models/llava/test/test_pte.py | 13 ++++- 5 files changed, 142 insertions(+), 32 deletions(-) create mode 100644 examples/models/llava/image_util.py diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 60589c96d47..4e167b6d895 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -54,6 +54,13 @@ export_llava() { $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts } +# Download a new image with different size, to test if the model can handle different image sizes +prepare_image_tensor() { + echo "Downloading image" + curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg + $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt +} + run_and_verify() { NOW=$(date +"%H:%M:%S") echo "Starting to run llava runner at ${NOW}" @@ -79,7 +86,12 @@ run_and_verify() { # verify result.txt RESULT=$(cat result.txt) # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. - EXPECTED_PREFIX="ASSISTANT:" + if [[ "$(uname)" == "Darwin" ]]; then + EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress on a basketball court. There are several players on the court, with one player in the foreground holding a basketball, and" + else + # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. + EXPECTED_PREFIX="ASSISTANT:" + fi if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then echo "Expected result prefix: ${EXPECTED_PREFIX}" echo "Actual result: ${RESULT}" @@ -96,4 +108,5 @@ run_and_verify() { cmake_install_executorch_libraries cmake_build_llava_runner export_llava +prepare_image_tensor run_and_verify diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 4f2aa6576b9..1df7c242dca 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -22,6 +22,7 @@ from executorch.examples.models.llama2.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) +from executorch.examples.models.llava.image_util import serialize_image from executorch.examples.models.llava.model import LlavaModel from executorch.exir import ( EdgeCompileConfig, @@ -35,7 +36,6 @@ from executorch.extension.llm.export.builder import DType, LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import Tokenizer -from torch import nn from torch.ao.quantization.quantizer.xnnpack_quantizer import ( get_symmetric_quantization_config, XNNPACKQuantizer, @@ -231,14 +231,7 @@ def get_image_tensor_for_llava_runner(llava_model): # llava runner doesn't have image reader so an image tensor is needed. (resized,) = llava_model.get_example_inputs() - copy = torch.tensor(resized) - m = nn.Module() - par = nn.Parameter(copy, requires_grad=False) - m.register_parameter("0", par) - tensors = torch.jit.script(m) - tensors.save("image.pt") - - logging.info("Saved image tensor to image.pt") + serialize_image(resized, "image.pt") def get_tokenizer_for_llava_runner(llava_model): diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py new file mode 100644 index 00000000000..bf5e331d61c --- /dev/null +++ b/examples/models/llava/image_util.py @@ -0,0 +1,78 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Utility functions for image processing. Run it with your image: + +# python image_util.py --image-path + +import logging +from argparse import ArgumentParser + +import torch +import torchvision +from PIL import Image +from torch import nn + + +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT) + + +def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor: + """Read image into a tensor and resize the image so that it fits in + a target_h x target_w canvas. + + Args: + image (Image): An Image object. + target_h (int): Target height. + target_w (int): Target width. + + Returns: + torch.Tensor: resized image tensor. + """ + img = torchvision.transforms.functional.pil_to_tensor(image) + # height ratio + ratio_h = img.shape[1] / target_h + # width ratio + ratio_w = img.shape[2] / target_w + # resize the image so that it fits in a target_h x target_w canvas + ratio = max(ratio_h, ratio_w) + output_size = (int(img.shape[1] / ratio), int(img.shape[2] / ratio)) + img = torchvision.transforms.Resize(size=output_size)(img) + return img + + +def serialize_image(image: torch.Tensor, path: str) -> None: + copy = torch.tensor(image) + m = nn.Module() + par = nn.Parameter(copy, requires_grad=False) + m.register_parameter("0", par) + tensors = torch.jit.script(m) + tensors.save(path) + + logging.info(f"Saved image tensor to {path}") + + +def main(): + parser = ArgumentParser() + parser.add_argument( + "--image-path", + required=True, + help="Path to the image.", + ) + parser.add_argument( + "--output-path", + default="image.pt", + ) + args = parser.parse_args() + + image = Image.open(args.image_path) + image_tensor = prepare_image(image, target_h=336, target_w=336) + serialize_image(image_tensor, args.output_path) + + +if __name__ == "__main__": + main() diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 4f975e2ed4b..b4a203d7419 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -6,20 +6,18 @@ # An ExecuTorch friendly implementation of Llava-1.5. -import math - import re from typing import Any, Dict, Optional import requests import torch -import torchvision from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer from executorch.examples.models.llama2.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) +from executorch.examples.models.llava.image_util import prepare_image from executorch.examples.models.model_base import EagerModelBase from PIL import Image @@ -156,19 +154,32 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor: return image_features def image_preprocess(self, img: torch.Tensor) -> torch.Tensor: - w = max(img.shape[1], img.shape[2]) + target_h = self.image_processor.crop_size["height"] + target_w = self.image_processor.crop_size["width"] # pad the image with median rgb value, to make a square - v_padding = (w - img.shape[1]) / 2 - h_padding = (w - img.shape[2]) / 2 - l_pad = int(math.ceil(h_padding)) - t_pad = int(math.ceil(v_padding)) - r_pad = int(math.floor(h_padding)) - b_pad = int(math.floor(v_padding)) - resized = F.pad( + l_pad = (target_w - img.shape[2]) // 2 + t_pad = (target_h - img.shape[1]) // 2 + # ceil division + r_pad = -((target_w - img.shape[2]) // -2) + b_pad = -((target_h - img.shape[1]) // -2) + + torch._check(l_pad >= 0) + torch._check(t_pad >= 0) + torch._check(r_pad >= 0) + torch._check(b_pad >= 0) + + # This is different from the original implementation, due to export limitations. + resized = torch.nn.functional.pad( img, - padding=(l_pad, t_pad, r_pad, b_pad), - fill=tuple(int(x * 255) for x in self.image_processor.image_mean), + (l_pad, r_pad, t_pad, b_pad), ) + # originally: + # resized = F.pad( + # img, + # padding=(l_pad, t_pad, r_pad, b_pad), + # fill=tuple(int(x * 255) for x in self.image_mean), + # ) + # TODO: implement _upsample_bicubic_aa.out in portable kernel library. # here padded shape should be max(h, w) x max(h, w) # skipping resize for now due to missing _upsample_bicubic_aa kernel in portable @@ -287,13 +298,12 @@ def get_example_inputs(self): """Returns a resized image as input to model.forward().""" if self.resized_image: return self.resized_image - imagr = torchvision.transforms.functional.pil_to_tensor(self.image) - ratio = ( - max(imagr.shape[1], imagr.shape[2]) - / self.image_processor.crop_size["height"] + resized = prepare_image( + self.image, + self.image_processor.crop_size["height"], + self.image_processor.crop_size["width"], ) - output_size = (int(imagr.shape[1] / ratio), int(imagr.shape[2] / ratio)) - self.resized_image = (torchvision.transforms.Resize(size=output_size)(imagr),) + self.resized_image = (resized,) return self.resized_image def get_inputs_for_prefill(self): @@ -317,8 +327,13 @@ def get_dynamic_shapes(self): return self._get_image_dynamic_shapes() def _get_image_dynamic_shapes(self): - height = Dim("height", min=8, max=336) - width = Dim("width", min=28, max=336) + # only support even number of height and width for now + _height = Dim( + "_height", min=1, max=self.image_processor.crop_size["height"] // 2 + ) + _width = Dim("_width", min=1, max=self.image_processor.crop_size["width"] // 2) + height = 2 * _height + width = 2 * _width dynamic_shapes = [{1: height, 2: width}] return dynamic_shapes diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py index d793b2ae221..85c47cc1de5 100644 --- a/examples/models/llava/test/test_pte.py +++ b/examples/models/llava/test/test_pte.py @@ -8,9 +8,10 @@ import sys import torch - +from executorch.examples.models.llava.image_util import prepare_image from executorch.examples.models.llava.model import LlavaModel from executorch.extension.pybindings.portable_lib import _load_for_executorch +from PIL import Image # Custom ops has to be loaded after portable_lib. # I don't know how to stop UFMT so I'm just using if True: to avoid lint error @@ -24,6 +25,12 @@ def main(): args = sys.argv[1:] + if len(args) == 0: + print( + "Usage: python test_pte.py . If no image, will use default image." + ) + sys.exit(1) + llava_module = _load_for_executorch(args[0]) llava_model = LlavaModel() @@ -31,6 +38,10 @@ def main(): prompt_before_image, resized, prompt_after_image = ( llava_model.get_inputs_for_prefill() ) + if len(args) == 2: + image_path = args[1] + image = Image.open(image_path) + resized = prepare_image(image, target_h=336, target_w=336) start_pos = 0 # pte prefill prompt before img From 4689c911637ecce0a57a8f284b1b4ce2bff7917a Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 26 Aug 2024 15:55:12 -0700 Subject: [PATCH 056/531] Swap --gc-sections to -dead_strip for clang compilation Differential Revision: D61799233 Pull Request resolved: https://github.com/pytorch/executorch/pull/4888 --- .ci/scripts/test_llava.sh | 12 ++++++++---- .github/workflows/trunk.yml | 28 ++++++++++++++++++++++++++++ CMakeLists.txt | 8 ++++++-- examples/models/llava/CMakeLists.txt | 6 +++++- 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 4e167b6d895..ec4a6b37d8a 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -8,6 +8,10 @@ set -exu # shellcheck source=/dev/null +BUILD_TYPE=${1:-Debug} + +echo "Building with BUILD_TYPE: $BUILD_TYPE" + if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then PYTHON_EXECUTABLE=python3 fi @@ -15,7 +19,7 @@ fi cmake_install_executorch_libraries() { cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ @@ -27,7 +31,7 @@ cmake_install_executorch_libraries() { -Bcmake-out . - cmake --build cmake-out -j9 --target install --config Debug + cmake --build cmake-out -j9 --target install --config ${BUILD_TYPE} } cmake_build_llava_runner() { @@ -36,7 +40,7 @@ cmake_build_llava_runner() { cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ @@ -45,7 +49,7 @@ cmake_build_llava_runner() { ${dir} - cmake --build cmake-out/${dir} -j9 --config Debug + cmake --build cmake-out/${dir} -j9 --config ${BUILD_TYPE} } # only export the one without custom op for now since it's diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 9c45406fa80..7a6aad15505 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -270,6 +270,34 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" + test-llava-runner-macos: + name: test-llava-runner-macos + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + strategy: + fail-fast: false + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "cmake" + + # install Llava requirements + bash examples/models/llama2/install_requirements.sh + bash examples/models/llava/install_requirements.sh + + # run python unittest + python -m unittest examples.models.llava.test.test_llava + + # run e2e (export, tokenizer and runner) + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh Release + test-qnn-model: name: test-qnn-model uses: pytorch/test-infra/.github/workflows/linux_job.yml@main diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f4ad1ddf8d..d25113a03ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -786,8 +786,12 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) endif() add_executable(executor_runner ${_executor_runner__srcs}) - if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) - target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") + if(CMAKE_BUILD_TYPE STREQUAL "Release") + if(APPLE) + target_link_options(executor_runner PRIVATE "LINKER:-dead_strip") + else() + target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") + endif() endif() target_link_libraries(executor_runner ${_executor_runner_libs}) target_compile_options(executor_runner PUBLIC ${_common_compile_options}) diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index 9bb9d9cf9d6..a1a6fc8c939 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -203,7 +203,11 @@ endif() add_executable(llava_main ${_srcs}) if(CMAKE_BUILD_TYPE STREQUAL "Release") - target_link_options(llava_main PRIVATE "LINKER:--gc-sections,-s") + if(APPLE) + target_link_options(llava_main PRIVATE "LINKER:-dead_strip,-s") + else() + target_link_options(llava_main PRIVATE "LINKER:--gc-sections,-s") + endif() endif() target_include_directories(llava_main PUBLIC ${_common_include_directories}) From 7efdfc05148179794f13105a44064e6f9d4d3109 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 17:17:29 -0700 Subject: [PATCH 057/531] Inline some of the Module methods. Differential Revision: D61801122 Pull Request resolved: https://github.com/pytorch/executorch/pull/4913 --- extension/module/module.cpp | 8 ------ extension/module/module.h | 50 ++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index e59d4b45dbc..235cb86ce80 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -130,10 +130,6 @@ Error Module::load(const Program::Verification verification) { return Error::Ok; } -bool Module::is_loaded() const { - return program_ != nullptr; -} - Result> Module::method_names() { ET_CHECK_OK_OR_RETURN_ERROR(load()); const auto method_count = program_->num_methods(); @@ -181,10 +177,6 @@ Error Module::load_method(const std::string& method_name) { return Error::Ok; } -bool Module::is_method_loaded(const std::string& method_name) const { - return methods_.count(method_name); -} - Result Module::method_meta(const std::string& method_name) { ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name)); return methods_.at(method_name).method->method_meta(); diff --git a/extension/module/module.h b/extension/module/module.h index 4289a3c9c76..e4fd3aa1068 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -111,7 +111,9 @@ class Module final { * * @returns true if the program is loaded, false otherwise. */ - bool is_loaded() const; + inline bool is_loaded() const { + return program_ != nullptr; + } /** * Get the program. The data loader used by the program is guaranteed to be @@ -119,7 +121,7 @@ class Module final { * * @returns Shared pointer to the program or nullptr if it's not yet loaded. */ - std::shared_ptr<::executorch::runtime::Program> program() const { + inline std::shared_ptr<::executorch::runtime::Program> program() const { return program_; } @@ -151,7 +153,9 @@ class Module final { * @returns true if the method specified by method_name is loaded, false * otherwise. */ - bool is_method_loaded(const std::string& method_name) const; + inline bool is_method_loaded(const std::string& method_name) const { + return methods_.count(method_name); + } /** * Get a method metadata struct by method name. @@ -191,8 +195,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result> + ET_NODISCARD inline ::executorch::runtime::Result< + std::vector<::executorch::runtime::EValue>> execute( const std::string& method_name, const ::executorch::runtime::EValue& input) { @@ -209,8 +213,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result> + ET_NODISCARD inline ::executorch::runtime::Result< + std::vector<::executorch::runtime::EValue>> execute(const std::string& method_name) { return execute(method_name, std::vector<::executorch::runtime::EValue>{}); } @@ -225,9 +229,9 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result<::executorch::runtime::EValue> get( - const std::string& method_name, + ET_NODISCARD inline ::executorch::runtime::Result< + ::executorch::runtime::EValue> + get(const std::string& method_name, const std::vector<::executorch::runtime::EValue>& input) { auto result = ET_UNWRAP(execute(method_name, input)); if (result.empty()) { @@ -246,9 +250,9 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result<::executorch::runtime::EValue> get( - const std::string& method_name, + ET_NODISCARD inline ::executorch::runtime::Result< + ::executorch::runtime::EValue> + get(const std::string& method_name, const ::executorch::runtime::EValue& input) { return get(method_name, std::vector<::executorch::runtime::EValue>{input}); } @@ -262,9 +266,9 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result<::executorch::runtime::EValue> get( - const std::string& method_name) { + ET_NODISCARD inline ::executorch::runtime::Result< + ::executorch::runtime::EValue> + get(const std::string& method_name) { return get(method_name, std::vector<::executorch::runtime::EValue>{}); } @@ -277,8 +281,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result> + ET_NODISCARD inline ::executorch::runtime::Result< + std::vector<::executorch::runtime::EValue>> forward(const std::vector<::executorch::runtime::EValue>& input) { return execute("forward", input); } @@ -292,8 +296,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result> + ET_NODISCARD inline ::executorch::runtime::Result< + std::vector<::executorch::runtime::EValue>> forward(const ::executorch::runtime::EValue& input) { return forward(std::vector<::executorch::runtime::EValue>{input}); } @@ -305,8 +309,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD - ::executorch::runtime::Result> + ET_NODISCARD inline ::executorch::runtime::Result< + std::vector<::executorch::runtime::EValue>> forward() { return forward(std::vector<::executorch::runtime::EValue>{}); } @@ -319,7 +323,7 @@ class Module final { * @returns A pointer to the EventTracer instance. Returns nullptr if no * EventTracer is set. */ - ::executorch::runtime::EventTracer* event_tracer() const { + inline ::executorch::runtime::EventTracer* event_tracer() const { return event_tracer_.get(); } From da2142bba805b50a1515cbd276abc145c89cc0ca Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 17:48:40 -0700 Subject: [PATCH 058/531] Fix module forward calls after api changes. Differential Revision: D61827911 Pull Request resolved: https://github.com/pytorch/executorch/pull/4922 --- .../qaihub_scripts/stable_diffusion/runner/runner.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp index a997397855b..3d3d99d7074 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp @@ -377,11 +377,11 @@ Error Runner::generate(std::string prompt) { Tensor uncond_emb_tensor = managed_uncond_emb.get_aliasing_tensor(); modules_[0]->set_output_data_ptr(cond_emb_tensor, 0); long encoder_start = util::time_in_ms(); - auto cond_res = modules_[0]->forward({cond_tokens_tensor}); + auto cond_res = modules_[0]->forward(cond_tokens_tensor); stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start); modules_[0]->set_output_data_ptr(uncond_emb_tensor, 0); encoder_start = util::time_in_ms(); - auto uncond_res = modules_[0]->forward({uncond_tokens_tensor}); + auto uncond_res = modules_[0]->forward(uncond_tokens_tensor); stats_.text_encoder_execution_time += (util::time_in_ms() - encoder_start); // Initialize unet parameters @@ -533,7 +533,7 @@ Error Runner::generate(std::string prompt) { modules_[2]->set_output_data_ptr(output_tensor, 0); long start_vae_execution = util::time_in_ms(); - auto vae_res = modules_[2]->forward({vae_input_tensor}); + auto vae_res = modules_[2]->forward(vae_input_tensor); stats_.vae_execution_time = (util::time_in_ms() - start_vae_execution); stats_.generate_end_ms = util::time_in_ms(); From 5942e4adedc1f817555de727f924af5e9e91c413 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 19:13:15 -0700 Subject: [PATCH 059/531] Allow EValue to be constructed with a smart pointer implicitly. Differential Revision: D61783902 Pull Request resolved: https://github.com/pytorch/executorch/pull/4902 --- runtime/core/evalue.h | 17 ++++ runtime/core/test/evalue_test.cpp | 132 +++++++++++++++++++++++++----- 2 files changed, 130 insertions(+), 19 deletions(-) diff --git a/runtime/core/evalue.h b/runtime/core/evalue.h index 8aee5f399df..c0c534e0692 100644 --- a/runtime/core/evalue.h +++ b/runtime/core/evalue.h @@ -238,6 +238,23 @@ struct EValue { new (&payload.as_tensor) exec_aten::Tensor(t); } + // Template constructor that allows construction from types that can be + // dereferenced to produce a type that EValue can be implicitly constructed + // from. + template + /*implicit*/ EValue( + T&& value, + typename std::enable_if(value)), + EValue>::value>::type* = 0) { + ET_CHECK_MSG(value != nullptr, "Pointer is null."); + *this = EValue(*std::forward(value)); + } + + // Delete constructor for raw pointers to ensure they cannot be used. + template + explicit EValue(T* value) = delete; + bool isTensor() const { return tag == Tag::Tensor; } diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp index bc3e3a7913b..4c08695dc4b 100644 --- a/runtime/core/test/evalue_test.cpp +++ b/runtime/core/test/evalue_test.cpp @@ -6,21 +6,67 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include -#include -#include #include +#include #include using namespace ::testing; + +namespace torch { +namespace executor { + using exec_aten::ScalarType; using executorch::runtime::BoxedEvalueList; using executorch::runtime::EValue; using executorch::runtime::Tag; using executorch::runtime::testing::TensorFactory; -TEST(TestEValue, CopyTrivialType) { +class EValueTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + runtime_init(); + } +}; + +// An utility class used in tests to simulate objects that manage Tensors. +// The overloaded operator*() is used to return the underlying Tensor, mimicking +// behavior of smart pointers. +class TensorWrapper { + public: + explicit TensorWrapper(exec_aten::Tensor tensor) + : tensor_(std::make_unique(std::move(tensor))) {} + + exec_aten::Tensor& operator*() { + return *tensor_; + } + + const exec_aten::Tensor& operator*() const { + return *tensor_; + } + + operator bool() const { + return static_cast(tensor_); + } + + bool operator==(std::nullptr_t) const { + return tensor_ == nullptr; + } + + bool operator!=(std::nullptr_t) const { + return tensor_ != nullptr; + } + + private: + std::unique_ptr tensor_; +}; + +TEST_F(EValueTest, CopyTrivialType) { EValue a; EValue b(true); EXPECT_TRUE(a.isNone()); @@ -30,7 +76,7 @@ TEST(TestEValue, CopyTrivialType) { EXPECT_EQ(b.to(), true); } -TEST(TestEValue, CopyTensor) { +TEST_F(EValueTest, CopyTensor) { TensorFactory tf; EValue a(tf.ones({3, 2})); EValue b(tf.ones({1})); @@ -39,7 +85,7 @@ TEST(TestEValue, CopyTensor) { EXPECT_EQ(a.toTensor().dim(), 1); } -TEST(TestEValue, TypeMismatchFatals) { +TEST_F(EValueTest, TypeMismatchFatals) { ET_EXPECT_DEATH( { auto e = EValue(true); @@ -48,12 +94,12 @@ TEST(TestEValue, TypeMismatchFatals) { ""); } -TEST(TestEValue, NoneByDefault) { +TEST_F(EValueTest, NoneByDefault) { EValue e; EXPECT_TRUE(e.isNone()); } -TEST(TestEValue, ToOptionalInt) { +TEST_F(EValueTest, ToOptionalInt) { EValue e((int64_t)5); EXPECT_TRUE(e.isInt()); EXPECT_FALSE(e.isNone()); @@ -63,7 +109,7 @@ TEST(TestEValue, ToOptionalInt) { EXPECT_EQ(o.value(), 5); } -TEST(TestEValue, NoneToOptionalInt) { +TEST_F(EValueTest, NoneToOptionalInt) { EValue e; EXPECT_TRUE(e.isNone()); @@ -71,7 +117,7 @@ TEST(TestEValue, NoneToOptionalInt) { EXPECT_FALSE(o.has_value()); } -TEST(TestEValue, ToOptionalScalar) { +TEST_F(EValueTest, ToOptionalScalar) { exec_aten::Scalar s((double)3.141); EValue e(s); EXPECT_TRUE(e.isScalar()); @@ -83,7 +129,7 @@ TEST(TestEValue, ToOptionalScalar) { EXPECT_EQ(o.value().to(), 3.141); } -TEST(TESTEValue, ScalarToType) { +TEST_F(EValueTest, ScalarToType) { exec_aten::Scalar s_d((double)3.141); EXPECT_EQ(s_d.to(), 3.141); exec_aten::Scalar s_i((int64_t)3); @@ -92,7 +138,7 @@ TEST(TESTEValue, ScalarToType) { EXPECT_EQ(s_b.to(), true); } -TEST(TestEValue, NoneToOptionalScalar) { +TEST_F(EValueTest, NoneToOptionalScalar) { EValue e; EXPECT_TRUE(e.isNone()); @@ -100,7 +146,7 @@ TEST(TestEValue, NoneToOptionalScalar) { EXPECT_FALSE(o.has_value()); } -TEST(TestEValue, NoneToOptionalTensor) { +TEST_F(EValueTest, NoneToOptionalTensor) { EValue e; EXPECT_TRUE(e.isNone()); @@ -108,7 +154,7 @@ TEST(TestEValue, NoneToOptionalTensor) { EXPECT_FALSE(o.has_value()); } -TEST(TestEValue, ToScalarType) { +TEST_F(EValueTest, ToScalarType) { EValue e((int64_t)4); auto o = e.toScalarType(); EXPECT_EQ(o, exec_aten::ScalarType::Long); @@ -118,7 +164,7 @@ TEST(TestEValue, ToScalarType) { EXPECT_EQ(o2.value(), exec_aten::ScalarType::Long); } -TEST(TestEValue, toString) { +TEST_F(EValueTest, toString) { const EValue e("foo", 3); EXPECT_TRUE(e.isString()); EXPECT_FALSE(e.isNone()); @@ -127,28 +173,28 @@ TEST(TestEValue, toString) { EXPECT_EQ(x, "foo"); } -TEST(TestEValue, MemoryFormat) { +TEST_F(EValueTest, MemoryFormat) { const EValue e((int64_t)0); EXPECT_TRUE(e.isInt()); const exec_aten::MemoryFormat m = e.to(); EXPECT_EQ(m, exec_aten::MemoryFormat::Contiguous); } -TEST(TestEValue, Layout) { +TEST_F(EValueTest, Layout) { const EValue e((int64_t)0); EXPECT_TRUE(e.isInt()); const exec_aten::Layout l = e.to(); EXPECT_EQ(l, exec_aten::Layout::Strided); } -TEST(TestEValue, Device) { +TEST_F(EValueTest, Device) { const EValue e((int64_t)0); EXPECT_TRUE(e.isInt()); const exec_aten::Device d = e.to(); EXPECT_TRUE(d.is_cpu()); } -TEST(TestEValue, BoxedEvalueList) { +TEST_F(EValueTest, BoxedEvalueList) { // create fake values table to point to EValue values[3] = { EValue((int64_t)1), EValue((int64_t)2), EValue((int64_t)3)}; @@ -164,7 +210,7 @@ TEST(TestEValue, BoxedEvalueList) { EXPECT_EQ(unwrapped[2], 3); } -TEST(TestEValue, toOptionalTensorList) { +TEST_F(EValueTest, toOptionalTensorList) { // create list, empty evalue ctor gets tag::None EValue values[2] = {EValue(), EValue()}; EValue* values_p[2] = {&values[0], &values[1]}; @@ -185,3 +231,51 @@ TEST(TestEValue, toOptionalTensorList) { EXPECT_FALSE(x[0].has_value()); EXPECT_FALSE(x[1].has_value()); } + +TEST_F(EValueTest, ConstructFromUniquePtr) { + TensorFactory tf; + auto tensor_ptr = std::make_unique(tf.ones({2, 3})); + + EValue evalue(std::move(tensor_ptr)); + + EXPECT_TRUE(evalue.isTensor()); + EXPECT_EQ(evalue.toTensor().dim(), 2); + EXPECT_EQ(evalue.toTensor().numel(), 6); + + EValue evalue2(std::make_unique(tf.ones({4, 5}))); + + EXPECT_TRUE(evalue2.isTensor()); + EXPECT_EQ(evalue2.toTensor().dim(), 2); + EXPECT_EQ(evalue2.toTensor().numel(), 20); +} + +TEST_F(EValueTest, ConstructFromSharedPtr) { + TensorFactory tf; + auto tensor_ptr = std::make_shared(tf.ones({4, 5})); + + EValue evalue(tensor_ptr); + + EXPECT_TRUE(evalue.isTensor()); + EXPECT_EQ(evalue.toTensor().dim(), 2); + EXPECT_EQ(evalue.toTensor().numel(), 20); +} + +TEST_F(EValueTest, ConstructFromTensorWrapper) { + TensorFactory tf; + TensorWrapper tensor_wrapper(tf.ones({4, 5})); + + EValue evalue(tensor_wrapper); + + EXPECT_TRUE(evalue.isTensor()); + EXPECT_EQ(evalue.toTensor().dim(), 2); + EXPECT_EQ(evalue.toTensor().numel(), 20); +} + +TEST_F(EValueTest, ConstructFromNullPtrAborts) { + std::unique_ptr null_ptr; + + ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, ""); +} + +} // namespace executor +} // namespace torch From 801e1c947ca88112973f915dabaf0326f3962324 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 19:17:24 -0700 Subject: [PATCH 060/531] Forbid having TensorImpl with zero number of elements and null data. Differential Revision: D61810277 Pull Request resolved: https://github.com/pytorch/executorch/pull/4909 --- .../core/exec_aten/testing_util/targets.bzl | 1 + runtime/core/portable_type/tensor_impl.cpp | 9 ++ .../portable_type/test/tensor_impl_test.cpp | 85 ++++++++++++++++++- 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl index 3b0e78f03bc..9b5249e8371 100644 --- a/runtime/core/exec_aten/testing_util/targets.bzl +++ b/runtime/core/exec_aten/testing_util/targets.bzl @@ -23,6 +23,7 @@ def define_common_targets(): # list. "//executorch/runtime/core/exec_aten/util/test/...", "//executorch/runtime/core/exec_aten/testing_util/test/...", + "//executorch/runtime/core/portable_type/test/...", "//executorch/kernels/prim_ops/test/...", "//executorch/kernels/portable/test/...", "//executorch/kernels/portable/cpu/util/test/...", diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp index fe6b57ea350..1ef0ed435bb 100644 --- a/runtime/core/portable_type/tensor_impl.cpp +++ b/runtime/core/portable_type/tensor_impl.cpp @@ -25,8 +25,16 @@ namespace { * Compute the number of elements based on the sizes of a tensor. */ ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) { + ET_CHECK_MSG( + dim == 0 || sizes != nullptr, + "Sizes must be provided for non-scalar tensors"); ssize_t numel = 1; // Zero-dimensional tensors (scalars) have numel == 1. for (ssize_t i = 0; i < dim; ++i) { + ET_CHECK_MSG( + sizes[i] >= 0, + "Size must be non-negative, got %d at dimension %zd", + sizes[i], + i); numel *= sizes[i]; } return numel; @@ -52,6 +60,7 @@ TensorImpl::TensorImpl( shape_dynamism_(dynamism) { ET_CHECK_MSG( isValid(type_), "Invalid type %" PRId8, static_cast(type_)); + ET_CHECK_MSG(dim_ >= 0, "Dimension must be non-negative, got %zd", dim_); } size_t TensorImpl::nbytes() const { diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp index 9e8e9d2a433..77dd01ea23f 100644 --- a/runtime/core/portable_type/test/tensor_impl_test.cpp +++ b/runtime/core/portable_type/test/tensor_impl_test.cpp @@ -8,12 +8,13 @@ #include -#include -#include - #include #include +#include +#include +#include + using namespace ::testing; namespace torch { @@ -29,7 +30,7 @@ class TensorImplTest : public ::testing::Test { void SetUp() override { // Since these tests cause ET_LOG to be called, the PAL must be initialized // first. - torch::executor::runtime_init(); + runtime_init(); } }; @@ -370,5 +371,81 @@ TEST_F(TensorImplTest, TestWriteRead) { EXPECT_EQ(y[0], 22.0); } +TEST_F(TensorImplTest, TestInvalidScalarType) { + SizesType sizes[2] = {3, 2}; + ET_EXPECT_DEATH(TensorImpl t(static_cast(-1), 2, sizes), ""); +} + +TEST_F(TensorImplTest, TestNegativeDimension) { + SizesType sizes[2] = {3, 2}; + ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, -1, sizes), ""); +} + +TEST_F(TensorImplTest, TestNullSizesNonZeroDim) { + ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, 2, nullptr), ""); +} + +TEST_F(TensorImplTest, TestNonNegativeSizes) { + SizesType sizes[2] = {3, -2}; + ET_EXPECT_DEATH(TensorImpl t(ScalarType::Float, 2, sizes), ""); +} + +TEST_F(TensorImplTest, TestEmptyTensor) { + SizesType sizes[2] = {0, 0}; + TensorImpl t(ScalarType::Float, 2, sizes); + EXPECT_EQ(t.numel(), 0); + EXPECT_EQ(t.data(), nullptr); +} + +TEST_F(TensorImplTest, TestTensorWithNoElementsButAllocatedMemory) { + SizesType sizes[2] = {0, 0}; + float data[1] = {1.0}; + TensorImpl t(ScalarType::Float, 2, sizes, data); + EXPECT_EQ(t.numel(), 0); + EXPECT_EQ(t.data(), data); +} + +TEST_F(TensorImplTest, TestTensorWithShapeButNoMemory) { + SizesType sizes[2] = {3, 2}; + TensorImpl t(ScalarType::Float, 2, sizes); + EXPECT_GT(t.numel(), 0); + EXPECT_EQ(t.data(), nullptr); +} + +TEST_F(TensorImplTest, TestNormalTensor) { + SizesType sizes[2] = {3, 2}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t(ScalarType::Float, 2, sizes, data); + EXPECT_GT(t.numel(), 0); + EXPECT_EQ(t.data(), data); +} + +TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) { + SizesType sizes[2] = {3, 2}; + TensorImpl t( + ScalarType::Float, + 2, + sizes, + nullptr, + nullptr, + nullptr, + TensorShapeDynamism::DYNAMIC_BOUND); + + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + t.set_data(data); + EXPECT_GT(t.numel(), 0); + EXPECT_EQ(t.data(), data); + + SizesType zero_sizes[2] = {0, 0}; + t.set_sizes_contiguous({zero_sizes, 2}); + EXPECT_EQ(t.numel(), 0); + EXPECT_EQ(t.data(), data); + + SizesType new_sizes[2] = {3, 2}; + t.set_sizes_contiguous({new_sizes, 2}); + EXPECT_GT(t.numel(), 0); + EXPECT_EQ(t.data(), data); +} + } // namespace executor } // namespace torch From 3fb03dcd26977e524ec2e2dc352d40d60660d192 Mon Sep 17 00:00:00 2001 From: neuropilot-captain <76544501+neuropilot-captain@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:19:05 +0800 Subject: [PATCH 061/531] Fix llama runner build (#4817) * Fix llama runner build * Fix linter error --- examples/mediatek/CMakeLists.txt | 10 +++++----- .../executor_runner/mtk_llama_executor_runner.cpp | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 1018477ec85..966fecb0664 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -105,11 +105,10 @@ if(${ANDROID}) _mtk_llama_executor_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp ) - # Build ABSL and RE2 - set(LLAMA2_EXAMPLE_MODEL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2) - set(THIRD_PARTY_ABSL_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/third-party/abseil-cpp) - set(THIRD_PARTY_RE2_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/third-party/re2) + set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm) + set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp) + set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2) set(ABSL_ENABLE_INSTALL ON) set(ABSL_PROPAGATE_CXX_STD ON) set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) @@ -119,7 +118,7 @@ if(${ANDROID}) set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) # Build tokenizers - set(LLAMA2_TOKENIZER_DIR ${LLAMA2_EXAMPLE_MODEL_DIR}/tokenizer) + set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer) add_library(tokenizer STATIC) target_include_directories(tokenizer PUBLIC @@ -135,6 +134,7 @@ if(${ANDROID}) PRIVATE ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp ) # Include directory for neuron headers diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index 59b7a39e1cc..b605dd13bec 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -68,8 +68,9 @@ #include "llama_runner/Utils.h" #include "llama_runner/llm_helper/include/llm_types.h" -#include -#include +#include +#include +#include // Llama model options DEFINE_uint64( @@ -316,7 +317,7 @@ std::unique_ptr load_tokenizer() { if (FLAGS_tokenizer_type == "bpe") { tokenizer = std::make_unique(); } else if (FLAGS_tokenizer_type == "tiktoken") { - tokenizer = std::make_unique(); + tokenizer = torch::executor::get_tiktoken_for_llama(); } ET_CHECK_MSG( tokenizer, "Invalid tokenizer type: %s", FLAGS_tokenizer_type.c_str()); From f65c28e2a43cf7f0c62509dd07711812cf38e96e Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 26 Aug 2024 23:05:44 -0700 Subject: [PATCH 062/531] Handle null data edge case in data_is_close testing util. Differential Revision: D61783890 Pull Request resolved: https://github.com/pytorch/executorch/pull/4901 --- .../exec_aten/testing_util/tensor_util.cpp | 9 +++++++++ .../testing_util/test/tensor_util_test.cpp | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index f0340d34ca2..03dffd208f0 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -41,6 +41,15 @@ bool data_is_close( size_t numel, double rtol, double atol) { + ET_CHECK_MSG( + numel == 0 || (a != nullptr && b != nullptr), + "Pointers must not be null when numel > 0: numel %zu, a 0x%p, b 0x%p", + numel, + a, + b); + if (a == b) { + return true; + } for (size_t i = 0; i < numel; i++) { const auto ai = a[i]; const auto bi = b[i]; diff --git a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp index 6d4ce5a8532..948f6bc78f0 100644 --- a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp +++ b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp @@ -23,6 +23,7 @@ using namespace ::testing; using exec_aten::ScalarType; using exec_aten::Tensor; +using exec_aten::TensorImpl; using exec_aten::TensorList; using executorch::runtime::testing::IsCloseTo; using executorch::runtime::testing::IsDataCloseTo; @@ -826,4 +827,22 @@ TEST(TensorUtilTest, TensorStreamBool) { "ETensor(sizes={2, 2}, dtype=Bool, data={1, 0, 1, 0})"); } +TEST(TensorTest, TestZeroShapeTensorEquality) { + TensorImpl::SizesType sizes[2] = {2, 2}; + TensorImpl::StridesType strides[2] = {2, 1}; + TensorImpl::DimOrderType dim_order[2] = {0, 1}; + + TensorImpl t1(ScalarType::Float, 2, sizes, nullptr, dim_order, strides); + TensorImpl t2(ScalarType::Float, 2, sizes, nullptr, dim_order, strides); + + ET_EXPECT_DEATH({ EXPECT_TENSOR_EQ(Tensor(&t1), Tensor(&t2)); }, ""); + + float data[] = {1.0, 2.0, 3.0, 4.0}; + + t1.set_data(data); + t2.set_data(data); + + EXPECT_TENSOR_EQ(Tensor(&t1), Tensor(&t2)); +} + #endif // !USE_ATEN_LIB From f92139f7888f1bce3a92a51668946fb7e3e1f8d7 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 27 Aug 2024 04:09:17 -0700 Subject: [PATCH 063/531] [module] Change the deleter of Program in Module to make android happy Differential Revision: D61837043 Pull Request resolved: https://github.com/pytorch/executorch/pull/4924 --- extension/module/module.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 235cb86ce80..2b42ce18dd1 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -122,10 +122,7 @@ Error Module::load(const Program::Verification verification) { auto program = ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification)); program_ = std::shared_ptr( - program.release(), - [data_loader = std::move(data_loader_)](Program* pointer) { - delete pointer; - }); + program.release(), [](Program* pointer) { delete pointer; }); } return Error::Ok; } From 395d3f5399a9cfe621e291a550e695055b521c0a Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Tue, 27 Aug 2024 08:29:34 -0700 Subject: [PATCH 064/531] enable parallel prefill again Differential Revision: D61751873 Pull Request resolved: https://github.com/pytorch/executorch/pull/4893 --- examples/models/llama2/runner/runner.cpp | 2 +- examples/models/llama2/runner/runner.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index dd0a305a371..7a2fa676628 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -126,7 +126,7 @@ Error Runner::load() { tokenizer_.get(), text_decoder_runner_.get(), metadata_.at(kUseKVCache), - enable_parallel_prefill_); + metadata_.at(kEnableDynamicShape)); text_token_generator_ = std::make_unique( tokenizer_.get(), diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 12fb63c6f34..4e3c1daef7b 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -45,7 +45,6 @@ class Runner { private: float temperature_; - bool enable_parallel_prefill_; bool shouldStop_{false}; // model From d9ae7c3a3036b75edd17a37e297a954ab8dc8917 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 27 Aug 2024 10:03:21 -0700 Subject: [PATCH 065/531] Rename tensor test and use gtest class fixture. (#4928) --- .../core/portable_type/test/CMakeLists.txt | 2 +- runtime/core/portable_type/test/targets.bzl | 4 ++-- ...ecutor_tensor_test.cpp => tensor_test.cpp} | 22 ++++++++++++++----- test/utils/OSSTestConfig.json | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) rename runtime/core/portable_type/test/{executor_tensor_test.cpp => tensor_test.cpp} (84%) diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt index 73d877d68bb..f89381f5120 100644 --- a/runtime/core/portable_type/test/CMakeLists.txt +++ b/runtime/core/portable_type/test/CMakeLists.txt @@ -23,7 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs optional_test.cpp executor_tensor_test.cpp half_test.cpp +set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp tensor_impl_test.cpp ) diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl index 15e002d451c..af55f95e45e 100644 --- a/runtime/core/portable_type/test/targets.bzl +++ b/runtime/core/portable_type/test/targets.bzl @@ -15,8 +15,8 @@ def define_common_targets(): ) runtime.cxx_test( - name = "executor_tensor_test", - srcs = ["executor_tensor_test.cpp"], + name = "tensor_test", + srcs = ["tensor_test.cpp"], deps = [ "//executorch/runtime/core/portable_type:portable_type", ], diff --git a/runtime/core/portable_type/test/executor_tensor_test.cpp b/runtime/core/portable_type/test/tensor_test.cpp similarity index 84% rename from runtime/core/portable_type/test/executor_tensor_test.cpp rename to runtime/core/portable_type/test/tensor_test.cpp index 944850ca8a0..7a772cd0769 100644 --- a/runtime/core/portable_type/test/executor_tensor_test.cpp +++ b/runtime/core/portable_type/test/tensor_test.cpp @@ -7,13 +7,25 @@ */ #include -#include + #include +#include +#include + namespace torch { namespace executor { -TEST(TensorTest, InvalidScalarType) { +class TensorTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + runtime_init(); + } +}; + +TEST_F(TensorTest, InvalidScalarType) { TensorImpl::SizesType sizes[1] = {1}; // Undefined, which is sort of a special case since it's not part of the @@ -28,7 +40,7 @@ TEST(TensorTest, InvalidScalarType) { ET_EXPECT_DEATH({ TensorImpl y(static_cast(-1), 1, sizes); }, ""); } -TEST(TensorTest, SetData) { +TEST_F(TensorTest, SetData) { TensorImpl::SizesType sizes[1] = {5}; TensorImpl::DimOrderType dim_order[1] = {0}; int32_t data[5] = {0, 0, 1, 0, 0}; @@ -39,7 +51,7 @@ TEST(TensorTest, SetData) { EXPECT_EQ(a.const_data_ptr(), nullptr); } -TEST(TensorTest, Strides) { +TEST_F(TensorTest, Strides) { TensorImpl::SizesType sizes[2] = {2, 2}; TensorImpl::DimOrderType dim_order[2] = {0, 1}; int32_t data[4] = {0, 0, 1, 1}; @@ -53,7 +65,7 @@ TEST(TensorTest, Strides) { EXPECT_EQ(a.const_data_ptr()[0 + a.strides()[0]], 1); } -TEST(TensorTest, ModifyDataOfConstTensor) { +TEST_F(TensorTest, ModifyDataOfConstTensor) { TensorImpl::SizesType sizes[1] = {1}; TensorImpl::DimOrderType dim_order[2] = {0}; int32_t data[1] = {1}; diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 38fa31cd6fb..b7d7f1700b4 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -77,7 +77,7 @@ "directory": "runtime/core/portable_type/test", "sources": [ "optional_test.cpp", - "executor_tensor_test.cpp", + "tensor_test.cpp", "half_test.cpp", "scalar_test.cpp", "tensor_impl_test.cpp" From 4890748f53a205fb1fd464a8ff9d6bfd7c8c61ac Mon Sep 17 00:00:00 2001 From: neuropilot-captain <76544501+neuropilot-captain@users.noreply.github.com> Date: Wed, 28 Aug 2024 02:38:14 +0800 Subject: [PATCH 066/531] [mediatek] Link portable kernel lib Pull Request resolved: https://github.com/pytorch/executorch/pull/4868 --- backends/mediatek/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index c7de8bb1f04..7e36746bca2 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -29,6 +29,7 @@ add_library(neuron_backend SHARED) target_link_libraries(neuron_backend PRIVATE executorch_no_prim_ops + portable_ops_lib android log ${NEURON_BUFFER_ALLOCATOR_LIB} From 49156d0f7583331d4c7137f040c13b3dfc48b0e0 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 27 Aug 2024 12:16:53 -0700 Subject: [PATCH 067/531] [llava] Expose max_seq_len as a parameter to export_llava Differential Revision: D61863296 Pull Request resolved: https://github.com/pytorch/executorch/pull/4930 --- examples/models/llava/export_llava.py | 12 ++++++++++-- examples/models/llava/model.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 1df7c242dca..c3145d44e9a 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -249,6 +249,11 @@ def main(): action=BooleanOptionalAction, help="Use sdpa_with_kv_cache custom op in LLava text model.", ) + parser.add_argument( + "--max-seq-len", + default=768, + help="Maximum sequence length for the text model.", + ) parser.add_argument( "--pte-name", default="llava_combined_xnnpack.pte", @@ -262,9 +267,12 @@ def main(): ) args = parser.parse_args() logging.info( - f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}" + f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}, max_seq_len: {args.max_seq_len}" + ) + llava_model = LlavaModel( + use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache, + max_seq_len=args.max_seq_len, ) - llava_model = LlavaModel(use_sdpa_with_kv_cache_op=args.use_sdpa_with_kv_cache) executorch_program = export_all(llava_model) diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index b4a203d7419..9ad185a5eee 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -8,7 +8,7 @@ import re -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple import requests import torch @@ -39,6 +39,7 @@ def __init__( llava_model: LlavaForConditionalGeneration, image_processor: CLIPImageProcessor, use_sdpa_with_kv_cache_op: bool = True, + max_seq_len: int = 768, ): super().__init__() self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op @@ -57,6 +58,7 @@ def __init__( enable_dynamic_shape=True, # allow parallel prefill use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op, # use sdpa_with_kv_cache op use_hf_rope=True, + max_seq_len=max_seq_len, ) self.embed_tokens = nn.Embedding( self.model_.config.text_config.vocab_size, @@ -233,7 +235,7 @@ def prefill( prompt_before_image: torch.Tensor, images: torch.Tensor, prompt_after_image: torch.Tensor, - ) -> (int, torch.Tensor): + ) -> Tuple[int, torch.Tensor]: """Avoiding the torch.where() call to find placeholder and insert image embedding. Taking 3 inputs instead.""" embeds = self.prefill_embedding(prompt_before_image, images, prompt_after_image) # returns the prefilled token length too, because the text model generates one logits in each forward call. @@ -264,8 +266,9 @@ def forward( class LlavaModel(EagerModelBase): - def __init__(self, use_sdpa_with_kv_cache_op=True): + def __init__(self, use_sdpa_with_kv_cache_op=True, max_seq_len=768): self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op + self.max_seq_len = max_seq_len self.processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") self.tokenizer = self.processor.tokenizer self.image_processor = self.processor.image_processor @@ -290,6 +293,7 @@ def get_eager_model(self): self.model, self.image_processor, self.use_sdpa_with_kv_cache_op, + self.max_seq_len, ) model.to(dtype=torch.float32) return model @@ -338,6 +342,6 @@ def _get_image_dynamic_shapes(self): return dynamic_shapes def _get_prompt_dynamic_shapes(self): - dim = torch.export.Dim("token_dim", min=2, max=2048) + dim = torch.export.Dim("token_dim", min=2, max=self.max_seq_len) text_model_dynamic_shapes = ({0: 1}, {1: dim}) return text_model_dynamic_shapes From d2e54b6225a73de4d78dc6f8e3b9e97d106fbc08 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 27 Aug 2024 13:07:09 -0700 Subject: [PATCH 068/531] Simplify setting inputs in Module execute. Differential Revision: D61499723 Pull Request resolved: https://github.com/pytorch/executorch/pull/4782 --- extension/module/module.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 2b42ce18dd1..6d5aedd8007 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -185,9 +185,8 @@ Result> Module::execute( ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name)); auto& method = methods_.at(method_name).method; - for (auto index = 0; index < input.size(); ++index) { - ET_CHECK_OK_OR_RETURN_ERROR(method->set_input(input[index], index)); - } + ET_CHECK_OK_OR_RETURN_ERROR(method->set_inputs( + exec_aten::ArrayRef(input.data(), input.size()))); ET_CHECK_OK_OR_RETURN_ERROR(method->execute()); const auto outputs_size = method->outputs_size(); From 37db39a50c2c0d5220d6ff193f7a0e376bdf10e6 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Tue, 27 Aug 2024 13:07:12 -0700 Subject: [PATCH 069/531] Remove read_file.h Differential Revision: D61749168 Pull Request resolved: https://github.com/pytorch/executorch/pull/4912 --- CMakeLists.txt | 1 - docs/source/sdk-bundled-io.md | 22 +++--- .../website/docs/tutorials/bundled_program.md | 25 +++---- extension/pybindings/pybindings.cpp | 1 - .../extension/pybindings/pybindings.bzl | 2 - util/read_file.cpp | 70 ------------------- util/read_file.h | 56 --------------- util/targets.bzl | 14 ---- 8 files changed, 20 insertions(+), 171 deletions(-) delete mode 100644 util/read_file.cpp delete mode 100644 util/read_file.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d25113a03ab..99c8b7f69fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -693,7 +693,6 @@ if(EXECUTORCH_BUILD_PYBIND) util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/util/read_file.cpp ) target_include_directories( util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS} diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md index 288fce93df6..c399bf1e27c 100644 --- a/docs/source/sdk-bundled-io.md +++ b/docs/source/sdk-bundled-io.md @@ -211,21 +211,19 @@ We need the pointer to ExecuTorch program to do the execution. To unify the proc Here's an example of how to use the `GetProgramData` API: ```c++ -std::shared_ptr buff_ptr; -size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. -Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); -ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - +// Assume that the user has read the contents of the file into file_data using +// whatever method works best for their application. The file could contain +// either BundledProgram data or Program data. +void* file_data = ...; +size_t file_data_len = ...; + +// If file_data contains a BundledProgram, GetProgramData() will return a +// pointer to the Program data embedded inside it. Otherwise it will return +// file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( - buff_ptr.get(), buff_len, &program_ptr, &program_len); + file_data, file_data_len, &program_ptr, &program_len); ET_CHECK_MSG( status == Error::Ok, "GetProgramData() failed with status 0x%" PRIx32, diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md index fb119df7310..e477d8e6a61 100644 --- a/docs/website/docs/tutorials/bundled_program.md +++ b/docs/website/docs/tutorials/bundled_program.md @@ -49,19 +49,15 @@ Error GetProgramData( Here's an example of how to use the GetProgramData API: ```c++ - std::shared_ptr buff_ptr; - size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. - Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); - ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - - uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model"); - + // Assume that the user has read the contents of the file into file_data using + // whatever method works best for their application. The file could contain + // either BundledProgram data or Program data. + void* file_data = ...; + size_t file_data_len = ...; + + // If file_data contains a BundledProgram, GetProgramData() will return a + // pointer to the Program data embedded inside it. Otherwise it will return + // file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( @@ -122,14 +118,13 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( ### Example -Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/devtools/fb/runners/executor_runner.cpp" and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. ```c++ // method_name is the name for the method we want to test // memory_manager is the executor::MemoryManager variable for executor memory allocation. // program is the executorch program. Result method = program->load_method(method_name, &memory_manager); - EXECUTORCH_END_PROF(prof_tok); ET_CHECK_MSG( method.ok(), "load_method() failed with status 0x%" PRIx32, diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 7c98ee4aa06..c605c48c582 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl index 813b420dbaa..ac5f126706c 100644 --- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl @@ -29,7 +29,6 @@ ATEN_MODULE_DEPS = [ "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", - "//executorch/util:read_file", "//executorch/devtools/bundled_program:runtime_aten", "//executorch/runtime/executor/test:test_backend_compiler_lib_aten", "//executorch/devtools/etdump:etdump_flatcc", @@ -55,7 +54,6 @@ def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibilit ], deps = [ "//executorch/runtime/core:core", - "//executorch/util:read_file", ] + cppdeps, external_deps = [ "pybind11", diff --git a/util/read_file.cpp b/util/read_file.cpp deleted file mode 100644 index 28713455144..00000000000 --- a/util/read_file.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include -#include - -namespace torch { -namespace executor { -namespace util { - -ET_NODISCARD Error read_file_content( - const char* file_name, - std::shared_ptr* file_data, - size_t* file_length) { - FILE* file; - unsigned long fileLen; - - // Open file - file = fopen(file_name, "rb"); - if (!file) { - ET_LOG(Error, "Unable to open file %s\n", file_name); - return Error::NotSupported; - } - - // Get file length - fseek(file, 0, SEEK_END); - fileLen = ftell(file); - fseek(file, 0, SEEK_SET); - - // Allocate memory - auto ptr = std::shared_ptr( - new char[fileLen + 1], std::default_delete()); - if (!ptr) { - ET_LOG(Error, "Unable to allocate memory to read file %s\n", file_name); - fclose(file); - return Error::NotSupported; - } - - // Read file contents into buffer - fread(ptr.get(), fileLen, 1, file); - fclose(file); - - *file_data = ptr; - *file_length = fileLen; - return Error::Ok; -} - -ET_DEPRECATED std::shared_ptr read_file_content(const char* name) { - std::shared_ptr file_data; - size_t file_length; - Error status = read_file_content(name, &file_data, &file_length); - if (status == Error::Ok) { - return file_data; - } else { - return nullptr; - } -} - -} // namespace util -} // namespace executor -} // namespace torch diff --git a/util/read_file.h b/util/read_file.h deleted file mode 100644 index 50abcbdec22..00000000000 --- a/util/read_file.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -namespace torch { -namespace executor { -namespace util { - -/** - * Read the data from the file given name. - * - * The returned pointer pointing to the memory address containing the data, and - * the file length is the length of data. - * - * @param[in] file_name The name of the file to be read. - * @param[out] file_data The file data, if read successfully. - * @param[out] file_length The length of file_data, in bytes, if read - * successfully. - * - * @returns Error::Ok if the file is read successfully, file_data point to the - * data and file_length is the correct length of file_data. Other values on - * failure. - */ -ET_NODISCARD Error read_file_content( - const char* file_name, - std::shared_ptr* file_data, - size_t* file_length); - -/** - * Read the data from the file given name. - * - * The returned pointer pointing to the memory address containing the data. - * - * This function is deprecated, and should use the above function instead to - * read file content. - * - * @param[in] name The name of the file to be read. - * - * @returns The pointer to file data, if read successfully. Otherwise null_ptr. - */ -ET_DEPRECATED std::shared_ptr read_file_content(const char* name); - -} // namespace util -} // namespace executor -} // namespace torch diff --git a/util/targets.bzl b/util/targets.bzl index c8b70f5c818..6797462e189 100644 --- a/util/targets.bzl +++ b/util/targets.bzl @@ -7,20 +7,6 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - runtime.cxx_library( - name = "read_file", - srcs = ["read_file.cpp"], - exported_headers = ["read_file.h"], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - "//executorch/runtime/core:core", - "//executorch/runtime/platform:compiler", - ], - ) - for aten_mode in (True, False): aten_suffix = ("_aten" if aten_mode else "") From 1cea0eeeac2c7c79b77313b53164b5f77acbb987 Mon Sep 17 00:00:00 2001 From: lucylq Date: Tue, 27 Aug 2024 13:54:52 -0700 Subject: [PATCH 070/531] preprocess e2e test Differential Revision: D61780713 Pull Request resolved: https://github.com/pytorch/executorch/pull/4887 --- .../models/flamingo/export_preprocess_lib.py | 7 +++- examples/models/flamingo/test_preprocess.py | 40 +++++++++++++++---- extension/llm/custom_ops/CMakeLists.txt | 1 + .../llm/custom_ops/preprocess_custom_ops.py | 14 +++++-- 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/examples/models/flamingo/export_preprocess_lib.py b/examples/models/flamingo/export_preprocess_lib.py index 082c306ea38..358b1f2149a 100644 --- a/examples/models/flamingo/export_preprocess_lib.py +++ b/examples/models/flamingo/export_preprocess_lib.py @@ -8,6 +8,7 @@ import torch from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.program._program import ExecutorchProgramManager from executorch.extension.llm.custom_ops import preprocess_custom_ops # noqa @@ -76,5 +77,9 @@ def lower_to_executorch_preprocess( exported_program, compile_config=EdgeCompileConfig(_check_ir_validity=False) ) - et_program = edge_program.to_executorch(ExecutorchBackendConfig()) + et_program = edge_program.to_executorch( + ExecutorchBackendConfig( + sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(), + ) + ) return et_program diff --git a/examples/models/flamingo/test_preprocess.py b/examples/models/flamingo/test_preprocess.py index 896a01655e5..34ad0ab8ed1 100644 --- a/examples/models/flamingo/test_preprocess.py +++ b/examples/models/flamingo/test_preprocess.py @@ -13,6 +13,12 @@ import PIL import torch +from executorch.extension.pybindings import portable_lib # noqa # usort: skip +from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +from executorch.extension.pybindings.portable_lib import ( + _load_for_executorch_from_buffer, +) + from parameterized import parameterized from PIL import Image @@ -21,14 +27,17 @@ CLIPImageTransform, ) -from torchtune.modules.transforms import ( +from torchtune.modules.transforms.vision_utils.get_canvas_best_fit import ( find_supported_resolutions, get_canvas_best_fit, +) + +from torchtune.modules.transforms.vision_utils.get_inscribed_size import ( get_inscribed_size, ) from torchvision.transforms.v2 import functional as F -from .export_preprocess_lib import export_preprocess +from .export_preprocess_lib import export_preprocess, lower_to_executorch_preprocess @dataclass @@ -74,6 +83,13 @@ def prepare_inputs( F.grayscale_to_rgb_image(F.to_image(image)), scale=True ) + # The above converts the PIL image into a torchvision tv_tensor. + # Convert the tv_tensor into a torch.Tensor. + image_tensor = image_tensor + 0 + + # Ensure tensor is contiguous for executorch. + image_tensor = image_tensor.contiguous() + # Calculate possible resolutions. possible_resolutions = config.possible_resolutions if possible_resolutions is None: @@ -187,6 +203,9 @@ def test_preprocess( max_num_tiles=config.max_num_tiles, ) + executorch_model = lower_to_executorch_preprocess(exported_model) + executorch_module = _load_for_executorch_from_buffer(executorch_model.buffer) + # Prepare image input. image = ( np.random.randint(0, 256, np.prod(image_size)) @@ -225,20 +244,25 @@ def test_preprocess( image=image, config=config ) - # Run eager and exported models. + # Run eager model and check it matches reference model. eager_image, eager_ar = eager_model( image_tensor, inscribed_size, best_resolution ) eager_ar = eager_ar.tolist() + self.assertTrue(torch.allclose(reference_image, eager_image)) + self.assertEqual(reference_ar, eager_ar) + # Run exported model and check it matches reference model. exported_image, exported_ar = exported_model.module()( image_tensor, inscribed_size, best_resolution ) exported_ar = exported_ar.tolist() - - # Check eager and exported models match reference model. - self.assertTrue(torch.allclose(reference_image, eager_image)) self.assertTrue(torch.allclose(reference_image, exported_image)) + self.assertEqual(reference_ar, exported_ar) - self.assertTrue(reference_ar, eager_ar) - self.assertTrue(reference_ar, exported_ar) + # Run executorch model and check it matches reference model. + et_image, et_ar = executorch_module.forward( + (image_tensor, inscribed_size, best_resolution) + ) + self.assertTrue(torch.allclose(reference_image, et_image)) + self.assertEqual(reference_ar, et_ar.tolist()) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index f057825ec80..8edfbfc85b2 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -82,6 +82,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) add_library( custom_ops_aot_lib SHARED ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp ) target_include_directories( custom_ops_aot_lib PUBLIC "${_common_include_directories}" diff --git a/extension/llm/custom_ops/preprocess_custom_ops.py b/extension/llm/custom_ops/preprocess_custom_ops.py index e49721ffd35..f1e05697a41 100644 --- a/extension/llm/custom_ops/preprocess_custom_ops.py +++ b/extension/llm/custom_ops/preprocess_custom_ops.py @@ -16,6 +16,9 @@ # Register and define tile_crop and out variant. preprocess_op_lib.define("tile_crop(Tensor input, int tile_size) -> Tensor") +# Keep this in sync with model config. +MAX_NUM_TILES = 4 + @impl(preprocess_op_lib, "tile_crop", dispatch_key="CompositeExplicitAutograd") def tile_crop_impl(input: torch.Tensor, tile_size: int) -> torch.Tensor: @@ -56,6 +59,11 @@ def tile_crop_out_impl( # Register meta kernel to prevent export tracing into the tile_crop impl. @torch.library.register_fake("preprocess::tile_crop") def tile_crop(output: torch.Tensor, tile_size: int) -> torch.Tensor: - # Returned tensor is of size [n, 3, 224, 224], where n is the number of tiles. - # We should export with n = max_num_tiles. Set 50 for now. - return torch.empty([50, output.size(0), 224, 224]) + # Returned tensor is of size [n, 3, 224, 224], where n = number of tiles. + # Use an unbacked symint to create an upper-bounded dynamic shape output. + # Otherwise, output is set to a static shape, and we can only output + # tensors of shape [MAX_NUM_TILES, 3, 224, 224]. + ctx = torch._custom_ops.get_ctx() + s0 = ctx.create_unbacked_symint() + torch._constrain_as_size(s0, 0, MAX_NUM_TILES) + return torch.empty([s0, output.size(0), tile_size, tile_size]) From 3f7707891199d2f96529df55f67cccf83f53cb75 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 27 Aug 2024 14:02:50 -0700 Subject: [PATCH 071/531] serialize fqns. Differential Revision: D61864979 Pull Request resolved: https://github.com/pytorch/executorch/pull/4931 --- exir/emit/_emit_program.py | 12 +++++++++--- exir/tests/test_joint_graph.py | 11 +++++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index bf40a78bb6e..9c8c9dfd067 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -84,20 +84,26 @@ def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.Grap def _get_training_metadata(methods: Dict[str, ExportedProgram]) -> Dict[str, int]: gradients_method_prefix = "__et_training_gradients_index_" parameters_method_prefix = "__et_training_parameters_index_" + fqn_method_prefix = "__et_training_fqn_" training_metadata = {} for name, method in methods.items(): found_grad = False found_param = False + fqns = [] i = 0 for output_spec in method.graph_signature.output_specs: - if output_spec.kind == OutputKind.GRADIENT_TO_PARAMETER and not found_grad: - training_metadata[gradients_method_prefix + name] = i - found_grad = True + if output_spec.kind == OutputKind.GRADIENT_TO_PARAMETER: + if not found_grad: + training_metadata[gradients_method_prefix + name] = i + found_grad = True + fqns.append(output_spec.target) elif output_spec.kind == OutputKind.TOKEN and not found_param: assert found_grad # Params must come after gradients training_metadata[parameters_method_prefix + name] = i found_param = True i += 1 + if len(fqns) > 0: + training_metadata[fqn_method_prefix + name] = fqns return training_metadata diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py index 0e5a322397d..fb09d995716 100644 --- a/exir/tests/test_joint_graph.py +++ b/exir/tests/test_joint_graph.py @@ -110,7 +110,7 @@ def forward(self, x, y): self.assertTrue(torch.allclose(m.linear.bias, et_outputs[4])) self.assertEqual( - len(et.executorch_program.execution_plan), 3 + len(et.executorch_program.execution_plan), 4 ) # forward + 2 training metadata functions # gradient outputs start at index 1 @@ -121,10 +121,17 @@ def forward(self, x, y): 1, ) - # parameter outputs start at index 3 self.assertEqual( et.executorch_program.execution_plan[2] # pyre-ignore .values[0] + .val.string_val, + "linear.weight", + ) + + # parameter outputs start at index 3 + self.assertEqual( + et.executorch_program.execution_plan[3] # pyre-ignore + .values[0] .val.int_val, 3, ) From 7600f21b346984cb0f57cf33d87515e6258ceb51 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 27 Aug 2024 14:26:49 -0700 Subject: [PATCH 072/531] Run ExecuTorchDemo test suite on AWS Device Farm Differential Revision: D61826982 Pull Request resolved: https://github.com/pytorch/executorch/pull/4918 --- .github/workflows/apple.yml | 53 ++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 5f19831250c..628d56bb037 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -25,7 +25,7 @@ concurrency: cancel-in-progress: true jobs: - test-demo-ios: + build-demo-ios: name: test-demo-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main secrets: inherit @@ -58,6 +58,57 @@ jobs: PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ build/test_ios_ci.sh ${ARTIFACTS_DIR_NAME} + # Upload the test demo app to S3 + upload-demo-ios: + needs: build-demo-ios + runs-on: linux.2xlarge + steps: + - name: Download the artifacts from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the artifacts + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the artifacts to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + test-demo-ios: + # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to + # authenticate with the cloud service. So, this job will be skipped on the latter + if: ${{ !github.event.pull_request.head.repo.fork }} + needs: upload-demo-ios + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + with: + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + # This is the custom device pool that only includes iOS devices + device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip + test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml + build-frameworks-ios: name: build-frameworks-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main From d91f6121f5341e4727efa7434a7929de803414de Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:39:06 -0700 Subject: [PATCH 073/531] Add support for int_oo in exir serialization Differential Revision: D61865297 Pull Request resolved: https://github.com/pytorch/executorch/pull/4932 --- exir/serde/export_serialize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index 9b2315c1aae..f0549ba4160 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -51,6 +51,7 @@ from torch.fx.experimental import symbolic_shapes from torch.utils import _pytree as pytree from torch.utils._pytree import treespec_dumps, treespec_loads +from torch.utils._sympy.numbers import int_oo from torch.utils._sympy.value_ranges import ValueRanges # pyre-ignore @@ -332,9 +333,9 @@ def deserialize_torch_artifact( def _sympy_int_to_int(val: sympy.Expr, adjust: str): # Convert simple sympy Integers into concrete int - if val == sympy.oo: + if val in (sympy.oo, int_oo): return math.inf - if val == -sympy.oo: + if val in (-sympy.oo, -int_oo): return -math.inf if isinstance(val, sympy.Integer): return int(val) @@ -360,9 +361,9 @@ def _sympy_int_to_int(val: sympy.Expr, adjust: str): def _int_to_sympy_int(val) -> sympy.Expr: # Convert concrete int into simple sympy Integers if val == math.inf: - return sympy.oo + return int_oo if val == -math.inf: - return -sympy.oo + return -int_oo return sympy.Integer(val) From 9fafdb0530ad8529a257ad266a0ef67572d12db8 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:43:36 -0400 Subject: [PATCH 074/531] Add Half support: full.out Differential Revision: D61864622 Pull Request resolved: https://github.com/pytorch/executorch/pull/4934 --- kernels/portable/cpu/op_full.cpp | 6 ++- kernels/test/op_full_test.cpp | 66 +++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/kernels/portable/cpu/op_full.cpp b/kernels/portable/cpu/op_full.cpp index cfa88f4d958..b4071662a8d 100644 --- a/kernels/portable/cpu/op_full.cpp +++ b/kernels/portable/cpu/op_full.cpp @@ -34,11 +34,13 @@ Tensor& full_out( out, "Failed to resize output tensor."); - ET_SWITCH_REAL_TYPES_AND(Bool, val_type, ctx, "full.out", CTYPE_VAL, [&] { + constexpr auto name = "full.out"; + + ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] { CTYPE_VAL val; utils::extract_scalar(fill_value, &val); - ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "full.out", CTYPE_OUT, [&] { + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { CTYPE_OUT val_casted = static_cast(val); auto data_out = out.mutable_data_ptr(); for (size_t i = 0; i < out.numel(); ++i) { diff --git a/kernels/test/op_full_test.cpp b/kernels/test/op_full_test.cpp index 82aaea03337..09885ddd991 100644 --- a/kernels/test/op_full_test.cpp +++ b/kernels/test/op_full_test.cpp @@ -38,12 +38,25 @@ class OpFullOutTest : public OperatorTest { std::vector size_int64_t(size_int32_t.begin(), size_int32_t.end()); auto aref = IntArrayRef(size_int64_t.data(), size_int64_t.size()); + // Boolean Scalar // Before: `out` consists of 0s. Tensor out = tf.zeros(size_int32_t); + // After: `out` consists of 1s. + op_full_out(aref, true, out); + EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t)); + // Integral Scalar + // Before: `out` consists of 0s. + out = tf.zeros(size_int32_t); // After: `out` consists of 1s. op_full_out(aref, 1, out); + EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t)); + // Floating Point Scalar + // Before: `out` consists of 0s. + out = tf.zeros(size_int32_t); + // After: `out` consists of 1s. + op_full_out(aref, 1.0, out); EXPECT_TENSOR_EQ(out, tf.ones(size_int32_t)); } }; @@ -57,4 +70,55 @@ class OpFullOutTest : public OperatorTest { test_ones_out({2, 3, 4}); \ } -ET_FORALL_REAL_TYPES(GENERATE_TEST) +ET_FORALL_REALH_TYPES(GENERATE_TEST) + +TEST_F(OpFullOutTest, ValueOverflow) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen kernel doesn't handle overflow"; + } + TensorFactory tf; + + std::vector sizes_int64_t_vec = {2, 3}; + std::vector sizes_in32_t_vec = {2, 3}; + auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size()); + + Tensor out = tf.zeros(sizes_in32_t_vec); + + op_full_out(sizes, 1000, out); +} + +TEST_F(OpFullOutTest, HalfSupport) { + TensorFactory tf; + + std::vector sizes_int64_t_vec = {2, 3}; + std::vector sizes_in32_t_vec = {2, 3}; + auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size()); + + // Boolean Scalar + Tensor out = tf.zeros(sizes_in32_t_vec); + op_full_out(sizes, true, out); + EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec)); + + // Integral Scalar + out = tf.zeros(sizes_in32_t_vec); + op_full_out(sizes, 1, out); + EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec)); + + // Floating Point Scalar + out = tf.zeros(sizes_in32_t_vec); + op_full_out(sizes, 3.1415926535, out); + EXPECT_TENSOR_EQ(out, tf.full(sizes_in32_t_vec, 3.1415926535)); +} + +TEST_F(OpFullOutTest, ZeroDim) { + TensorFactory tf; + + std::vector sizes_int64_t_vec = {}; + std::vector sizes_in32_t_vec = {}; + auto sizes = IntArrayRef(sizes_int64_t_vec.data(), sizes_int64_t_vec.size()); + + // Boolean Scalar + Tensor out = tf.zeros(sizes_in32_t_vec); + op_full_out(sizes, true, out); + EXPECT_TENSOR_EQ(out, tf.ones(sizes_in32_t_vec)); +} From b578d6d88b5ff54dc74cb192815d2ae9d284a33f Mon Sep 17 00:00:00 2001 From: hsharma35 Date: Tue, 27 Aug 2024 15:57:36 -0700 Subject: [PATCH 075/531] Release ethosu driver after use. Differential Revision: D61733840 Pull Request resolved: https://github.com/pytorch/executorch/pull/4905 --- backends/arm/runtime/ArmBackendEthosU.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 7420874d8f4..d6e61e0a0d9 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -164,8 +165,10 @@ class ArmBackend final : public PyTorchBackendInterface { } // Allocate driver handle and synchronously invoke driver - ethosu_driver* drv = ethosu_reserve_driver(); - if (drv == NULL) { + auto driver = + std::unique_ptr( + ethosu_reserve_driver(), ethosu_release_driver); + if (driver == NULL) { ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed"); return Error::InvalidState; } @@ -178,7 +181,7 @@ class ArmBackend final : public PyTorchBackendInterface { size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; int result = ethosu_invoke_v3( - drv, + driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, bases, From 11434975a15bf0d1d05e2bed0014c5d60f849aa0 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Tue, 27 Aug 2024 16:53:11 -0700 Subject: [PATCH 076/531] [llm_manual] Use the new namespace, and clean up headers Differential Revision: D61746261 Pull Request resolved: https://github.com/pytorch/executorch/pull/4884 --- docs/source/llm/getting-started.md | 20 ++++----- examples/llm_manual/basic_sampler.h | 6 ++- examples/llm_manual/basic_tokenizer.h | 11 ++--- examples/llm_manual/main.cpp | 23 ++++------ examples/llm_manual/managed_tensor.h | 60 +++++++++------------------ 5 files changed, 46 insertions(+), 74 deletions(-) diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 6d79e1e0fd4..a0865811462 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -198,25 +198,21 @@ Create a file called main.cpp with the following contents: // main.cpp #include -#include -#include -#include -#include "basic_tokenizer.h" #include "basic_sampler.h" +#include "basic_tokenizer.h" #include "managed_tensor.h" #include -#include +#include #include -#include -#include - -using namespace torch::executor; +#include -using SizesType = exec_aten::SizesType; -using DimOrderType = exec_aten::DimOrderType; -using StridesType = exec_aten::StridesType; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::extension::Module; +using executorch::runtime::EValue; +using executorch::runtime::Result; ``` The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array. diff --git a/examples/llm_manual/basic_sampler.h b/examples/llm_manual/basic_sampler.h index a95b823de8d..b4fc8fe248c 100644 --- a/examples/llm_manual/basic_sampler.h +++ b/examples/llm_manual/basic_sampler.h @@ -6,12 +6,14 @@ * LICENSE file in the root directory of this source tree. */ +#pragma once + #include #include + class BasicSampler { public: - BasicSampler() {} - int64_t sample(std::vector logits) { + int64_t sample(const std::vector& logits) { // Find the token with the highest log probability. int64_t max_index = std::max_element(logits.begin(), logits.end()) - logits.begin(); diff --git a/examples/llm_manual/basic_tokenizer.h b/examples/llm_manual/basic_tokenizer.h index eb51d15fc50..385bfaaca48 100644 --- a/examples/llm_manual/basic_tokenizer.h +++ b/examples/llm_manual/basic_tokenizer.h @@ -6,21 +6,22 @@ * LICENSE file in the root directory of this source tree. */ +#pragma once + #include #include -#include #include #include #include class BasicTokenizer { public: - BasicTokenizer(const std::string& filePath) { - std::ifstream file(filePath); + explicit BasicTokenizer(const std::string& file_path) { + std::ifstream file(file_path); if (!file) { - std::cerr << "Unable to open file"; - exit(9); // return with error code + std::cerr << "Unable to open file " << file_path << "\n"; + exit(9); } std::string str( (std::istreambuf_iterator(file)), diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp index 992272a416b..c0fc482542e 100644 --- a/examples/llm_manual/main.cpp +++ b/examples/llm_manual/main.cpp @@ -6,31 +6,24 @@ * LICENSE file in the root directory of this source tree. */ -// main.cpp - #include -#include -#include -#include #include "basic_sampler.h" #include "basic_tokenizer.h" #include "managed_tensor.h" -#include #include +#include #include -#include -#include - -using namespace torch::executor; +#include -using SizesType = exec_aten::SizesType; -using DimOrderType = exec_aten::DimOrderType; -using StridesType = exec_aten::StridesType; - -// main.cpp +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::extension::Module; +using executorch::runtime::EValue; +using executorch::runtime::Result; +// The value of the gpt2 `<|endoftext|>` token. #define ENDOFTEXT 50256 std::string generate( diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h index d870f4861e6..204b38aa4e9 100644 --- a/examples/llm_manual/managed_tensor.h +++ b/examples/llm_manual/managed_tensor.h @@ -6,59 +6,39 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include -#include - -#include - #pragma once -namespace torch { -namespace executor { +#include /** - * A tensor wrapper takes ownership of all the memory of the necessary metadata - * for torch::executor::Tensor. Note that it doesn't own the data memory. + * Creates and owns the necessary metadata for a Tensor instance. Does not own + * the data pointer. */ class ManagedTensor { public: - /// The type used for elements of `sizes()`. - using SizesType = exec_aten::SizesType; - /// The type used for elements of `dim_order()`. - using DimOrderType = exec_aten::DimOrderType; - /// The type used for elements of `strides()`. - using StridesType = exec_aten::StridesType; - - ManagedTensor() = delete; - - explicit ManagedTensor( + ManagedTensor( void* data, - const std::vector& sizes, - ScalarType dtype) - : sizes_(sizes) { - tensor_impl_ = std::make_unique( - dtype, - sizes_.size(), - sizes_.data(), - data, - nullptr, - nullptr, - TensorShapeDynamism::DYNAMIC_BOUND); - } + const std::vector& sizes, + exec_aten::ScalarType dtype) + : sizes_(sizes), + tensor_impl_( + /*type=*/dtype, + /*dim=*/sizes_.size(), + /*sizes=*/sizes_.data(), + /*data=*/data, + /*dim_order=*/nullptr, + /*strides=*/nullptr, + /*dynamism=*/ + executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND) {} /** * Get the Tensor object managed by this class. */ - Tensor get_tensor() { - return Tensor(tensor_impl_.get()); + exec_aten::Tensor get_tensor() { + return exec_aten::Tensor(&tensor_impl_); } private: - std::unique_ptr tensor_impl_; - std::vector sizes_; + std::vector sizes_; + exec_aten::TensorImpl tensor_impl_; }; - -} // namespace executor -} // namespace torch From 7bb8e9c21196151407e1ab68ccd1483149ab5acb Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 27 Aug 2024 20:06:12 -0400 Subject: [PATCH 077/531] Clean up index utils Differential Revision: D61822107 Pull Request resolved: https://github.com/pytorch/executorch/pull/4915 --- kernels/portable/cpu/util/index_util.cpp | 111 ++++++++++++++++++ kernels/portable/cpu/util/index_util.h | 16 +-- kernels/portable/cpu/util/kernel_ops_util.cpp | 111 ------------------ kernels/portable/cpu/util/kernel_ops_util.h | 23 ---- .../kernels/portable/op_registration_util.bzl | 6 +- 5 files changed, 120 insertions(+), 147 deletions(-) diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp index 109983c8122..1baf103665d 100644 --- a/kernels/portable/cpu/util/index_util.cpp +++ b/kernels/portable/cpu/util/index_util.cpp @@ -78,6 +78,22 @@ void get_index_select_out_target_size( } } +bool check_nonzero_args(const Tensor& in, const Tensor& out) { + (void)in; + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + out.scalar_type() == ScalarType::Long, + "Expected out to be a Long tensor but received %" PRId8, + static_cast(out.scalar_type())); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + out.dim() == 2, + "Expected out to be a 2d tensor received %zd", + ssize_t(out.dim())); + + return true; +} + bool check_scatter_add_args( const Tensor& self, int64_t dim, @@ -130,6 +146,101 @@ bool check_scatter_add_args( return true; } +bool check_select_scatter_args( + const Tensor& in, + const Tensor& src, + int64_t dim, + int64_t index, + Tensor& output) { + /** + * Assumptions for inputs: + * 1. output size is the same as input size + * 2. src size is the same as the selected slice from the input + * 3. dim and index values are valid given the input tensor + */ + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, output)); + + // The dim planed to be selected on shall exist in input + ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim())); + + // The index shall be valid in the given dimenson + ET_LOG_MSG_AND_RETURN_IF_FALSE( + index >= 0 && index < in.size(dim), + "index %" PRId64 " out of range [-%zd,%zd) at in.size( %" PRId64 ")", + index, + in.size(dim), + in.size(dim), + dim); + + // The src.dim() shall be one lower than in.dim() since src needs to fit + // into the selected data on one dim of input + // https://pytorch.org/docs/stable/generated/torch.select_scatter.html + ET_LOG_MSG_AND_RETURN_IF_FALSE( + in.dim() == src.dim() + 1, + "in.dim() %zd != src.dim() + 1 %zd", + in.dim(), + src.dim() + 1); + + // The size of src tensor should follow these rules: + // - src.size(i) shall equal to in.size(i) if i < dim, + // - src.size(i) shall equal to in.size(i+1) if i >= dim + + for (ssize_t d = 0; d < in.dim() - 1; d++) { + if (d < dim) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, d, src, d)); + } else { + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(in, d + 1, src, d)); + } + } + + return true; +} + +bool check_slice_scatter_args( + const Tensor& input, + const Tensor& src, + int64_t dim, + int64_t num_values, + int64_t step, + Tensor output) { + ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0); + + // Check dim. The dim planed to be selected on shall exist in input + ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim())); + + // Input and output tensors should be the same shape and dtype + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output)); + + // The input.dim() shall equal to src.dim() + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src)); + + // Check step. Step must be greater than zero + ET_LOG_MSG_AND_RETURN_IF_FALSE( + step > 0, "slice step must be greater than zero"); + + // The size of src tensor should follow these rules: + // - src.size(i) shall equal to input.size(i) if i != dim, + // - src.size(dim) shall equal to num_values + for (size_t d = 0; d < input.dim() - 1; d++) { + if (d != dim) { + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(input, d, src, d)); + } else { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + src.size(d) == num_values, + "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")", + d, + input.size(d), + num_values, + dim); + } + } + + return true; +} + int64_t adjust_slice_indices( int64_t dim_length, int64_t* start, diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h index 9677ac90bc6..2575fbeeb55 100644 --- a/kernels/portable/cpu/util/index_util.h +++ b/kernels/portable/cpu/util/index_util.h @@ -27,6 +27,8 @@ void get_index_select_out_target_size( exec_aten::SizesType* out_sizes, size_t* out_ndim); +bool check_nonzero_args(const Tensor& in, const Tensor& out); + bool check_scatter_add_args( const Tensor& self, int64_t dim, @@ -34,7 +36,12 @@ bool check_scatter_add_args( const Tensor& src, Tensor& out); -bool check_nonzero_args(const Tensor& in, const Tensor& out); +bool check_select_scatter_args( + const Tensor& in, + const Tensor& src, + int64_t dim, + int64_t index, + Tensor& output); bool check_slice_scatter_args( const Tensor& input, @@ -50,12 +57,5 @@ int64_t adjust_slice_indices( int64_t* end, int64_t step); -bool check_select_scatter_args( - const Tensor& in, - const Tensor& src, - int64_t dim, - int64_t index, - Tensor& output); - } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp index d34b41ab797..6ac8e83d2d9 100644 --- a/kernels/portable/cpu/util/kernel_ops_util.cpp +++ b/kernels/portable/cpu/util/kernel_ops_util.cpp @@ -519,117 +519,6 @@ void get_max_pool2d_with_indices_out_target_size( in, 2, kernel_size, stride, padding, dilation, out_sizes, ceil_mode); } -bool check_slice_scatter_args( - const Tensor& input, - const Tensor& src, - int64_t dim, - int64_t num_values, - int64_t step, - Tensor output) { - ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0); - - // Check dim. The dim planed to be selected on shall exist in input - ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim())); - - // Input and output tensors should be the same shape and dtype - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output)); - - // The input.dim() shall equal to src.dim() - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src)); - - // Check step. Step must be greater than zero - ET_LOG_MSG_AND_RETURN_IF_FALSE( - step > 0, "slice step must be greater than zero"); - - // The size of src tensor should follow these rules: - // - src.size(i) shall equal to input.size(i) if i != dim, - // - src.size(dim) shall equal to num_values - for (size_t d = 0; d < input.dim() - 1; d++) { - if (d != dim) { - ET_LOG_AND_RETURN_IF_FALSE( - tensors_have_same_size_at_dims(input, d, src, d)); - } else { - ET_LOG_MSG_AND_RETURN_IF_FALSE( - src.size(d) == num_values, - "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")", - d, - input.size(d), - num_values, - dim); - } - } - - return true; -} - -bool check_select_scatter_args( - const Tensor& in, - const Tensor& src, - int64_t dim, - int64_t index, - Tensor& output) { - /** - * Assumptions for inputs: - * 1. output size is the same as input size - * 2. src size is the same as the selected slice from the input - * 3. dim and index values are valid given the input tensor - */ - - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, output)); - - // The dim planed to be selected on shall exist in input - ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim())); - - // The index shall be valid in the given dimenson - ET_LOG_MSG_AND_RETURN_IF_FALSE( - index >= 0 && index < in.size(dim), - "index %" PRId64 " out of range [-%zd,%zd) at in.size( %" PRId64 ")", - index, - in.size(dim), - in.size(dim), - dim); - - // The src.dim() shall be one lower than in.dim() since src needs to fit - // into the selected data on one dim of input - // https://pytorch.org/docs/stable/generated/torch.select_scatter.html - ET_LOG_MSG_AND_RETURN_IF_FALSE( - in.dim() == src.dim() + 1, - "in.dim() %zd != src.dim() + 1 %zd", - in.dim(), - src.dim() + 1); - - // The size of src tensor should follow these rules: - // - src.size(i) shall equal to in.size(i) if i < dim, - // - src.size(i) shall equal to in.size(i+1) if i >= dim - - for (ssize_t d = 0; d < in.dim() - 1; d++) { - if (d < dim) { - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, d, src, d)); - } else { - ET_LOG_AND_RETURN_IF_FALSE( - tensors_have_same_size_at_dims(in, d + 1, src, d)); - } - } - - return true; -} - -bool check_nonzero_args(const Tensor& in, const Tensor& out) { - (void)in; - - ET_LOG_MSG_AND_RETURN_IF_FALSE( - out.scalar_type() == ScalarType::Long, - "Expected out to be a Long tensor but received %" PRId8, - static_cast(out.scalar_type())); - - ET_LOG_MSG_AND_RETURN_IF_FALSE( - out.dim() == 2, - "Expected out to be a 2d tensor received %zd", - ssize_t(out.dim())); - - return true; -} - bool check_masked_fill_args( const Tensor& in, const Tensor& mask, diff --git a/kernels/portable/cpu/util/kernel_ops_util.h b/kernels/portable/cpu/util/kernel_ops_util.h index c18269490b7..22a09ef33d5 100644 --- a/kernels/portable/cpu/util/kernel_ops_util.h +++ b/kernels/portable/cpu/util/kernel_ops_util.h @@ -451,29 +451,6 @@ void get_max_pool2d_with_indices_out_target_size( exec_aten::SizesType* out_sizes, size_t* out_ndim); -bool check_nonzero_args(const Tensor& in, const Tensor& out); - -bool check_slice_scatter_args( - const Tensor& input, - const Tensor& src, - int64_t dim, - int64_t num_values, - int64_t step, - Tensor output); - -int64_t adjust_slice_indices( - int64_t dim_length, - int64_t* start, - int64_t* end, - int64_t step); - -bool check_select_scatter_args( - const Tensor& in, - const Tensor& src, - int64_t dim, - int64_t index, - Tensor& output); - bool check_masked_fill_args( const Tensor& in, const Tensor& mask, diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index 347094435c6..820312a54da 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -835,7 +835,6 @@ ATEN_OPS = ( deps = [ ":scalar_utils", "//executorch/kernels/portable/cpu/util:index_util", - "//executorch/kernels/portable/cpu/util:kernel_ops_util", ], ), op_target( @@ -972,7 +971,6 @@ ATEN_OPS = ( name = "op_scatter_add", deps = [ "//executorch/kernels/portable/cpu/util:index_util", - "//executorch/kernels/portable/cpu/util:kernel_ops_util", "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", ], @@ -987,10 +985,9 @@ ATEN_OPS = ( op_target( name = "op_select_scatter", deps = [ - "//executorch/kernels/portable/cpu/util:kernel_ops_util", + "//executorch/kernels/portable/cpu/util:index_util", "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", - "//executorch/kernels/portable/cpu/util:index_util", ], ), op_target( @@ -1028,7 +1025,6 @@ ATEN_OPS = ( name = "op_slice_scatter", deps = [ "//executorch/kernels/portable/cpu/util:index_util", - "//executorch/kernels/portable/cpu/util:kernel_ops_util", ], ), op_target( From 69472e5c43481324ad923ceb29392ab72830acee Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 27 Aug 2024 17:18:08 -0700 Subject: [PATCH 078/531] [llava] Enable memory profiling Differential Revision: D61878836 Pull Request resolved: https://github.com/pytorch/executorch/pull/4936 --- examples/models/llava/export_llava.py | 16 ++++++++++++++++ util/activation_memory_profiler.py | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index c3145d44e9a..903f8c17605 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -36,6 +36,7 @@ from executorch.extension.llm.export.builder import DType, LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import Tokenizer +from executorch.util.activation_memory_profiler import generate_memory_trace from torch.ao.quantization.quantizer.xnnpack_quantizer import ( get_symmetric_quantization_config, XNNPACKQuantizer, @@ -265,6 +266,12 @@ def main(): action=BooleanOptionalAction, help="Generate artifacts for llava runner.", ) + parser.add_argument( + "--profile_memory", + required=False, + action="store_true", + help="Generate chrome trace of activation memory for intermediate tensors.", + ) args = parser.parse_args() logging.info( f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {args.use_sdpa_with_kv_cache}, max_seq_len: {args.max_seq_len}" @@ -276,6 +283,15 @@ def main(): executorch_program = export_all(llava_model) + # memory profiling + if args.profile_memory: + for method_name in executorch_program.methods: + generate_memory_trace( + executorch_program, + f"{args.pte_name}_{method_name}.json", + method_name=method_name, + ) + with open(args.pte_name, "wb") as f: executorch_program.write_to_file(f) logging.info(f"Exported ExecuTorch program to {args.pte_name}") diff --git a/util/activation_memory_profiler.py b/util/activation_memory_profiler.py index 5f5185dd7b4..f459dfafaf0 100644 --- a/util/activation_memory_profiler.py +++ b/util/activation_memory_profiler.py @@ -106,6 +106,7 @@ def generate_memory_trace( executorch_program_manager: ExecutorchProgramManager, chrome_trace_filename: str, enable_memory_offsets: bool = False, + method_name: str = "forward", ): """ Generate the memory timeline from the given ExecuTorch program. @@ -122,7 +123,7 @@ def generate_memory_trace( f"generate_memory_trace expects ExecutorchProgramManager instance but got {type(executorch_program_manager)}" ) - exported_program = executorch_program_manager.exported_program() + exported_program = executorch_program_manager.exported_program(method_name) if not _validate_memory_planning_is_done(exported_program): raise ValueError("Executorch program does not have memory planning.") From 5395ae6964037da4004b3a0c58b76927bd42c99f Mon Sep 17 00:00:00 2001 From: Max Ren <40742183+mcr229@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:32:22 -0700 Subject: [PATCH 079/531] fold quantize in convert Differential Revision: D61814397 Pull Request resolved: https://github.com/pytorch/executorch/pull/4889 --- examples/models/phi-3-mini/export_phi-3-mini.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py index 553fded67fb..c2e97a21b1e 100644 --- a/examples/models/phi-3-mini/export_phi-3-mini.py +++ b/examples/models/phi-3-mini/export_phi-3-mini.py @@ -69,7 +69,7 @@ def export(args) -> None: ) model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6] model(*example_inputs) - model = convert_pt2e(model, fold_quantize=False) + model = convert_pt2e(model) DuplicateDynamicQuantChainPass()(model) # TODO(lunwenh): update it to use export once # https://github.com/pytorch/pytorch/issues/128394 is resolved. From a79b1a6a4e2984c250c91b3951aad56e604bcecd Mon Sep 17 00:00:00 2001 From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:22:03 +0800 Subject: [PATCH 080/531] Support regnet_x_400mf and regnet_y_400mf (#4925) --- backends/qualcomm/tests/test_qnn_delegate.py | 40 ++++ examples/qualcomm/oss_scripts/dino_v2.py | 6 - examples/qualcomm/oss_scripts/esrgan.py | 6 - .../oss_scripts/gMLP_image_classification.py | 6 - examples/qualcomm/oss_scripts/regnet.py | 181 ++++++++++++++++++ examples/qualcomm/oss_scripts/squeezenet.py | 6 - examples/qualcomm/oss_scripts/ssd300_vgg16.py | 6 - examples/qualcomm/scripts/deeplab_v3.py | 6 - examples/qualcomm/scripts/edsr.py | 6 - examples/qualcomm/scripts/inception_v3.py | 6 - examples/qualcomm/scripts/inception_v4.py | 6 - .../qualcomm/scripts/mobilebert_fine_tune.py | 6 - examples/qualcomm/scripts/mobilenet_v2.py | 6 - examples/qualcomm/scripts/mobilenet_v3.py | 6 - examples/qualcomm/scripts/torchvision_vit.py | 7 +- 15 files changed, 222 insertions(+), 78 deletions(-) create mode 100644 examples/qualcomm/oss_scripts/regnet.py diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 08fd907c40a..79b8443dc71 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -1668,6 +1668,46 @@ def test_gMLP(self): self.assertGreaterEqual(msg["top_1"], 60) self.assertGreaterEqual(msg["top_5"], 90) + def test_regnet(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + + weights = ["regnet_y_400mf", "regnet_x_400mf"] + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + for weight in weights: + p = subprocess.Popen( + cmds + ["--weights", weight], stdout=subprocess.DEVNULL + ) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 85) + def test_ssd300_vgg16(self): if not self.required_envs([self.pretrained_weight, self.oss_repo]): self.skipTest("missing required envs") diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py index b3fecfbbe63..a8241e34a73 100644 --- a/examples/qualcomm/oss_scripts/dino_v2.py +++ b/examples/qualcomm/oss_scripts/dino_v2.py @@ -105,12 +105,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/oss_scripts/esrgan.py b/examples/qualcomm/oss_scripts/esrgan.py index 56871db7646..df02374e4bb 100644 --- a/examples/qualcomm/oss_scripts/esrgan.py +++ b/examples/qualcomm/oss_scripts/esrgan.py @@ -74,12 +74,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/oss_scripts/gMLP_image_classification.py b/examples/qualcomm/oss_scripts/gMLP_image_classification.py index 864a9b919fc..cbcd6d88cbf 100644 --- a/examples/qualcomm/oss_scripts/gMLP_image_classification.py +++ b/examples/qualcomm/oss_scripts/gMLP_image_classification.py @@ -96,12 +96,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where artifacts were built - # pte_path : path where QNN delegate executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/oss_scripts/regnet.py b/examples/qualcomm/oss_scripts/regnet.py new file mode 100644 index 00000000000..0dc70608daf --- /dev/null +++ b/examples/qualcomm/oss_scripts/regnet.py @@ -0,0 +1,181 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import sys +from multiprocessing.connection import Client + +import numpy as np +import torch +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, + topk_accuracy, +) + +from torchvision.models import ( + regnet_x_400mf, + RegNet_X_400MF_Weights, + regnet_y_400mf, + RegNet_Y_400MF_Weights, +) + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + inputs.append((feature,)) + for element in target: + targets.append(element) + input_list += f"input_{index}_0.raw\n" + + return inputs, targets, input_list + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + if not args.compile_only and args.device is None: + raise RuntimeError( + "device serial is required if not compile only. " + "Please specify a device serial by -s/--device argument." + ) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + if args.weights == "regnet_y_400mf": + weights = RegNet_Y_400MF_Weights.DEFAULT + model = regnet_y_400mf(weights=weights).eval() + pte_filename = "regnet_y_400mf" + else: + weights = RegNet_X_400MF_Weights.DEFAULT + model = regnet_x_400mf(weights=weights).eval() + pte_filename = "regnet_x_400mf" + + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=QuantDtype.use_8a8w, + ) + + if args.compile_only: + sys.exit(0) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + # top-k analysis + predictions = [] + for i in range(data_num): + predictions.append( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + ) + + k_val = [1, 5] + topk = [topk_accuracy(predictions, targets, k).item() for k in k_val] + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)})) + else: + for i, k in enumerate(k_val): + print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./regnet", + default="./regnet", + type=str, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "--weights", + type=str, + choices=["regnet_y_400mf", "regnet_x_400mf"], + help="Specify which regent weights/model to execute", + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/squeezenet.py b/examples/qualcomm/oss_scripts/squeezenet.py index 820f23d1193..64b317068ce 100644 --- a/examples/qualcomm/oss_scripts/squeezenet.py +++ b/examples/qualcomm/oss_scripts/squeezenet.py @@ -92,12 +92,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/oss_scripts/ssd300_vgg16.py b/examples/qualcomm/oss_scripts/ssd300_vgg16.py index 45e3073baeb..a5db138233e 100644 --- a/examples/qualcomm/oss_scripts/ssd300_vgg16.py +++ b/examples/qualcomm/oss_scripts/ssd300_vgg16.py @@ -155,12 +155,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py index 8d1aa376e7c..34a94c8a76a 100755 --- a/examples/qualcomm/scripts/deeplab_v3.py +++ b/examples/qualcomm/scripts/deeplab_v3.py @@ -95,12 +95,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py index c5e3f8b0105..8852cf0e4c7 100755 --- a/examples/qualcomm/scripts/edsr.py +++ b/examples/qualcomm/scripts/edsr.py @@ -126,12 +126,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index 50361938e85..82b290d253d 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -92,12 +92,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py index cd4dcb7cd99..e7f2fea1cd7 100755 --- a/examples/qualcomm/scripts/inception_v4.py +++ b/examples/qualcomm/scripts/inception_v4.py @@ -91,12 +91,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 94f528dbc35..278ab8e8c02 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -268,12 +268,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py index 8a3032df02f..7cc0226e250 100755 --- a/examples/qualcomm/scripts/mobilenet_v2.py +++ b/examples/qualcomm/scripts/mobilenet_v2.py @@ -92,12 +92,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py index d0cd7bb4df0..08c65904631 100644 --- a/examples/qualcomm/scripts/mobilenet_v3.py +++ b/examples/qualcomm/scripts/mobilenet_v3.py @@ -90,12 +90,6 @@ def main(args): if args.compile_only: sys.exit(0) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py index 85852ebb2fe..dc9459bb13c 100755 --- a/examples/qualcomm/scripts/torchvision_vit.py +++ b/examples/qualcomm/scripts/torchvision_vit.py @@ -76,12 +76,7 @@ def main(args): quant_dtype=QuantDtype.use_8a8w, shared_buffer=args.shared_buffer, ) - # setup required paths accordingly - # qnn_sdk : QNN SDK path setup in environment variable - # build_path : path where QNN delegate artifacts were built - # pte_path : path where executorch binary was stored - # device_id : serial number of android device - # workspace : folder for storing artifacts on android device + adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", From ff46dd56b1bf3a0c576d7c3667be939e68f24bb0 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 27 Aug 2024 22:23:17 -0700 Subject: [PATCH 081/531] [executorch] Rename phi-3-mini-lora directory to match phi-3-mini-lora Differential Revision: D61820324 Pull Request resolved: https://github.com/pytorch/executorch/pull/4914 --- .../models/{phi3-mini-lora => phi-3-mini-lora}/README.md | 5 +++-- .../{phi3-mini-lora => phi-3-mini-lora}/export_model.py | 0 .../install_requirements.sh | 0 3 files changed, 3 insertions(+), 2 deletions(-) rename examples/models/{phi3-mini-lora => phi-3-mini-lora}/README.md (87%) rename examples/models/{phi3-mini-lora => phi-3-mini-lora}/export_model.py (100%) rename examples/models/{phi3-mini-lora => phi-3-mini-lora}/install_requirements.sh (100%) diff --git a/examples/models/phi3-mini-lora/README.md b/examples/models/phi-3-mini-lora/README.md similarity index 87% rename from examples/models/phi3-mini-lora/README.md rename to examples/models/phi-3-mini-lora/README.md index 3811fc5438c..69564581af3 100644 --- a/examples/models/phi3-mini-lora/README.md +++ b/examples/models/phi-3-mini-lora/README.md @@ -6,7 +6,7 @@ In this example, we export to ExecuTorch a model ([phi-3-mini](https://github.co `./install_requirements.sh` in ExecuTorch root directory. ### Step 2: Install Requirements -- `./examples/models/phi3-mini-lora/install_requirements.sh` +- `./examples/models/phi-3-mini-lora/install_requirements.sh` ### Step 3: Export and run the model 1. Export the inferenace and training models to ExecuTorch. @@ -22,5 +22,6 @@ python export_model.py # Build the executor_runner target cmake --build cmake-out --target executor_runner -j9 -./cmake-out/executor_runner --model_path mini_phi3_lora.pte +# Run the model for inference. +./cmake-out/executor_runner --model_path phi3_mini_lora.pte ``` diff --git a/examples/models/phi3-mini-lora/export_model.py b/examples/models/phi-3-mini-lora/export_model.py similarity index 100% rename from examples/models/phi3-mini-lora/export_model.py rename to examples/models/phi-3-mini-lora/export_model.py diff --git a/examples/models/phi3-mini-lora/install_requirements.sh b/examples/models/phi-3-mini-lora/install_requirements.sh similarity index 100% rename from examples/models/phi3-mini-lora/install_requirements.sh rename to examples/models/phi-3-mini-lora/install_requirements.sh From 35e2302cdf4a1eb81119c8cf2c3ae56e5bd25fdc Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 27 Aug 2024 23:24:06 -0700 Subject: [PATCH 082/531] Add a workflow to validate and upload iOS demo app test spec Differential Revision: D61882788 Pull Request resolved: https://github.com/pytorch/executorch/pull/4937 --- .github/workflows/apple.yml | 11 ++- ...pecs.yml => upload-android-test-specs.yml} | 6 +- .github/workflows/upload-apple-test-specs.yml | 91 +++++++++++++++++++ ...fault-ios-device-farm-appium-test-spec.yml | 31 +++++++ 4 files changed, 135 insertions(+), 4 deletions(-) rename .github/workflows/{upload-test-specs.yml => upload-android-test-specs.yml} (94%) create mode 100644 .github/workflows/upload-apple-test-specs.yml create mode 100644 examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 628d56bb037..2c4c172ac1c 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -19,6 +19,15 @@ on: - extension/apple/** - extension/module/** workflow_dispatch: + # TODO (huydhn): This is used to validate the test spec. Eventually, we need a proper + # perf benchmark workflow like android-perf. This can be cleaned up once that workflow + # is ready + workflow_call: + inputs: + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} @@ -107,7 +116,7 @@ jobs: # Uploaded to S3 from the previous job ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip - test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml + test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }} build-frameworks-ios: name: build-frameworks-ios diff --git a/.github/workflows/upload-test-specs.yml b/.github/workflows/upload-android-test-specs.yml similarity index 94% rename from .github/workflows/upload-test-specs.yml rename to .github/workflows/upload-android-test-specs.yml index 24119b64566..5a468da44f1 100644 --- a/.github/workflows/upload-test-specs.yml +++ b/.github/workflows/upload-android-test-specs.yml @@ -1,15 +1,15 @@ -name: Upload AWS Device Farm test specs +name: Upload AWS Device Farm Android test specs on: pull_request: paths: - - .github/workflows/upload-test-specs.yml + - .github/workflows/upload-android-test-specs.yml - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml push: branches: - main paths: - - .github/workflows/upload-test-specs.yml + - .github/workflows/upload-android-test-specs.yml - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml concurrency: diff --git a/.github/workflows/upload-apple-test-specs.yml b/.github/workflows/upload-apple-test-specs.yml new file mode 100644 index 00000000000..f5db9a04a60 --- /dev/null +++ b/.github/workflows/upload-apple-test-specs.yml @@ -0,0 +1,91 @@ +name: Upload AWS Device Farm Apple iOS test specs + +on: + pull_request: + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + push: + branches: + - main + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + upload-apple-test-spec-for-validation: + runs-on: linux.2xlarge + steps: + - uses: actions/checkout@v3 + + - name: Upload the spec as a GitHub artifact for validation + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 1 + if-no-files-found: error + path: examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + + # TODO (huydhn): An example on how to validate the test spec using the iOS demo app, but we need a proper + # perf benchmark workflow like android-perf + validate-apple-test-spec: + needs: upload-apple-test-spec-for-validation + uses: ./.github/workflows/apple.yml + secrets: inherit + permissions: + id-token: write + contents: read + with: + test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/default-ios-device-farm-appium-test-spec.yml + + upload-apple-test-spec: + needs: validate-apple-test-spec + runs-on: ubuntu-22.04 + timeout-minutes: 15 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios + aws-region: us-east-1 + + - name: Only push to S3 when running the workflow manually from main branch + if: ${{ github.ref == 'refs/heads/main' }} + shell: bash + run: | + set -eux + echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}" + + - name: Upload the spec to S3 ossci-ios bucket + shell: bash + working-directory: examples/demo-apps/apple_ios + env: + SPEC_FILE: default-ios-device-farm-appium-test-spec.yml + run: | + set -eux + + pip install awscli==1.32.18 + + AWS_CMD="aws s3 cp --dryrun" + if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then + AWS_CMD="aws s3 cp" + fi + + shasum -a 256 "${SPEC_FILE}" + ${AWS_CMD} "${SPEC_FILE}" s3://ossci-ios/executorch/ --acl public-read diff --git a/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml new file mode 100644 index 00000000000..5b66e165c4e --- /dev/null +++ b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml @@ -0,0 +1,31 @@ +version: 0.1 + +# Phases are collection of commands that get executed on Device Farm. +phases: + # The install phase includes commands that install dependencies that your tests use. + # Default dependencies for testing frameworks supported on Device Farm are already installed. + install: + commands: + + # The pre-test phase includes commands that setup your test environment. + pre_test: + commands: + - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos + - unzip $DEVICEFARM_APP_PATH -d /tmp + - mv /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos/ + + # The test phase includes commands that run your test suite execution. + test: + commands: + - xcodebuild test-without-building -destination id=$DEVICEFARM_DEVICE_UDID -xctestrun $DEVICEFARM_TEST_PACKAGE_PATH/*.xctestrun -derivedDataPath $DEVICEFARM_LOG_DIR + + # The post test phase includes are commands that are run after your tests are executed. + post_test: + commands: + +# The artifacts phase lets you specify the location where your tests logs, device logs will be stored. +# And also let you specify the location of your test logs and artifacts which you want to be collected by Device Farm. +# These logs and artifacts will be available through ListArtifacts API in Device Farm. +artifacts: + # By default, Device Farm will collect your artifacts from following directories + - $DEVICEFARM_LOG_DIR From 4116cb24dbf811d820ad7c6024dea738162f6c4a Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:16:10 +0800 Subject: [PATCH 083/531] Qualcomm AI Engine Direct - Model sharding for LLM (#4923) For LLM, model size is too large to fit in device memory for inference. Therefore, we need to divide the model into a few parts in order to avoid inference time out-of-memory errors. Summary: - Use custom fallback op to split graph - Add splill fill feature - Add model sharding argument for qnn --- .../qualcomm/partition/qnn_partitioner.py | 24 +--- backends/qualcomm/quantizer/quantizer.py | 2 +- examples/models/llama2/export_llama_lib.py | 39 ++++++- extension/llm/custom_ops/model_sharding.py | 104 ++++++++++++++++++ extension/llm/custom_ops/op_fallback.cpp | 48 ++++++++ extension/llm/custom_ops/op_fallback.h | 20 ++++ extension/llm/custom_ops/targets.bzl | 4 +- extension/llm/export/partitioner_lib.py | 11 +- extension/llm/export/quantizer_lib.py | 6 + 9 files changed, 228 insertions(+), 30 deletions(-) create mode 100644 extension/llm/custom_ops/model_sharding.py create mode 100644 extension/llm/custom_ops/op_fallback.cpp create mode 100644 extension/llm/custom_ops/op_fallback.h diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py index 73dbede8ff6..659bda517f0 100644 --- a/backends/qualcomm/partition/qnn_partitioner.py +++ b/backends/qualcomm/partition/qnn_partitioner.py @@ -44,16 +44,7 @@ def __init__( ): self.node_visitors = node_visitor.get_node_visitors(edge_program) - self.skip_node_op_builder_set = set() - if skip_node_op_set is not None: - self.skip_node_op_builder_set = set( - [ - self.node_visitors[val] - for val in skip_node_op_set - if val in self.node_visitors - ] - ) - + self.skip_node_op_set = skip_node_op_set self.skip_node_id_set = skip_node_id_set self.nodes_to_wrappers = defaultdict(dict) self.qnn_manager = PyQnnManager.QnnManager( @@ -75,14 +66,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: if node.target in allow_list_operator: return True - if self.skip_node_id_set is not None and node.name in self.skip_node_id_set: - print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") - return False - if ( - self.skip_node_op_builder_set is not None - and self.node_visitors[node.target.__name__] - in self.skip_node_op_builder_set + node.name in self.skip_node_id_set + or node.target.__name__ in self.skip_node_op_set ): print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") return False @@ -124,8 +110,8 @@ def __init__( QnnBackend.__name__, self.compiler_specs_snapshot ) self.partition_tags: Dict[str, DelegationSpec] = {} - self.skip_node_id_set = skip_node_id_set - self.skip_node_op_set = skip_node_op_set + self.skip_node_id_set = set() if skip_node_id_set is None else skip_node_id_set + self.skip_node_op_set = set() if skip_node_op_set is None else skip_node_op_set def generate_partitions( self, edge_program: torch.export.ExportedProgram diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index d51e016473f..e27edf939c8 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -116,7 +116,7 @@ def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: boo if enable: self.use_per_channel_weight_quant_ops.update(ops) else: - self.use_per_channel_weight_quant_ops.difference(ops) + self.use_per_channel_weight_quant_ops.difference_update(ops) def add_16bit_quant_ops(self, ops: Set[OpOverload]) -> None: for op in ops: diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 172a1d72fd7..1dac12cc853 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -193,6 +193,12 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Whether or not to export a model using kv cache", ) + parser.add_argument( + "--num_sharding", + type=int, + default=0, + help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.", + ) parser.add_argument( "--use_sdpa_with_kv_cache", default=False, @@ -455,6 +461,9 @@ def _validate_args(args): " Please use --disable_dynamic_shape." ) + if args.num_sharding > 0 and not args.qnn: + raise ValueError("Model shard is only supported with qnn backend now.") + def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 _validate_args(args) @@ -501,11 +510,11 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 modelname = f"coreml_{modelname}" if args.qnn: + from executorch.extension.llm.custom_ops import model_sharding + partitioners.append( get_qnn_partitioner( - quant_dtype, - args.use_kv_cache, - args.pt2e_quantize, + args.use_kv_cache, args.pt2e_quantize, args.num_sharding ) ) # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` @@ -514,6 +523,13 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program` _transform(builder_exported_to_edge.edge_manager.exported_program()) + if args.num_sharding > 0: + model_sharding.split_graph( + builder_exported_to_edge.edge_manager.exported_program(), + builder_exported_to_edge.metadata["get_n_layers"], + shares=args.num_sharding, + ) + if args.generate_etrecord: if not builder_exported_to_edge.edge_manager: raise ValueError("Unable to generate etrecord due to missing edge manager.") @@ -521,7 +537,13 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 logging.info("Generating etrecord") # Copy the edge manager which will be serialized into etrecord. This is memory-wise expensive. edge_manager_copy = copy.deepcopy(builder_exported_to_edge.edge_manager) - builder = builder_exported_to_edge.to_backend(partitioners).to_executorch() + builder = builder_exported_to_edge.to_backend(partitioners) + if args.num_sharding > 0 and args.qnn: + from executorch.backends.qualcomm.utils.utils import canonicalize_program + + canonicalize_program(builder.edge_manager.exported_program()) + + builder = builder.to_executorch() # Generate ETRecord if edge_manager_copy: @@ -532,7 +554,13 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 ) logging.info("Generated etrecord.bin") else: - builder = builder_exported_to_edge.to_backend(partitioners).to_executorch() + builder = builder_exported_to_edge.to_backend(partitioners) + if args.num_sharding > 0 and args.qnn: + from executorch.backends.qualcomm.utils.utils import canonicalize_program + + canonicalize_program(builder.edge_manager.exported_program()) + + builder = builder.to_executorch() if args.profile_memory: generate_memory_trace(builder.export_program, "memory_profile.json") @@ -575,6 +603,7 @@ def _load_llama_model_metadata( "get_max_seq_len": model_args.max_seq_len, "get_n_bos": 1, "get_n_eos": 2 if is_fairseq2 else 1, + "get_n_layers": model_args.n_layers, "get_vocab_size": model_args.vocab_size, "use_kv_cache": use_kv_cache, "use_sdpa_with_kv_cache": use_sdpa_with_kv_cache, diff --git a/extension/llm/custom_ops/model_sharding.py b/extension/llm/custom_ops/model_sharding.py new file mode 100644 index 00000000000..75d6fd25740 --- /dev/null +++ b/extension/llm/custom_ops/model_sharding.py @@ -0,0 +1,104 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import re +from typing import List + +import torch + +from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.export.exported_program import ExportedProgram +from torch.library import impl, Library + + +fallback_op_lib = Library("llama", "DEF") +# registering an operator. +fallback_op_lib.define("fallback(Tensor input) -> Tensor") + + +@impl(fallback_op_lib, "fallback") +def fallback_impl(a: torch.Tensor) -> torch.Tensor: + return a + + +# registering the out variant. +fallback_op_lib.define("fallback.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)") + + +@impl(fallback_op_lib, "fallback.out") +def fallback_out_impl(a: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor: + out.copy_(a) + return out + + +class SplitGraph(ExportPass): + """ + Class to split the model to multiple partitions. + Because there is limited memory on the device, it could + not load all llama model in one pte. + """ + + def __init__(self, shard_layers: List[int]): + super().__init__() + self.shard_layers = shard_layers + + def _insert_fallback_op( + self, graph_module: torch.fx.GraphModule + ) -> torch.fx.GraphModule: + """ + Insert fallback op before layer that needs to be shard. + Example: + There is 12 layers llama model and num_sharding is 3. + The first partition will contain layers [0, 4) and embedding. + The second partition will contain layers [4, 8). + The third partition will contain layers [8, 12) and output. + """ + pattern = r"layers.(\d+)" + prev_node = None + prev_layer = None + for node in graph_module.graph.nodes: + if node.op != "call_function" or "nn_module_stack" not in node.meta: + continue + + module_values_list = list(node.meta["nn_module_stack"].values()) + full_qualified_name = module_values_list[-1][0] + # Search which layer this node belongs to + match = re.search(pattern, full_qualified_name) + if match is None: + continue + + cur_layer = int(match.group(1)) + # Check the current node which is the last node of the layer + if cur_layer in self.shard_layers and prev_layer == cur_layer - 1: + with graph_module.graph.inserting_after(prev_node): + users = list(prev_node.users.keys()) + inserted_node = graph_module.graph.create_node( + "call_function", + exir_ops.edge.llama.fallback.default, + (prev_node,), + ) + inserted_node.meta["val"] = prev_node.meta["val"] + if prev_node.meta.get(QCOM_QUANT_ATTRS, None): + inserted_node.meta[QCOM_QUANT_ATTRS] = prev_node.meta[ + QCOM_QUANT_ATTRS + ] + for user in users: + user.replace_input_with(prev_node, inserted_node) + + prev_layer = cur_layer + prev_node = node + + def call(self, graph_module: torch.fx.GraphModule): + self._insert_fallback_op(graph_module) + graph_module.recompile() + return PassResult(graph_module, True) + + +def split_graph(edge_program: ExportedProgram, num_layers: int, shares: int): + graph_module = edge_program.graph_module + shard_layers = list(range(0, num_layers, int(num_layers / shares))) + return SplitGraph(shard_layers)(graph_module) diff --git a/extension/llm/custom_ops/op_fallback.cpp b/extension/llm/custom_ops/op_fallback.cpp new file mode 100644 index 00000000000..4eb10642f37 --- /dev/null +++ b/extension/llm/custom_ops/op_fallback.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include + +namespace torch { +namespace executor { + +namespace native { + +// Copy from op_clone.cpp +Tensor& fallback_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, in.sizes()) == torch::executor::Error::Ok, + InvalidArgument, + out); + + // The input and out shall share same dtype and size + ET_KERNEL_CHECK( + ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); + + if (in.nbytes() > 0) { + // Note that this check is important. It's valid for a tensor with numel 0 + // to have a null data pointer, but in some environments it's invalid to + // pass a null pointer to memcpy() even when the size is zero. + memcpy(out.mutable_data_ptr(), in.const_data_ptr(), in.nbytes()); + } + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch + +EXECUTORCH_LIBRARY( + llama, + "fallback.out", + torch::executor::native::fallback_out); diff --git a/extension/llm/custom_ops/op_fallback.h b/extension/llm/custom_ops/op_fallback.h new file mode 100644 index 00000000000..62a2c0d53eb --- /dev/null +++ b/extension/llm/custom_ops/op_fallback.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch { +namespace executor { + +namespace native { +Tensor& fallback_out(RuntimeContext& ctx, const Tensor& in, Tensor& out); +} // namespace native +} // namespace executor +} // namespace torch diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index fe93f6a422d..55273c1f4ff 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -8,8 +8,8 @@ def define_common_targets(): """ runtime.cxx_library( name = "custom_ops", - srcs = ["op_sdpa.cpp"], - exported_headers = ["op_sdpa.h"], + srcs = ["op_sdpa.cpp", "op_fallback.cpp"], + exported_headers = ["op_sdpa.h", "op_fallback.h"], exported_deps = [ "//executorch/runtime/kernel:kernel_includes", "//executorch/kernels/portable/cpu:scalar_utils", diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index ab98f2543f7..e75d5bef3fb 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -105,7 +105,9 @@ def get_coreml_partitioner( def get_qnn_partitioner( - quant_dtype, use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None + use_kv_cache: bool = False, + pt2e_quantize: Optional[str] = None, + num_sharding: int = 0, ): assert ( use_kv_cache is True @@ -132,7 +134,7 @@ def get_qnn_partitioner( ) use_fp16 = True - skip_node_op_set = {} + skip_node_op_set = {"llama.fallback.default"} if pt2e_quantize is not None: use_fp16 = False @@ -140,7 +142,10 @@ def get_qnn_partitioner( generate_qnn_executorch_compiler_spec( # pyre-fixme[16] soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] # pyre-fixme[16] - backend_options=generate_htp_compiler_spec(use_fp16=use_fp16), + backend_options=generate_htp_compiler_spec( + use_fp16=use_fp16, + use_multi_contexts=num_sharding > 0, + ), debug=False, saver=False, ), diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 36d2f630b03..76a2bc97d3e 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -177,6 +177,12 @@ def get_qnn_quantizer( quant_dtype = QuantDtype.use_8a8w # pyre-fixme[16] elif quant_config == "16a16w": quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] + # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w + # TODO: enable it after the issue is fixed + logging.warn( + "Disable per channel quantization for linear due to the error with QNN HTP 16a16w." + ) + qnn_quantizer.set_per_channel_linear_quant(enable=False) qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS) qnn_quantizer.set_bit16_op_quant_config( # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. From 5205cb2e0b28ae07fe7f6ee9ba6dff15604d19e3 Mon Sep 17 00:00:00 2001 From: Chester Hu Date: Wed, 28 Aug 2024 10:05:09 -0700 Subject: [PATCH 084/531] Update setup-with-qnn.sh (#4943) Add flags for: 1. Executorch to use tiktoken 2. Quantized kernels 3. Custom kernels Fix the build error --- examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index c6ada79553a..86a9e051c65 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -8,6 +8,7 @@ set -eu CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}" +EXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN:-OFF}" # Note: Set up ANDROID_NDK and ANDROID_ABI cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ @@ -15,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_QNN=ON \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ @@ -32,6 +34,7 @@ cmake examples/models/llama2 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="$ANDROID_ABI" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 From e636ef66244199589313b690b155f4eba3b4a86a Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 28 Aug 2024 10:12:12 -0700 Subject: [PATCH 085/531] Move preprocess into subdir Differential Revision: D61833185 Pull Request resolved: https://github.com/pytorch/executorch/pull/4927 --- examples/models/flamingo/preprocess/__init__.py | 0 examples/models/flamingo/{ => preprocess}/export_preprocess.py | 0 .../models/flamingo/{ => preprocess}/export_preprocess_lib.py | 0 examples/models/flamingo/{ => preprocess}/test_preprocess.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/models/flamingo/preprocess/__init__.py rename examples/models/flamingo/{ => preprocess}/export_preprocess.py (100%) rename examples/models/flamingo/{ => preprocess}/export_preprocess_lib.py (100%) rename examples/models/flamingo/{ => preprocess}/test_preprocess.py (100%) diff --git a/examples/models/flamingo/preprocess/__init__.py b/examples/models/flamingo/preprocess/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/models/flamingo/export_preprocess.py b/examples/models/flamingo/preprocess/export_preprocess.py similarity index 100% rename from examples/models/flamingo/export_preprocess.py rename to examples/models/flamingo/preprocess/export_preprocess.py diff --git a/examples/models/flamingo/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py similarity index 100% rename from examples/models/flamingo/export_preprocess_lib.py rename to examples/models/flamingo/preprocess/export_preprocess_lib.py diff --git a/examples/models/flamingo/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py similarity index 100% rename from examples/models/flamingo/test_preprocess.py rename to examples/models/flamingo/preprocess/test_preprocess.py From 65d552f5cc524093851b44b3a250e1ee13d90b56 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Wed, 28 Aug 2024 11:28:35 -0700 Subject: [PATCH 086/531] fix typo in doc Differential Revision: D61921827 Pull Request resolved: https://github.com/pytorch/executorch/pull/4944 --- docs/source/kernel-library-custom-aten-kernel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md index 8fb4ed96cd5..2cf87ca4588 100644 --- a/docs/source/kernel-library-custom-aten-kernel.md +++ b/docs/source/kernel-library-custom-aten-kernel.md @@ -10,7 +10,7 @@ Portable kernel library is the in-house default kernel library, it’s easy to u **What do we support?** On the operator coverage side, the kernel registration APIs allow users to register kernels for all core ATen ops as well as custom ops, as long as the custom ops schemas are specified. -Notice that we also support _partial kernels, _for example the kernel only supports a subset of tensor dtypes and/or dim orders. +Notice that we also support partial kernels, for example the kernel only supports a subset of tensor dtypes and/or dim orders. **Kernel contract**: kernels need to comply with the following requirements: From 8c4427c72daffc85dd043ec13e30c74d1e43a411 Mon Sep 17 00:00:00 2001 From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:34:43 +0200 Subject: [PATCH 087/531] Enable permute_memory_to_nhwc for corstone300 unittests Differential Revision: D61480408 Pull Request resolved: https://github.com/pytorch/executorch/pull/4773 --- backends/arm/runtime/ArmBackendEthosU.cpp | 94 ++++++++++++------- backends/arm/test/ops/test_add.py | 6 +- backends/arm/test/ops/test_conv.py | 4 - backends/arm/test/ops/test_linear.py | 12 +-- backends/arm/test/runner_utils.py | 5 +- backends/arm/test/tester/arm_tester.py | 10 +- .../executor_runner/arm_executor_runner.cpp | 3 +- 7 files changed, 85 insertions(+), 49 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index d6e61e0a0d9..74042935515 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -148,8 +148,9 @@ class ArmBackend final : public PyTorchBackendInterface { if (both_char and permuted_input_shape) { // permuted byte copy CHW to HWC permute_CHW_to_HWC( - scratch_addr, tensor_in.mutable_data_ptr(), + scratch_addr, + tensor_in.size(1), tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { @@ -204,13 +205,31 @@ class ArmBackend final : public PyTorchBackendInterface { // Process input EValue into scratch // Outputs are in the index immediately after inputs auto tensor_out = args[handles.inputs->count + i]->toTensor(); - for (int j = 0; j < tensor_out.numel(); j++) { - if (tensor_out.scalar_type() == ScalarType::Char) { - char* output_address = (char*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; - } else { - int* output_address = (int*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; + bool permuted_output_shape; + ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( + i, + tensor_out, + &handles.outputs->io[i], + execution_handle->permuted_io_flag, + &permuted_output_shape)); + if (tensor_out.scalar_type() == ScalarType::Char and + permuted_output_shape) { + char* output_address = (char*)output_addr; + permute_HWC_to_CHW( + output_address, + tensor_out.mutable_data_ptr(), + tensor_out.size(1), + tensor_out.size(2), + tensor_out.size(3)); + } else { + for (int j = 0; j < tensor_out.numel(); j++) { + if (tensor_out.scalar_type() == ScalarType::Char) { + char* output_address = (char*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } else { + int* output_address = (int*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } } } } @@ -225,51 +244,62 @@ class ArmBackend final : public PyTorchBackendInterface { private: Error check_requires_permute( int index, - const exec_aten::Tensor tensor_in, - VelaIO* input, + const exec_aten::Tensor tensor, + VelaIO* io, bool permuted_io_flag, bool* is_permuted) const { - bool permuted_input_shape = false; - if (tensor_in.dim() == 4) { + bool permuted_shape = false; + if (tensor.dim() == 4) { // special case for NHWC workaround in AOT; as the compilation has // permuted to channel last in an undetectable way, we assume here - // that the application has similarly permuted any input tensors. - permuted_input_shape = tensor_in.size(0) == input->shape[0] && - tensor_in.size(1) == input->shape[3] && - tensor_in.size(2) == input->shape[1] && - tensor_in.size(3) == input->shape[2]; - if (permuted_input_shape) { - ET_LOG(Info, "Tensor input %d will be permuted", index); + // that the application has similarly permuted any input/output tensors. + permuted_shape = tensor.size(0) == io->shape[0] && + tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] && + tensor.size(3) == io->shape[2]; + if (permuted_shape) { + ET_LOG(Info, "Tensor input/output %d will be permuted", index); } - if (permuted_io_flag != permuted_input_shape) { - ET_LOG(Error, "Permute compile flag and permuted input don't agree"); + if (permuted_io_flag != permuted_shape) { + ET_LOG( + Error, + "Permute compile flag and permuted input/output don't agree"); return Error::InvalidProgram; } } - if (!permuted_input_shape) { + if (!permuted_shape) { // Error check matching shapes in the general case - for (int i = 0; i < tensor_in.dim(); i++) { - if (tensor_in.size(i) != input->shape[i]) { - ET_LOG(Error, "Tensor input %d mismatched shape", index); + for (int i = 0; i < tensor.dim(); i++) { + if (tensor.size(i) != io->shape[i]) { + ET_LOG(Error, "Tensor input/output %d mismatched shape", index); ET_LOG( Error, "dimension %d mismatch, %zd != %d", index, - tensor_in.size(i), - input->shape[i]); + tensor.size(i), + io->shape[i]); return Error::InvalidProgram; } } } - *is_permuted = permuted_input_shape; + *is_permuted = permuted_shape; return Error::Ok; } - void permute_CHW_to_HWC(char* input, char* output, int H, int W) const { + void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W) + const { for (int i = 0; i != H * W; ++i) { - output[i * 3 + 0] = input[i + 0 * W * H]; - output[i * 3 + 1] = input[i + 1 * W * H]; - output[i * 3 + 2] = input[i + 2 * W * H]; + for (int j = 0; j < C; ++j) { + output[i * C + j] = input[i + j * W * H]; + } + } + } + + void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W) + const { + for (int i = 0; i != H * W; ++i) { + for (int j = 0; j < C; ++j) { + output[i + j * W * H] = input[i * C + j]; + } } } }; diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 3bd2b2605c4..63023327f79 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -37,9 +37,9 @@ class Add2(torch.nn.Module): torch.FloatTensor([1, 2, 3, 5, 7]), (torch.FloatTensor([2, 1, 2, 1, 10])), ), - (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)), + (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)), (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)), - (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), + (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)), (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), ] @@ -101,7 +101,7 @@ def _test_add_u55_BI_pipeline( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), ) .quantize() .export() diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py index 9ebfe77da2c..ae1c5a65a83 100644 --- a/backends/arm/test/ops/test_conv.py +++ b/backends/arm/test/ops/test_conv.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import List, Tuple, Union @@ -15,9 +14,6 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - class Conv2d(torch.nn.Module): """ diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 33f62955ecd..6fdbb2127e0 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -26,17 +26,17 @@ ( "model_linear_rank1_zeros", torch.zeros(10), - 10, + 15, ), ( "model_linear_rank1_ones", torch.ones(10), - 10, + 15, ), ( "model_linear_rank1_negative_ones", torch.ones(10) * (-1), - 10, + 20, ), ( "model_linear_rank1_rand", @@ -46,12 +46,12 @@ ( "model_linear_rank1_negative_large_rand", torch.rand(10) * (-100), - 10, + 30, ), ( "model_linear_rank1_large_randn", - torch.randn(10) * 100, - 10, + torch.randn(15) * 100, + 20, ), ] diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 930fc0adf10..c8259c38d1e 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -265,9 +265,12 @@ def run_corstone300( raise RuntimeError( f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}" ) + elif "E [" in result_stdout: + logger.error(result_stdout) tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32) - tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(inputs[0].shape) + output_shape = self.output_node.args[0][0].meta["val"].shape + tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape) return [tosa_ref_output] def run_tosa_ref_model( diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 8a02c63d7a6..98fac29144c 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -252,7 +252,10 @@ def run_method_and_compare_outputs( if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor): test_input.extend(list(arg)) - if is_nhwc: + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): test_input = self.transpose_data_format(test_input, "NHWC") input_shapes = [ @@ -263,7 +266,10 @@ def run_method_and_compare_outputs( reference_output = reference_stage.run_artifact(reference_input) test_output = tuple(test_stage.run_artifact(test_input)) - if is_nhwc: + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): test_output = self.transpose_data_format(test_output, "NCHW") self._compare_outputs( diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 6256ff47cf6..8605038936a 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -71,7 +71,8 @@ void et_pal_emit_log_message( size_t line, const char* message, ET_UNUSED size_t length) { - fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message); + fprintf( + stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message); } namespace { From 89c499e86cb4e29e75d9345ea069203625def4a3 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Wed, 28 Aug 2024 11:41:38 -0700 Subject: [PATCH 088/531] Remove util/util.h Differential Revision: D61823549 Pull Request resolved: https://github.com/pytorch/executorch/pull/4919 --- backends/cadence/cadence_runner/targets.bzl | 1 - docs/source/Doxyfile | 3 +- examples/apple/coreml/executor_runner/main.mm | 1 - .../llama_runner/LlamaModelChunk.cpp | 3 +- .../llama_runner/LlamaModelChunk.h | 3 +- .../llama_runner/ModelChunk.cpp | 3 +- .../mtk_llama_executor_runner.cpp | 1 - .../executor_runner/qnn_executor_runner.cpp | 1 - .../sdk_example_runner/sdk_example_runner.cpp | 1 - examples/sdk/sdk_example_runner/targets.bzl | 1 - .../test/demos/rpc/ExecutorBackend.cpp | 1 - exir/backend/test/demos/rpc/targets.bzl | 1 - extension/training/test/targets.bzl | 1 - .../core/exec_aten/testing_util/targets.bzl | 1 - runtime/executor/test/targets.bzl | 1 - .../extension/pybindings/pybindings.bzl | 1 - util/targets.bzl | 20 ------- util/util.h | 58 ------------------- 18 files changed, 4 insertions(+), 98 deletions(-) delete mode 100644 util/util.h diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl index 361fe9712ee..b59a98cd75a 100644 --- a/backends/cadence/cadence_runner/targets.bzl +++ b/backends/cadence/cadence_runner/targets.bzl @@ -25,6 +25,5 @@ def define_common_targets(): "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader", "fbsource//xplat/executorch/kernels/portable:generated_lib", "fbsource//xplat/executorch/runtime/executor:program", - "fbsource//xplat/executorch/util:util", ], ) diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile index b741509197d..e662105b83f 100644 --- a/docs/source/Doxyfile +++ b/docs/source/Doxyfile @@ -964,8 +964,7 @@ INPUT = ../runtime/executor/memory_manager.h \ ../runtime/core/tensor_shape_dynamism.h \ ../runtime/platform/compiler.h \ ../runtime/executor/ \ - ../runtime/platform/ \ - ../util/ + ../runtime/platform/ diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm index 2475d68fa9b..c83287fb44d 100644 --- a/examples/apple/coreml/executor_runner/main.mm +++ b/examples/apple/coreml/executor_runner/main.mm @@ -14,7 +14,6 @@ #import #import #import -#import #import #import #import diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp index 2096bdea62e..c2d75fd30ec 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include "LlamaConfig.h" #include "LlamaModelChunk.h" @@ -344,4 +343,4 @@ void LlamaModelChunk::InitCache() { } } -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h index 8cba55b8468..c8955378cbf 100644 --- a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h +++ b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h @@ -19,7 +19,6 @@ #include #include #include -#include #include "LlamaConfig.h" #include "ModelChunk.h" @@ -136,4 +135,4 @@ class LlamaModelChunk : public ModelChunk { size_t mCurrentTokenIndex = 0; }; -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp index aebb6b9c0b7..b09e2c58767 100644 --- a/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp +++ b/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #define ENSURE_INIT \ ET_CHECK_MSG(Initialized(), "Error: Model chunk not initialized."); @@ -573,4 +572,4 @@ void ModelChunk::ReleaseModelInstance(void* modelInstance) { } } -} // namespace torch::executor \ No newline at end of file +} // namespace torch::executor diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index b605dd13bec..370695cb773 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -60,7 +60,6 @@ #include #include #include -#include #include "llama_runner/LlamaConfig.h" #include "llama_runner/LlamaRuntime.h" diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index c2a6c2c46c6..f1c84bc6650 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include diff --git a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp index 7e979937d1b..fc47d17f42b 100644 --- a/examples/sdk/sdk_example_runner/sdk_example_runner.cpp +++ b/examples/sdk/sdk_example_runner/sdk_example_runner.cpp @@ -29,7 +29,6 @@ #include #include #include -#include static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB static constexpr size_t kBundledAllocatorPoolSize = 16 * 1024U; diff --git a/examples/sdk/sdk_example_runner/targets.bzl b/examples/sdk/sdk_example_runner/targets.bzl index 025d42fee1a..680bdacc40c 100644 --- a/examples/sdk/sdk_example_runner/targets.bzl +++ b/examples/sdk/sdk_example_runner/targets.bzl @@ -19,7 +19,6 @@ def define_common_targets(): "//executorch/runtime/executor:program", "//executorch/extension/data_loader:file_data_loader", "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/util:util", "//executorch/devtools/etdump:etdump_flatcc", "//executorch/devtools/bundled_program:runtime", ], diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp index ae5bf1d733b..0bc85a685e9 100644 --- a/exir/backend/test/demos/rpc/ExecutorBackend.cpp +++ b/exir/backend/test/demos/rpc/ExecutorBackend.cpp @@ -20,7 +20,6 @@ #include #include #include -#include namespace torch { namespace executor { diff --git a/exir/backend/test/demos/rpc/targets.bzl b/exir/backend/test/demos/rpc/targets.bzl index 71df2d8176e..67935e0e373 100644 --- a/exir/backend/test/demos/rpc/targets.bzl +++ b/exir/backend/test/demos/rpc/targets.bzl @@ -27,7 +27,6 @@ def define_common_targets(): "//executorch/kernels/portable:generated_lib", "//executorch/runtime/backend:interface", "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/util:util", ] + MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB, exported_deps = [ "//executorch/runtime/core:core", diff --git a/extension/training/test/targets.bzl b/extension/training/test/targets.bzl index a4ec07a37f9..22107409c2a 100644 --- a/extension/training/test/targets.bzl +++ b/extension/training/test/targets.bzl @@ -25,7 +25,6 @@ def define_common_targets(is_fbcode = False): ], deps = [ "//executorch/runtime/executor:program", - "//executorch/util:util", "//executorch/extension/data_loader:file_data_loader", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", "//executorch/extension/evalue_util:print_evalue", diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl index 9b5249e8371..57764771afb 100644 --- a/runtime/core/exec_aten/testing_util/targets.bzl +++ b/runtime/core/exec_aten/testing_util/targets.bzl @@ -32,7 +32,6 @@ def define_common_targets(): "//executorch/kernels/test/...", "//executorch/runtime/core/test/...", "//executorch/test/...", - "//executorch/util/...", "//executorch/backends/fb/qnnpack/test/...", "//executorch/extension/kernel_util/test/...", "@EXECUTORCH_CLIENTS", diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index a329a4884ef..d6e3bc3d89d 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -175,7 +175,6 @@ def define_common_targets(is_fbcode = False): ":managed_memory_manager", "//executorch/runtime/executor:program", "//executorch/runtime/kernel:operator_registry", - "//executorch/util:util", "//executorch/extension/data_loader:file_data_loader", ], env = modules_env, diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim/xplat/executorch/extension/pybindings/pybindings.bzl index ac5f126706c..5ef9fe59266 100644 --- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim/xplat/executorch/extension/pybindings/pybindings.bzl @@ -16,7 +16,6 @@ PORTABLE_MODULE_DEPS = [ "//executorch/extension/data_loader:buffer_data_loader", "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", - "//executorch/util:util", "//executorch/runtime/executor/test:test_backend_compiler_lib", "//executorch/devtools/etdump:etdump_flatcc", ] + get_all_cpu_backend_targets() diff --git a/util/targets.bzl b/util/targets.bzl index 6797462e189..5c5d7401d51 100644 --- a/util/targets.bzl +++ b/util/targets.bzl @@ -7,26 +7,6 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - for aten_mode in (True, False): - aten_suffix = ("_aten" if aten_mode else "") - - # DEPRECATED: Remove this once all users have migrated to - # extension/runner_util:inputs. - runtime.cxx_library( - name = "util" + aten_suffix, - srcs = [], - exported_headers = ["util.h"], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - "//executorch/extension/runner_util:inputs" + aten_suffix, - "//executorch/runtime/core:core", - "//executorch/runtime/executor:program" + aten_suffix, - ], - ) - if not runtime.is_oss: runtime.python_library( name = "python_profiler", diff --git a/util/util.h b/util/util.h deleted file mode 100644 index 4974afdf167..00000000000 --- a/util/util.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * @file - * DEPRECATED: Do not use this file or add new functions to it. - */ - -#pragma once - -#include -#include -#include - -namespace torch { -namespace executor { -namespace util { - -/** - * DEPRECATED: Use prepare_input_tensors() instead. - * - * Allocates input tensors for the provided Method, filling them with ones. - * - * @param[in] method The Method that owns the inputs to prepare. - * @returns An array of pointers that must be passed to `FreeInputs()` after - * the Method is no longer needed. - */ -ET_DEPRECATED -inline exec_aten::ArrayRef PrepareInputTensors(Method& method) { - Result inputs = prepare_input_tensors(method); - ET_CHECK(inputs.ok()); - // A hack to work with the deprecated signature. Return an ArrayRef that - // points to a single BufferCleanup. - return { - reinterpret_cast(new BufferCleanup(std::move(inputs.get()))), 1}; -} - -/** - * DEPRECATED: Use prepare_input_tensors() instead, which does not need this. - * - * Frees memory that was allocated by `PrepareInputTensors()`. - */ -ET_DEPRECATED -inline void FreeInputs(exec_aten::ArrayRef inputs) { - ET_CHECK(inputs.size() == 1); - // A hack to work with the deprecated signature. The ArrayRef points to a - // single BufferCleanup for us to delete. - delete reinterpret_cast(const_cast(inputs.data())); -} - -} // namespace util -} // namespace executor -} // namespace torch From 89a24e01a9a957f115d79fdbd5ae73f8abe33729 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 28 Aug 2024 11:53:15 -0700 Subject: [PATCH 089/531] dont emit non mutable weights Differential Revision: D61888453 Pull Request resolved: https://github.com/pytorch/executorch/pull/4938 --- exir/passes/weights_to_outputs_pass.py | 6 ++++-- exir/tests/test_joint_graph.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/exir/passes/weights_to_outputs_pass.py b/exir/passes/weights_to_outputs_pass.py index 216830c2e6c..aaf0c0eb5dc 100644 --- a/exir/passes/weights_to_outputs_pass.py +++ b/exir/passes/weights_to_outputs_pass.py @@ -53,11 +53,13 @@ def weights_to_outputs_pass( break assert output_node is not None - # Get place holder nodes with gradients + # Get input nodes that are weights with an associated gradient placeholder_nodes = [ node for node in gm.graph.nodes - if node.op == "placeholder" and node.target in inputs_to_params.keys() + if node.op == "placeholder" + and node.target in inputs_to_params.keys() + and inputs_to_params[node.target] in grad_targets ] # Flag these placeholder nodes as having a gradient attached so that memory planning will operate on them. diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py index fb09d995716..2413e2b4980 100644 --- a/exir/tests/test_joint_graph.py +++ b/exir/tests/test_joint_graph.py @@ -26,10 +26,13 @@ class Module(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(3, 3) + self.linear_no_train = torch.nn.Linear(3, 3) + for param in self.linear_no_train.parameters(): + param.requires_grad = False self.loss = torch.nn.CrossEntropyLoss() def forward(self, x, y): - return self.loss(self.linear(x).softmax(dim=0), y) + return self.loss(self.linear_no_train(self.linear(x)).softmax(dim=0), y) m = Module() example_inputs = (torch.ones(3), torch.tensor([1.0, 0.0, 0.0])) From 88edab839e3018f468360fe33f19b8478a09fb12 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:58:34 -0400 Subject: [PATCH 090/531] Add op: pixel_unshuffle Differential Revision: D60978345 Pull Request resolved: https://github.com/pytorch/executorch/pull/4631 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_pixel_shuffle.cpp | 92 +++++++------ kernels/portable/cpu/op_pixel_unshuffle.cpp | 104 ++++++++++++++ kernels/portable/cpu/util/copy_ops_util.cpp | 36 +++++ kernels/portable/cpu/util/copy_ops_util.h | 11 ++ kernels/portable/functions.yaml | 5 + kernels/test/op_pixel_shuffle_test.cpp | 7 +- kernels/test/op_pixel_unshuffle_test.cpp | 130 ++++++++++++++++++ kernels/test/targets.bzl | 3 +- .../kernels/portable/op_registration_util.bzl | 6 + 10 files changed, 346 insertions(+), 50 deletions(-) create mode 100644 kernels/portable/cpu/op_pixel_unshuffle.cpp create mode 100644 kernels/test/op_pixel_unshuffle_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index f95169a068e..7c0a0516dd6 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -275,6 +275,8 @@ - op: pixel_shuffle.out +- op: pixel_unshuffle.out + - op: pow.Scalar_out - op: pow.Tensor_Tensor_out diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp index 87217cbb9e1..104348f3fed 100644 --- a/kernels/portable/cpu/op_pixel_shuffle.cpp +++ b/kernels/portable/cpu/op_pixel_shuffle.cpp @@ -12,6 +12,49 @@ namespace torch { namespace executor { namespace native { +namespace { + +template +void pixel_shuffle_impl(const Tensor& in, int64_t upscale_factor, Tensor& out) { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + const auto leading_dims = getLeadingDims(in, in.dim() - 3); + const auto channels = in.size(in.dim() - 3); + const auto height = in.size(in.dim() - 2); + const auto width = in.size(in.dim() - 1); + + const auto sub_channels = channels / (upscale_factor * upscale_factor); + const auto S = upscale_factor; + + // input strides + const auto stride_n = channels * height * width; + const auto stride_c = S * S * height * width; + const auto stride_s1 = S * height * width; + const auto stride_s2 = height * width; + const auto stride_h = width; + + // input tensor shape of [n, c, s1, s2, h, w] + // output tensor shape of [n, c, h, s1, w, s2] + size_t i = 0; + for (size_t n = 0; n < leading_dims; n++) { + for (size_t c = 0; c < sub_channels; c++) { + for (size_t h = 0; h < height; h++) { + for (size_t s1 = 0; s1 < S; s1++) { + for (size_t w = 0; w < width; w++) { + for (size_t s2 = 0; s2 < S; s2++) { + size_t input_offset = n * stride_n + c * stride_c + + s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w; + out_data[i++] = in_data[input_offset]; + } + } + } + } + } + } +} + +} // namespace using SizesType = exec_aten::SizesType; using Tensor = exec_aten::Tensor; @@ -29,11 +72,6 @@ Tensor& pixel_shuffle_out( InvalidArgument, out); - const Tensor::SizesType leading_dims = getLeadingDims(in, in.dim() - 3); - const Tensor::SizesType channels = in.size(in.dim() - 3); - const Tensor::SizesType height = in.size(in.dim() - 2); - const Tensor::SizesType width = in.size(in.dim() - 1); - Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_pixel_shuffle_out_target_size( @@ -46,47 +84,13 @@ Tensor& pixel_shuffle_out( InvalidArgument, out); + constexpr auto name = "pixel_shuffle.out"; + const auto in_type = out.scalar_type(); // in and out must be the same dtype - ET_SWITCH_ALL_TYPES( - in_type, - ctx, - "pixel_shuffle.out", - CTYPE, - [leading_dims, channels, height, width, upscale_factor, &in, &out] { - const CTYPE* const in_data = in.const_data_ptr(); - CTYPE* const out_data = out.mutable_data_ptr(); - - const int64_t sub_channels = - channels / (upscale_factor * upscale_factor); - const int64_t S = upscale_factor; - - // input strides - int64_t stride_n = channels * height * width; - int64_t stride_c = S * S * height * width; - int64_t stride_s1 = S * height * width; - int64_t stride_s2 = height * width; - int64_t stride_h = width; - - // input tensor shape of [n, c, s1, s2, h, w] - // output tensor shape of [n, c, h, s1, w, s2] - size_t i = 0; - for (size_t n = 0; n < leading_dims; n++) { - for (size_t c = 0; c < sub_channels; c++) { - for (size_t h = 0; h < height; h++) { - for (size_t s1 = 0; s1 < S; s1++) { - for (size_t w = 0; w < width; w++) { - for (size_t s2 = 0; s2 < S; s2++) { - int64_t input_offset = n * stride_n + c * stride_c + - s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w; - out_data[i++] = in_data[input_offset]; - } - } - } - } - } - } - }); + ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&]() { + pixel_shuffle_impl(in, upscale_factor, out); + }); return out; } diff --git a/kernels/portable/cpu/op_pixel_unshuffle.cpp b/kernels/portable/cpu/op_pixel_unshuffle.cpp new file mode 100644 index 00000000000..a0f86f9050f --- /dev/null +++ b/kernels/portable/cpu/op_pixel_unshuffle.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace { + +template +void pixel_unshuffle_impl( + const Tensor& in, + int64_t downscale_factor, + Tensor& out) { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + const auto leading_dims = getLeadingDims(in, in.dim() - 3); + const auto channels = out.size(in.dim() - 3); + const auto height = out.size(in.dim() - 2); + const auto width = out.size(in.dim() - 1); + + const auto S = downscale_factor; + const auto sub_channels = channels / (S * S); + + // output strides + const auto stride_n = channels * height * width; + const auto stride_c = S * S * height * width; + const auto stride_s1 = S * height * width; + const auto stride_s2 = height * width; + const auto stride_h = width; + + // input tensor shape of [n, c, h, s1, w, s2] + // output tensor shape of [n, c, s1, s2, h, w] + size_t i = 0; + for (size_t n = 0; n < leading_dims; n++) { + for (size_t c = 0; c < sub_channels; c++) { + for (size_t h = 0; h < height; h++) { + for (size_t s1 = 0; s1 < S; s1++) { + for (size_t w = 0; w < width; w++) { + for (size_t s2 = 0; s2 < S; s2++) { + size_t output_offset = n * stride_n + c * stride_c + + s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w; + out_data[output_offset] = in_data[i++]; + } + } + } + } + } + } +} + +} // namespace + +using SizesType = exec_aten::SizesType; +using Tensor = exec_aten::Tensor; + +Tensor& pixel_unshuffle_out( + RuntimeContext& ctx, + const Tensor& in, + int64_t downscale_factor, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_pixel_unshuffle_args(in, downscale_factor, out), + InvalidArgument, + out); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_pixel_unshuffle_out_target_size( + in, downscale_factor, expected_out_size, &expected_out_dim); + + // Make sure the output tensor is the right size. + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "pixel_unshuffle.out"; + + const auto in_type = out.scalar_type(); + // in and out must be the same dtype + ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&]() { + pixel_unshuffle_impl(in, downscale_factor, out); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index 314e38c2b53..bcd72d96a3b 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -325,6 +325,19 @@ bool check_pixel_shuffle_args( return true; } +bool check_pixel_unshuffle_args( + const Tensor& in, + int64_t downscale_factor, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(in, 3)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(out, 3)); + ET_LOG_AND_RETURN_IF_FALSE(downscale_factor > 0); + ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 1) % downscale_factor == 0); + ET_LOG_AND_RETURN_IF_FALSE(in.size(in.dim() - 2) % downscale_factor == 0); + return true; +} + void get_pixel_shuffle_out_target_size( const Tensor& in, int64_t upscale_factor, @@ -347,6 +360,29 @@ void get_pixel_shuffle_out_target_size( out_sizes[i] = in.size(i) * casted_upscale_factor; } +void get_pixel_unshuffle_out_target_size( + const Tensor& in, + int64_t downscale_factor, + exec_aten::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = in.dim(); + const exec_aten::SizesType casted_factor = downscale_factor; + + size_t i = 0; + for (; i < in.dim() - 3; ++i) { + // Copy all leading dimensions in. + out_sizes[i] = in.size(i); + } + // The last 3 dimensions are (channel, height, width). Multiply channel by + // the downscale factor squared and divide the height and width by that + // factor. + out_sizes[i] = in.size(i) * (casted_factor * casted_factor); + i++; + out_sizes[i] = in.size(i) / casted_factor; + i++; + out_sizes[i] = in.size(i) / casted_factor; +} + bool check_select_copy_out_args( const Tensor& in, int64_t dim, diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index d5362ae373a..ef0fc9579bd 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -113,6 +113,17 @@ void get_pixel_shuffle_out_target_size( exec_aten::SizesType* out_sizes, size_t* out_ndim); +bool check_pixel_unshuffle_args( + const Tensor& in, + int64_t upscale_factor, + Tensor& out); + +void get_pixel_unshuffle_out_target_size( + const Tensor& in, + int64_t upscale_factor, + exec_aten::SizesType* out_sizes, + size_t* out_ndim); + bool check_select_copy_out_args( const Tensor& in, int64_t dim, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 93256b2a05f..a1a1b35ddda 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -622,6 +622,11 @@ - arg_meta: null kernel_name: torch::executor::pixel_shuffle_out +- op: pixel_unshuffle.out + kernels: + - arg_meta: null + kernel_name: torch::executor::pixel_unshuffle_out + - op: pow.Scalar_out kernels: - arg_meta: null diff --git a/kernels/test/op_pixel_shuffle_test.cpp b/kernels/test/op_pixel_shuffle_test.cpp index 01c2d878b2e..a255f94b2ea 100644 --- a/kernels/test/op_pixel_shuffle_test.cpp +++ b/kernels/test/op_pixel_shuffle_test.cpp @@ -74,7 +74,7 @@ TEST_F(OpPixelShuffleOutTest, AllRealDtypesSupported) { TEST_F(OpPixelShuffleOutTest, LargerInputRank) { TensorFactory tf; - // Pixel shuffle allows a 4D (or higher) input tensor, make sure the extra + // Pixel shuffle allows a 3D (or higher) input tensor, make sure the extra // dimensions don't cause issues. Tensor a = tf.ones(/*sizes=*/{1, 4, 1, 4, 2, 2}); @@ -102,11 +102,8 @@ TEST_F(OpPixelShuffleOutTest, InvalidInputChannelsDies) { TEST_F(OpPixelShuffleOutTest, WrongInputRankDies) { TensorFactory tf; - // Pixel shuffle requires a 4D input tensor. + // Pixel shuffle requires a 3D or higher input tensor. Tensor a = tf.ones(/*sizes=*/{1, 2}); - - // NOTE: The wrong output rank dies for the portable kernel, but not the aten - // kernel. Tensor out = tf.zeros(/*sizes=*/{1, 2}); // Using the wrong input shape should exit with an error code. diff --git a/kernels/test/op_pixel_unshuffle_test.cpp b/kernels/test/op_pixel_unshuffle_test.cpp new file mode 100644 index 00000000000..838aa4c6946 --- /dev/null +++ b/kernels/test/op_pixel_unshuffle_test.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::SupportedFeatures; +using torch::executor::testing::TensorFactory; + +class OpPixelUnshuffleOutTest : public OperatorTest { + protected: + Tensor& op_pixel_unshuffle_out( + const Tensor& self, + int64_t upscale_factor, + Tensor& out) { + return torch::executor::aten::pixel_unshuffle_outf( + context_, self, upscale_factor, out); + } + + template + void test_pixel_unshuffle() { + TensorFactory tf_in; + + const std::vector sizes = {1, 1, 4, 4}; + const std::vector out_sizes = {1, 4, 2, 2}; + + // Destination for the pixel_unshuffle. + Tensor out = tf_in.zeros(out_sizes); + + op_pixel_unshuffle_out( + tf_in.make( + sizes, {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15}), + 2, + out); + EXPECT_TENSOR_EQ( + out, + tf_in.make( + out_sizes, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15})); + } +}; + +// +// Correctness Tests +// + +/** + * Uses the function templates above to test all input dtypes. + */ +TEST_F(OpPixelUnshuffleOutTest, AllRealDtypesSupported) { +#define ENUMERATE_TEST_ENTRY(ctype, dtype) \ + test_pixel_unshuffle(); + + ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY) + +#undef ENUMERATE_TEST_ENTRY +} + +TEST_F(OpPixelUnshuffleOutTest, LargerInputRank) { + TensorFactory tf; + + // Pixel unshuffle allows a 3D (or higher) input tensor, make sure the extra + // dimensions don't cause issues. + Tensor a = tf.ones(/*sizes=*/{1, 4, 1, 1, 4, 4}); + + const std::vector out_sizes = {1, 4, 1, 4, 2, 2}; + Tensor out = tf.zeros(out_sizes); + + op_pixel_unshuffle_out(a, 2, out); + EXPECT_TENSOR_EQ(out, tf.ones(out_sizes)); +} + +// Mismatched shape tests. +TEST_F(OpPixelUnshuffleOutTest, InvalidInputShapeDies) { + TensorFactory tf; + + // Input tensors with invalid shapes. 7 is not divisible by downsample_factor + Tensor a = tf.ones(/*sizes=*/{1, 1, 7, 8}); + + Tensor out = tf.zeros(/*sizes=*/{1, 4, 4, 4}); + + // Using the wrong input shape should exit with an error code. + ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 2, out)); +} + +TEST_F(OpPixelUnshuffleOutTest, WrongInputRankDies) { + TensorFactory tf; + + // Pixel unshuffle requires a 3D or higher input tensor. + Tensor a = tf.ones(/*sizes=*/{1, 2}); + Tensor out = tf.zeros(/*sizes=*/{1, 2}); + + // Using the wrong input rank should exit with an error code. + ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 2, out)); +} + +TEST_F(OpPixelUnshuffleOutTest, DifferentDtypeDies) { + TensorFactory tf; + TensorFactory tf_float; + + Tensor a = tf.ones(/*sizes=*/{1, 2, 12, 12}); + + // Pixel unshuffle requires two tensors with the same dtype. + Tensor out = tf_float.zeros(/*sizes=*/{1, 18, 4, 4}); + + // Using the wrong output dtype should exit with an error code. + ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, 3, out)); +} + +TEST_F(OpPixelUnshuffleOutTest, NegativeUpscaleFactorDies) { + TensorFactory tf; + Tensor a = tf.ones(/*sizes=*/{1, 2, 12, 12}); + Tensor out = tf.zeros(/*sizes=*/{1, 18, 4, 4}); + // Using a negative upscale factor should exit with an error code. + ET_EXPECT_KERNEL_FAILURE(context_, op_pixel_unshuffle_out(a, -3, out)); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 83abeeb4ab3..7ff7dad0bf9 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -244,7 +244,6 @@ def define_common_targets(): _common_op_test("op_minimum_test", ["aten", "portable"]) _common_op_test("op_mm_test", ["aten", "portable"]) _common_op_test("op_mul_test", ["aten", "portable", "optimized"]) - _common_op_test("op_pow_test", ["aten", "portable"]) _common_op_test("op_native_batch_norm_test", ["aten", "portable"]) _common_op_test("op_native_group_norm_test", ["aten", "portable"]) _common_op_test("op_native_layer_norm_test", ["aten", "portable", "optimized"]) @@ -255,6 +254,8 @@ def define_common_targets(): _common_op_test("op_pdist_forward_test", ["aten", "portable"]) _common_op_test("op_permute_copy_test", ["aten", "portable"]) _common_op_test("op_pixel_shuffle_test", ["aten", "portable"]) + _common_op_test("op_pixel_unshuffle_test", ["aten", "portable"]) + _common_op_test("op_pow_test", ["aten", "portable"]) _common_op_test("op_prod_test", ["aten", "portable"]) _common_op_test("op_reciprocal_test", ["aten", "portable"]) _common_op_test("op_relu_test", ["aten", "portable"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index 820312a54da..75642b81fec 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -861,6 +861,12 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op_pixel_unshuffle", + deps = [ + "//executorch/kernels/portable/cpu/util:copy_ops_util", + ], + ), op_target( name = "op_pow", deps = [ From 20024900fa3248d10f92703a890a404edc8d6ff8 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:43:27 -0400 Subject: [PATCH 091/531] Add op: gather.out Differential Revision: D61822105 Pull Request resolved: https://github.com/pytorch/executorch/pull/4939 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_gather.cpp | 98 +++++ kernels/portable/cpu/util/index_util.cpp | 45 +++ kernels/portable/cpu/util/index_util.h | 7 + kernels/portable/functions.yaml | 5 + kernels/test/op_gather_test.cpp | 379 ++++++++++++++++++ kernels/test/targets.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 6 + 8 files changed, 543 insertions(+) create mode 100644 kernels/portable/cpu/op_gather.cpp create mode 100644 kernels/test/op_gather_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 7c0a0516dd6..e06830acabd 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -173,6 +173,8 @@ - op: full.out +- op: gather.out + - op: ge.Scalar_out - op: ge.Tensor_out diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp new file mode 100644 index 00000000000..0f509f21aa4 --- /dev/null +++ b/kernels/portable/cpu/op_gather.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +namespace { + +template +void gather_helper( + const Tensor& in, + const Tensor& index, + Tensor& out, + int64_t dim) { + const CTYPE* in_data = in.const_data_ptr(); + const long* index_data = index.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + if (index.dim() == 0) { + out_data[0] = in_data[index_data[0]]; + return; + } + + for (size_t ix = 0; ix < index.numel(); ++ix) { + size_t ix_coord[kTensorDimensionLimit]; + indexToCoordinate(index, ix, ix_coord); + + size_t in_coord[kTensorDimensionLimit]; + for (size_t i = 0; i < out.dim(); ++i) { + if (i == dim) { + in_coord[i] = index_data[ix]; + } else { + in_coord[i] = ix_coord[i]; + } + } + + size_t in_ix = coordinateToIndex(in, in_coord); + size_t out_ix = coordinateToIndex(out, ix_coord); + + out_data[out_ix] = in_data[in_ix]; + } +} + +} // namespace + +Tensor& gather_out( + RuntimeContext& ctx, + const Tensor& in, + int64_t dim, + const Tensor& index, + bool sparse_grad, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_gather_args(in, dim, index, sparse_grad, out), + InvalidArgument, + out); + + if (dim < 0) { + dim += nonzero_dim(in); + } + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, index.sizes()) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "gather.out"; + + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { + gather_helper(in, index, out, dim); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp index 1baf103665d..c8b89788109 100644 --- a/kernels/portable/cpu/util/index_util.cpp +++ b/kernels/portable/cpu/util/index_util.cpp @@ -12,6 +12,51 @@ namespace torch { namespace executor { +bool check_gather_args( + const Tensor& in, + int64_t dim, + const Tensor& index, + bool sparse_grad, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim)); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + index.scalar_type() == ScalarType::Long, + "Expected dypte int64 for index"); + if (index.numel() != 0) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + nonzero_dim(in) == nonzero_dim(index), + "self and index should have the same dimensionality when index is not empty " + "except for the case when one has dimension 0 and the other has dimension 1"); + } + + // Normalize dim to non-negative value + if (dim < 0) { + dim += nonzero_dim(in); + } + + for (size_t d = 0; d < nonzero_dim(in); ++d) { + if (d != dim) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + nonempty_size(index, d) <= nonempty_size(in, d), + "size of dimension %zd of index should be smaller than the size of that dimension of input if dimension %zd != dim %zd", + d, + d, + (size_t)dim); + } + } + const long* index_data = index.const_data_ptr(); + for (size_t i = 0; i < index.numel(); ++i) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + index_data[i] >= 0 && index_data[i] < nonempty_size(in, dim), + "Index is out of bounds for dimension %zd with size %zd", + (size_t)dim, + nonempty_size(index, dim)); + } + + return true; +} + bool check_index_select_args( const Tensor& in, int64_t dim, diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h index 2575fbeeb55..7c296832924 100644 --- a/kernels/portable/cpu/util/index_util.h +++ b/kernels/portable/cpu/util/index_util.h @@ -14,6 +14,13 @@ namespace torch { namespace executor { +bool check_gather_args( + const Tensor& in, + int64_t dim, + const Tensor& index, + bool sparse_grad, + Tensor& output); + bool check_index_select_args( const Tensor& in, int64_t dim, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index a1a1b35ddda..bdf3cea671a 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -392,6 +392,11 @@ - arg_meta: null kernel_name: torch::executor::full_like_out +- op: gather.out + kernels: + - arg_meta: null + kernel_name: torch::executor::gather_out + - op: ge.Scalar_out kernels: - arg_meta: null diff --git a/kernels/test/op_gather_test.cpp b/kernels/test/op_gather_test.cpp new file mode 100644 index 00000000000..9d637560eda --- /dev/null +++ b/kernels/test/op_gather_test.cpp @@ -0,0 +1,379 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include +#include + +using namespace ::testing; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpGatherOutTest : public OperatorTest { + protected: + Tensor& op_gather_out( + const Tensor& self, + int64_t dim, + const Tensor& index, + bool sparse_grad, + Tensor& out) { + return torch::executor::aten::gather_outf( + context_, self, dim, index, sparse_grad, out); + } + + // Common testing for the operator + template + void test_gather_out() { + TensorFactory tf_index; + TensorFactory tf_data; + const std::vector sizes = {2, 3}; + // clang-format off + Tensor self = tf_data.make( + /*sizes=*/{2, 5}, + { + 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 + }); + // clang-format on + Tensor out = tf_data.zeros(sizes); + // clang-format off + bool sparse_grad = false; + Tensor index = tf_index.make(sizes, + { + 0, 1, 0, + 1, 0, 1, + }); + // clang-format on + + // Valid input should give the expected output + op_gather_out(self, 0, index, sparse_grad, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make( + sizes, + { + 1, 7, 3, + 6, 2, 8, + })); + // clang-format on + + // Valid input should give the expected output + op_gather_out(self, 1, index, sparse_grad, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make(sizes, + { + 1, 2, 1, + 7, 6, 7, + })); + + self = tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 1, 2, 3, + 4, 5, 6, + 7, 8, 9, + + // [1, :, :] + 10, 11, 12, + 13, 14, 15, + 16, 17, 18 + }); + index = tf_index.make( + /*sizes=*/{1, 3, 2}, + { + 0, 1, + 1, 2, + 0, 2 + }); + // clang-format on + out = tf_data.zeros(/*sizes=*/{1, 3, 2}); + + op_gather_out(self, 1, index, sparse_grad, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{1, 3, 2}, + { + 1, 5, + 4, 8, + 1, 8, + })); + // clang-format on + + out = tf_data.zeros(/*sizes=*/{1, 3, 2}); + op_gather_out(self, 2, index, sparse_grad, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{1, 3, 2}, + { + 1, 2, + 5, 6, + 7, 9, + })); + // clang-format on + } + + // Invalid dimensions + template + void test_gather_out_invalid_dim() { + TensorFactory tf_index; + TensorFactory tf_data; + // clang-format off + Tensor self = tf_data.make(/*sizes=*/{2, 5}, + { + 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 + }); + const std::vector sizes = {2, 3}; + Tensor index = tf_index.make(sizes, + { + 0, 1, 0, + 1, 0, 1, + }); + // clang-format on + bool sparse_grad = false; + Tensor out = tf_data.zeros(sizes); + + // Invalid dim should die + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, -3, index, sparse_grad, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 2, index, sparse_grad, out)); + + // Self and index hsould have same number of dimensions + index = tf_index.zeros(/*sizes=*/{2, 2, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); + + // Size of dimension of index should be smaller than the size of that + // dimension of self if dimension != dim + index = tf_index.zeros(/*sizes=*/{3, 5}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 1, index, sparse_grad, out)); + + // Index out of bound for self in dim + index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 2, 0, 1, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); + } + + void test_dynamic_shape( + const std::vector& out_shape, + enum torch::executor::TensorShapeDynamism dynamism) { + TensorFactory tf; + TensorFactory tf_index; + + Tensor input = tf.ones({2, 3, 4}); + Tensor index = tf_index.zeros({2, 3, 4}); + bool sparse_grad = false; + Tensor expected = tf.ones({2, 3, 4}); + Tensor out = tf.zeros(out_shape, dynamism); + + op_gather_out(input, 2, index, sparse_grad, out); + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpGatherOutTest, AllValidInputOutputSupport) { +#define TEST_ENTRY(CTYPE, DTYPE) test_gather_out(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpGatherOutTest, InfinityAndNANTest) { + TensorFactory tf_index; + TensorFactory tf_data; + // clang-format off + Tensor self = tf_data.make( + /*sizes=*/{2, 5}, + { + INFINITY, -INFINITY, NAN, 2.33, 3.14, + NAN, INFINITY, -INFINITY, 3.14, 2.33 + }); + // clang-format on + const std::vector sizes = {2, 3}; + Tensor index = tf_index.make(sizes, {0, 1, 0, 1, 0, 1}); + bool sparse_grad = false; + Tensor out = tf_data.zeros(sizes); + + // Valid input should give the expected output + op_gather_out(self, 0, index, sparse_grad, out); + // clang-format off + EXPECT_TENSOR_CLOSE( + out, + tf_data.make(sizes, + { + INFINITY, INFINITY, NAN, + NAN, -INFINITY, -INFINITY, + })); + // clang-format on +} + +TEST_F(OpGatherOutTest, InvalidDimensionsDies) { +#define TEST_ENTRY(CTYPE, DTYPE) \ + test_gather_out_invalid_dim(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpGatherOutTest, MismatchedInputDtypesDies) { + TensorFactory tf_byte; + TensorFactory tf_char; + TensorFactory tf_long; + + Tensor self = tf_char.make({2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + const std::vector sizes = {2, 3}; + Tensor index = tf_byte.make(sizes, {0, 1, 0, 0, 1, 0}); + bool sparse_grad = false; + Tensor out = tf_char.zeros(sizes); + + // Types other than long for index should die + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); + + // Mismatched dtype of self and out should die + self = tf_byte.make(/*sizes=*/{2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + index = tf_long.make(sizes, {0, 1, 0, 1, 0, 1}); + out = tf_char.zeros(sizes); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); +} + +TEST_F(OpGatherOutTest, DynamicShapeUpperBoundSameAsExpected) { + test_dynamic_shape( + {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpGatherOutTest, DynamicShapeUpperBoundLargerThanExpected) { + test_dynamic_shape( + {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpGatherOutTest, DynamicShapeUnbound) { + if (!torch::executor::testing::SupportedFeatures::get()->output_resize) { + GTEST_SKIP() << "Dynamic shape not supported"; + } + test_dynamic_shape( + {1, 1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); +} + +TEST_F(OpGatherOutTest, EmptyIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({2, 5}); + const std::vector sizes = {2, 0, 3}; + Tensor index = tf_index.zeros(sizes); + bool sparse_grad = false; + Tensor out = tf_data.zeros(sizes); + op_gather_out(self, 0, index, sparse_grad, out); + EXPECT_TENSOR_CLOSE(out, tf_data.zeros(sizes)); +} + +TEST_F(OpGatherOutTest, ValidZeroDim) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.zeros({}); + bool sparse_grad = false; + Tensor out = tf_data.zeros({}); + op_gather_out(self, 0, index, sparse_grad, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {3.14})); +} + +TEST_F(OpGatherOutTest, InvalidZeroDimInput) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({}); + const std::vector sizes = {2, 3}; + Tensor index = tf_index.make(sizes, {0, 0, 0, 0, 0, 0}); + bool sparse_grad = false; + Tensor out = tf_data.zeros(sizes); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); +} + +TEST_F(OpGatherOutTest, InvalidZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6}); + const std::vector sizes = {}; + Tensor index = tf_index.make(sizes, {2}); + bool sparse_grad = false; + Tensor out = tf_data.zeros(sizes); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 1, index, sparse_grad, out)); +} + +TEST_F(OpGatherOutTest, ValidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + const std::vector sizes = {3}; + Tensor index = tf_index.make(sizes, {0, 0, 0}); + bool sparse_grad = false; + Tensor out = tf_data.make({3}, {2.71, 2.71, 2.71}); + op_gather_out(self, 0, index, sparse_grad, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {3.14, 3.14, 3.14})); +} + +TEST_F(OpGatherOutTest, ValidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + const std::vector sizes = {}; + Tensor index = tf_index.make(sizes, {2}); + bool sparse_grad = false; + Tensor out = tf_data.make(sizes, {1729}); + op_gather_out(self, 0, index, sparse_grad, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {30})); +} + +TEST_F(OpGatherOutTest, InvalidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + const std::vector sizes = {3}; + Tensor index = tf_index.make(sizes, {10, 100, 1000}); + bool sparse_grad = false; + Tensor out = tf_data.make({3}, {2.71, 2.71, 2.71}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); +} + +TEST_F(OpGatherOutTest, InvalidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + const std::vector sizes = {}; + Tensor index = tf_index.make(sizes, {100}); + bool sparse_grad = false; + Tensor out = tf_data.make(sizes, {1729}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_gather_out(self, 0, index, sparse_grad, out)); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 7ff7dad0bf9..e44769841b2 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -211,6 +211,7 @@ def define_common_targets(): _common_op_test("op_fmod_test", ["aten", "portable"]) _common_op_test("op_full_like_test", ["aten", "portable"]) _common_op_test("op_full_test", ["aten", "portable"]) + _common_op_test("op_gather_test", ["aten", "portable"]) _common_op_test("op_ge_test", ["aten", "portable"]) _common_op_test("op_gelu_test", ["aten", "portable", "optimized"]) _common_op_test("op_glu_test", ["aten", "portable"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index 75642b81fec..a0200cb1a6f 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -559,6 +559,12 @@ ATEN_OPS = ( ":scalar_utils", ], ), + op_target( + name = "op_gather", + deps = [ + "//executorch/kernels/portable/cpu/util:index_util", + ], + ), op_target( name = "op_ge", deps = [ From cc9fb50c1cedcadd70eb3c563083e23243ba5106 Mon Sep 17 00:00:00 2001 From: Esteb37 <35089867+Esteb37@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:32:53 -0400 Subject: [PATCH 092/531] Add buffer_to_buffer prepacking Differential Revision: D61665694 Pull Request resolved: https://github.com/pytorch/executorch/pull/4840 --- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 48 +++++++++++++++++++ .../vulkan/runtime/graph/ops/impl/Staging.h | 7 +++ 2 files changed, 55 insertions(+) diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index b02613c2083..c40d57c8b52 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -132,6 +132,33 @@ ValueRef prepack( return v; } +ValueRef prepack_buffer( + ComputeGraph& graph, + const ValueRef vref, + const utils::GPUMemoryLayout layout) { + ValueRef v = graph.add_tensor_like(vref, layout); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer"); + + vkapi::ParamsBindList ubos; + ubos.append({graph.numel_ubo(v)}); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + shader, + graph.create_global_wg_size(v), + graph.create_local_wg_size(v), + // Input and Outputs + vref, + v, + // Parameter Buffers + ubos, + // Specialization Constants + {})); + + return v; +} + ValueRef prepack_if_tensor_ref( ComputeGraph& graph, const ValueRef v, @@ -143,6 +170,17 @@ ValueRef prepack_if_tensor_ref( } } +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout) { + if (graph.val_is_tref(v)) { + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { if (graph.val_is_tref(v)) { utils::GPUMemoryLayout layout = @@ -153,4 +191,14 @@ ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { } } +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { + if (graph.val_is_tref(v)) { + utils::GPUMemoryLayout layout = + graph.suggested_memory_layout(graph.get_tref(v)->sizes); + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index fc875de80dd..88a9630239a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -29,6 +29,13 @@ ValueRef prepack_if_tensor_ref( const ValueRef v, const utils::GPUMemoryLayout layout); +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout); + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v); +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v); + } // namespace vkcompute From a5157de760f332b79d5f931b6f65c95200a1c87b Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Wed, 28 Aug 2024 13:55:22 -0700 Subject: [PATCH 093/531] Allow delegate to consume buffer mutations Differential Revision: D60838243 Pull Request resolved: https://github.com/pytorch/executorch/pull/4830 --- backends/apple/mps/test/test_mps_utils.py | 2 +- exir/backend/test/TARGETS | 3 + exir/backend/test/op_partitioner_demo.py | 50 +++++++++ exir/backend/test/test_partitioner.py | 112 +++++++++++++++++++ exir/lowered_backend_module.py | 128 +++++++++++++++++++++- extension/export_util/utils.py | 2 +- 6 files changed, 289 insertions(+), 8 deletions(-) diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py index 77c02f533be..199a7fe1782 100644 --- a/backends/apple/mps/test/test_mps_utils.py +++ b/backends/apple/mps/test/test_mps_utils.py @@ -229,7 +229,7 @@ def lower_module_and_test_output( compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))] if use_partitioner: - logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}") + logging.info(f"Edge IR graph:\n{edge_program.exported_program()}") delegated_program = edge_program delegated_program = edge_program.to_backend( MPSPartitioner(compile_specs=compile_specs) diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS index ed58b06b3dd..b99f374d83c 100644 --- a/exir/backend/test/TARGETS +++ b/exir/backend/test/TARGETS @@ -88,6 +88,8 @@ python_library( "//executorch/exir/backend:compile_spec_schema", "//executorch/exir/backend:partitioner", "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", + "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner", + "//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess", "//executorch/exir/dialects:lib", ], ) @@ -290,6 +292,7 @@ python_unittest( "//executorch/exir/backend/test/demos/rpc:executor_backend_register", ], deps = [ + ":op_partitioner_demo", "//caffe2:torch", "//executorch/exir:lib", "//executorch/exir/backend:backend_details", diff --git a/exir/backend/test/op_partitioner_demo.py b/exir/backend/test/op_partitioner_demo.py index dc20c03e68b..62a0aeb782c 100644 --- a/exir/backend/test/op_partitioner_demo.py +++ b/exir/backend/test/op_partitioner_demo.py @@ -21,6 +21,9 @@ from executorch.exir.backend.test.backend_with_compiler_demo import ( BackendWithCompilerDemo, ) +from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import ( + ExecutorBackend, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.graph_module import get_control_flow_submodules from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param @@ -29,6 +32,11 @@ from torch.fx.passes.operator_support import any_chain, OperatorSupportBase +class AllOperatorSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" + + class AddOperatorSupport(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: return node.op == "call_function" and node.target in [ @@ -126,6 +134,48 @@ def partition(self, edge_exported_program: ExportedProgram) -> PartitionResult: ) +@final +class AllNodesPartitionerDemo(Partitioner): + """ + Partitions all nodes + """ + + def __init__(self) -> None: + self.op_support = AllOperatorSupport() + self.delegation_spec = DelegationSpec(ExecutorBackend.__name__, []) + + def partition(self, edge_exported_program: ExportedProgram) -> PartitionResult: + partition_tags = {} + partition_list = generate_pattern_op_partitions( + edge_exported_program.graph_module, op_support=self.op_support + ) + for partition in partition_list: + for node in partition.nodes: + delegation_tag = f"tag{partition.id}" + partition_tags[delegation_tag] = self.delegation_spec + + # Tag the add nodes + node.meta["delegation_tag"] = delegation_tag + + for arg_node in node.args: + if not isinstance(arg_node, torch.fx.Node): + continue + + is_get_attr = arg_node.op == "get_attr" + is_param_buffer = arg_node.op == "placeholder" and ( + is_param(edge_exported_program, arg_node) + or is_buffer(edge_exported_program, arg_node) + or is_lifted_tensor_constant(edge_exported_program, arg_node) + ) + if is_get_attr or is_param_buffer: + arg_node.meta["delegation_tag"] = delegation_tag + # Add to the list of partitioned nodes. + + return PartitionResult( + tagged_exported_program=edge_exported_program, partition_tags=partition_tags + ) + + ops_not_to_decompose = [ torch.ops.aten.linear.default, torch.ops.aten.scaled_dot_product_attention.default, diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py index 3ee6202ae8e..3973011a269 100644 --- a/exir/backend/test/test_partitioner.py +++ b/exir/backend/test/test_partitioner.py @@ -26,6 +26,10 @@ from executorch.exir.backend.test.demos.rpc.executor_backend_preprocess import ( ExecutorBackend, ) +from executorch.exir.backend.test.op_partitioner_demo import ( + AddAttributePartitionerDemo, + AllNodesPartitionerDemo, +) from executorch.exir.backend.utils import get_delegates, tag_constant_data from executorch.exir.dialects._ops import ops as exir_ops @@ -619,3 +623,111 @@ def partition( and node.target == torch.ops.aten.copy_.default ] self.assertEqual(len(copy_node), 1) + + def test_buffer_mutation1(self): + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("b", torch.ones(3, 3)) + + def forward(self, x): + self.b.add_(x) + return x + self.b + + model_inputs = (torch.ones(3, 3),) + orig_res = TestModule()(*model_inputs) + edge_program = exir.to_edge(torch.export.export(TestModule(), model_inputs)) + lowered = edge_program.to_backend(AddAttributePartitionerDemo()) + + self.assertTrue( + torch.allclose(lowered.exported_program().module()(*model_inputs), orig_res) + ) + + self.assertEqual( + len(lowered.exported_program().graph_signature.buffers_to_mutate), + 0, + ) + lowered_module_nodes = get_delegates(lowered.exported_program().graph) + self.assertEqual(len(lowered_module_nodes), 1) + lowered_module_node = lowered_module_nodes[0] + + # get call delegate node + call_delegate_node = list(lowered_module_node.users.keys())[0] + self.assertEqual(len(call_delegate_node.args), 2) + + lower_module = getattr( + lowered.exported_program().graph_module, lowered_module_node.name + ) + delegated_ep = lower_module.original_module + + self.assertEqual(len(delegated_ep.state_dict), 1) + self.assertEqual(len(delegated_ep.graph_signature.buffers_to_mutate), 1) + self.assertEqual(len(delegated_ep.graph_signature.buffers), 1) + + def test_buffer_mutation_llama_repro(self): + SHAPE = (2, 3) + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("cache", torch.zeros(SHAPE, dtype=torch.float32)) + + def forward(self, q, k_val, input_pos): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + q = torch.rand(1, 3) + k = torch.rand(1, 3) + example_inputs = (q, k, torch.tensor([1, 1])) + + model = Model() + model.eval() + + exir_program_aten = torch.export.export(model, example_inputs) + exir_program_aten.module()(*example_inputs) + edge_program_manager = exir.to_edge(exir_program_aten) + lowered = edge_program_manager.to_backend(AllNodesPartitionerDemo()) + + self.assertEqual( + len(lowered.exported_program().graph_signature.buffers_to_mutate), + 0, + ) + lowered_module_nodes = get_delegates(lowered.exported_program().graph) + self.assertEqual(len(lowered_module_nodes), 1) + lowered_module_node = lowered_module_nodes[0] + + # get call delegate node + call_delegate_node = list(lowered_module_node.users.keys())[0] + self.assertEqual(len(call_delegate_node.args), 4) + + lower_module = getattr( + lowered.exported_program().graph_module, lowered_module_node.name + ) + delegated_ep = lower_module.original_module + + self.assertEqual(len(delegated_ep.state_dict), 1) + self.assertEqual(len(delegated_ep.graph_signature.buffers_to_mutate), 1) + self.assertEqual(len(delegated_ep.graph_signature.buffers), 1) + + def test_buffer_mutation_unsupported(self): + SHAPE = (2, 3) + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("state_1", torch.zeros(SHAPE, dtype=torch.float32)) + + def forward(self, x): + add = self.state_1.add_(x) + return add + + model = Model() + model.eval() + + example_inputs = (torch.randn(SHAPE),) + exir_program_aten = torch.export.export(model, example_inputs) + edge_program_manager = exir.to_edge(exir_program_aten) + with self.assertRaises(AssertionError): + edge_program_manager.to_backend(AddAttributePartitionerDemo()) diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 4d07fdcdf06..d93905a2bd0 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -8,6 +8,7 @@ import copy import operator +from collections import defaultdict from typing import Any, Dict, List, Optional, Set, Tuple, Union import torch @@ -488,8 +489,12 @@ def _get_new_signature( # noqa: C901 else {} ) + toplevel_output_node_to_sig: Dict[str, List[OutputSpec]] = defaultdict(list) + if not is_submodule: + for output_spec in old_signature.output_specs: + toplevel_output_node_to_sig[output_spec.arg.name].append(output_spec) + for node in gm.graph.nodes: - is_tagged = tag is None or node.meta.get("delegation_tag", None) == tag if node.op == "placeholder": if node.name not in input_node_to_sig: @@ -507,7 +512,7 @@ def _get_new_signature( # noqa: C901 if not isinstance(orig_input_spec.arg, TensorArgument): input_specs.append(orig_input_spec) - elif is_tagged: + elif node.meta.get("delegation_tag", None) == tag: input_specs.append(orig_input_spec) if orig_input_spec.kind == InputKind.USER_INPUT: @@ -551,11 +556,72 @@ def _get_new_signature( # noqa: C901 ) if node.op == "output": - output_nodes = pytree.tree_leaves((node.args, node.kwargs)) + buffer_mutation_idxs: Dict[int, List[OutputSpec]] = defaultdict(list) + for user in call_module_node.users.keys(): + if user.name in toplevel_output_node_to_sig: + assert ( + user.op == "call_function" and user.target == operator.getitem + ), f"Invalid user {user}, node.op is {user.op} and node.target is {user.target}" + getitem_idx = user.args[1] + assert isinstance( + getitem_idx, int + ), f"Invalid getitem type: {type(getitem_idx)}" + buffer_mutation_idxs[getitem_idx].extend( + toplevel_output_node_to_sig[user.name] + ) - for output_node in output_nodes: + for i, output_node in enumerate(node.args[0]): + if i in buffer_mutation_idxs: + assert isinstance(output_node, torch.fx.Node) + orig_output_specs = buffer_mutation_idxs[i] + + if any( + orig_output_spec.kind == OutputKind.BUFFER_MUTATION + and orig_output_spec.target in new_state_dict + for orig_output_spec in orig_output_specs + ): + # If the delegate wants to consume the buffer, then the + # delegate should also consume the buffer mutation + # (output spec would be a BUFFER_MUTATION). Otherwise + # the delegate will just return the result of the + # mutation as a USER_OUTPUT. + + orig_output_spec = [ + orig_output_spec + for orig_output_spec in orig_output_specs + if orig_output_spec.kind == OutputKind.BUFFER_MUTATION + and orig_output_spec.target in new_state_dict + ][0] + + assert len(orig_output_specs) == 1, ( + f"Constant {orig_output_spec.target} was tagged to be " + "consumed by the buffer, and was found to also contain " + "a buffer mutation. However this buffer mutation node " + "was found to also be used as other types of outputs " + "which is currently not supported. Please file an " + "issue on Github. \n\n" + f"The toplevel program: {original_program}\n" + ) + output_specs.append( + OutputSpec( + kind=OutputKind.BUFFER_MUTATION, + arg=TensorArgument(name=output_node.name), + target=orig_output_spec.target, + ) + ) + output_specs_to_delete[orig_output_spec.arg.name] = ( + orig_output_spec + ) + else: + output_specs.append( + OutputSpec( + kind=OutputKind.USER_OUTPUT, + arg=TensorArgument(name=output_node.name), + target=None, + ) + ) - if not isinstance(output_node, torch.fx.Node): + elif not isinstance(output_node, torch.fx.Node): output_specs.append( OutputSpec( kind=OutputKind.USER_OUTPUT, @@ -630,6 +696,9 @@ def create_exported_program_from_submodule( in_spec = pytree.tree_flatten((tuple(subgraph_signature.user_inputs), {}))[1] out_spec = pytree.tree_flatten(subgraph_signature.user_outputs)[1] + print(submodule.graph) + print(subgraph_signature) + return ( ExportedProgram( root=submodule, @@ -774,7 +843,7 @@ def get_lowered_backend_modules( return lowered_programs -def _unsafe_adjust_original_program( +def _unsafe_adjust_original_program( # noqa: C901 original_program: ExportedProgram, call_delegate_node: torch.fx.Node, input_specs_to_delete: Dict[str, InputSpec], @@ -830,3 +899,50 @@ def _unsafe_adjust_original_program( del original_program._constants[input_spec.target] else: raise RuntimeError(f"Invalid input spec {input_spec} received") + + # Delete buffer mutations from the output which were consumed by the delegate + toplevel_output_node = None + for node in reversed(original_program.graph.nodes): + if node.op == "output": + toplevel_output_node = node + break + + assert toplevel_output_node is not None + assert ( + len(toplevel_output_node.args) == 1 + ), f"Invalid output node: {toplevel_output_node} with args {toplevel_output_node.args}" + + new_output_args = [ + arg + for arg in toplevel_output_node.args[0] + if not isinstance(arg, torch.fx.Node) or arg.name not in output_specs_to_delete + ] + toplevel_output_node.args = (tuple(new_output_args),) + + # Delete the buffer mutation getitem nodes + getitem_idxs: List[int] = [] + user_nodes = list(call_delegate_node.users.keys()) + for user in user_nodes: + if user.name in output_specs_to_delete: + assert ( + user.op == "call_function" and user.target == operator.getitem + ), f"Invalid user {user}, node.op is {node.op} and node.target is {node.target}" + user_idx = user.args[1] + assert isinstance(user_idx, int), f"Invalid getitem type: {type(user_idx)}" + getitem_idxs.append(user_idx) + original_program.graph.erase_node(user) + + getitem_idxs.sort(reverse=True) + + # Adjust all the getitem indices after the deleted getitems + user_nodes = list(call_delegate_node.users.keys()) + for user in user_nodes: + assert user.op == "call_function" and user.target == operator.getitem + user_idx = user.args[1] + assert isinstance(user_idx, int) + for i, idx in enumerate(getitem_idxs): + if user_idx > idx: + user.args = (user.args[0], user_idx - (len(getitem_idxs) - i)) + break + + original_program._validate() diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py index 5c2700e6f5e..37e09babbb9 100644 --- a/extension/export_util/utils.py +++ b/extension/export_util/utils.py @@ -63,7 +63,7 @@ def _core_aten_to_edge( compile_config=edge_compile_config, ) if verbose: - logging.info(f"Exported graph:\n{edge_manager.exported_program().graph}") + logging.info(f"Exported graph:\n{edge_manager.exported_program()}") return edge_manager From 52c9f30209a26df564c05adbd0ee4c4661364753 Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 28 Aug 2024 16:00:44 -0700 Subject: [PATCH 094/531] [executorch] Sync torchao version Differential Revision: D61925882 Pull Request resolved: https://github.com/pytorch/executorch/pull/4945 --- .ci/docker/ci_commit_pins/torchao.txt | 1 + examples/models/flamingo/install_requirements.sh | 4 ++++ examples/models/llama2/install_requirements.sh | 5 ++++- examples/models/phi-3-mini-lora/install_requirements.sh | 4 ++++ 4 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 .ci/docker/ci_commit_pins/torchao.txt diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt new file mode 100644 index 00000000000..768110b82ff --- /dev/null +++ b/.ci/docker/ci_commit_pins/torchao.txt @@ -0,0 +1 @@ +0916b5b29b092afcbf2b898caae49abe80662bac diff --git a/examples/models/flamingo/install_requirements.sh b/examples/models/flamingo/install_requirements.sh index 0bcf302ca9e..8e1ebbd4576 100644 --- a/examples/models/flamingo/install_requirements.sh +++ b/examples/models/flamingo/install_requirements.sh @@ -7,3 +7,7 @@ # Install torchtune nightly for model definitions. pip install --pre torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir + +# Install torchao. +TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt) +pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}" diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh index 6b106c1c214..ddd29ad3f6f 100755 --- a/examples/models/llama2/install_requirements.sh +++ b/examples/models/llama2/install_requirements.sh @@ -8,7 +8,10 @@ # Install snakeviz for cProfile flamegraph # Install sentencepiece for llama tokenizer pip install snakeviz sentencepiece -pip install torchao==0.1 + +# Install torchao. +TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt) +pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}" # Install lm-eval for Model Evaluation with lm-evalution-harness # Install tiktoken for tokenizer diff --git a/examples/models/phi-3-mini-lora/install_requirements.sh b/examples/models/phi-3-mini-lora/install_requirements.sh index ab73d8dac40..c8aa428fe38 100755 --- a/examples/models/phi-3-mini-lora/install_requirements.sh +++ b/examples/models/phi-3-mini-lora/install_requirements.sh @@ -8,3 +8,7 @@ pip install torchvision pip install torchtune pip install tiktoken + +# Install torchao. +TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt) +pip install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}" From 230511ed88476d4641b8a9b28965bc1c3a927ad2 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:04:49 -0700 Subject: [PATCH 095/531] Fix linter in quantizer_lib.py Pull Request resolved: https://github.com/pytorch/executorch/pull/4952 --- extension/llm/export/quantizer_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 76a2bc97d3e..7fc53358c50 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -179,7 +179,7 @@ def get_qnn_quantizer( quant_dtype = QuantDtype.use_16a16w # pyre-fixme[16] # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w # TODO: enable it after the issue is fixed - logging.warn( + logging.warning( "Disable per channel quantization for linear due to the error with QNN HTP 16a16w." ) qnn_quantizer.set_per_channel_linear_quant(enable=False) From 1899d15c8aab658a5fdf45acdace5ce4c8d7fea7 Mon Sep 17 00:00:00 2001 From: cccclai Date: Thu, 29 Aug 2024 07:30:32 +0800 Subject: [PATCH 096/531] update doc to disable dynamic shape (#4941) --- examples/models/llama2/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 980a1831b0e..b8a260865b5 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -316,9 +316,9 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --coreml -c stories110M.pt -p params.json` -- MPS: `python -m examples.models.llama2.export_llama -kv --mps -c stories110M.pt -p params.json` -- QNN: `python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json` +- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. From 4a8b8eee96ce5c514bfcb37140455348c717381a Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Wed, 28 Aug 2024 17:52:29 -0700 Subject: [PATCH 097/531] [executorch] Increase mac capacity / fix conda for llava test on trunk Differential Revision: D61878157 Pull Request resolved: https://github.com/pytorch/executorch/pull/4935 --- .github/workflows/trunk.yml | 55 +++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 7a6aad15505..31887da855b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -270,33 +270,34 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" - test-llava-runner-macos: - name: test-llava-runner-macos - uses: pytorch/test-infra/.github/workflows/macos_job.yml@main - strategy: - fail-fast: false - with: - runner: macos-m1-stable - python-version: '3.11' - submodules: 'true' - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - timeout: 900 - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "cmake" - - # install Llava requirements - bash examples/models/llama2/install_requirements.sh - bash examples/models/llava/install_requirements.sh - - # run python unittest - python -m unittest examples.models.llava.test.test_llava - - # run e2e (export, tokenizer and runner) - PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh Release + # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. + # test-llava-runner-macos: + # name: test-llava-runner-macos + # uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + # strategy: + # fail-fast: false + # with: + # runner: macos-14-xlarge + # python-version: '3.11' + # submodules: 'true' + # ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # timeout: 900 + # script: | + # BUILD_TOOL=cmake + + # bash .ci/scripts/setup-conda.sh + # # Setup MacOS dependencies as there is no Docker support on MacOS atm + # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + # # install Llava requirements + # ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh + + # # run python unittest + # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava + + # # run e2e (export, tokenizer and runner) + # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release test-qnn-model: name: test-qnn-model From 1774638c68b71cfeeebca924799bacc1c2baf07e Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 28 Aug 2024 18:02:37 -0700 Subject: [PATCH 098/531] [llava] Quantize embedding Differential Revision: D61939945 Pull Request resolved: https://github.com/pytorch/executorch/pull/4955 --- .ci/scripts/test_llava.sh | 2 +- .../llama2/source_transformation/quantize.py | 1 + examples/models/llava/export_llava.py | 15 ++++++++++++--- examples/models/llava/model.py | 14 +++----------- examples/models/llava/test/test_llava.py | 11 +++++------ examples/models/llava/test/test_pte.py | 10 ++++------ 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index ec4a6b37d8a..3543ea3fa57 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -91,7 +91,7 @@ run_and_verify() { RESULT=$(cat result.txt) # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. if [[ "$(uname)" == "Darwin" ]]; then - EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress on a basketball court. There are several players on the court, with one player in the foreground holding a basketball, and" + EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various" else # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. EXPECTED_PREFIX="ASSISTANT:" diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py index 4f3eaf1125b..da832f8285a 100644 --- a/examples/models/llama2/source_transformation/quantize.py +++ b/examples/models/llama2/source_transformation/quantize.py @@ -399,6 +399,7 @@ def replace_embedding_weight_only_grouped_int8_per_channel( vocab_size=child.weight.shape[0], embedding_dim=child.weight.shape[1], group_size=group_size, + dtype=child.weight.dtype, packed=packed, ), ) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 903f8c17605..2823ca726e0 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -17,6 +17,7 @@ get_quantizer_and_quant_params, ) from executorch.examples.models.llama2.source_transformation.quantize import ( + EmbeddingQuantHandler, get_quant_weight_transform, ) from executorch.examples.models.llama2.source_transformation.sdpa import ( @@ -157,12 +158,20 @@ def forward(self, images): def export_token_embedding(llava, prompt): - embed = llava.embed_tokens - token_dim_1 = Dim("token_dim_1", min=2, max=3518) + def quant_embedding(model): + return EmbeddingQuantHandler( + model, + bitwidth=8, + group_size=32, + packed=False, + ).quantized_model() + + quantized_token_embed = quant_embedding(llava.model_.language_model.model) + token_dim_1 = Dim("token_dim_1", min=2, max=llava.text_model_args.max_seq_len) dynamic_shapes = [{1: token_dim_1}] with torch.no_grad(): token_embedding_ep = torch.export.export( - embed, (prompt,), dynamic_shapes=dynamic_shapes + quantized_token_embed.embed_tokens, (prompt,), dynamic_shapes=dynamic_shapes ) return token_embedding_ep diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 9ad185a5eee..8dcf286727b 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -21,7 +21,6 @@ from executorch.examples.models.model_base import EagerModelBase from PIL import Image -from torch import nn from torch.export import Dim from torchvision.transforms.v2 import functional as F @@ -60,11 +59,6 @@ def __init__( use_hf_rope=True, max_seq_len=max_seq_len, ) - self.embed_tokens = nn.Embedding( - self.model_.config.text_config.vocab_size, - self.model_.config.text_config.hidden_size, - self.model_.config.pad_token_id, - ) self.text_model = Transformer(self.text_model_args) # use custom op for SDPA. if use_sdpa_with_kv_cache_op: @@ -75,11 +69,6 @@ def __init__( strict=False, assign=True, ) - self.embed_tokens.load_state_dict( - state_dict=self.model_.language_model.model.embed_tokens.state_dict(), - strict=True, - assign=True, - ) def _translate_state_dict_for_text_model(self) -> Dict[str, Any]: state_dict = self.model_.language_model.state_dict() @@ -133,6 +122,9 @@ def _feature_select(self, image_outputs): def get_model(self): return self.model_.get_model() + def embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor: + return self.model_.language_model.model.embed_tokens(tokens) + def encode_images(self, images: torch.Tensor) -> torch.Tensor: images = images.to(dtype=self.model_.dtype) if type(images) is list: diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index f464a580a87..2e50bcecf49 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -15,12 +15,11 @@ # import order matters. We need to import portable_lib first since it contains the static op registry # which will be used in the import of custom ops. Otherwise, the registration of custom ops will be skipped. # I don't know how to mute UFMT so I'm just using if True: to avoid the error -if True: - from executorch.extension.pybindings.portable_lib import ( - _load_for_executorch_from_buffer, - ) -from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa: F401 - +from executorch.extension.pybindings.portable_lib import ( + _load_for_executorch_from_buffer, +) +from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +from executorch.kernels import quantized # noqa # usort: skip logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py index 85c47cc1de5..003b2b56755 100644 --- a/examples/models/llava/test/test_pte.py +++ b/examples/models/llava/test/test_pte.py @@ -14,10 +14,8 @@ from PIL import Image # Custom ops has to be loaded after portable_lib. -# I don't know how to stop UFMT so I'm just using if True: to avoid lint error -if True: - from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa - +from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +from executorch.kernels import quantized # noqa # usort: skip FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.DEBUG, format=FORMAT) @@ -54,7 +52,7 @@ def main(): )[0] print(pte_prefill_before_img) - start_pos += pte_prefill_before_img.shape[1] + start_pos += prompt_before_image.shape[1] # pte prefill image logging.warning("Image encoder started") @@ -71,7 +69,7 @@ def main(): logging.warning("Image token prefill finished") print(pte_prefill_img) - start_pos += pte_prefill_img.shape[1] + start_pos += pte_embeds_img.shape[1] # pte prefill prompt after img logging.warning("Text token prefill started") From 1f59accf54055de319b05c993262560708de1edc Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 28 Aug 2024 18:08:51 -0700 Subject: [PATCH 099/531] Format cmake files. Differential Revision: D61944289 Pull Request resolved: https://github.com/pytorch/executorch/pull/4958 --- CMakeLists.txt | 76 +++---- backends/apple/coreml/CMakeLists.txt | 6 +- backends/cadence/CMakeLists.txt | 6 +- backends/cadence/hifi/kernels/CMakeLists.txt | 11 +- .../cadence/hifi/operators/CMakeLists.txt | 32 +-- .../hifi/third-party/nnlib/CMakeLists.txt | 31 +-- .../cadence/reference/kernels/CMakeLists.txt | 10 +- .../reference/operators/CMakeLists.txt | 39 ++-- backends/mediatek/CMakeLists.txt | 50 ++--- backends/qualcomm/CMakeLists.txt | 11 +- backends/vulkan/cmake/ShaderLibrary.cmake | 4 +- backends/xnnpack/CMakeLists.txt | 8 +- build/Codegen.cmake | 3 +- build/Test.cmake | 71 +++--- devtools/CMakeLists.txt | 10 +- examples/arm/executor_runner/CMakeLists.txt | 77 ++++--- examples/mediatek/CMakeLists.txt | 212 ++++++++---------- .../llama_runner/CMakeLists.txt | 79 +++---- examples/models/llama2/runner/CMakeLists.txt | 5 +- examples/models/llava/CMakeLists.txt | 1 - examples/qualcomm/CMakeLists.txt | 19 +- .../qualcomm/executor_runner/CMakeLists.txt | 4 +- .../oss_scripts/llama2/CMakeLists.txt | 18 +- .../qaihub_scripts/llama/CMakeLists.txt | 51 +++-- .../stable_diffusion/CMakeLists.txt | 27 +-- examples/sdk/CMakeLists.txt | 43 ++-- exir/backend/test/demos/rpc/CMakeLists.txt | 9 +- extension/llm/custom_ops/CMakeLists.txt | 13 +- kernels/prim_ops/test/CMakeLists.txt | 4 +- kernels/quantized/CMakeLists.txt | 21 +- kernels/test/CMakeLists.txt | 17 +- .../core/portable_type/test/CMakeLists.txt | 4 +- schema/CMakeLists.txt | 3 +- test/CMakeLists.txt | 5 +- 34 files changed, 462 insertions(+), 518 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 99c8b7f69fc..20bb1bb122a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -505,7 +505,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE) ) target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema) if(DL_LIBRARY_EXISTS) - target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr() + # For dladdr() + target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) endif() target_include_directories( executorch_no_prim_ops_shared PUBLIC ${_common_include_directories} @@ -541,7 +542,7 @@ target_link_options_shared_lib(executorch) # operators necessary for the models that will run. # if(BUILD_EXECUTORCH_PORTABLE_OPS) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) @@ -584,56 +585,56 @@ if(EXECUTORCH_BUILD_GTESTS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest) endif() -if(EXECUTORCH_BUILD_SDK) - set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER - ON - CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE - ) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) endif() -if(EXECUTORCH_BUILD_EXTENSION_APPLE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple) +if(EXECUTORCH_BUILD_CADENCE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) endif() -if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) +if(EXECUTORCH_BUILD_COREML) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) endif() -if(EXECUTORCH_BUILD_EXTENSION_MODULE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) +if(EXECUTORCH_BUILD_MPS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) endif() if(EXECUTORCH_BUILD_NEURON) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek) endif() -if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) +if(EXECUTORCH_BUILD_QNN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) endif() if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() -if(EXECUTORCH_BUILD_QNN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) +if(EXECUTORCH_BUILD_SDK) + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + ON + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() -if(EXECUTORCH_BUILD_ARM_BAREMETAL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +if(EXECUTORCH_BUILD_EXTENSION_APPLE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple) endif() -if(EXECUTORCH_BUILD_MPS) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) +if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) endif() -if(EXECUTORCH_BUILD_COREML) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) +if(EXECUTORCH_BUILD_EXTENSION_MODULE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) endif() -if(EXECUTORCH_BUILD_CADENCE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) +if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) endif() if(EXECUTORCH_BUILD_PYBIND) @@ -690,9 +691,8 @@ if(EXECUTORCH_BUILD_PYBIND) ) # util lib add_library( - util - ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp + util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp ) target_include_directories( util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS} @@ -741,12 +741,14 @@ if(EXECUTORCH_BUILD_PYBIND) else() set_target_properties( portable_lib - PROPERTIES # Assume is the root `site-packages/executorch` - # Need to add /extension/llm/custom_ops for - # libcustom_ops_aot_lib - # Need to add /kernels/quantized for - # libquantized_ops_aot_lib - BUILD_RPATH "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" + PROPERTIES + # Assume is the root `site-packages/executorch` + # Need to add /extension/llm/custom_ops for + # libcustom_ops_aot_lib + # Need to add /kernels/quantized for + # libquantized_ops_aot_lib + BUILD_RPATH + "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" ) endif() @@ -757,9 +759,7 @@ endif() if(EXECUTORCH_BUILD_KERNELS_CUSTOM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops - ) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) endif() if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt index 113b21bd690..7f927284cdd 100644 --- a/backends/apple/coreml/CMakeLists.txt +++ b/backends/apple/coreml/CMakeLists.txt @@ -14,10 +14,10 @@ if(NOT EXECUTORCH_ROOT) endif() if(EXECUTORCH_BUILD_SDK) -# protobuf requires frtti -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti" ) + # protobuf requires frtti + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti") endif() - + option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF) # inmemoryfs sources diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index f725655e0d6..d786142f085 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -27,8 +27,8 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(TARGET_DIR reference) if(EXECUTORCH_NNLIB_OPT) -set(TARGET_DIR hifi) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) + set(TARGET_DIR hifi) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -68,7 +68,7 @@ target_include_directories( target_include_directories( cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} - ${_common_include_directories} + ${_common_include_directories} ) target_link_libraries( diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 8cbeb3e1806..15d1a4ddd52 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -13,11 +13,12 @@ add_library( target_include_directories( cadence_kernels - PUBLIC . - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ + PUBLIC + . + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ ) target_link_libraries(cadence_kernels PRIVATE xa_nnlib) diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 996d109db48..8da6169cda1 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -44,7 +44,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" +) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -52,18 +53,20 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" - "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -75,12 +78,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_hifi.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml ) message("Generated files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt index e93e0759d2c..90eca6b47e1 100644 --- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt +++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt @@ -1,30 +1,19 @@ - cmake_minimum_required(VERSION 3.10.0) project(cadence_nnlib) - -add_custom_target( nnlib_target ALL COMMAND - make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build - OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj - LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib - -j8 ) +add_custom_target( + nnlib_target ALL + COMMAND + make install_nnlib -f makefile -C + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build + OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj + LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib -j8 +) add_library(xa_nnlib STATIC IMPORTED GLOBAL) add_dependencies(xa_nnlib nnlib_target) set_property( - TARGET xa_nnlib - PROPERTY - IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" + TARGET xa_nnlib PROPERTY IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" ) - - - - - - - - - - - diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt index eadb01f54d5..fba66e9b27a 100644 --- a/backends/cadence/reference/kernels/CMakeLists.txt +++ b/backends/cadence/reference/kernels/CMakeLists.txt @@ -5,12 +5,6 @@ # LICENSE file in the root directory of this source tree. # lint_cmake: -linelength -add_library( - cadence_kernels - kernels.cpp -) +add_library(cadence_kernels kernels.cpp) -target_include_directories( - cadence_kernels - PUBLIC . -) +target_include_directories(cadence_kernels PUBLIC .) diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt index 71b0304c997..605c43ef715 100644 --- a/backends/cadence/reference/operators/CMakeLists.txt +++ b/backends/cadence/reference/operators/CMakeLists.txt @@ -50,7 +50,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" +) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -58,19 +59,26 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp" - "quantized_relu_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" - "quantized_matmul_out.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops + "quantized_linear_out.cpp" + "quantized_conv_out.cpp" + "quantized_relu_out.cpp" + "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" + "dequantize_per_tensor.cpp" + "quantized_matmul_out.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -82,12 +90,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml ) message("Generated cadence x86 files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 7e36746bca2..4b233d94f04 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -10,41 +10,35 @@ # Let include directory as "executorch/..." set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../..) -set(NEURON_BUFFER_ALLOCATOR_LIB "" CACHE PATH "Path to Neuron Buffer Allocator library") -message(STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}") - -include_directories( - BEFORE - ${_common_include_directories} +set(NEURON_BUFFER_ALLOCATOR_LIB + "" + CACHE PATH "Path to Neuron Buffer Allocator library" +) +message( + STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}" ) +include_directories(BEFORE ${_common_include_directories}) + # shortcut include directory for neuron headers -include_directories( - BEFORE - ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include -) +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) -target_link_libraries(neuron_backend - PRIVATE - executorch_no_prim_ops - portable_ops_lib - android - log - ${NEURON_BUFFER_ALLOCATOR_LIB} +target_link_libraries( + neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log + ${NEURON_BUFFER_ALLOCATOR_LIB} ) -target_sources(neuron_backend - INTERFACE - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp - ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp +target_sources( + neuron_backend + INTERFACE ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp + ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp ) target_link_options_shared_lib(neuron_backend) diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index babdb96d8bc..8c62b025bcd 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -66,9 +66,7 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release") add_link_options("-s") # --gc-sections is added by torch. - add_compile_options( - "-O3" "-ffunction-sections" "-fdata-sections" "-frtti" - ) + add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti") endif() include_directories( @@ -261,11 +259,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") if(CMAKE_BUILD_TYPE STREQUAL "Release") # need to allow exceptions in pybind - set(_pybind_compile_options - -Wno-deprecated-declarations - -fPIC - -frtti - -fexceptions + set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti + -fexceptions ) target_compile_options( PyQnnManagerAdaptor PUBLIC ${_pybind_compile_options} diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 49dc27056a0..b44736d20dd 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -50,8 +50,8 @@ function(gen_vulkan_shader_lib_cpp shaders_path) execute_process( COMMAND "${PYTHON_EXECUTABLE}" - ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py - --glsl-path ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} + ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path + ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH} --env ${VULKAN_GEN_ARG_ENV} RESULT_VARIABLE error_code diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index a5b12d65799..98870bf0e11 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -32,9 +32,11 @@ if(NOT PYTHON_EXECUTABLE) resolve_python_executable() endif() -# NB: Enabling this will serialize execution of delegate instances -# Keeping this OFF by default to maintain existing behavior, to be revisited. -option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" OFF) +# NB: Enabling this will serialize execution of delegate instances Keeping this +# OFF by default to maintain existing behavior, to be revisited. +option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE + "Enable workspace sharing across different delegate instances" OFF +) if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() diff --git a/build/Codegen.cmake b/build/Codegen.cmake index 818deb17581..381cd0958fd 100644 --- a/build/Codegen.cmake +++ b/build/Codegen.cmake @@ -78,7 +78,8 @@ function(generate_bindings_for_kernels) # Executorch runtime. execute_process( COMMAND - "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib;print(get_python_lib())" + "${PYTHON_EXECUTABLE}" -c + "from distutils.sysconfig import get_python_lib;print(get_python_lib())" OUTPUT_VARIABLE site-packages-out ERROR_VARIABLE site-packages-out-error RESULT_VARIABLE site-packages-result diff --git a/build/Test.cmake b/build/Test.cmake index b2b23cb03ad..20d5cc58f84 100644 --- a/build/Test.cmake +++ b/build/Test.cmake @@ -5,8 +5,8 @@ # LICENSE file in the root directory of this source tree. # -# This file is intended to have helper functions for test-related -# CMakeLists.txt files. +# This file is intended to have helper functions for test-related CMakeLists.txt +# files. # # ### Editing this file ### # @@ -33,53 +33,56 @@ target_link_options_shared_lib(quantized_ops_lib) # Add code coverage flags to supported compilers if(EXECUTORCH_USE_CPP_CODE_COVERAGE) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") - string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") - string(APPEND CMAKE_CXX_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_CXX_FLAGS + " -fprofile-instr-generate -fcoverage-mapping" + ) else() - message(ERROR "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported") + message(ERROR + "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported" + ) endif() endif() -# A helper function to generate a gtest cxx executable target -# @param target_name: name for the executable -# @param SOURCES : test sources to be compiled. Sometimes -# util sources are used as well -# @param EXTRA LIBS : additional libraries to be linked against -# the target. gtest, gmock, executorch are linked by default, but Sometimes -# user may need additional libraries like kernels. -# We use CMake package executorch in this helper, so user can easily add -# installed libraries. +# A helper function to generate a gtest cxx executable target @param +# target_name: name for the executable @param SOURCES : test +# sources to be compiled. Sometimes util sources are used as well @param EXTRA +# LIBS : additional libraries to be linked against the target. +# gtest, gmock, executorch are linked by default, but Sometimes user may need +# additional libraries like kernels. We use CMake package executorch in this +# helper, so user can easily add installed libraries. # -# Example: -# et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) +# Example: et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) # # This defines a gtest executable my_test, compiling my_test.cpp, and linking # against libportable_kernels.a. # function(et_cxx_test target_name) -set(multi_arg_names SOURCES EXTRA_LIBS) -cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) + set(multi_arg_names SOURCES EXTRA_LIBS) + cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) -# Let files say "include ". -target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) + # Let files say "include ". + target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) -set(ET_TEST_UTIL_SOURCES ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp) + set(ET_TEST_UTIL_SOURCES + ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp + ) -add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) -# Includes gtest, gmock, executorch by default -target_link_libraries( - ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch - ${ET_CXX_TEST_EXTRA_LIBS} -) + add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) + # Includes gtest, gmock, executorch by default + target_link_libraries( + ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch + ${ET_CXX_TEST_EXTRA_LIBS} + ) -# add_test adds a test target to be used by ctest. -# We use `ExecuTorchTest` as the ctest target name for the test executable -# Usage: cd cmake-out/path/to/test/; ctest -# Note: currently we directly invoke the test target, without using ctest -add_test(ExecuTorchTest ${target_name}) + # add_test adds a test target to be used by ctest. We use `ExecuTorchTest` as + # the ctest target name for the test executable Usage: cd + # cmake-out/path/to/test/; ctest Note: currently we directly invoke the test + # target, without using ctest + add_test(ExecuTorchTest ${target_name}) endfunction() diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt index 4c4d15fd733..776d421a8d3 100644 --- a/devtools/CMakeLists.txt +++ b/devtools/CMakeLists.txt @@ -92,11 +92,11 @@ if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT) # headers on the host during the build, even if we're cross-compiling the # flatcc runtime to a different architecture. execute_process( - COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir} - -DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF - # See above comment about POSITION_INDEPENDENT_CODE. - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -B${CMAKE_BINARY_DIR}/_host_build + COMMAND + ${CMAKE_COMMAND} ${_flatcc_source_dir} -DFLATCC_TEST=OFF + -DFLATCC_REFLECTION=OFF + # See above comment about POSITION_INDEPENDENT_CODE. + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -B${CMAKE_BINARY_DIR}/_host_build ) execute_process( COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 1f42eda9fbc..136f72ee756 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -9,25 +9,38 @@ project(arm_executor_runner) option(SEMIHOSTING "Enable semihosting" OFF) if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING}) - message(FATAL_ERROR - "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " - "model is built into the binary.") + message( + FATAL_ERROR + "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " + "model is built into the binary." + ) endif() # Example ExecuTorch demo for bare metal Cortex-M based systems -set(ET_DIR_PATH "../../.." CACHE PATH - "Path to ExecuTorch dir") -set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH - "Path to ExecuTorch build dir") -set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH - "Path to ExecuTorch headers") -set(ET_PTE_FILE_PATH "" CACHE PATH - "Path to ExecuTorch model pte") -set(ETHOS_SDK_PATH "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" CACHE PATH - "Path to Ethos-U bare metal driver/env") -set(PYTHON_EXECUTABLE "python" CACHE PATH - "Define to override python executable used") - +set(ET_DIR_PATH + "../../.." + CACHE PATH "Path to ExecuTorch dir" +) +set(ET_BUILD_DIR_PATH + "${ET_DIR_PATH}/cmake-out" + CACHE PATH "Path to ExecuTorch build dir" +) +set(ET_INCLUDE_PATH + "${ET_DIR_PATH}/.." + CACHE PATH "Path to ExecuTorch headers" +) +set(ET_PTE_FILE_PATH + "" + CACHE PATH "Path to ExecuTorch model pte" +) +set(ETHOS_SDK_PATH + "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" + CACHE PATH "Path to Ethos-U bare metal driver/env" +) +set(PYTHON_EXECUTABLE + "python" + CACHE PATH "Define to override python executable used" +) get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) @@ -104,16 +117,16 @@ set_property( # Convert pte to header if(NOT ${SEMIHOSTING}) - add_custom_target(gen_model_header - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h) + add_custom_target( + gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + ) add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py - --pte ${ET_PTE_FILE_PATH} - --outdir ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${ET_PTE_FILE_PATH} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte + ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${ET_PTE_FILE_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) endif() @@ -146,19 +159,17 @@ target_include_directories( arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR} ) - - if(SEMIHOSTING) -target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) + target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) else() -add_dependencies(arm_executor_runner gen_model_header) + add_dependencies(arm_executor_runner gen_model_header) endif() # Fixup compilation of retarget.c if(SEMIHOSTING) -# Remove this when MLBEDSW-8910 is closed. -set_source_files_properties( - ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c - PROPERTIES HEADER_FILE_ONLY TRUE -) + # Remove this when MLBEDSW-8910 is closed. + set_source_files_properties( + ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c + PROPERTIES HEADER_FILE_ONLY TRUE + ) endif() diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 966fecb0664..2abee59759f 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -1,8 +1,8 @@ - # Copyright (c) 2024 MediaTek Inc. - # - # Licensed under the BSD License (the "License"); you may not use this file - # except in compliance with the License. See the license file in the root - # directory of this source tree for more details. +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. cmake_minimum_required(VERSION 3.19) project(mediatek_example) @@ -49,117 +49,93 @@ find_package(gflags REQUIRED) link_directories(${EXECUTORCH_ROOT}/cmake-android-out/lib) if(${ANDROID}) - message("Build MTK Android Examples") - - get_filename_component(EXECUTORCH_SOURCE_DIR - "${CMAKE_CURRENT_LIST_DIR}/../.." - ABSOLUTE - ) - set(_mtk_executor_runner__srcs ${_executor_runner__srcs}) - list( - TRANSFORM - _mtk_executor_runner__srcs - PREPEND - "${EXECUTORCH_SOURCE_DIR}/" - ) - list( - FILTER - _mtk_executor_runner__srcs - EXCLUDE REGEX - ".*executor_runner.cpp$" - ) - list( - PREPEND - _mtk_executor_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_executor_runner.cpp - ) - - add_executable(mtk_executor_runner ${_mtk_executor_runner__srcs}) - - target_include_directories(mtk_executor_runner - PUBLIC - ${_common_include_directories} - ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include - ) - - target_link_libraries(mtk_executor_runner - ${_executor_runner_libs} - executorch - neuron_backend - gflags - ) - target_compile_options(mtk_executor_runner - PUBLIC - ${_common_compile_options} - ) - - set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs}) - list( - FILTER - _mtk_llama_executor_runner__srcs - EXCLUDE REGEX - ".*executor_runner.cpp$" - ) - list( - PREPEND - _mtk_llama_executor_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp - ) - # Build ABSL and RE2 - set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm) - set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp) - set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2) - set(ABSL_ENABLE_INSTALL ON) - set(ABSL_PROPAGATE_CXX_STD ON) - set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) - add_subdirectory(${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil) - add_subdirectory(${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2) - set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - - # Build tokenizers - set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer) - add_library(tokenizer STATIC) - target_include_directories(tokenizer - PUBLIC - ${_common_include_directories} - ${THIRD_PARTY_ABSL_DIR} - ${THIRD_PARTY_RE2_DIR} - ) - target_link_libraries(tokenizer - PRIVATE - re2::re2 - ) - target_sources(tokenizer - PRIVATE - ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp - ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp - ) - - # Include directory for neuron headers - include_directories( - BEFORE - ${_common_include_directories} - ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/mediatek/runtime/include - ) - - # Build Llama Executor static library - add_subdirectory(executor_runner/llama_runner) - - # Build Llama Executor Runner - add_executable(mtk_llama_executor_runner ${_mtk_llama_executor_runner__srcs}) - - target_link_libraries(mtk_llama_executor_runner - ${_executor_runner_libs} - ${NEURON_BUFFER_ALLOCATOR_LIB} - neuron_backend - gflags - mtk_llama_executor_lib - tokenizer - ) - target_compile_options(mtk_llama_executor_runner - PUBLIC - ${_common_compile_options} - ) + message("Build MTK Android Examples") + + get_filename_component( + EXECUTORCH_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE + ) + set(_mtk_executor_runner__srcs ${_executor_runner__srcs}) + list(TRANSFORM _mtk_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") + list(FILTER _mtk_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$") + list(PREPEND _mtk_executor_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_executor_runner.cpp + ) + + add_executable(mtk_executor_runner ${_mtk_executor_runner__srcs}) + + target_include_directories( + mtk_executor_runner + PUBLIC ${_common_include_directories} + ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include + ) + + target_link_libraries( + mtk_executor_runner ${_executor_runner_libs} executorch neuron_backend + gflags + ) + target_compile_options(mtk_executor_runner PUBLIC ${_common_compile_options}) + + set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs}) + list(FILTER _mtk_llama_executor_runner__srcs EXCLUDE REGEX + ".*executor_runner.cpp$" + ) + list(PREPEND _mtk_llama_executor_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_llama_executor_runner.cpp + ) + # Build ABSL and RE2 + set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm) + set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp) + set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2) + set(ABSL_ENABLE_INSTALL ON) + set(ABSL_PROPAGATE_CXX_STD ON) + set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + add_subdirectory( + ${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil + ) + add_subdirectory( + ${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2 + ) + set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) + + # Build tokenizers + set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer) + add_library(tokenizer STATIC) + target_include_directories( + tokenizer PUBLIC ${_common_include_directories} ${THIRD_PARTY_ABSL_DIR} + ${THIRD_PARTY_RE2_DIR} + ) + target_link_libraries(tokenizer PRIVATE re2::re2) + target_sources( + tokenizer + PRIVATE + ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp + ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp + ) + + # Include directory for neuron headers + include_directories( + BEFORE ${_common_include_directories} + ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/mediatek/runtime/include + ) + + # Build Llama Executor static library + add_subdirectory(executor_runner/llama_runner) + + # Build Llama Executor Runner + add_executable(mtk_llama_executor_runner ${_mtk_llama_executor_runner__srcs}) + + target_link_libraries( + mtk_llama_executor_runner + ${_executor_runner_libs} + ${NEURON_BUFFER_ALLOCATOR_LIB} + neuron_backend + gflags + mtk_llama_executor_lib + tokenizer + ) + target_compile_options( + mtk_llama_executor_runner PUBLIC ${_common_compile_options} + ) endif() diff --git a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt index fe809ef1337..9d27e685f3a 100644 --- a/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt +++ b/examples/mediatek/executor_runner/llama_runner/CMakeLists.txt @@ -1,66 +1,41 @@ - # Copyright (c) 2024 MediaTek Inc. - # - # Licensed under the BSD License (the "License"); you may not use this file - # except in compliance with the License. See the license file in the root - # directory of this source tree for more details. +# Copyright (c) 2024 MediaTek Inc. +# +# Licensed under the BSD License (the "License"); you may not use this file +# except in compliance with the License. See the license file in the root +# directory of this source tree for more details. # Let include directory as "executorch/..." set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) -include_directories( - BEFORE - ${_common_include_directories} -) +include_directories(BEFORE ${_common_include_directories}) # shortcut include directory for neuron headers include_directories( - BEFORE - ${_common_include_directories}/backends/mediatek/runtime/include + BEFORE ${_common_include_directories}/backends/mediatek/runtime/include ) add_library(llm_helper STATIC) -target_sources(llm_helper - PRIVATE - llm_helper/mask_builder.cpp - llm_helper/rotary_embedding.cpp - llm_helper/token_embedding.cpp +target_sources( + llm_helper + PRIVATE llm_helper/mask_builder.cpp llm_helper/rotary_embedding.cpp + llm_helper/token_embedding.cpp ) -target_link_libraries(llm_helper - PRIVATE - executorch -) -target_include_directories(llm_helper - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - llm_helper/include -) -target_compile_options(llm_helper - PRIVATE - ${_common_compile_options} +target_link_libraries(llm_helper PRIVATE executorch) +target_include_directories( + llm_helper PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} llm_helper/include ) +target_compile_options(llm_helper PRIVATE ${_common_compile_options}) add_library(mtk_llama_executor_lib STATIC) -target_link_libraries(mtk_llama_executor_lib - PRIVATE - ${_executor_runner_libs} - ${NEURON_BUFFER_ALLOCATOR_LIB} - neuron_backend - log - llm_helper -) -target_sources(mtk_llama_executor_lib - INTERFACE - MultiModelLoader.h - ModelChunk.h - LlamaModelChunk.h - LlamaRuntime.h - PRIVATE - MultiModelLoader.cpp - ModelChunk.cpp - LlamaModelChunk.cpp - LlamaRuntime.cpp -) -target_compile_options(mtk_llama_executor_lib - PUBLIC - ${_common_compile_options} -) +target_link_libraries( + mtk_llama_executor_lib + PRIVATE ${_executor_runner_libs} ${NEURON_BUFFER_ALLOCATOR_LIB} + neuron_backend log llm_helper +) +target_sources( + mtk_llama_executor_lib + INTERFACE MultiModelLoader.h ModelChunk.h LlamaModelChunk.h LlamaRuntime.h + PRIVATE MultiModelLoader.cpp ModelChunk.cpp LlamaModelChunk.cpp + LlamaRuntime.cpp +) +target_compile_options(mtk_llama_executor_lib PUBLIC ${_common_compile_options}) diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index 2c9696f69eb..c99a54982aa 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -42,8 +42,9 @@ target_include_directories( ) if(EXECUTORCH_USE_TIKTOKEN) - list(APPEND _llama_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp + list( + APPEND _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp ) list(APPEND _llama_runner__srcs ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index a1a6fc8c939..9d7e47812e2 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -21,7 +21,6 @@ project(llava) # Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) - include(CMakeDependentOption) # # pthreadpool: build pthreadpool library. Disable on unsupported platforms diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index 94af209cb6c..542d7f03460 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -41,7 +41,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) # The `__srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}. # set(EXECUTORCH_SRCS_FILE - "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake" ) extract_sources(${EXECUTORCH_SRCS_FILE}) include(${EXECUTORCH_SRCS_FILE}) @@ -50,7 +50,6 @@ get_filename_component( EXECUTORCH_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../.." ABSOLUTE ) - # portable_ops_lib gen_selected_ops(LIB_NAME "full_portable_ops_lib" INCLUDE_ALL_OPS "ON") generate_bindings_for_kernels( @@ -68,21 +67,13 @@ target_include_directories( ) # build qnn_executor_runner -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/executor_runner -) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner) # build qnn_llama_runner -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2 -) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2) # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama -) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama) # build qaihub_stable_diffusion_runner -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/stable_diffusion -) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/stable_diffusion) diff --git a/examples/qualcomm/executor_runner/CMakeLists.txt b/examples/qualcomm/executor_runner/CMakeLists.txt index 73106d9368f..b950a4f82fd 100644 --- a/examples/qualcomm/executor_runner/CMakeLists.txt +++ b/examples/qualcomm/executor_runner/CMakeLists.txt @@ -9,7 +9,9 @@ set(_qnn_executor_runner__srcs ${_executor_runner__srcs}) # preprocess executor runner src files list(TRANSFORM _qnn_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") list(FILTER _qnn_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$") -list(PREPEND _qnn_executor_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_executor_runner.cpp) +list(PREPEND _qnn_executor_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/qnn_executor_runner.cpp +) # build executor runner add_executable(qnn_executor_runner ${_qnn_executor_runner__srcs}) diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt index 2f13f017d3b..7b59120d713 100644 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt @@ -9,25 +9,21 @@ set(_qnn_llama_runner__srcs ${_llama_runner__srcs}) # preprocess qnn llama runner src files list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") -list(PREPEND _qnn_llama_runner__srcs +list( + PREPEND + _qnn_llama_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h ) - # build qnn llama runner add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs}) target_include_directories( qnn_llama_runner PUBLIC ${_common_include_directories} ) -target_link_libraries(qnn_llama_runner - qnn_executorch_backend - full_portable_ops_lib - extension_data_loader - extension_module - gflags -) -target_compile_options(qnn_llama_runner - PUBLIC ${_common_compile_options} +target_link_libraries( + qnn_llama_runner qnn_executorch_backend full_portable_ops_lib + extension_data_loader extension_module gflags ) +target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index d2cbbc183cb..674aa2b72fe 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -6,45 +6,42 @@ # preprocess qaihub runner src files for llama2,3 set(_qaihub_llama_runner__srcs ${_llama_runner__srcs}) -list(TRANSFORM _qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") -list(FILTER _qaihub_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") -list(PREPEND _qaihub_llama_runner__srcs +list(TRANSFORM _qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") +list(FILTER _qaihub_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") +list( + PREPEND + _qaihub_llama_runner__srcs ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h ) - # preprocess qaihub llama2 7b runner src files set(_qaihub_llama2_7b_runner__srcs ${_qaihub_llama_runner__srcs}) list(PREPEND _qaihub_llama2_7b_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp ) # build qaihub llama2 7b runner add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs}) -target_include_directories(qaihub_llama2_7b_runner - PUBLIC ${_common_include_directories} +target_include_directories( + qaihub_llama2_7b_runner PUBLIC ${_common_include_directories} ) -target_link_libraries(qaihub_llama2_7b_runner - qnn_executorch_backend - executorch_no_prim_ops - extension_data_loader - extension_module - gflags +target_link_libraries( + qaihub_llama2_7b_runner qnn_executorch_backend executorch_no_prim_ops + extension_data_loader extension_module gflags ) -target_compile_options(qaihub_llama2_7b_runner - PUBLIC ${_common_compile_options} +target_compile_options( + qaihub_llama2_7b_runner PUBLIC ${_common_compile_options} ) - # preprocess qaihub llama3 8b runner src files set(_qaihub_llama3_8b_runner__srcs ${_qaihub_llama_runner__srcs}) list(PREPEND _qaihub_llama3_8b_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp ) # Adding a compile option to differentiate llama2 with llama3 logic @@ -65,23 +62,25 @@ add_subdirectory( ) set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - -list(APPEND _qaihub_llama3_8b_runner__srcs +list( + APPEND _qaihub_llama3_8b_runner__srcs ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp ) -list(APPEND _qaihub_llama3_8b_runner__srcs +list( + APPEND + _qaihub_llama3_8b_runner__srcs ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp ) set(_preprocessor_flag -DET_USE_TIKTOKEN) - # build qaihub llama3 8b runner add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs}) -target_include_directories(qaihub_llama3_8b_runner - PUBLIC ${_common_include_directories} +target_include_directories( + qaihub_llama3_8b_runner PUBLIC ${_common_include_directories} ) -target_link_libraries(qaihub_llama3_8b_runner +target_link_libraries( + qaihub_llama3_8b_runner qnn_executorch_backend executorch_no_prim_ops extension_data_loader @@ -89,6 +88,6 @@ target_link_libraries(qaihub_llama3_8b_runner gflags re2::re2 ) -target_compile_options(qaihub_llama3_8b_runner - PUBLIC ${_common_compile_options} +target_compile_options( + qaihub_llama3_8b_runner PUBLIC ${_common_compile_options} ) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt index c897f5f9f84..b0cec2d3005 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -6,21 +6,22 @@ # preprocess qaihub_stable_diffusion_runner_src files set(_qaihub_stable_diffusion_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/qaihub_stable_diffusion_runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h + ${CMAKE_CURRENT_LIST_DIR}/qaihub_stable_diffusion_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h ) # build qaihub_stable_diffusion_runner -add_executable(qaihub_stable_diffusion_runner ${_qaihub_stable_diffusion_runner__srcs}) -target_include_directories(qaihub_stable_diffusion_runner - PUBLIC ${_common_include_directories} +add_executable( + qaihub_stable_diffusion_runner ${_qaihub_stable_diffusion_runner__srcs} ) -target_link_libraries(qaihub_stable_diffusion_runner - qnn_executorch_backend - executorch_no_prim_ops - extension_data_loader - extension_module - gflags +target_include_directories( + qaihub_stable_diffusion_runner PUBLIC ${_common_include_directories} +) +target_link_libraries( + qaihub_stable_diffusion_runner qnn_executorch_backend executorch_no_prim_ops + extension_data_loader extension_module gflags +) +target_compile_options( + qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options} ) -target_compile_options(qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options}) diff --git a/examples/sdk/CMakeLists.txt b/examples/sdk/CMakeLists.txt index af7e9b15bc5..29248d10738 100644 --- a/examples/sdk/CMakeLists.txt +++ b/examples/sdk/CMakeLists.txt @@ -65,26 +65,25 @@ target_link_libraries( ) if(EXECUTORCH_BUILD_COREML) -find_library(ACCELERATE_FRAMEWORK Accelerate) -find_library(COREML_FRAMEWORK CoreML) -find_library(FOUNDATION_FRAMEWORK Foundation) -find_library(SQLITE_LIBRARY sqlite3) - -set(PROTOBUF_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../backends/apple/coreml/third-party/coremltools/deps/protobuf/cmake) -find_library(PROTOBUF_LITE REQUIRED NAMES libprotobuf-lite.a PATHS ${PROTOBUF_LIB_DIR} NO_DEFAULT_PATH) - -target_link_libraries( - sdk_example_runner - "-Wl,-force_load" - coremldelegate -) - -target_link_libraries( - sdk_example_runner - ${PROTOBUF_LITE} - ${ACCELERATE_FRAMEWORK} - ${COREML_FRAMEWORK} - ${FOUNDATION_FRAMEWORK} - ${SQLITE_LIBRARY} -) + find_library(ACCELERATE_FRAMEWORK Accelerate) + find_library(COREML_FRAMEWORK CoreML) + find_library(FOUNDATION_FRAMEWORK Foundation) + find_library(SQLITE_LIBRARY sqlite3) + + set(PROTOBUF_LIB_DIR + ${CMAKE_CURRENT_BINARY_DIR}/../../backends/apple/coreml/third-party/coremltools/deps/protobuf/cmake + ) + find_library( + PROTOBUF_LITE REQUIRED + NAMES libprotobuf-lite.a + PATHS ${PROTOBUF_LIB_DIR} + NO_DEFAULT_PATH + ) + + target_link_libraries(sdk_example_runner "-Wl,-force_load" coremldelegate) + + target_link_libraries( + sdk_example_runner ${PROTOBUF_LITE} ${ACCELERATE_FRAMEWORK} + ${COREML_FRAMEWORK} ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY} + ) endif() diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt index cf39248c384..cd1b6e73ff2 100644 --- a/exir/backend/test/demos/rpc/CMakeLists.txt +++ b/exir/backend/test/demos/rpc/CMakeLists.txt @@ -26,10 +26,10 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake) set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(_common_compile_options -Wno-deprecated-declarations -fPIC) -add_library(executor_backend STATIC ExecutorBackendRegister.cpp ExecutorBackend.cpp) -target_link_libraries( - executor_backend PRIVATE executorch_no_prim_ops +add_library( + executor_backend STATIC ExecutorBackendRegister.cpp ExecutorBackend.cpp ) +target_link_libraries(executor_backend PRIVATE executorch_no_prim_ops) target_include_directories( executor_backend PUBLIC ${_common_include_directories} @@ -38,4 +38,5 @@ install( TARGETS executor_backend DESTINATION lib INCLUDES - DESTINATION ${_common_include_directories}) + DESTINATION ${_common_include_directories} +) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 8edfbfc85b2..c1ffa954606 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -80,16 +80,15 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) # Add a AOT library find_package(Torch CONFIG REQUIRED) add_library( - custom_ops_aot_lib SHARED ${_custom_ops__srcs} - ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp + custom_ops_aot_lib SHARED + ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp ) target_include_directories( custom_ops_aot_lib PUBLIC "${_common_include_directories}" ) target_include_directories( - custom_ops_aot_lib - PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include" + custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include" ) if(TARGET portable_lib) # If we have portable_lib built, custom_ops_aot_lib gives the ability to use @@ -103,8 +102,8 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) target_link_libraries(custom_ops_aot_lib PUBLIC cpublas torch) if(WIN32) - # There is no direct replacement for libpthread.so on Windows. - # For the Windows build, link directly against pthreadpool and cpuinfo. + # There is no direct replacement for libpthread.so on Windows. For the + # Windows build, link directly against pthreadpool and cpuinfo. target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo) endif() target_compile_options( diff --git a/kernels/prim_ops/test/CMakeLists.txt b/kernels/prim_ops/test/CMakeLists.txt index 93d53523a0b..a25e15acc48 100644 --- a/kernels/prim_ops/test/CMakeLists.txt +++ b/kernels/prim_ops/test/CMakeLists.txt @@ -26,6 +26,4 @@ target_link_options_shared_lib(executorch) set(_test_srcs prim_ops_test.cpp) -et_cxx_test( - kernels_prim_ops_test SOURCES ${_test_srcs} -) +et_cxx_test(kernels_prim_ops_test SOURCES ${_test_srcs}) diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt index dbc9edcb973..f073835c935 100644 --- a/kernels/quantized/CMakeLists.txt +++ b/kernels/quantized/CMakeLists.txt @@ -88,13 +88,14 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode" LIB_NAME "quantized_ops_aot_lib" KERNEL_SOURCES "${_quantized_sources}" ) - # Register quantized ops to portable_lib, so that they're available - # via pybindings. + # Register quantized ops to portable_lib, so that they're available via + # pybindings. if(TARGET portable_lib) add_library(quantized_pybind_kernels_lib ${_quantized_kernels__srcs}) target_link_libraries(quantized_pybind_kernels_lib PRIVATE portable_lib) target_compile_options( - quantized_pybind_kernels_lib PUBLIC ${_common_compile_options}) + quantized_pybind_kernels_lib PUBLIC ${_common_compile_options} + ) target_include_directories( quantized_pybind_kernels_lib PUBLIC "${_common_include_directories}" ) @@ -104,18 +105,14 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode" generate_bindings_for_kernels( LIB_NAME "quantized_ops_pybind_lib" CUSTOM_OPS_YAML "${_yaml_file}" ) - # Build a library for pybind usage. - # quantized_ops_pybind_lib: Register quantized ops kernels into - # Executorch runtime for pybind. + # Build a library for pybind usage. quantized_ops_pybind_lib: Register + # quantized ops kernels into Executorch runtime for pybind. gen_operators_lib( - LIB_NAME "quantized_ops_pybind_lib" - KERNEL_LIBS quantized_pybind_kernels_lib - DEPS portable_lib + LIB_NAME "quantized_ops_pybind_lib" KERNEL_LIBS + quantized_pybind_kernels_lib DEPS portable_lib ) target_link_libraries( - quantized_ops_aot_lib - PUBLIC - quantized_ops_pybind_lib + quantized_ops_aot_lib PUBLIC quantized_ops_pybind_lib ) endif() endif() diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 92350a2c0db..791c2184e9f 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -31,7 +31,8 @@ foreach(kernel ${_kernels}) add_custom_command( OUTPUT "${_wrapper_path}" COMMAND mkdir -p ${_wrapper_dir} - COMMAND echo "#include " > "${_wrapper_path}" + COMMAND echo "#include " > + "${_wrapper_path}" WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" COMMENT "Generating ${_wrapper_path}" VERBATIM @@ -243,11 +244,19 @@ set(_optimized_kernels_test_sources ) # We don't have sleef on OSS so we don't have gelu and log_softmax -list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp" "op_log_softmax_test.cpp") +list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp" + "op_log_softmax_test.cpp" +) et_cxx_test( - optimized_kernels_test SOURCES ${_optimized_kernels_test_sources} EXTRA_LIBS - optimized_kernels optimized_ops_lib portable_kernels eigen_blas + optimized_kernels_test + SOURCES + ${_optimized_kernels_test_sources} + EXTRA_LIBS + optimized_kernels + optimized_ops_lib + portable_kernels + eigen_blas ) add_dependencies(optimized_kernels_test generate_wrapper) target_include_directories( diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt index f89381f5120..21eb4feae0f 100644 --- a/runtime/core/portable_type/test/CMakeLists.txt +++ b/runtime/core/portable_type/test/CMakeLists.txt @@ -23,8 +23,8 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp - scalar_test.cpp tensor_impl_test.cpp +set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp + tensor_impl_test.cpp ) et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS) diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt index a69f751bf2a..5a4013f43e0 100644 --- a/schema/CMakeLists.txt +++ b/schema/CMakeLists.txt @@ -49,7 +49,8 @@ function(generate_program_schema _schema_srcs _schema_name) # and some users need an alignment larger than the default, which is typically # 32. target_compile_definitions( - ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024) + ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024 + ) target_include_directories( ${_schema_name} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1cd4c824978..5dbe47c8671 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -62,8 +62,9 @@ endif() # add_executable(size_test_all_ops ${_size_test__srcs}) target_link_options_shared_lib(portable_ops_lib) -target_link_libraries(size_test_all_ops executorch - portable_ops_lib portable_kernels) +target_link_libraries( + size_test_all_ops executorch portable_ops_lib portable_kernels +) if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() From 8bcaca2a4fb9ed498180b13fb0b4b8c5e1736eb9 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 28 Aug 2024 18:12:49 -0700 Subject: [PATCH 100/531] Remove unused metadata util. Differential Revision: D61948644 Pull Request resolved: https://github.com/pytorch/executorch/pull/4960 --- extension/llm/runner/metadata_util.h | 53 ---------------------------- extension/llm/runner/targets.bzl | 11 ------ 2 files changed, 64 deletions(-) delete mode 100644 extension/llm/runner/metadata_util.h diff --git a/extension/llm/runner/metadata_util.h b/extension/llm/runner/metadata_util.h deleted file mode 100644 index 5f55dad538d..00000000000 --- a/extension/llm/runner/metadata_util.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * Constant metadata can be serialized in .pte files, this helper enables - * easy access to the metadata. - */ -#pragma once - -#include - -namespace executorch { -namespace extension { -namespace llm { - -template -T get_module_metadata( - Module* module, - const std::string& method_name, - T default_val) { - const auto method_names = module->method_names(); - ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model"); - auto model_methods = method_names.get(); - - T res = default_val; - if (model_methods.count(method_name)) { - ::executorch::runtime::Result> - outputs = module->execute(method_name); - if (outputs.ok()) { - std::vector<::executorch::runtime::EValue> outs = outputs.get(); - if (outs.size() > 0) { - res = outs[0].to(); - } - } - } else { - ET_LOG( - Info, - "The model does not contain %s method, using default value %lld", - method_name.c_str(), - (long long)default_val); - } - ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res); - return res; -} - -} // namespace llm -} // namespace extension -} // namespace executorch diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 1a2fa252c8f..4d715980afe 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -70,17 +70,6 @@ def define_common_targets(): ], ) - runtime.cxx_library( - name = "metadata_util" + aten_suffix, - exported_headers = ["metadata_util.h"], - visibility = [ - "@EXECUTORCH_CLIENTS", - ], - exported_deps = [ - "//executorch/extension/module:module" + aten_suffix, - ], - ) - runtime.cxx_library( name = "runner_lib" + aten_suffix, exported_headers = [ From 4cfdd22d25b9194c030331d1d12ca5d3aea016c6 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:46:59 -0700 Subject: [PATCH 101/531] [ET] Introduce `TensorInfo::is_memory_planned`` API Differential Revision: D61923695 Pull Request resolved: https://github.com/pytorch/executorch/pull/4946 --- runtime/executor/method_meta.cpp | 14 +++++++++++--- runtime/executor/method_meta.h | 11 ++++++++++- runtime/executor/test/method_meta_test.cpp | 1 + 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp index c6a855358d6..309ecf0ec85 100644 --- a/runtime/executor/method_meta.cpp +++ b/runtime/executor/method_meta.cpp @@ -68,10 +68,12 @@ size_t calculate_nbytes( TensorInfo::TensorInfo( Span sizes, Span dim_order, - exec_aten::ScalarType scalar_type) + exec_aten::ScalarType scalar_type, + const bool is_memory_planned) : sizes_(sizes), dim_order_(dim_order), scalar_type_(scalar_type), + is_memory_planned_(is_memory_planned), nbytes_(calculate_nbytes(sizes_, scalar_type_)) {} Span TensorInfo::sizes() const { @@ -86,6 +88,10 @@ exec_aten::ScalarType TensorInfo::scalar_type() const { return scalar_type_; } +bool TensorInfo::is_memory_planned() const { + return is_memory_planned_; +} + size_t TensorInfo::nbytes() const { return nbytes_; } @@ -132,7 +138,8 @@ Result MethodMeta::input_tensor_meta(size_t index) const { tensor_value->sizes()->data(), tensor_value->sizes()->size()), Span( tensor_value->dim_order()->data(), tensor_value->dim_order()->size()), - static_cast(tensor_value->scalar_type())); + static_cast(tensor_value->scalar_type()), + tensor_value->allocation_info() != nullptr); } size_t MethodMeta::num_outputs() const { @@ -170,7 +177,8 @@ Result MethodMeta::output_tensor_meta(size_t index) const { tensor_value->sizes()->data(), tensor_value->sizes()->size()), Span( tensor_value->dim_order()->data(), tensor_value->dim_order()->size()), - static_cast(tensor_value->scalar_type())); + static_cast(tensor_value->scalar_type()), + tensor_value->allocation_info() != nullptr); } size_t MethodMeta::num_memory_planned_buffers() const { diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h index c67b9d268e0..7817583fc3c 100644 --- a/runtime/executor/method_meta.h +++ b/runtime/executor/method_meta.h @@ -52,6 +52,11 @@ class TensorInfo final { */ exec_aten::ScalarType scalar_type() const; + /** + * Returns whether the tensor's memory was planned during export. + */ + bool is_memory_planned() const; + /** * Returns the size of the tensor in bytes. */ @@ -64,7 +69,8 @@ class TensorInfo final { TensorInfo( Span sizes, Span dim_order, - exec_aten::ScalarType scalar_type); + exec_aten::ScalarType scalar_type, + const bool is_memory_planned); /** * The sizes of the tensor. @@ -85,6 +91,9 @@ class TensorInfo final { /// The scalar type of the tensor. exec_aten::ScalarType scalar_type_; + /// Whether the tensor's memory was planned during export. + bool is_memory_planned_; + /// The size in bytes of the tensor. size_t nbytes_; }; diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp index 7e0e06099ee..bd48f64d98f 100644 --- a/runtime/executor/test/method_meta_test.cpp +++ b/runtime/executor/test/method_meta_test.cpp @@ -61,6 +61,7 @@ void check_tensor(const TensorInfo& tensor_info) { EXPECT_EQ(dim_order.size(), 2); EXPECT_EQ(dim_order[0], 0); EXPECT_EQ(dim_order[1], 1); + EXPECT_EQ(tensor_info.is_memory_planned(), true); EXPECT_EQ(tensor_info.nbytes(), 16); } } // namespace From eeb52d5c861c649b8004283f8948c169f24a6a85 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Wed, 28 Aug 2024 19:06:49 -0700 Subject: [PATCH 102/531] Fix sym int deserialization and add to shape_env.var_to_val Differential Revision: D61948307 Pull Request resolved: https://github.com/pytorch/executorch/pull/4959 --- exir/serde/export_serialize.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index f0549ba4160..8ec864e489b 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -1474,6 +1474,12 @@ def deserialize_sym_int(self, s: SymInt) -> Union[int, torch.SymInt]: if val.expr_str in self.symbol_name_to_symbol: sym = self.symbol_name_to_symbol[val.expr_str] + if ( + isinstance(sym, sympy.Symbol) + and sym not in self.shape_env.var_to_val + ): + if hint is not None: + self.shape_env.add_var_to_val(sym, hint) else: sym = sympy.sympify(val.expr_str, locals=self.symbol_name_to_symbol) # NOTE(avik): Assumptions on symbols are not explicitly serialized. From 66b2f73c5fd37bd974b71ba17a801fa99418b095 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 29 Aug 2024 04:47:05 +0200 Subject: [PATCH 103/531] Implement bmm op for Arm backend Differential Revision: D61852906 Pull Request resolved: https://github.com/pytorch/executorch/pull/4926 --- backends/arm/arm_partitioner.py | 1 + backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_bmm.py | 82 +++++++++++ .../quantization_annotation/mm_annotator.py | 2 +- backends/arm/test/ops/test_bmm.py | 135 ++++++++++++++++++ 5 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 backends/arm/operators/op_bmm.py create mode 100644 backends/arm/test/ops/test_bmm.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index 0dc3d36b5c6..bee8b8a27f9 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -40,6 +40,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.addmm.default, exir_ops.edge.aten.expand_copy.default, exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.permute_copy.default, exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.convolution.default, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index dc1fcc8e2c3..fb5e46c5c2d 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -9,6 +9,7 @@ op_addmm, op_avg_pool2d, op_batch_norm, + op_bmm, op_cat, op_conv2d, op_dequant, diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py new file mode 100644 index 00000000000..8d0235ebe73 --- /dev/null +++ b/backends/arm/operators/op_bmm.py @@ -0,0 +1,82 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args +from executorch.backends.arm.tosa_utils import get_two_inputs +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class BMMVisitor(NodeVisitor): + target = "aten.bmm.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + input0, input1 = get_two_inputs(node) + + # aten.bmm maps directly to MATMUL + # NOTE: For now, only INT8 & FP32 is supported + + # For INT8, we need to get the zero points and add an intermediate tensor + # for a later rescale. + if is_quant_node: + input0_zp = get_quant_node_args(input0).zp + input1_zp = get_quant_node_args(input1).zp + bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) + bmm_output_name = bmm_result.name + else: + input0_zp, input1_zp = 0, 0 + bmm_output_name = output.name + + # Add the MATMUL to the TOSA graph. + attr = ts.TosaSerializerAttribute() + attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp) + + tosa_graph.addOperator( + TosaOp.Op().MATMUL, + [input0.name, input1.name], + [bmm_output_name], + attr, + ) + + # As INT8 accumulates into INT32, we need to rescale it back to INT8 + if is_quant_node: + input0_q_params = get_quant_node_args(input0) + input1_q_params = get_quant_node_args(input1) + output_q_params = get_quant_node_args(list(node.users)[0]) + + final_output_scale = ( + input0_q_params.scale * input1_q_params.scale + ) / output_q_params.scale + + build_rescale( + tosa_fb=tosa_graph, + scale=final_output_scale, + input_node=bmm_result, + output_name=output.name, + output_type=ts.DType.INT8, + output_shape=bmm_result.shape, + input_zp=0, + output_zp=output_q_params.zp, + is_double_round=False, + ) diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py index 969f0131ffd..7fb5c51b224 100644 --- a/backends/arm/quantizer/quantization_annotation/mm_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mm_annotator.py @@ -22,7 +22,7 @@ def _annotate_mm( quantization_config: QuantizationConfig, filter_fn: Optional[Callable[[Node], bool]] = None, ) -> Optional[List[List[Node]]]: - mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn) + mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn) mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values())) annotated_partitions = [] for mm_partition in mm_partitions: diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py new file mode 100644 index 00000000000..30f45261247 --- /dev/null +++ b/backends/arm/test/ops/test_bmm.py @@ -0,0 +1,135 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +torch.manual_seed(1) + + +class TestBMM(unittest.TestCase): + """Tests Batch MatMul""" + + class BMM(torch.nn.Module): + test_parameters = [ + (torch.rand(5, 3, 5), torch.rand(5, 5, 2)), + (torch.rand(2, 1, 1), torch.rand(2, 1, 1)), + (torch.ones(1, 55, 3), torch.ones(1, 3, 44)), + (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)), + (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)), + ] + + def forward(self, x, y): + return torch.bmm(x, y) + + class BMMSingleInput(torch.nn.Module): + test_parameters = [ + (torch.rand(20, 3, 3),), + (torch.ones(2, 128, 128),), + (10000 * torch.randn(4, 25, 25),), + (5 + 5 * torch.randn(3, 64, 64),), + ] + + def forward(self, x): + return torch.bmm(x, x) + + def _test_bmm_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy + @parameterized.expand(BMMSingleInput.test_parameters) + @unittest.expectedFailure + def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_u55_BI_pipeline(self.BMMSingleInput(), test_data) From 2553b85d8e4a31ebb6070bc642fc766931f6dc36 Mon Sep 17 00:00:00 2001 From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com> Date: Thu, 29 Aug 2024 04:54:40 +0200 Subject: [PATCH 104/531] Arm backend: extend Softmax to handle dim < 0 Differential Revision: D61852817 Pull Request resolved: https://github.com/pytorch/executorch/pull/4819 --- backends/arm/operators/op_softmax.py | 2 +- backends/arm/test/ops/test_softmax.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/arm/operators/op_softmax.py b/backends/arm/operators/op_softmax.py index 627fa64aed1..6baf4ea16f6 100644 --- a/backends/arm/operators/op_softmax.py +++ b/backends/arm/operators/op_softmax.py @@ -33,7 +33,7 @@ def define_node( input_name = inputs[0].name dim_order = inputs[0].dim_order input_shape = tosa_shape(inputs[0].shape, dim_order) - dim_value = dim_order.index(inputs[1].number) + dim_value = dim_order.index(inputs[1].number % len(dim_order)) ## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1) # FP32 diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index b3b6230daa7..20da65b687f 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -15,15 +14,17 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, test_data, dim) - ("zeros", torch.zeros(10, 10, 10, 10), 1), + ("zeros", torch.zeros(10, 10, 10, 10), 0), + ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4), ("ones", torch.ones(10, 10, 10, 10), 1), + ("ones_neg_dim", torch.ones(10, 10, 10, 10), -1), ("rand", torch.rand(10, 10, 10, 10), 2), + ("rand_neg_dim", torch.rand(10, 10, 10, 10), -2), ("randn", torch.randn(10, 10, 10, 10), 3), + ("randn_neg_dim", torch.randn(10, 10, 10, 10), -3), ] From 1ae997c30334f3aeef6358f8600508150c47db29 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:09:00 -0700 Subject: [PATCH 105/531] [executorch] Ignore leading 1 dimensions when checking optimized path for op_mul (#4963) A 1 x 1 x ... x m x n tensor can be element-wise multiplied with a m x n tensor just fine. Pull Request resolved: https://github.com/pytorch/executorch/pull/4806 Co-authored-by: Scott Wolchok --- kernels/optimized/cpu/op_mul.cpp | 23 ++++++++++++++++++++++- kernels/test/op_mul_test.cpp | 15 +++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index adcd8999150..b4bb7955279 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -22,6 +22,25 @@ using ScalarType = exec_aten::ScalarType; namespace { +bool sizes_match_ignoring_leading_1s( + ArrayRef lhs, + ArrayRef rhs) { + auto lhs_begin = lhs.begin(); + auto lhs_end = lhs.end(); + while (lhs_begin != lhs_end && *lhs_begin == 1) { + ++lhs_begin; + } + + auto rhs_begin = rhs.begin(); + auto rhs_end = rhs.end(); + while (rhs_begin != rhs_end && *rhs_begin == 1) { + ++rhs_begin; + } + + return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && + std::equal(lhs_begin, lhs_end, rhs_begin); +} + // Move to generic util as this is applicable to all binary ops bool can_use_optimized_path( const Tensor& a, @@ -38,7 +57,9 @@ bool can_use_optimized_path( (a_type != ScalarType::Half && b_type != ScalarType::Half); can_use_optimized_path = can_use_optimized_path && (a.sizes().equals(b.sizes()) || - (a.numel() == b.numel() && a.numel() == out.numel())); + (a.numel() == b.numel() && + (a.numel() == out.numel() || + sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))); return can_use_optimized_path; } diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 029a74ca944..a59cf4ec5a6 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -165,6 +165,21 @@ TEST_F(OpMulOutTest, BoolTensors) { EXPECT_TENSOR_EQ(out, tf.make(sizes, /*data=*/{false, false, true, false})); } +TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) { + TensorFactory tf; + + const std::vector sizes1 = {1, 1, 2, 2}; + const std::vector sizes2 = {1, 2, 2}; + + // Destination for the mul. + Tensor out = tf.zeros(sizes1); + + // Multiply two tensors + op_mul_out( + tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out); + EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8})); +} + // Mismatched shape tests. TEST_F(OpMulOutTest, MismatchedInputShapesDies) { if (SupportedFeatures::get()->is_aten) { From 89ebeb08b849dab6e2fad00db386e29db345b0c4 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:25:04 -0700 Subject: [PATCH 106/531] Move threadpool to extension Differential Revision: D61938189 Pull Request resolved: https://github.com/pytorch/executorch/pull/4954 --- backends/xnnpack/runtime/XNNCompiler.cpp | 2 +- backends/xnnpack/targets.bzl | 2 +- backends/xnnpack/test/CMakeLists.txt | 6 ++++-- configurations/targets.bzl | 2 +- examples/models/llama2/CMakeLists.txt | 6 +++--- examples/models/llama2/main.cpp | 4 ++-- examples/models/llama2/targets.bzl | 4 ++-- examples/models/llava/CMakeLists.txt | 6 +++--- examples/models/llava/main.cpp | 4 ++-- extension/android/CMakeLists.txt | 2 +- extension/android/jni/jni_layer_llama.cpp | 4 ++-- extension/llm/custom_ops/CMakeLists.txt | 4 ++-- extension/llm/custom_ops/op_sdpa.cpp | 2 +- extension/llm/custom_ops/targets.bzl | 2 +- extension/parallel/targets.bzl | 2 +- extension/parallel/test/CMakeLists.txt | 4 ++-- extension/parallel/thread_parallel.cpp | 2 +- {backends/xnnpack => extension}/threadpool/TARGETS | 0 .../xnnpack => extension}/threadpool/cpuinfo_utils.cpp | 0 {backends/xnnpack => extension}/threadpool/cpuinfo_utils.h | 0 {backends/xnnpack => extension}/threadpool/targets.bzl | 0 {backends/xnnpack => extension}/threadpool/test/TARGETS | 0 {backends/xnnpack => extension}/threadpool/test/targets.bzl | 2 +- .../threadpool/test/threadpool_test.cpp | 4 ++-- {backends/xnnpack => extension}/threadpool/threadpool.cpp | 4 ++-- {backends/xnnpack => extension}/threadpool/threadpool.h | 0 .../xnnpack => extension}/threadpool/threadpool_guard.cpp | 2 +- .../xnnpack => extension}/threadpool/threadpool_guard.h | 0 28 files changed, 36 insertions(+), 34 deletions(-) rename {backends/xnnpack => extension}/threadpool/TARGETS (100%) rename {backends/xnnpack => extension}/threadpool/cpuinfo_utils.cpp (100%) rename {backends/xnnpack => extension}/threadpool/cpuinfo_utils.h (100%) rename {backends/xnnpack => extension}/threadpool/targets.bzl (100%) rename {backends/xnnpack => extension}/threadpool/test/TARGETS (100%) rename {backends/xnnpack => extension}/threadpool/test/targets.bzl (89%) rename {backends/xnnpack => extension}/threadpool/test/threadpool_test.cpp (97%) rename {backends/xnnpack => extension}/threadpool/threadpool.cpp (97%) rename {backends/xnnpack => extension}/threadpool/threadpool.h (100%) rename {backends/xnnpack => extension}/threadpool/threadpool_guard.cpp (89%) rename {backends/xnnpack => extension}/threadpool/threadpool_guard.h (100%) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index ac53831b04c..7999bb9a71f 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 4fd0ee519cb..dc8cd5917b3 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -47,7 +47,7 @@ def define_common_targets(): deps = [ third_party_dep("XNNPACK"), "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header", - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", "//executorch/runtime/core/exec_aten/util:tensor_util", ], # XnnpackBackend.cpp needs to compile with executor as whole diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index d0fbddae237..4b787e80eed 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -23,8 +23,10 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs # We can't put runtime/test_runtime_utils.cpp because we don't # build aten - runtime/test_xnnexecutor.cpp ../threadpool/threadpool.cpp - ../threadpool/threadpool_guard.cpp ../threadpool/test/threadpool_test.cpp + runtime/test_xnnexecutor.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/test/threadpool_test.cpp ) et_cxx_test( diff --git a/configurations/targets.bzl b/configurations/targets.bzl index dc88c137441..60a09d36269 100644 --- a/configurations/targets.bzl +++ b/configurations/targets.bzl @@ -20,7 +20,7 @@ def define_common_targets(): runtime.cxx_library( name = "executor_cpu_optimized", exported_deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", ] + get_all_cpu_backend_targets(), visibility = [ "//executorch/test/...", diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index 81089a438de..8b82fdda12f 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -147,8 +147,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL) list(APPEND link_libraries pthreadpool) # These 2 source files are included in xnnpack_backend if(NOT TARGET xnnpack_backend) - list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp - ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp + list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp ) endif() list(APPEND _common_include_directories @@ -159,7 +159,7 @@ endif() # Extra sources for cpuinfo if(EXECUTORCH_BUILD_CPUINFO) list(APPEND link_libraries cpuinfo) - list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp) + list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/cpuinfo_utils.cpp) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/cpuinfo/include ) diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp index 10a355a6037..1e1c3a10b3b 100644 --- a/examples/models/llama2/main.cpp +++ b/examples/models/llama2/main.cpp @@ -11,8 +11,8 @@ #include #if defined(ET_USE_THREADPOOL) -#include -#include +#include +#include #endif DEFINE_string( diff --git a/examples/models/llama2/targets.bzl b/examples/models/llama2/targets.bzl index 6cf398097d0..57e84256a49 100644 --- a/examples/models/llama2/targets.bzl +++ b/examples/models/llama2/targets.bzl @@ -17,8 +17,8 @@ def define_common_targets(): deps = [ "//executorch/examples/models/llama2/runner:runner" + aten_suffix, "//executorch/extension/evalue_util:print_evalue", - "//executorch/backends/xnnpack/threadpool:threadpool", - "//executorch/backends/xnnpack/threadpool:cpuinfo_utils", + "//executorch/extension/threadpool:threadpool", + "//executorch/extension/threadpool:cpuinfo_utils", ], external_deps = [ "gflags", diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index 9d7e47812e2..abd455a6b17 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -130,8 +130,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL) list(APPEND link_libraries pthreadpool) # These 2 source files are included in xnnpack_backend if(NOT TARGET xnnpack_backend) - list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/threadpool.cpp - ${XNNPACK_ROOT}/threadpool/threadpool_guard.cpp + list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp ) endif() list(APPEND _common_include_directories @@ -142,7 +142,7 @@ endif() # Extra sources for cpuinfo if(EXECUTORCH_BUILD_CPUINFO) list(APPEND link_libraries cpuinfo) - list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp) + list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/cpuinfo_utils.cpp) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/cpuinfo/include ) diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 6f4911180ca..431f86c906e 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -11,8 +11,8 @@ #include #if defined(ET_USE_THREADPOOL) -#include -#include +#include +#include #endif DEFINE_string( diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 5d836171411..5982cd16e10 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -95,7 +95,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) if(TARGET pthreadpool) set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp - ../../backends/xnnpack/threadpool/cpuinfo_utils.cpp + ../../extension/threadpool/cpuinfo_utils.cpp ) else() set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index f3cb6103c83..4f67d04396c 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -23,8 +23,8 @@ #include #if defined(ET_USE_THREADPOOL) -#include -#include +#include +#include #endif #include diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index c1ffa954606..5822352f306 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -53,8 +53,8 @@ if(NOT EXECUTORCH_BUILD_XNNPACK) list( APPEND _custom_ops__srcs - "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool_guard.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/threadpool/threadpool.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/threadpool/threadpool_guard.cpp" ) else() list(APPEND custom_ops_libs xnnpack_backend) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index d31cbaf3697..56db1c208ea 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -20,8 +20,8 @@ #include #ifdef ET_USE_THREADPOOL -#include #include +#include #endif #include diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index 55273c1f4ff..b90b636f7c4 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -17,7 +17,7 @@ def define_common_targets(): "//executorch/kernels/optimized:libvec", "//executorch/extension/kernel_util:kernel_util", "//executorch/extension/parallel:thread_parallel", - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", ], compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], visibility = [ diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl index 7cb8a2d28a8..b1da51b6171 100644 --- a/extension/parallel/targets.bzl +++ b/extension/parallel/targets.bzl @@ -23,7 +23,7 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", "//executorch/runtime/core:core", "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], diff --git a/extension/parallel/test/CMakeLists.txt b/extension/parallel/test/CMakeLists.txt index 7c6f6a27d75..1453a868920 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/parallel/test/CMakeLists.txt @@ -23,8 +23,8 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp - ${EXECUTORCH_ROOT}/backends/xnnpack/threadpool/threadpool.cpp - ${EXECUTORCH_ROOT}/backends/xnnpack/threadpool/threadpool_guard.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp ) et_cxx_test( diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp index aac47cca2e8..fb2d3e7b0ae 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/parallel/thread_parallel.cpp @@ -8,8 +8,8 @@ #include -#include #include +#include #include #include diff --git a/backends/xnnpack/threadpool/TARGETS b/extension/threadpool/TARGETS similarity index 100% rename from backends/xnnpack/threadpool/TARGETS rename to extension/threadpool/TARGETS diff --git a/backends/xnnpack/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp similarity index 100% rename from backends/xnnpack/threadpool/cpuinfo_utils.cpp rename to extension/threadpool/cpuinfo_utils.cpp diff --git a/backends/xnnpack/threadpool/cpuinfo_utils.h b/extension/threadpool/cpuinfo_utils.h similarity index 100% rename from backends/xnnpack/threadpool/cpuinfo_utils.h rename to extension/threadpool/cpuinfo_utils.h diff --git a/backends/xnnpack/threadpool/targets.bzl b/extension/threadpool/targets.bzl similarity index 100% rename from backends/xnnpack/threadpool/targets.bzl rename to extension/threadpool/targets.bzl diff --git a/backends/xnnpack/threadpool/test/TARGETS b/extension/threadpool/test/TARGETS similarity index 100% rename from backends/xnnpack/threadpool/test/TARGETS rename to extension/threadpool/test/TARGETS diff --git a/backends/xnnpack/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl similarity index 89% rename from backends/xnnpack/threadpool/test/targets.bzl rename to extension/threadpool/test/targets.bzl index 7bbcd8c4c03..b8a39d8969a 100644 --- a/backends/xnnpack/threadpool/test/targets.bzl +++ b/extension/threadpool/test/targets.bzl @@ -15,6 +15,6 @@ def define_common_targets(): name = "threadpool_test", srcs = _THREADPOOL_TESTS, deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", ], ) diff --git a/backends/xnnpack/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp similarity index 97% rename from backends/xnnpack/threadpool/test/threadpool_test.cpp rename to extension/threadpool/test/threadpool_test.cpp index a63a264e27b..c244b8fcf23 100644 --- a/backends/xnnpack/threadpool/test/threadpool_test.cpp +++ b/extension/threadpool/test/threadpool_test.cpp @@ -11,8 +11,8 @@ #include #include -#include -#include +#include +#include using namespace ::testing; diff --git a/backends/xnnpack/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp similarity index 97% rename from backends/xnnpack/threadpool/threadpool.cpp rename to extension/threadpool/threadpool.cpp index 4757ebf64c5..3de179de10d 100644 --- a/backends/xnnpack/threadpool/threadpool.cpp +++ b/extension/threadpool/threadpool.cpp @@ -6,8 +6,8 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#include +#include #include #include diff --git a/backends/xnnpack/threadpool/threadpool.h b/extension/threadpool/threadpool.h similarity index 100% rename from backends/xnnpack/threadpool/threadpool.h rename to extension/threadpool/threadpool.h diff --git a/backends/xnnpack/threadpool/threadpool_guard.cpp b/extension/threadpool/threadpool_guard.cpp similarity index 89% rename from backends/xnnpack/threadpool/threadpool_guard.cpp rename to extension/threadpool/threadpool_guard.cpp index a7f2a1803db..ac4103fbbc7 100644 --- a/backends/xnnpack/threadpool/threadpool_guard.cpp +++ b/extension/threadpool/threadpool_guard.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include namespace torch { namespace executorch { diff --git a/backends/xnnpack/threadpool/threadpool_guard.h b/extension/threadpool/threadpool_guard.h similarity index 100% rename from backends/xnnpack/threadpool/threadpool_guard.h rename to extension/threadpool/threadpool_guard.h From ba06861dcb22a51626dec509fba249c1320f4564 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:30:42 -0700 Subject: [PATCH 107/531] Add vectorized scalar path for single-element Tensor passed to optimized mul (#4964) We are currently doing slow broadcasting for this case. After this diff, we should get nice vectorization. Differential Revision: D61560825 Pull Request resolved: https://github.com/pytorch/executorch/pull/4807 --------- Co-authored-by: Scott Wolchok --- kernels/optimized/cpu/op_mul.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index b4bb7955279..14f55ed5344 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -119,6 +119,34 @@ Tensor& opt_mul_out( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); + if (b.numel() == 1) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + auto error = resize_tensor(out, a.sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.out", CTYPE, [&]() { + ET_SWITCH_REALB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + CTYPE_B b_val = *b.const_data_ptr(); + CTYPE b_casted = static_cast(b_val); + + using Vec = executorch::vec::Vectorized; + executorch::vec::map( + [b_casted](Vec x) { return x * Vec(b_casted); }, + out.mutable_data_ptr(), + a.const_data_ptr(), + out.numel()); + }); + }); + return out; + } + } else if (a.numel() == 1) { + return opt_mul_out(ctx, b, a, out); + } + if (can_use_optimized_path(a, b, out)) { // Resize for dynamic shape auto error = resize_tensor(out, a.sizes()); From 58efb8b67e074e7822b380d100cfc0e6ca5bf053 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:46:36 -0700 Subject: [PATCH 108/531] Optimized 2D-by-1D broadcasting in optimized op_mul (#4965) Detect that we are doing an elementwise multiplication for a 2D tensor and a 1D tensor. Dispatch to a vectorized kernel for this case. Differential Revision: [D61560826](https://our.internmc.facebook.com/intern/diff/D61560826/) Pull Request resolved: https://github.com/pytorch/executorch/pull/4808 --------- Co-authored-by: Scott Wolchok --- kernels/optimized/cpu/op_mul.cpp | 110 ++++++++++++++++++------ kernels/optimized/vec/functional_base.h | 35 ++++++++ kernels/test/op_mul_test.cpp | 29 ++++--- 3 files changed, 139 insertions(+), 35 deletions(-) diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 14f55ed5344..38f99c62536 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -22,45 +22,74 @@ using ScalarType = exec_aten::ScalarType; namespace { +// NOTE: we bake ArrayRef iterators being pointers into the return +// type here because we assume that iterators are portable across +// ArrayRef copies. +const Tensor::SizesType* arrayref_begin_ignoring_leading_1s( + ArrayRef arr) { + return std::find_if( + arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; }); +} + bool sizes_match_ignoring_leading_1s( ArrayRef lhs, ArrayRef rhs) { - auto lhs_begin = lhs.begin(); + auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs); auto lhs_end = lhs.end(); - while (lhs_begin != lhs_end && *lhs_begin == 1) { - ++lhs_begin; - } - auto rhs_begin = rhs.begin(); + auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs); auto rhs_end = rhs.end(); - while (rhs_begin != rhs_end && *rhs_begin == 1) { - ++rhs_begin; - } return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && std::equal(lhs_begin, lhs_end, rhs_begin); } // Move to generic util as this is applicable to all binary ops -bool can_use_optimized_path( - const Tensor& a, - const Tensor& b, - const Tensor& out) { +enum class ElementwiseOptimizedPath { + kNone, + kTreatAs1d, + kBroadcast2dBy1d, + kBroadcast2dBy1dReverseArguments, +}; + +ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path( + const Tensor& lhs, + const Tensor& rhs) { + auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes()); + auto lhs_end = lhs.sizes().end(); + + auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes()); + auto rhs_end = rhs.sizes().end(); + + const auto lhs_size = lhs_end - lhs_begin; + const auto rhs_size = rhs_end - rhs_begin; + if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) { + return ElementwiseOptimizedPath::kBroadcast2dBy1d; + } + + if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) { + return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments; + } + + return ElementwiseOptimizedPath::kNone; +} + +ElementwiseOptimizedPath +select_optimized_path(const Tensor& a, const Tensor& b, const Tensor& out) { ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - bool can_use_optimized_path = true; - can_use_optimized_path = - can_use_optimized_path && ((a_type == b_type) && (a_type == out_type)); - can_use_optimized_path = can_use_optimized_path && - (a_type != ScalarType::Half && b_type != ScalarType::Half); - can_use_optimized_path = can_use_optimized_path && - (a.sizes().equals(b.sizes()) || - (a.numel() == b.numel() && - (a.numel() == out.numel() || - sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))); - return can_use_optimized_path; + if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) { + return ElementwiseOptimizedPath::kNone; + } + if (a.sizes().equals(b.sizes()) || + (a.numel() == b.numel() && + (a.numel() == out.numel() || + sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) { + return ElementwiseOptimizedPath::kTreatAs1d; + } + return select_broadcast_2d_by_1d_optimized_path(a, b); } template < @@ -147,7 +176,8 @@ Tensor& opt_mul_out( return opt_mul_out(ctx, b, a, out); } - if (can_use_optimized_path(a, b, out)) { + auto selected_optimized_path = select_optimized_path(a, b, out); + if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { // Resize for dynamic shape auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( @@ -166,6 +196,38 @@ Tensor& opt_mul_out( b.const_data_ptr(), out.numel()); }); + } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) { + const Tensor* lhs; + const Tensor* rhs; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + lhs = &b; + rhs = &a; + } else { + // Catch failure to update logic when adding new broadcasting possibility. + ET_DCHECK( + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1d); + lhs = &a; + rhs = &b; + } + auto error = resize_tensor(out, lhs->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() { + using Vec = executorch::vec::Vectorized; + executorch::vec::broadcasting_map_2d_by_1d( + [](Vec x, Vec y) { return x * y; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + }); } else { ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h index 50141c2ec3f..7edb043abc9 100644 --- a/kernels/optimized/vec/functional_base.h +++ b/kernels/optimized/vec/functional_base.h @@ -325,5 +325,40 @@ inline void map4( } } + +// Map vec_fun across input_data and input_data2, where input_data is +// a two-dimensional array of size (size, size2), input_data2 is a +// one-dimensional array of size size2, and input_data2 is broadcast +// to be of size (size, size2). +template +inline void broadcasting_map_2d_by_1d( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + const scalar_t* input_data2, + int64_t size, + int64_t size2) { + using Vec = vec::Vectorized; + for (int64_t outer_idx = 0; outer_idx < size; ++outer_idx) { + const scalar_t* input_data_row = input_data + outer_idx * size2; + scalar_t* output_data_row = output_data + outer_idx * size2; + int64_t inner_idx = 0; + for (; inner_idx < size2 - (size2 % Vec::size()); inner_idx += Vec::size()) { + Vec data_vec = Vec::loadu(input_data_row + inner_idx); + Vec data_vec2 = Vec::loadu(input_data2 + inner_idx); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data_row + inner_idx); + } + if (size2 - inner_idx > 0) { + Vec data_vec = Vec::loadu(input_data_row + inner_idx, size2 - inner_idx); + Vec data_vec2 = Vec::loadu(input_data2 + inner_idx, size2 - inner_idx); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data_row + inner_idx, size2 - inner_idx); + } + } +} + + + } // namespace vec } // namespace executorch diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index a59cf4ec5a6..32b69352ef1 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -181,7 +181,7 @@ TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) { } // Mismatched shape tests. -TEST_F(OpMulOutTest, MismatchedInputShapesDies) { +TEST_F(OpMulOutTest, MismatchedNonBroadcastableInputShapesDies) { if (SupportedFeatures::get()->is_aten) { GTEST_SKIP() << "ATen currently supports mismatched shapes"; } @@ -189,11 +189,11 @@ TEST_F(OpMulOutTest, MismatchedInputShapesDies) { TensorFactory tf; // Input tensors with different shapes. - Tensor a = tf.ones(/*sizes=*/{1, 2}); + Tensor a = tf.ones(/*sizes=*/{4, 2}); Tensor b = tf.ones(/*sizes=*/{2, 2}); // Output tensor; matches the shape of one of the inputs. - Tensor out = tf.zeros(/*sizes=*/{4}); + Tensor out = tf.zeros(/*sizes=*/{8}); // Multiplying the two mismatched tensors should cause an assertion and kill // the test process. @@ -204,16 +204,22 @@ TEST_F(OpMulOutTest, MismatchedInputShapesDies) { TEST_F(OpMulOutTest, BroadcastA2BTest) { TensorFactory tf_a; - // a and b of different shapes - Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); - Tensor b = tf_a.make({2}, /*data=*/{2, 2}); + std::vector> b_sizeses = { + {2}, + {1, 2}, + }; + for (const auto& b_sizes : b_sizeses) { + // a and b of different shapes + Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); + Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2}); - // Destination for output of mul. - Tensor out = tf_a.zeros({2, 2}); + // Destination for output of mul. + Tensor out = tf_a.zeros({2, 2}); - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE( - op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE( + op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + } } // Broadcast tensor a's size to tensor b's size @@ -262,6 +268,7 @@ TEST_F(OpMulOutTest, ScalarInputBroadcastTest) { // Check that it matches the expected output. EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); + EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); } TEST_F(OpMulOutTest, MismatchedOutputShapesDies) { From 0c6a77e54c01de49d52d08546163ca3b6ee8c071 Mon Sep 17 00:00:00 2001 From: winskuo-quic <143469905+winskuo-quic@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:46:55 +0800 Subject: [PATCH 109/531] Refine tokenizer (#4940) --- backends/qualcomm/scripts/build.sh | 4 +- backends/qualcomm/tests/test_qnn_delegate.py | 7 +- ...d-run-qualcomm-ai-engine-direct-backend.md | 28 ++--- .../android/ExecuTorchDemo/README.md | 2 +- examples/qualcomm/README.md | 4 +- .../qualcomm/oss_scripts/llama2/README.md | 2 +- .../qualcomm/qaihub_scripts/llama/README.md | 4 +- .../llama/llama2/qaihub_llama2_7b.py | 70 ++++++------ .../llama/llama2/qaihub_llama2_7b_runner.cpp | 11 +- .../llama/llama3/qaihub_llama3_8b.py | 101 ++++++++---------- .../llama/llama3/qaihub_llama3_8b_runner.cpp | 16 ++- .../qaihub_scripts/llama/runner/runner.cpp | 72 +++++++++---- .../qaihub_scripts/llama/runner/runner.h | 12 ++- .../qaihub_scripts/stable_diffusion/README.md | 2 +- .../qaihub_stable_diffusion.py | 99 +++-------------- .../qualcomm/qaihub_scripts/utils/README.md | 2 +- .../qualcomm/qaihub_scripts/utils/utils.py | 87 +++++++++++++++ examples/qualcomm/utils.py | 9 +- 18 files changed, 290 insertions(+), 242 deletions(-) create mode 100644 examples/qualcomm/qaihub_scripts/utils/utils.py diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index aafd6252e79..b63ea6fe8d9 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -70,7 +70,7 @@ if [ "$BUILD_AARCH64" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT else # Force rebuild flatccrt for the correct platform - cd $BUILD_ROOT/sdk && make clean + cd $BUILD_ROOT/devtools && make clean fi cd $BUILD_ROOT @@ -112,7 +112,7 @@ if [ "$BUILD_X86_64" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT else # Force rebuild flatccrt for the correct platform - cd $BUILD_ROOT/sdk && make clean + cd $BUILD_ROOT/devtools && make clean fi cd $BUILD_ROOT diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 79b8443dc71..cba23f935c2 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -2036,7 +2036,12 @@ def test_llama3_8b(self): self.fail(msg["Error"]) else: model_out = msg["result"] - self.assertTrue(model_out.startswith(prompt)) + expected_result = ( + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + + prompt + + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + ) + self.assertTrue(model_out.startswith(expected_result)) def test_stable_diffusion(self): if not self.required_envs(): diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index c774ae57b43..5abaaeb7cef 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -126,8 +126,8 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b ```bash cd $EXECUTORCH_ROOT -mkdir cmake-out -cd cmake-out +mkdir build-x86 +cd build-x86 # Note that the below command might change. # Please refer to the above build.sh for latest workable commands. cmake .. \ @@ -158,8 +158,8 @@ Commands to build `qnn_executor_runner` for Android: ```bash cd $EXECUTORCH_ROOT -mkdir cmake-out-android -cd cmake-out-android +mkdir build-android +cd build-android # build executorch & qnn_executorch_backend cmake .. \ -DCMAKE_INSTALL_PREFIX=$PWD \ @@ -189,7 +189,7 @@ cmake ../examples/qualcomm \ cmake --build examples/qualcomm -j$(nproc) # qnn_executor_runner can be found under examples/qualcomm -# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner +# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner ls examples/qualcomm ``` @@ -209,7 +209,7 @@ cd $EXECUTORCH_ROOT cp schema/program.fbs exir/_serialize/program.fbs cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download ``` You might see something like below: @@ -239,7 +239,7 @@ We can test model inferences before deploying it to a device by HTP emulator. Let's build `qnn_executor_runner` for a x64 host: ```bash # assuming the AOT component is built. -cd $EXECUTORCH_ROOT/cmake-out +cd $EXECUTORCH_ROOT/build-x86 cmake ../examples/qualcomm \ -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ @@ -249,14 +249,14 @@ cmake ../examples/qualcomm \ cmake --build examples/qualcomm -j$(nproc) # qnn_executor_runner can be found under examples/qualcomm -# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner +# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner ls examples/qualcomm/ ``` To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`. We set the below two paths to `LD_LIBRARY_PATH` environment variable: 1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/` - 2. `$EXECUTORCH_ROOT/cmake-out/lib/` + 2. `$EXECUTORCH_ROOT/build-x86/lib/` The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section. @@ -264,8 +264,8 @@ The second path is for `libqnn_executorch_backend.so`. So, we can run `./deeplab_v3/dlv3_qnn.pte` by: ```bash -cd $EXECUTORCH_ROOT/cmake-out -export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH +cd $EXECUTORCH_ROOT/build-x86 +export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte ``` @@ -308,8 +308,8 @@ So, we can run `qnn_executor_runner` like ```bash adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} adb shell "cd ${DEVICE_DIR} \ && export LD_LIBRARY_PATH=${DEVICE_DIR} \ && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \ @@ -333,7 +333,7 @@ I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump The model is merely executed. If we want to feed real inputs and get model outputs, we can use ```bash cd $EXECUTORCH_ROOT -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s ``` The `` can be found by `adb devices` command. diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 89d8c34ee39..807561f44b5 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build- After generating the model, copy the model to `assets` directory. ```bash -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/ ``` diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md index 3e7a018ac74..fa73c92ee11 100644 --- a/examples/qualcomm/README.md +++ b/examples/qualcomm/README.md @@ -53,12 +53,12 @@ cd $EXECUTORCH_ROOT/examples/qualcomm/scripts #### For MobileNet_v2 ```bash -python mobilenet_v2.py -s -m "SM8550" -b path/to/cmake-out-android/ -d /path/to/imagenet-mini/val +python mobilenet_v2.py -s -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val ``` #### For DeepLab_v3 ```bash -python deeplab_v3.py -s -m "SM8550" -b path/to/cmake-out-android/ --download +python deeplab_v3.py -s -m "SM8550" -b path/to/build-android/ --download ``` #### Check context binary version diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md index ec15545a6f5..d83902a6de8 100644 --- a/examples/qualcomm/oss_scripts/llama2/README.md +++ b/examples/qualcomm/oss_scripts/llama2/README.md @@ -32,7 +32,7 @@ echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": Default example generates the story based on the given prompt, "Once". ```bash # 16a4w quant: -python examples/qualcomm/oss_scripts/llama2/llama.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once" +python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once" ``` #### (Note) Customized PTQ data set diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md index d7c5f80d334..d49ca4cc946 100644 --- a/examples/qualcomm/qaihub_scripts/llama/README.md +++ b/examples/qualcomm/qaihub_scripts/llama/README.md @@ -27,7 +27,7 @@ python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o token #### Step3: Run default examples ```bash # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized -python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?" +python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?" ``` ## Llama-3-8b-chat-hf @@ -48,5 +48,5 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o #### Step3: Run default examples ```bash # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized -python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?" +python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?" ``` \ No newline at end of file diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py index d680c973d35..9966d665aec 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py +++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import gc import json import os from multiprocessing.connection import Client @@ -15,18 +14,19 @@ QcomChipset, ) from executorch.backends.qualcomm.utils.utils import ( - canonicalize_program, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, generate_qnn_executorch_option, ) +from executorch.examples.qualcomm.qaihub_scripts.utils.utils import ( + gen_pte_from_ctx_bin, + get_encoding, +) from executorch.examples.qualcomm.utils import ( setup_common_args_and_variables, SimpleADB, ) -from executorch.exir.backend.backend_api import to_backend -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass def main(args): @@ -55,45 +55,25 @@ def main(args): is_from_context_binary=True, ) - pte_name = ( - "qaihub_llama2_7b_prompt" - if args.use_prompt_processor - else "qaihub_llama2_7b_token" - ) + if args.use_prompt_processor: + pte_name = "qaihub_llama2_7b_prompt" + last_shard_num_inputs = 4 + last_shard_num_outputs = 513 + else: + pte_name = "qaihub_llama2_7b_token" + last_shard_num_inputs = 516 + last_shard_num_outputs = 513 + if args.pre_gen_pte is None: # create custom operators as context loader bundle_programs = [ from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}") for i, target in enumerate(target_names) ] - # lower with QnnBackend - lowered_modules = [ - to_backend("QnnBackend", prog["edge_program"], compiler_specs) - for prog in bundle_programs - ] - # setup spill-fill buffer for relieving runtime memory usage - canonicalize_program(lowered_modules) - # export pte files - pte_files = [] - for i in range(len(target_names)): - print(f"pte {i} generating...") - memory_planning_pass = MemoryPlanningPass( - memory_planning_algo="greedy", - alloc_graph_input=False, - alloc_graph_output=False, - ) - pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte") - with open(pte_files[-1], "wb") as file: - file.write( - lowered_modules[0].buffer( - extract_delegate_segments=True, - memory_planning=memory_planning_pass, - ) - ) - # gc for reducing host memory consuming - bundle_programs.pop(0) - lowered_modules.pop(0) - gc.collect() + pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))] + pte_files = gen_pte_from_ctx_bin( + args.artifact, pte_names, compiler_specs, bundle_programs + ) else: pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)] @@ -125,7 +105,16 @@ def get_logit_encoding(path_to_last_shard: str): ) output_file = "result.txt" pos_embs_file = ["freq_cos", "freq_sin"] - scale, offset = get_logit_encoding(target_names[-1]) + encoding = get_encoding( + path_to_shard=f"{args.context_binaries}/{target_names[-1]}", + compiler_specs=compiler_specs, + get_input=False, + get_output=True, + num_input=last_shard_num_inputs, + num_output=last_shard_num_outputs, + )[0] + scale = encoding["scale"][-1] + offset = encoding["offset"][-1] outputs = [] runner_args = [ *[ @@ -173,7 +162,8 @@ def post_process(): freq = (freq / scale + offset).clip(min=0, max=65535).detach() freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1]) - adb.push(files=custom_files) + if not args.skip_push: + adb.push(files=custom_files) adb.execute(custom_runner_cmd=runner_cmds) adb.pull(args.artifact, callback=post_process) if args.ip and args.port != -1: @@ -230,7 +220,7 @@ def post_process(): parser.add_argument( "--temperature", help="sampling temperature for llama2", - default=0.8, + default=0.0, type=float, ) diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp index 3aabdb93091..49782cf8789 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp @@ -36,8 +36,8 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); DEFINE_double( temperature, - 0.8f, - "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); + 0.0f, + "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); DEFINE_int32( eval_mode, 0, @@ -75,9 +75,10 @@ int main(int argc, char** argv) { // generate tokens & store inference output std::ofstream fout(FLAGS_output_path.c_str()); - runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) { - fout << piece; - }); + runner.generate( + FLAGS_prompt, "", FLAGS_seq_len, [&](const std::string& piece) { + fout << piece; + }); fout.close(); return 0; } diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py index 9e4f3caf661..bdcd7ad6a2e 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py +++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py @@ -4,30 +4,28 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import gc import json import os from multiprocessing.connection import Client -import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor - import torch from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( # noqa: F401 QcomChipset, ) + from executorch.backends.qualcomm.utils.utils import ( - canonicalize_program, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, - generate_qnn_executorch_option, +) +from executorch.examples.qualcomm.qaihub_scripts.utils.utils import ( + gen_pte_from_ctx_bin, + get_encoding, ) from executorch.examples.qualcomm.utils import ( setup_common_args_and_variables, SimpleADB, ) -from executorch.exir.backend.backend_api import to_backend -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass def main(args): @@ -56,67 +54,33 @@ def main(args): is_from_context_binary=True, ) - pte_name = ( - "qaihub_llama3_8b_prompt" - if args.use_prompt_processor - else "qaihub_llama3_8b_token" - ) + if args.use_prompt_processor: + pte_name = "qaihub_llama3_8b_prompt" + last_shard_num_inputs = 4 + last_shard_num_outputs = 65 + custom_spill_fill = 128974848 + else: + pte_name = "qaihub_llama3_8b_token" + last_shard_num_inputs = 68 + last_shard_num_outputs = 65 + custom_spill_fill = 3932160 + if args.pre_gen_pte is None: # create custom operators as context loader bundle_programs = [ from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}") for i, target in enumerate(target_names) ] - # lower with QnnBackend - lowered_modules = [ - to_backend("QnnBackend", prog["edge_program"], compiler_specs) - for prog in bundle_programs - ] - # TODO: QNN seems to have an expected spill fill size that can be found through log. - # Find a way to set this value instead of manually go through the log to retrieve the value. - custom_spill_fill = 128974848 if args.use_prompt_processor else 3932160 - # setup spill-fill buffer for relieving runtime memory usage - canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill) - # export pte files - pte_files = [] - for i in range(len(target_names)): - print(f"pte {i} generating...") - memory_planning_pass = MemoryPlanningPass( - memory_planning_algo="greedy", - alloc_graph_input=False, - alloc_graph_output=False, - ) - pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte") - with open(pte_files[-1], "wb") as file: - file.write( - lowered_modules[0].buffer( - extract_delegate_segments=True, - memory_planning=memory_planning_pass, - ) - ) - # gc for reducing host memory consuming - bundle_programs.pop(0) - lowered_modules.pop(0) - gc.collect() + pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))] + pte_files = gen_pte_from_ctx_bin( + args.artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill + ) else: pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(5)] if args.compile_only: return - def get_logit_encoding(path_to_last_shard: str): - with open(f"{args.context_binaries}/{path_to_last_shard}", "rb") as f: - ctx_bin = f.read() - qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_specs), ctx_bin - ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() - logits = qnn_mgr.GetGraphOutputs()[-1] - encoding = logits.GetEncodings() - qnn_mgr.Destroy() - return encoding.data["scale"].item(), encoding.data["offset"].item() - adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=args.build_folder, @@ -129,7 +93,17 @@ def get_logit_encoding(path_to_last_shard: str): ) output_file = "result.txt" pos_embs_file = ["freq_cos", "freq_sin"] - scale, offset = get_logit_encoding(target_names[-1]) + + encoding = get_encoding( + path_to_shard=f"{args.context_binaries}/{target_names[-1]}", + compiler_specs=compiler_specs, + get_input=False, + get_output=True, + num_input=last_shard_num_inputs, + num_output=last_shard_num_outputs, + )[0] + scale = encoding["scale"][-1] + offset = encoding["offset"][-1] outputs = [] runner_args = [ *[ @@ -145,6 +119,7 @@ def get_logit_encoding(path_to_last_shard: str): f"--eval_mode {0 if args.use_prompt_processor else 1}", f"--logits_scale {scale}", f"--logits_offset {-offset}", + f"--system_prompt '{args.system_prompt}'", ] runner_cmds = " ".join( [ @@ -177,7 +152,8 @@ def post_process(): freq = (freq / scale + offset).clip(min=0, max=65535).detach() freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1]) - adb.push(files=custom_files) + if not args.skip_push: + adb.push(files=custom_files) adb.execute(custom_runner_cmd=runner_cmds) adb.pull(args.artifact, callback=post_process) if args.ip and args.port != -1: @@ -234,7 +210,7 @@ def post_process(): parser.add_argument( "--temperature", help="sampling temperature for llama3", - default=0.8, + default=0.0, type=float, ) @@ -245,6 +221,13 @@ def post_process(): type=str, ) + parser.add_argument( + "--system_prompt", + help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", + default="", + type=str, + ) + parser.add_argument( "--pre_gen_pte", help="folder path to pre-compiled ptes", diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp index d5c2208c386..aae18434c61 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp @@ -35,10 +35,14 @@ DEFINE_string(freq_sin_path, "", "Path to precomputed position embeddings"); DEFINE_string(output_path, "outputs", "Executorch inference data output path."); DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); +DEFINE_string( + system_prompt, + "", + "Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None"); DEFINE_double( temperature, - 0.8f, - "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); + 0.0f, + "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); DEFINE_int32( eval_mode, 0, @@ -77,9 +81,11 @@ int main(int argc, char** argv) { // generate tokens & store inference output std::ofstream fout(FLAGS_output_path.c_str()); - runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) { - fout << piece; - }); + runner.generate( + FLAGS_prompt, + FLAGS_system_prompt, + FLAGS_seq_len, + [&](const std::string& piece) { fout << piece; }); fout.close(); return 0; } diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index 30568b4b067..ec13cec37c5 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -50,8 +50,6 @@ Runner::Runner( const int logits_offset) : tokenizer_path_(tokenizer_path), temperature_(temperature), - bos_id_(1), - eos_id_(2), n_bos_(1), n_eos_(1), vocab_size_(QAIHUB_LLAMA_LOGITS), @@ -67,6 +65,21 @@ Runner::Runner( } ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); +// load tokenizer +#if defined(QAIHUB_LLAMA3_RUNNER) + tokenizer_ = get_tiktoken_for_llama(); + tokenizer_->load(tokenizer_path_); + eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + version_ = LlamaVersion::kLlama3; +#else + tokenizer_ = std::make_unique(); + tokenizer_->load(tokenizer_path_); + version_ = LlamaVersion::kLlama2; +#endif + + bos_id_ = tokenizer_->bos_tok(); + eos_id_.insert(tokenizer_->eos_tok()); + switch (eval_mode_) { case EvalMode::kBert: io_mem_ = @@ -98,14 +111,6 @@ Error Runner::load() { ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward")); } -// load tokenizer -#if defined(QAIHUB_LLAMA3_RUNNER) - tokenizer_ = get_tiktoken_for_llama(); -#else - tokenizer_ = std::make_unique(); -#endif - tokenizer_->load(tokenizer_path_); - // create sampler sampler_ = std::make_unique( vocab_size_, @@ -157,6 +162,7 @@ void Runner::run_model_step(std::vector>& inputs) { // TODO: add overloaded method for on-device tokenize Error Runner::generate( const std::string& prompt, + const std::string& system_prompt, int32_t seq_len, std::function token_callback, std::function stats_callback) { @@ -185,13 +191,43 @@ Error Runner::generate( } stats_.inference_start_ms = util::time_in_ms(); - shouldStop_ = false; seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_; + std::string post_process_prompt; + switch (version_) { + case LlamaVersion::kLlama2: + post_process_prompt.append(prompt); + break; + case LlamaVersion::kLlama3: + if (!system_prompt.empty()) { + post_process_prompt.append( + "<|start_header_id|>system<|end_header_id|>\n\n"); + post_process_prompt.append(system_prompt); + post_process_prompt.append("<|eot_id|>\n"); + } + post_process_prompt.append( + "<|start_header_id|>user<|end_header_id|>\n\n"); + post_process_prompt.append(prompt); + post_process_prompt.append( + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"); + // tokenizer_->encode will add <|begin_of_text|> token for us. + // For now, do token call back so the output format looks the same as + // llama3 model card. + if (token_callback && eval_mode_ == EvalMode::kKVCached) { + token_callback("<|begin_of_text|>"); + } + break; + default: + ET_CHECK_MSG(false, "unsupported llama version"); + break; + } + Result> encode_res = - tokenizer_->encode(prompt, n_bos_, 0); + tokenizer_->encode(post_process_prompt, n_bos_, 0); ET_CHECK_OK_OR_RETURN_ERROR( - encode_res.error(), "failed to encode prompt %s", prompt.c_str()); + encode_res.error(), + "failed to encode prompt %s", + post_process_prompt.c_str()); std::vector prompt_tokens = encode_res.get(); int num_prompt_tokens = prompt_tokens.size(); @@ -264,11 +300,7 @@ Error Runner::generate( token_callback(piece_res.get().c_str()); } - if (shouldStop_) { - break; - } - - if (pos >= num_prompt_tokens && cur_token == eos_id_) { + if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) { ET_LOG(Info, "\nReached to the end of generation"); break; } @@ -367,10 +399,6 @@ std::string statsToJsonString(const Runner::Stats& stats) { } } // namespace -void Runner::stop() { - shouldStop_ = true; -} - std::vector> Runner::get_methods_meta() { std::vector> methods_meta; methods_meta.reserve(modules_.size()); diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h index f0b644071bb..b9849a21327 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h @@ -68,6 +68,7 @@ class Runner { Error load(); Error generate( const std::string& prompt, + const std::string& system_prompt, int32_t seq_len, std::function token_callback = {}, std::function stats_callback = {}); @@ -81,11 +82,16 @@ class Runner { kUnsupported, }; + enum LlamaVersion { + kLlama2 = 0, + kLlama3, + }; + int32_t logitsToToken(const exec_aten::Tensor& logits_tensor); void run_model_step(std::vector>& inputs); // metadata - const int32_t bos_id_; - const int32_t eos_id_; + int32_t bos_id_; + std::unordered_set eos_id_; const int32_t n_bos_; const int32_t n_eos_; const int32_t vocab_size_; @@ -96,11 +102,11 @@ class Runner { float temperature_; std::unique_ptr tokenizer_; std::unique_ptr sampler_; - bool shouldStop_{false}; Stats stats_; std::unique_ptr io_mem_; const float logits_scale_; const int32_t logits_offset_; + LlamaVersion version_; }; } // namespace executor diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md index 21b3370df70..3b5a74c8238 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/README.md @@ -29,7 +29,7 @@ sh examples/qualcomm/qaihub_scripts/stable_diffusion/install_requirements.sh #### Step4: Run default example In this example, we execute the script for 20 time steps with the `prompt` 'a photo of an astronaut riding a horse on mars': ```bash -python examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py -a ${ARTIFACTS} -b build_android -m ${SOC_MODEL} --s ${SERIAL_NUM} --text_encoder_bin ${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY} --unet_bin ${PATH_TO_UNET_CONTEXT_BINARY} --vae_bin ${PATH_TO_VAE_CONTEXT_BINARY} --vocab_json ${PATH_TO_VOCAB_JSON_FILE} --num_time_steps 20 --prompt "a photo of an astronaut riding a horse on mars" +python examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py -b build-android -m ${SOC_MODEL} --s ${SERIAL_NUM} --text_encoder_bin ${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY} --unet_bin ${PATH_TO_UNET_CONTEXT_BINARY} --vae_bin ${PATH_TO_VAE_CONTEXT_BINARY} --vocab_json ${PATH_TO_VOCAB_JSON_FILE} --num_time_steps 20 --prompt "a photo of an astronaut riding a horse on mars" ``` - Please replace `${PATH_TO_TEXT_ENCODER_CONTEXT_BINARY}`, `${PATH_TO_UNET_CONTEXT_BINARY}`, and `${PATH_TO_VAE_CONTEXT_BINARY}` with the actual paths to your AI Hub context binary files. - Please replace `${PATH_TO_VOCAB_JSON_FILE}` with the actual path to your vocab.json file. diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py index 862db31f174..64393fddfee 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py @@ -4,12 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import gc import json import os from multiprocessing.connection import Client -import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor import numpy as np import piq import torch @@ -18,23 +16,24 @@ from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( QcomChipset, ) + from executorch.backends.qualcomm.utils.utils import ( - canonicalize_program, from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, - generate_qnn_executorch_option, ) from executorch.examples.qualcomm.qaihub_scripts.stable_diffusion.stable_diffusion_lib import ( StableDiffusion, ) +from executorch.examples.qualcomm.qaihub_scripts.utils.utils import ( + gen_pte_from_ctx_bin, + get_encoding, +) from executorch.examples.qualcomm.utils import ( setup_common_args_and_variables, SimpleADB, ) -from executorch.exir.backend.backend_api import to_backend -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from PIL import Image from torchvision.transforms import ToTensor @@ -54,42 +53,6 @@ def get_quant_data( return quant_data.to(dtype=torch.uint16) -def get_encoding( - path_to_shard: str, - compiler_specs: str, - get_input: bool, - get_output: bool, - num_input: int, - num_output: int, -): - encoding_list = [] - with open(path_to_shard, "rb") as f: - ctx_bin = f.read() - qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_specs), ctx_bin - ) - assert qnn_mgr.Init().value == 0, "failed to load context binary" - qnn_mgr.AllocateTensor() - if get_input: - encoding_input = {"scale": [], "offset": []} - for i in range(num_input): - inputs = qnn_mgr.GetGraphInputs()[i] - encoding = inputs.GetEncodings() - encoding_input["scale"].append(encoding.data["scale"].item()) - encoding_input["offset"].append(encoding.data["offset"].item()) - encoding_list.append(encoding_input) - if get_output: - encoding_output = {"scale": [], "offset": []} - for i in range(num_output): - outputs = qnn_mgr.GetGraphOutputs()[i] - encoding = outputs.GetEncodings() - encoding_output["scale"].append(encoding.data["scale"].item()) - encoding_output["offset"].append(encoding.data["offset"].item()) - encoding_list.append(encoding_output) - qnn_mgr.Destroy() - return encoding_list - - def get_encodings( path_to_shard_encoder: str, path_to_shard_unet: str, @@ -248,44 +211,6 @@ def save_result(output_image): print(f"Output image saved at {save_path}") -def gen_pte_from_ctx_bin(args, compiler_specs): - # Create custom operators as context loader - bundle_programs = [ - from_context_binary(args.text_encoder_bin, "ctx_loader_0"), - from_context_binary(args.unet_bin, "ctx_loader_1"), - from_context_binary(args.vae_bin, "ctx_loader_2"), - ] - - # Lower with QnnBackend - lowered_modules = [ - to_backend("QnnBackend", prog["edge_program"], compiler_specs) - for prog in bundle_programs - ] - # Setup spill-fill buffer for relieving runtime memory usage - canonicalize_program(lowered_modules) - # export pte files - pte_files = [] - for target_name in target_names: - memory_planning_pass = MemoryPlanningPass( - memory_planning_algo="greedy", - alloc_graph_input=False, - alloc_graph_output=False, - ) - pte_files.append(f"{args.artifact}/{args.pte_prefix}_{target_name}.pte") - with open(pte_files[-1], "wb") as file: - file.write( - lowered_modules[0].buffer( - extract_delegate_segments=True, memory_planning=memory_planning_pass - ) - ) - # GC for reducing host memory consuming - bundle_programs.pop(0) - lowered_modules.pop(0) - gc.collect() - - return pte_files - - def inference(args, compiler_specs, pte_files): # Loading a pretrained EulerDiscreteScheduler from the https://huggingface.co/stabilityai/stable-diffusion-2-1-base. scheduler = EulerDiscreteScheduler.from_pretrained( @@ -408,7 +333,8 @@ def inference(args, compiler_specs, pte_files): file.write(flattened_tensor.numpy().tobytes()) files.append(os.path.join(args.artifact, "latents.raw")) - adb.push(inputs=input_unet, input_list=input_list_unet, files=files) + if not args.skip_push: + adb.push(inputs=input_unet, input_list=input_list_unet, files=files) adb.execute(custom_runner_cmd=qnn_executor_runner_args) output_image = [] @@ -442,7 +368,16 @@ def main(args): ) if args.pre_gen_pte is None: - pte_files = gen_pte_from_ctx_bin(args, compiler_specs) + # Create custom operators as context loader + bundle_programs = [ + from_context_binary(args.text_encoder_bin, "ctx_loader_0"), + from_context_binary(args.unet_bin, "ctx_loader_1"), + from_context_binary(args.vae_bin, "ctx_loader_2"), + ] + pte_names = [f"{args.pte_prefix}_{target_name}" for target_name in target_names] + pte_files = gen_pte_from_ctx_bin( + args.artifact, pte_names, compiler_specs, bundle_programs + ) assert ( len(pte_files) == 3 ), f"Error: Expected 3 PTE files, but got {len(pte_files)} files." diff --git a/examples/qualcomm/qaihub_scripts/utils/README.md b/examples/qualcomm/qaihub_scripts/utils/README.md index facc1da76e8..61f465f3926 100644 --- a/examples/qualcomm/qaihub_scripts/utils/README.md +++ b/examples/qualcomm/qaihub_scripts/utils/README.md @@ -20,7 +20,7 @@ If users are interested in well-known applications, [Qualcomm AI HUB](https://ai ### Dependencies * Register for Qualcomm AI HUB. -* Download the corresponding QNN SDK via shit [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases). +* Download the corresponding QNN SDK via [link](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) which your favorite model is compiled with. Ths link will automatically download the latest version at this moment (users should be able to specify version soon, please refer to [this](../../../../docs/source/build-run-qualcomm-ai-engine-direct-backend.md#software) for earlier releases). ### Target Model diff --git a/examples/qualcomm/qaihub_scripts/utils/utils.py b/examples/qualcomm/qaihub_scripts/utils/utils.py new file mode 100644 index 00000000000..67d519a688e --- /dev/null +++ b/examples/qualcomm/qaihub_scripts/utils/utils.py @@ -0,0 +1,87 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import gc + +import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor + +from executorch.backends.qualcomm.utils.utils import ( + canonicalize_program, + generate_qnn_executorch_option, +) +from executorch.exir.backend.backend_api import to_backend +from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass + + +def get_encoding( + path_to_shard: str, + compiler_specs: str, + get_input: bool, + get_output: bool, + num_input: int, + num_output: int, +): + encoding_list = [] + with open(path_to_shard, "rb") as f: + ctx_bin = f.read() + qnn_mgr = PyQnnManagerAdaptor.QnnManager( + generate_qnn_executorch_option(compiler_specs), ctx_bin + ) + assert qnn_mgr.Init().value == 0, "failed to load context binary" + qnn_mgr.AllocateTensor() + if get_input: + encoding_input = {"scale": [], "offset": []} + for i in range(num_input): + inputs = qnn_mgr.GetGraphInputs()[i] + encoding = inputs.GetEncodings() + encoding_input["scale"].append(encoding.data["scale"].item()) + encoding_input["offset"].append(encoding.data["offset"].item()) + encoding_list.append(encoding_input) + if get_output: + encoding_output = {"scale": [], "offset": []} + for i in range(num_output): + outputs = qnn_mgr.GetGraphOutputs()[i] + encoding = outputs.GetEncodings() + encoding_output["scale"].append(encoding.data["scale"].item()) + encoding_output["offset"].append(encoding.data["offset"].item()) + encoding_list.append(encoding_output) + qnn_mgr.Destroy() + return encoding_list + + +def gen_pte_from_ctx_bin( + artifact, pte_names, compiler_specs, bundle_programs, custom_spill_fill=None +): + + # Lower with QnnBackend + lowered_modules = [ + to_backend("QnnBackend", prog["edge_program"], compiler_specs) + for prog in bundle_programs + ] + # Setup spill-fill buffer for relieving runtime memory usage + canonicalize_program(lowered_modules, custom_buffer_size=custom_spill_fill) + # export pte files + pte_files = [] + for pte_name in pte_names: + print(f"{pte_name} generating...") + memory_planning_pass = MemoryPlanningPass( + memory_planning_algo="greedy", + alloc_graph_input=False, + alloc_graph_output=False, + ) + pte_files.append(f"{artifact}/{pte_name}.pte") + with open(pte_files[-1], "wb") as file: + file.write( + lowered_modules[0].buffer( + extract_delegate_segments=True, memory_planning=memory_planning_pass + ) + ) + # GC for reducing host memory consuming + bundle_programs.pop(0) + lowered_modules.pop(0) + gc.collect() + + return pte_files diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 20641c6dc84..ef21892f96c 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -353,7 +353,7 @@ def setup_common_args_and_variables(): parser.add_argument( "-b", "--build_folder", - help="path to cmake binary directory for android, e.g., /path/to/cmake-out-android", + help="path to cmake binary directory for android, e.g., /path/to/build-android", type=str, required=True, ) @@ -418,6 +418,13 @@ def setup_common_args_and_variables(): action="store_true", ) + parser.add_argument( + "--skip_push", + help="If specified, skip pushing files to device.", + action="store_true", + default=False, + ) + # QNN_SDK_ROOT might also be an argument, but it is used in various places. # So maybe it's fine to just use the environment. if "QNN_SDK_ROOT" not in os.environ: From 7b3549bf6aabfbd65f22f3079b86f81bd7c7cf5d Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:55:02 -0700 Subject: [PATCH 110/531] Propagate mul optimizations from D61504544/D61560825/D61560826 to add/sub/div (#4966) These diffs only updated the optimized mul operator; make the corresponding changes to the other binary arithmetic operators. Differential Revision: D61577411 Pull Request resolved: https://github.com/pytorch/executorch/pull/4816 --------- Co-authored-by: Scott Wolchok --- kernels/optimized/cpu/binary_ops.h | 91 ++++++++++++++++++++ kernels/optimized/cpu/op_add.cpp | 74 +++++++++++++++- kernels/optimized/cpu/op_div.cpp | 132 ++++++++++++++++++++++++----- kernels/optimized/cpu/op_mul.cpp | 71 +--------------- kernels/optimized/cpu/op_sub.cpp | 110 +++++++++++++++++++++++- kernels/optimized/cpu/targets.bzl | 11 +++ kernels/test/op_add_test.cpp | 59 ++++++++++++- kernels/test/op_div_test.cpp | 6 ++ kernels/test/op_sub_test.cpp | 12 ++- 9 files changed, 465 insertions(+), 101 deletions(-) create mode 100644 kernels/optimized/cpu/binary_ops.h diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h new file mode 100644 index 00000000000..01f3eed401e --- /dev/null +++ b/kernels/optimized/cpu/binary_ops.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch { +namespace executor { +namespace internal { +// NOTE: we bake ArrayRef iterators being pointers into the return +// type here because we assume that iterators are portable across +// ArrayRef copies. +inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s( + ArrayRef arr) { + return std::find_if( + arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; }); +} + +inline bool sizes_match_ignoring_leading_1s( + ArrayRef lhs, + ArrayRef rhs) { + auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs); + auto lhs_end = lhs.end(); + + auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs); + auto rhs_end = rhs.end(); + + return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && + std::equal(lhs_begin, lhs_end, rhs_begin); +} +} // namespace internal + +enum class ElementwiseOptimizedPath { + kNone, + kTreatAs1d, + kBroadcast2dBy1d, + kBroadcast2dBy1dReverseArguments, +}; + +namespace internal { +inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path( + const Tensor& lhs, + const Tensor& rhs) { + auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes()); + auto lhs_end = lhs.sizes().end(); + + auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes()); + auto rhs_end = rhs.sizes().end(); + + const auto lhs_size = lhs_end - lhs_begin; + const auto rhs_size = rhs_end - rhs_begin; + if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) { + return ElementwiseOptimizedPath::kBroadcast2dBy1d; + } + + if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) { + return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments; + } + + return ElementwiseOptimizedPath::kNone; +} +} // namespace internal + +ElementwiseOptimizedPath inline select_optimized_path( + const Tensor& a, + const Tensor& b, + const Tensor& out) { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) { + return ElementwiseOptimizedPath::kNone; + } + if (a.sizes().equals(b.sizes()) || + (a.numel() == b.numel() && + (a.numel() == out.numel() || + internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) { + return ElementwiseOptimizedPath::kTreatAs1d; + } + return internal::select_broadcast_2d_by_1d_optimized_path(a, b); +} + +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index b62c3b154fa..a2a05891e54 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -81,8 +82,41 @@ Tensor& opt_add_out( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) && - a_type != ScalarType::Half) { + if (b.numel() == 1) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + auto error = resize_tensor(out, a.sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() { + ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() { + CTYPE alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + CTYPE_B b_val = *b.const_data_ptr(); + CTYPE b_casted = static_cast(b_val); + + using Vec = executorch::vec::Vectorized; + executorch::vec::map( + [alpha_val, b_casted](Vec x) { + return x + Vec(alpha_val * b_casted); + }, + out.mutable_data_ptr(), + a.const_data_ptr(), + out.numel()); + }); + }); + return out; + } + } else if (a.numel() == 1) { + return opt_add_out(ctx, b, a, alpha, out); + } + + auto selected_optimized_path = select_optimized_path(a, b, out); + if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { // Resize for dynamic shape auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( @@ -105,6 +139,42 @@ Tensor& opt_add_out( b.const_data_ptr(), out.numel()); }); + } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) { + const Tensor* lhs; + const Tensor* rhs; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + lhs = &b; + rhs = &a; + } else { + // Catch failure to update logic when adding new broadcasting possibility. + ET_DCHECK( + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1d); + lhs = &a; + rhs = &b; + } + auto error = resize_tensor(out, lhs->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(out_type, ctx, "add.out", CTYPE, [&]() { + CTYPE alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + + using Vec = executorch::vec::Vectorized; + executorch::vec::broadcasting_map_2d_by_1d( + [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + }); } else { ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index cdd156910b4..ae3fd7b943c 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -48,7 +49,57 @@ Tensor& opt_div_out( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes())) { + if (a.numel() == 1 || b.numel() == 1) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + const Tensor* tensor; + const Tensor* scalar; + ScalarType tensor_type; + ScalarType scalar_type; + if (a.numel() == 1) { + tensor = &b; + tensor_type = b_type; + scalar = &a; + scalar_type = a_type; + } else { + tensor = &a; + tensor_type = a_type; + scalar = &b; + scalar_type = b_type; + } + auto error = resize_tensor(out, tensor->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(tensor_type, ctx, "div.out", CTYPE, [&]() { + ET_SWITCH_REALB_TYPES(scalar_type, ctx, "div.out", CTYPE_SCALAR, [&]() { + CTYPE_SCALAR scalar_val = *scalar->const_data_ptr(); + CTYPE scalar_casted = static_cast(scalar_val); + + using Vec = executorch::vec::Vectorized; + if (a.numel() == 1) { + executorch::vec::map( + [scalar_casted](Vec x) { return Vec(scalar_casted) / x; }, + out.mutable_data_ptr(), + tensor->const_data_ptr(), + out.numel()); + } else { + executorch::vec::map( + [scalar_casted](Vec x) { return x / Vec(scalar_casted); }, + out.mutable_data_ptr(), + tensor->const_data_ptr(), + out.numel()); + } + }); + }); + return out; + } + } + + auto selected_optimized_path = select_optimized_path(a, b, out); + if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { // Resize for dynamic shape auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( @@ -67,6 +118,49 @@ Tensor& opt_div_out( b.const_data_ptr(), out.numel()); }); + } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) { + const Tensor* lhs; + const Tensor* rhs; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + lhs = &b; + rhs = &a; + } else { + // Catch failure to update logic when subing new broadcasting possibility. + ET_DCHECK( + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1d); + lhs = &a; + rhs = &b; + } + auto error = resize_tensor(out, lhs->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() { + using Vec = executorch::vec::Vectorized; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + executorch::vec::broadcasting_map_2d_by_1d( + [](Vec x, Vec y) { return y / x; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + } else { + executorch::vec::broadcasting_map_2d_by_1d( + [](Vec x, Vec y) { return x / y; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + } + }); } else { ScalarType common_type = get_compute_type(a_type, b_type); ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); @@ -77,25 +171,23 @@ Tensor& opt_div_out( InvalidArgument, out); - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, common_type, ctx, "div.out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, out_type, ctx, "div.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted / b_casted; - - return static_cast(value); - }, - a, - b, - out); - }); - }); + ET_SWITCH_REALB_TYPES(a_type, ctx, "div.out", CTYPE_A, [&]() { + ET_SWITCH_REALB_TYPES(b_type, ctx, "div.out", CTYPE_B, [&]() { + ET_SWITCH_REALB_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() { + ET_SWITCH_REALB_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted / b_casted; + + return static_cast(value); + }, + a, + b, + out); + }); + }); }); }); } diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 38f99c62536..3b93870a610 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -22,76 +23,6 @@ using ScalarType = exec_aten::ScalarType; namespace { -// NOTE: we bake ArrayRef iterators being pointers into the return -// type here because we assume that iterators are portable across -// ArrayRef copies. -const Tensor::SizesType* arrayref_begin_ignoring_leading_1s( - ArrayRef arr) { - return std::find_if( - arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; }); -} - -bool sizes_match_ignoring_leading_1s( - ArrayRef lhs, - ArrayRef rhs) { - auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs); - auto lhs_end = lhs.end(); - - auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs); - auto rhs_end = rhs.end(); - - return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && - std::equal(lhs_begin, lhs_end, rhs_begin); -} - -// Move to generic util as this is applicable to all binary ops -enum class ElementwiseOptimizedPath { - kNone, - kTreatAs1d, - kBroadcast2dBy1d, - kBroadcast2dBy1dReverseArguments, -}; - -ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path( - const Tensor& lhs, - const Tensor& rhs) { - auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes()); - auto lhs_end = lhs.sizes().end(); - - auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes()); - auto rhs_end = rhs.sizes().end(); - - const auto lhs_size = lhs_end - lhs_begin; - const auto rhs_size = rhs_end - rhs_begin; - if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) { - return ElementwiseOptimizedPath::kBroadcast2dBy1d; - } - - if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) { - return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments; - } - - return ElementwiseOptimizedPath::kNone; -} - -ElementwiseOptimizedPath -select_optimized_path(const Tensor& a, const Tensor& b, const Tensor& out) { - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType out_type = out.scalar_type(); - - if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) { - return ElementwiseOptimizedPath::kNone; - } - if (a.sizes().equals(b.sizes()) || - (a.numel() == b.numel() && - (a.numel() == out.numel() || - sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) { - return ElementwiseOptimizedPath::kTreatAs1d; - } - return select_broadcast_2d_by_1d_optimized_path(a, b); -} - template < bool can_cast, typename CTYPE_A, diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 87368f3ed76..252bee8aee8 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -83,9 +84,64 @@ Tensor& opt_sub_out( ScalarType out_type = out.scalar_type(); ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out); + if (a.numel() == 1 || b.numel() == 1) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + const Tensor* tensor; + const Tensor* scalar; + ScalarType tensor_type; + ScalarType scalar_type; + if (a.numel() == 1) { + tensor = &b; + tensor_type = b_type; + scalar = &a; + scalar_type = a_type; + } else { + tensor = &a; + tensor_type = a_type; + scalar = &b; + scalar_type = b_type; + } + auto error = resize_tensor(out, tensor->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REAL_TYPES(tensor_type, ctx, "sub.out", CTYPE, [&]() { + ET_SWITCH_REAL_TYPES(scalar_type, ctx, "sub.out", CTYPE_SCALAR, [&]() { + CTYPE alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + CTYPE_SCALAR scalar_val = *scalar->const_data_ptr(); + CTYPE scalar_casted = static_cast(scalar_val); - if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) && - a_type != ScalarType::Half) { + using Vec = executorch::vec::Vectorized; + if (a.numel() == 1) { + executorch::vec::map( + [alpha_val, scalar_casted](Vec x) { + return Vec(scalar_casted) - Vec(alpha_val) * x; + }, + out.mutable_data_ptr(), + tensor->const_data_ptr(), + out.numel()); + } else { + executorch::vec::map( + [alpha_val, scalar_casted](Vec x) { + return x - Vec(alpha_val * scalar_casted); + }, + out.mutable_data_ptr(), + tensor->const_data_ptr(), + out.numel()); + } + }); + }); + } + return out; + } + + auto selected_optimized_path = select_optimized_path(a, b, out); + if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) { // Resize for dynamic shape auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( @@ -95,7 +151,7 @@ Tensor& opt_sub_out( out, "Failed to resize output tensor."); - ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() { + ET_SWITCH_REAL_TYPES(a_type, ctx, "sub.out", CTYPE, [&]() { CTYPE alpha_val; ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); @@ -108,6 +164,53 @@ Tensor& opt_sub_out( b.const_data_ptr(), out.numel()); }); + } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) { + const Tensor* lhs; + const Tensor* rhs; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + lhs = &b; + rhs = &a; + } else { + // Catch failure to update logic when subing new broadcasting possibility. + ET_DCHECK( + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1d); + lhs = &a; + rhs = &b; + } + auto error = resize_tensor(out, lhs->sizes()); + ET_KERNEL_CHECK_MSG( + ctx, + error == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() { + CTYPE alpha_val; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + + using Vec = executorch::vec::Vectorized; + if (selected_optimized_path == + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { + executorch::vec::broadcasting_map_2d_by_1d( + [alpha_val](Vec x, Vec y) { return y - Vec(alpha_val) * x; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + } else { + executorch::vec::broadcasting_map_2d_by_1d( + [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; }, + out.mutable_data_ptr(), + lhs->const_data_ptr(), + rhs->const_data_ptr(), + lhs->sizes()[lhs->dim() - 2], + lhs->sizes()[lhs->dim() - 1]); + } + }); } else { ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); @@ -128,6 +231,7 @@ Tensor& opt_sub_out( CTYPE_IN alpha_val; ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); + SubInner< can_cast::value, CTYPE_A, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index d62568f3130..e7bb2d36bf4 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -5,6 +5,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_add", deps = [ + ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", ], @@ -18,6 +19,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_div", deps = [ + ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", ], @@ -53,6 +55,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_mul", deps = [ + ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", ], @@ -68,6 +71,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_sub", deps = [ + ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", ], @@ -90,6 +94,13 @@ def define_common_targets(): aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops] all_op_targets = aten_op_targets + runtime.cxx_library( + name = "binary_ops", + exported_headers = ["binary_ops.h"], + visibility = ["//executorch/kernels/optimized/cpu/..."], + exported_deps = ["//executorch/runtime/core:core"], + ) + runtime.cxx_library( name = "cpu_optimized", srcs = [], diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp index 33de8153a31..79a58a0c7ce 100644 --- a/kernels/test/op_add_test.cpp +++ b/kernels/test/op_add_test.cpp @@ -288,6 +288,59 @@ TEST_F(OpAddOutKernelTest, BroadcastSupported) { EXPECT_TENSOR_EQ(out, tf.ones({5, 2, 3, 4})); } +TEST_F(OpAddOutKernelTest, BroadcastOneElementTensor) { + TensorFactory tf; + Tensor x = tf.make({1}, {1.75}); + Tensor y = tf.make({3, 2}, {-1.5, -1, -0.5, 0, 0.5, 1.5}); + + Tensor out = tf.zeros({3, 2}); + + Tensor ret = op_add_out(x, y, 1, out); + + Tensor expected = tf.make( + {3, 2}, + { + 0.25, + 0.75, + 1.25, + 1.75, + 2.25, + 3.25, + }); + + EXPECT_TENSOR_EQ(out, expected); + + out = op_add_out(y, x, 1, out); + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpAddOutKernelTest, BroadcastOneElementTensorTypePromotion) { + TensorFactory tf; + TensorFactory tfDouble; + Tensor x = tfDouble.make({1}, {1.75}); + Tensor y = tf.make({3, 2}, {-1.5, -1, -0.5, 0, 0.5, 1.5}); + + Tensor out = tfDouble.zeros({3, 2}); + + Tensor ret = op_add_out(x, y, 1, out); + + Tensor expected = tfDouble.make( + {3, 2}, + { + 0.25, + 0.75, + 1.25, + 1.75, + 2.25, + 3.25, + }); + + EXPECT_TENSOR_EQ(out, expected); + + out = op_add_out(y, x, 1, out); + EXPECT_TENSOR_EQ(out, expected); +} + // // Death Tests // @@ -355,15 +408,15 @@ TEST_F(OpAddOutKernelTest, BoolOutputWithIntegralInput) { ET_EXPECT_KERNEL_FAILURE(context_, op_add_out(a, b, /*alpha=*/1, out)); } -TEST_F(OpAddOutKernelTest, MismatchedInputShapesDies) { +TEST_F(OpAddOutKernelTest, MismatchedNonBroadcastableInputShapesDies) { TensorFactory tf; // Addends with different shapes. - Tensor a = tf.ones(/*sizes=*/{4}); + Tensor a = tf.ones(/*sizes=*/{4, 2}); Tensor b = tf.ones(/*sizes=*/{2, 2}); // Destination for the sum; matches the shape of one of the inputs. - Tensor out = tf.zeros(/*sizes=*/{4}); + Tensor out = tf.zeros(/*sizes=*/{8}); // Adding the two mismatched tensors should cause an assertion and kill the // test process. diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp index bb302b63140..eb01893d48b 100644 --- a/kernels/test/op_div_test.cpp +++ b/kernels/test/op_div_test.cpp @@ -229,6 +229,12 @@ TEST_F(OpDivOutTest, BroadcastScalarSupported2) { Tensor ret = tf.make({3, 1, 1}, {4, 2, 1}); EXPECT_TENSOR_EQ(out, ret); + + std::swap(a, b); + out = tf.zeros({3, 1, 1}); + op_div_out(a, b, out); + ret = tf.make({3, 1, 1}, {0.25, 0.5, 1}); + EXPECT_TENSOR_EQ(out, ret); } TEST_F(OpDivOutTest, BroadcastDimSizeIsOneAB) { diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp index 210f9891d1e..4bfc22bfcbb 100644 --- a/kernels/test/op_sub_test.cpp +++ b/kernels/test/op_sub_test.cpp @@ -198,6 +198,12 @@ TEST_F(OpSubOutTest, BroadcastScalarSupported2) { Tensor ret = tf.make({3, 1, 1}, {6, 4, 0}); EXPECT_TENSOR_EQ(out, ret); + + std::swap(a, b); + out = tf.zeros({3, 1, 1}); + op_sub_out(a, b, 1, out); + ret = tf.make({3, 1, 1}, {-6, -4, 0}); + EXPECT_TENSOR_EQ(out, ret); } // @@ -265,15 +271,15 @@ TEST_F(OpSubOutTest, BoolOutputWithIntegralInput) { ET_EXPECT_KERNEL_FAILURE(context_, op_sub_out(a, b, /*alpha=*/1, out)); } -TEST_F(OpSubOutTest, MismatchedInputShapesDies) { +TEST_F(OpSubOutTest, MismatchedNonBroadcastableInputShapesDies) { TensorFactory tf; // Subtrahend and minuend with different shapes. - Tensor a = tf.ones(/*sizes=*/{4}); + Tensor a = tf.ones(/*sizes=*/{4, 2}); Tensor b = tf.ones(/*sizes=*/{2, 2}); // Destination for the subtraction; matches the shape of one of the inputs. - Tensor out = tf.zeros(/*sizes=*/{4}); + Tensor out = tf.zeros(/*sizes=*/{8}); // Performing substraction on two mismatched tensors should cause an assertion // and kill the test process. From e49d4dd8cbb743ac1a7ad0d4574b65a366aaa27e Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 29 Aug 2024 00:36:20 -0700 Subject: [PATCH 111/531] Let Module set_output_data_ptr accept an EValue. Differential Revision: D61957435 Pull Request resolved: https://github.com/pytorch/executorch/pull/4969 --- extension/module/module.cpp | 3 ++- extension/module/module.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 6d5aedd8007..f9f1d6be0f7 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -197,8 +197,9 @@ Result> Module::execute( return outputs; } -Error Module::set_output_data_ptr(Tensor& output_tensor, size_t output_index) { +Error Module::set_output_data_ptr(EValue output_value, size_t output_index) { ET_CHECK_OK_OR_RETURN_ERROR(load_method("forward")); + auto& output_tensor = output_value.toTensor(); auto& method = methods_.at("forward").method; return method->set_output_data_ptr( output_tensor.mutable_data_ptr(), output_tensor.nbytes(), output_index); diff --git a/extension/module/module.h b/extension/module/module.h index e4fd3aa1068..689fef5cd29 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -330,13 +330,13 @@ class Module final { /** * Set output data pointer for forward method. * - * @param[in] output_tensor A Tensor for the output of 'forward' method. + * @param[in] output_value A Tensor for the output of 'forward' method. * @param[in] output_index Index of the output in 'forward' method. * * @returns An Error to indicate success or failure of the loading process. */ ::executorch::runtime::Error set_output_data_ptr( - exec_aten::Tensor& output_tensor, + ::executorch::runtime::EValue output_value, size_t output_index); private: From 9de12bc77566b1b163f839a675b778eb49b92960 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:32:39 -0400 Subject: [PATCH 112/531] Add custom op: batch box cox Differential Revision: D61934829 Pull Request resolved: https://github.com/pytorch/executorch/pull/4956 --- runtime/core/exec_aten/testing_util/targets.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl index 57764771afb..d3219c58839 100644 --- a/runtime/core/exec_aten/testing_util/targets.bzl +++ b/runtime/core/exec_aten/testing_util/targets.bzl @@ -30,6 +30,7 @@ def define_common_targets(): "//executorch/kernels/quantized/test/...", "//executorch/kernels/optimized/test/...", "//executorch/kernels/test/...", + "//executorch/kernels/fb/custom_ops/...", "//executorch/runtime/core/test/...", "//executorch/test/...", "//executorch/backends/fb/qnnpack/test/...", From db5abf6263b0550a8765b363fbf6c28ca3d173fe Mon Sep 17 00:00:00 2001 From: Esteb37 <35089867+Esteb37@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:33:29 -0400 Subject: [PATCH 113/531] Add int4pack_mm shader and operator Differential Revision: D61669488 Pull Request resolved: https://github.com/pytorch/executorch/pull/4949 --- .../runtime/graph/ops/glsl/q_4w_linear.glsl | 93 +++++++++++ .../runtime/graph/ops/glsl/q_4w_linear.yaml | 16 ++ .../graph/ops/impl/QuantizedMatMul.cpp | 158 ++++++++++++++++++ .../vulkan/runtime/graph/ops/impl/Staging.cpp | 2 +- .../graph/ops/impl/utils/QPackUtils.cpp | 72 ++++++++ .../runtime/graph/ops/impl/utils/QPackUtils.h | 25 +++ backends/vulkan/test/utils/test_utils.cpp | 14 ++ backends/vulkan/test/utils/test_utils.h | 5 + .../vulkan/test/vulkan_compute_api_test.cpp | 118 +++++++++++++ 9 files changed, 502 insertions(+), 1 deletion(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml create mode 100644 backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp create mode 100644 backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp create mode 100644 backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl new file mode 100644 index 00000000000..71ecf162362 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl @@ -0,0 +1,93 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#include "indexing_utils.h" + +#define PRECISION ${PRECISION} + +#define FLOAT_T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type(STORAGE)} + +${define_required_extensions(DTYPE)} +${define_required_extensions("int8")} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} +${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)} +${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)} + +${layout_declare_ubo(4, "ivec4", "out_sizes")} +${layout_declare_ubo(5, "ivec4", "out_strides")} +${layout_declare_ubo(6, "ivec4", "mat1_strides")} +${layout_declare_ubo(7, "ivec4", "mat2_sizes")} +${layout_declare_ubo(8, "ivec4", "mat2_strides")} +${layout_declare_ubo(9, "ivec4", "scales_strides")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int group_size = 1; + +void main() { + + const ivec4 out_pos = ivec4( + gl_GlobalInvocationID.x, // n = 0..N-1 + gl_GlobalInvocationID.y, // m = 0..M-1 + gl_GlobalInvocationID.z % out_sizes.z, + gl_GlobalInvocationID.z / out_sizes.z); + + if (any(greaterThanEqual(out_pos, out_sizes))) { + return; + } + + const uint K = mat2_sizes.x * 2; + const uint N = mat2_sizes.y; + const uint n = out_pos.x; + const uint m = out_pos.y; + const uint k_block = (K + group_size - 1) / group_size; + const uint mask = uint(0x0f); + ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); + ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); + ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); + ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); + + float rc = 0.0; + int k = 0; + + for (int kb = 0; kb < k_block; kb++) { + scale_pos.x = kb; + const int scale_id = to_buffer_id(scale_pos, scales_strides); + const float scale = float(t_scales_and_zeros[scale_id]); + + zero_pos.x = kb; + const int zero_id = to_buffer_id(zero_pos, scales_strides); + const float zero = float(t_scales_and_zeros[zero_id]) - scale * 8.0; + + for(uint idx = 0; idx < group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const int mat1_id = to_buffer_id(mat1_pos, mat1_strides); + const float mat1_val = float(t_mat1[mat1_id]); + + mat2_pos.x = k / 2; + const int mat2_id = to_buffer_id(mat2_pos, mat2_strides); + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (t_mat2[mat2_id] & 0xFF); + mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + + rc += mat1_val * (scale * float(mat2_val) + zero); + } + } + + const int out_id = to_buffer_id(out_pos, out_strides); + t_out[out_id] = FLOAT_T(rc); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml new file mode 100644 index 00000000000..a3585c998e8 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q_4w_linear: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: q_4w_linear diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp new file mode 100644 index 00000000000..b2796d26dd1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace vkcompute { + +void check_q_matmul_args( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size_data, + const ValueRef scales_and_zeros, + const ValueRef out) { + const std::vector mat1_sizes = graph.sizes_of(mat1); + const std::vector mat2_sizes = graph.sizes_of(mat2_data); + const std::vector scales_and_zeros_sizes = + graph.sizes_of(scales_and_zeros); + + const uint32_t group_size = graph.extract_scalar(group_size_data); + + VK_CHECK_COND(mat1_sizes.size() == 2); + VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); + + VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out)); + + const int mat1_K = utils::val_at(-1, mat1_sizes); + const int mat2_K = utils::val_at(-1, mat2_sizes) * 2; + const int N = utils::val_at(-2, mat2_sizes); + + VK_CHECK_COND(mat1_K == mat2_K); + + VK_CHECK_COND(mat2_K % group_size == 0); + + const uint32_t k_groups = mat2_K / group_size; + + VK_CHECK_COND(scales_and_zeros_sizes.size() == 3); + VK_CHECK_COND(utils::val_at(-1, scales_and_zeros_sizes) == k_groups); + VK_CHECK_COND(utils::val_at(-2, scales_and_zeros_sizes) == N); + VK_CHECK_COND(utils::val_at(-3, scales_and_zeros_sizes) == 2); + + // Match https://fburl.com/code/6ostkknm + std::vector valid_group_sizes = {32, 64, 128, 256}; + + bool is_valid_group_size = false; + for (auto valid_group_size : valid_group_sizes) { + if (group_size == valid_group_size) { + is_valid_group_size = true; + break; + } + } + + VK_CHECK_COND(is_valid_group_size); +} + +void resize_q_matmul_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); + vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); + + const int out_cols = utils::val_at(-2, mat1->sizes()); + const int out_rows = utils::val_at(-2, mat2->sizes()); + + std::vector new_out_sizes(3); + if (mat1->sizes().size() == 2) { + new_out_sizes.resize(2); + new_out_sizes.at(0) = out_cols; + new_out_sizes.at(1) = out_rows; + } else { + new_out_sizes.at(0) = mat1->sizes().at(0); + new_out_sizes.at(1) = out_cols; + new_out_sizes.at(2) = out_rows; + } + + out->virtual_resize(new_out_sizes); +} + +void add_q_matmul_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros_data, + const ValueRef out) { + ValueRef mat2 = + prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + ValueRef scales_and_zeros = + prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); + + std::string kernel_name = "q_4w_linear"; + + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + const uint32_t group_size_val = graph.extract_scalar(group_size); + + vkapi::ParamsBindList ubos({}); + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.strides_ubo(out)); + ubos.append(graph.strides_ubo(mat1)); + ubos.append(graph.sizes_ubo(mat2)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + + auto out_sizes = graph.sizes_of(out); + uint32_t N = utils::val_at(-1, out_sizes); + uint32_t M = utils::val_at(-2, out_sizes); + + utils::uvec3 global_wg_size = {N, M, 1}; + + utils::uvec3 local_wg_size = adaptive_work_group_size(global_wg_size); + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + ubos, + // Specialization Constants + {SV(group_size_val)}, + // Resizing Logic + resize_q_matmul_node, + {})); +} + +void int4pack_mm(ComputeGraph& graph, const std::vector& args) { + check_q_matmul_args(graph, args[0], args[1], args[2], args[3], args[4]); + return add_q_matmul_node( + graph, + args[0], // mat1 + args[1], // mat2 + args[2], // group_size + args[3], // scales_and_zeros + args[4] // out + ); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten._weight_int4pack_mm.default, int4pack_mm); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index c40d57c8b52..9df5b73c1a1 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -136,7 +136,7 @@ ValueRef prepack_buffer( ComputeGraph& graph, const ValueRef vref, const utils::GPUMemoryLayout layout) { - ValueRef v = graph.add_tensor_like(vref, layout); + ValueRef v = graph.add_tensor_like(vref, utils::kBuffer, layout); vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer"); diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp new file mode 100644 index 00000000000..4cf678a9dcb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) { + for (int32_t n = 0; n < N; n++) { + for (int32_t k2 = 0; k2 < K / 2; k2++) { + uint8_t src_val0 = w_ptr[n * K + k2 * 2]; + uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1]; + b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0); + } + } +} + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr) { + const int32_t N = utils::val_at(-1, W_sizes); + const int32_t K = utils::val_at(-2, W_sizes); + + const auto numel = K * N; + std::vector w_ptr_T(numel); + std::vector b_ptr(utils::div_up(numel, 2)); + + // Transpose the weights + for (int32_t k = 0; k < K; k++) { + for (int32_t n = 0; n < N; n++) { + w_ptr_T[n * K + k] = w_ptr[k * N + n]; + } + } + + // Pack two int4s into each int8 + pack4(w_ptr_T.data(), b_ptr.data(), N, K); + + return b_ptr; +} + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros) { + const int64_t N = utils::val_at(-1, W_sizes); + const int64_t K = utils::val_at(-2, W_sizes); + + std::vector w_ptr_deq(K * N); + const int k_groups = K / group_size; + const int zeros_stride = k_groups * N; + + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) { + const int kb = k / group_size; + const int scale_idx = k_groups * n + kb; + const float scale = scales_and_zeros[scale_idx]; + const float zero = + scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0; + w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero; + } + } + + return w_ptr_deq; +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h new file mode 100644 index 00000000000..4c4cf26d504 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr); + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros); + +} // namespace vkcompute diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 6c056cc9d90..4a3a41d6c72 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -377,6 +377,20 @@ std::vector create_random_float_buffer( return data; } +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min, + const uint8_t max) { + std::vector data(numel); + std::default_random_engine rng; + std::uniform_real_distribution dist(min, max); + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = (uint8_t)dist(rng); + } + return data; +} + void fill_vtensor( ComputeGraph& graph, const IOValueRef idx, diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index bf549446170..c8af5470862 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -144,6 +144,11 @@ std::vector create_random_float_buffer( const float min = 0, const float max = 1); +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min = 0, + const uint8_t max = 255); + void fill_vtensor( ComputeGraph& graph, const IOValueRef idx, diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index e24e2ea4e06..3d172f490cf 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -21,6 +21,8 @@ #include +#include + #include using namespace vkcompute::api; @@ -2659,3 +2661,119 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) { /*offset = */ 0.5, /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); } + +void test_int4pack_mm(std::vector MKN, uint32_t group_size) { + GraphConfig config; + ComputeGraph graph(config); + + const uint32_t M = MKN[0]; + const uint32_t K = MKN[1]; + const uint32_t N = MKN[2]; + + const std::vector mat1_size = {M, K}; + const std::vector mat2_size = {K, N}; + const std::vector mat2_q_size = {N, K / 2}; // Transposed and packed + const std::vector out_size = {M, N}; + + std::vector A_data = create_random_float_buffer(M * K); + IOValueRef A = + graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kBuffer); + graph.copy_into_staging(A.staging, A_data.data(), A_data.size()); + + // Quantized but un-packed weights + std::vector B_quant_data = create_random_uint8_buffer(K * N, 0, 16); + + // Pack and transpose weights to correspond to int4 weight format + std::vector B_int4_data = + int4mm_pack_weights(mat2_size, B_quant_data.data()); + + IOValueRef B_int4 = + graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer); + graph.copy_into_staging( + B_int4.staging, B_int4_data.data(), B_int4_data.size()); + + const int k_groups = K / group_size; + + // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in + // int4 range + IOValueRef scales_and_zeros = + graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, utils::kBuffer); + std::vector s_data(graph.numel_of(scales_and_zeros.value)); + const int zeros_stride = s_data.size() / 2; + for (size_t i = 0; i < zeros_stride; i++) { + s_data[i] = rand() % 100; + s_data[i + zeros_stride] = rand() % 16; + } + + graph.copy_into_staging( + scales_and_zeros.staging, s_data.data(), s_data.size()); + + IOValueRef out_int4; + out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + + VK_GET_OP_FN("aten._weight_int4pack_mm.default") + (graph, + {A.value, + B_int4.value, + graph.add_scalar(group_size), + scales_and_zeros.value, + out_int4.value}); + + out_int4.staging = graph.set_output_tensor(out_int4.value); + + // Dequantized matmul for comparison + IOValueRef B_deq = + graph.add_input_tensor(mat2_size, vkapi::kFloat, utils::kBuffer); + std::vector B_deq_data = int4mm_dequantize_weights( + mat2_size, B_quant_data.data(), group_size, s_data.data()); + graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size()); + + IOValueRef out_deq; + out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + + VK_GET_OP_FN("aten.mm.default") + (graph, {A.value, B_deq.value, out_deq.value}); + + out_deq.staging = graph.set_output_tensor(out_deq.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + graph.propagate_resize(); + graph.execute(); + + // Compare outputs + std::vector out_int4_data(graph.numel_of(out_int4.value)); + graph.copy_from_staging( + out_int4.staging, out_int4_data.data(), out_int4_data.size()); + + std::vector out_deq_data(graph.numel_of(out_deq.value)); + graph.copy_from_staging( + out_deq.staging, out_deq_data.data(), out_deq_data.size()); + + for (int i = 0; i < out_int4_data.size(); i++) { + CHECK_VALUE(out_int4_data, i, out_deq_data[i]); + } +} + +TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) { + if (!context()->adapter_ptr()->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + + // Vector multiplication, single group per row + test_int4pack_mm({1, 32, 1}, 32); + + // Vector multiplication, multiple groups per row + test_int4pack_mm({1, 256, 1}, 64); + + // Square matrices, single group per row + test_int4pack_mm({32, 32, 32}, 32); + + // Irregular matrices, single group per row + test_int4pack_mm({37, 32, 19}, 32); + + // Irregular matrices, multiple groups per row + test_int4pack_mm({37, 256, 19}, 64); +} From 455ddaafb2b475ea90907f3e3416d7fe308586f5 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:25:27 -0700 Subject: [PATCH 114/531] Update pytorch pin for ET Differential Revision: D61974631 Pull Request resolved: https://github.com/pytorch/executorch/pull/4971 --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 3251f4ee9a7..14422e45d7c 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -c42ac54d9e817bf0a0366eb78e6c8beba4d5eff5 +e4cd76cf8283c8ddbf95674b020fbfcff467cb4b diff --git a/install_requirements.py b/install_requirements.py index 4b7dedc0a49..1f5982c80e0 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -94,7 +94,7 @@ def python_is_compatible(): # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION = "dev20240821" +NIGHTLY_VERSION = "dev20240829" # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" @@ -121,7 +121,7 @@ def python_is_compatible(): # TODO: Make each example publish its own requirements.txt EXAMPLES_REQUIREMENTS = [ "timm==1.0.7", - f"torchaudio==2.4.0.{NIGHTLY_VERSION}", + f"torchaudio==2.5.0.{NIGHTLY_VERSION}", "torchsr==1.0.4", "transformers==4.42.4", ] From 9fd8e53db646802171349078f96ad6407cf868d4 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:14:35 -0400 Subject: [PATCH 115/531] Add op: scatter.value_out Differential Revision: D61871642 Pull Request resolved: https://github.com/pytorch/executorch/pull/4957 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_scatter.cpp | 104 +++++ kernels/portable/cpu/util/index_util.cpp | 9 + kernels/portable/cpu/util/index_util.h | 7 + kernels/portable/functions.yaml | 5 + kernels/test/op_scatter_test.cpp | 362 ++++++++++++++++++ kernels/test/targets.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 7 + 8 files changed, 497 insertions(+) create mode 100644 kernels/portable/cpu/op_scatter.cpp create mode 100644 kernels/test/op_scatter_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index e06830acabd..f28cfb48b36 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -321,6 +321,8 @@ - op: scalar_tensor.out +- op: scatter.value_out + - op: scatter_add.out - op: select_copy.int_out diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp new file mode 100644 index 00000000000..9696ab4f14d --- /dev/null +++ b/kernels/portable/cpu/op_scatter.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +namespace { + +template +void scatter_value_helper( + const Tensor& in, + int64_t dim, + const Tensor& index, + CTYPE_VAL val, + Tensor& out) { + const CTYPE* in_data = in.const_data_ptr(); + const long* index_data = index.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + memcpy(out_data, in_data, in.nbytes()); + + if (index.dim() == 0) { + out_data[index_data[0]] = static_cast(val); + return; + } + + for (size_t ix = 0; ix < index.numel(); ++ix) { + size_t ix_coord[kTensorDimensionLimit]; + indexToCoordinate(index, ix, ix_coord); + + size_t out_coord[kTensorDimensionLimit]; + for (size_t i = 0; i < out.dim(); ++i) { + if (i == dim) { + out_coord[i] = index_data[ix]; + } else { + out_coord[i] = ix_coord[i]; + } + } + size_t out_ix = coordinateToIndex(out, out_coord); + + out_data[out_ix] = static_cast(val); + } +} + +} // namespace + +Tensor& scatter_value_out( + RuntimeContext& ctx, + const Tensor& in, + int64_t dim, + const Tensor& index, + const Scalar& value, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_scatter_value_args(in, dim, index, value, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + if (dim < 0) { + dim += nonzero_dim(in); + } + + ScalarType val_type = utils::get_scalar_dtype(value); + + constexpr auto name = "scatter.value_out"; + + ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] { + CTYPE_VAL val; + utils::extract_scalar(value, &val); + + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { + scatter_value_helper(in, dim, index, val, out); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp index c8b89788109..ca9900773a1 100644 --- a/kernels/portable/cpu/util/index_util.cpp +++ b/kernels/portable/cpu/util/index_util.cpp @@ -191,6 +191,15 @@ bool check_scatter_add_args( return true; } +bool check_scatter_value_args( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& value, + Tensor& out) { + return check_gather_args(self, dim, index, false, out); +} + bool check_select_scatter_args( const Tensor& in, const Tensor& src, diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h index 7c296832924..ae6654be52b 100644 --- a/kernels/portable/cpu/util/index_util.h +++ b/kernels/portable/cpu/util/index_util.h @@ -43,6 +43,13 @@ bool check_scatter_add_args( const Tensor& src, Tensor& out); +bool check_scatter_value_args( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& value, + Tensor& out); + bool check_select_scatter_args( const Tensor& in, const Tensor& src, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index bdf3cea671a..21258329aa8 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -737,6 +737,11 @@ - arg_meta: null kernel_name: torch::executor::scalar_tensor_out +- op: scatter.value_out + kernels: + - arg_meta: null + kernel_name: torch::executor::scatter_value_out + - op: scatter_add.out kernels: - arg_meta: null diff --git a/kernels/test/op_scatter_test.cpp b/kernels/test/op_scatter_test.cpp new file mode 100644 index 00000000000..2335c839d00 --- /dev/null +++ b/kernels/test/op_scatter_test.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include +#include + +using namespace ::testing; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpScatterValueOutTest : public OperatorTest { + protected: + Tensor& op_scatter_value_out( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& value, + Tensor& out) { + return torch::executor::aten::scatter_outf( + context_, self, dim, index, value, out); + } + + // Common testing for the operator + template + void test_scatter_value_out() { + TensorFactory tf_index; + TensorFactory tf_data; + + const Scalar& value = 1; + + const std::vector sizes = {3, 5}; + Tensor self = tf_data.zeros(sizes); + Tensor out = tf_data.zeros(sizes); + Tensor index = tf_index.make({2, 3}, {0, 1, 2, 0, 1, 2}); + + op_scatter_value_out(self, 0, index, value, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make( + sizes, + { + 1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0 + })); + // clang-format on + + op_scatter_value_out(self, 1, index, value, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make(sizes, + { + 1, 1, 1, 0, 0, + 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0 + })); + + const Scalar& value2 = 2; + self = tf_data.ones(/*sizes=*/{2, 3, 3}); + out = tf_data.zeros(/*sizes=*/{2, 3, 3}); + // clang-format off + index = tf_index.make( + /*sizes=*/{1, 3, 2}, + { + 0, 1, + 1, 2, + 0, 2 + }); + // clang-format on + + op_scatter_value_out(self, 1, index, value2, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 2, 1, 1, + 2, 2, 1, + 1, 2, 1, + + // [1, :, :] + 1, 1, 1, + 1, 1, 1, + 1, 1, 1 + })); + // clang-format on + + out = tf_data.zeros(/*sizes=*/{2, 3, 3}); + op_scatter_value_out(self, 2, index, value2, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 2, 2, 1, + 1, 2, 2, + 2, 1, 2, + + // [1, :, :] + 1, 1, 1, + 1, 1, 1, + 1, 1, 1 + })); + // clang-format on + } + + // Invalid dimensions + template + void test_scatter_value_out_invalid_dim() { + TensorFactory tf_index; + TensorFactory tf_data; + // clang-format off + Tensor self = tf_data.make(/*sizes=*/{2, 5}, + { + 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 + }); + const std::vector sizes = {2, 3}; + Tensor index = tf_index.make(sizes, + { + 0, 1, 0, + 1, 0, 1, + }); + // clang-format on + const Scalar& value = 1; + Tensor out = tf_data.zeros(sizes); + + // Invalid dim should die + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, -3, index, value, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 2, index, value, out)); + + // Self and index hsould have same number of dimensions + index = tf_index.zeros(/*sizes=*/{2, 2, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); + + // Size of dimension of index should be smaller than the size of that + // dimension of self if dimension != dim + index = tf_index.zeros(/*sizes=*/{3, 5}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 1, index, value, out)); + + // Index out of bound for self in dim + index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 2, 0, 1, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); + } + + void test_dynamic_shape( + const std::vector& out_shape, + enum torch::executor::TensorShapeDynamism dynamism) { + TensorFactory tf; + TensorFactory tf_index; + + Tensor input = tf.ones({2, 3, 4}); + Tensor index = tf_index.zeros({2, 3, 4}); + const Scalar& value = 1; + Tensor expected = tf.ones({2, 3, 4}); + Tensor out = tf.zeros(out_shape, dynamism); + + op_scatter_value_out(input, 2, index, value, out); + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpScatterValueOutTest, AllValidInputOutputSupport) { +#define TEST_ENTRY(CTYPE, DTYPE) test_scatter_value_out(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpScatterValueOutTest, InfinityAndNANTest) { + TensorFactory tf_index; + TensorFactory tf_data; + // clang-format off + Tensor self = tf_data.make( + /*sizes=*/{2, 5}, + { + 0.0, -INFINITY, NAN, 2.33, NAN, + NAN, INFINITY, -INFINITY, -INFINITY, 2.33 + }); + // clang-format on + Tensor index = tf_index.make({2, 3}, {0, 1, 0, 1, 0, 1}); + const Scalar& value = INFINITY; + Tensor out = tf_data.zeros({2, 5}); + + // Valid input should give the expected output + op_scatter_value_out(self, 0, index, value, out); + // clang-format off + EXPECT_TENSOR_CLOSE( + out, + tf_data.make(/*sizes=*/{2, 5}, + { + INFINITY, INFINITY, INFINITY, 2.33, NAN, + INFINITY, INFINITY, INFINITY, -INFINITY, 2.33 + })); + // clang-format on +} + +TEST_F(OpScatterValueOutTest, InvalidDimensionsDies) { +#define TEST_ENTRY(CTYPE, DTYPE) \ + test_scatter_value_out_invalid_dim(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpScatterValueOutTest, MismatchedInputDtypesDies) { + TensorFactory tf_byte; + TensorFactory tf_char; + TensorFactory tf_long; + + Tensor self = tf_char.make({2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + const std::vector sizes = {2, 3}; + Tensor index = tf_byte.make(sizes, {0, 1, 0, 0, 1, 0}); + const Scalar& value = 5; + Tensor out = tf_char.zeros(sizes); + + // Types other than long for index should die + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); + + // Mismatched dtype of self and out should die + self = tf_byte.make(/*sizes=*/{2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + index = tf_long.make(sizes, {0, 1, 0, 1, 0, 1}); + out = tf_char.zeros(sizes); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); +} + +TEST_F(OpScatterValueOutTest, DynamicShapeUpperBoundSameAsExpected) { + test_dynamic_shape( + {2, 3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpScatterValueOutTest, DynamicShapeUpperBoundLargerThanExpected) { + test_dynamic_shape( + {10, 10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); +} + +TEST_F(OpScatterValueOutTest, DynamicShapeUnbound) { + if (!torch::executor::testing::SupportedFeatures::get()->output_resize) { + GTEST_SKIP() << "Dynamic shape not supported"; + } + test_dynamic_shape( + {1, 1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); +} + +TEST_F(OpScatterValueOutTest, EmptyIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({2, 5}); + Tensor index = tf_index.zeros({2, 0, 3}); + const Scalar& value = 5; + Tensor out = tf_data.zeros({2, 5}); + op_scatter_value_out(self, 0, index, value, out); + EXPECT_TENSOR_CLOSE(out, tf_data.ones({2, 5})); +} + +TEST_F(OpScatterValueOutTest, ValidZeroDim) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.zeros({}); + const Scalar& value = 5; + Tensor out = tf_data.zeros({}); + op_scatter_value_out(self, 0, index, value, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5})); +} + +TEST_F(OpScatterValueOutTest, InvalidZeroDimInput) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({}); + Tensor index = tf_index.make({2, 3}, {0, 0, 0, 0, 0, 0}); + const Scalar& value = 5; + Tensor out = tf_data.zeros({}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); +} + +TEST_F(OpScatterValueOutTest, InvalidZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6}); + Tensor index = tf_index.make({}, {2}); + const Scalar& value = 5; + Tensor out = tf_data.zeros({2, 3}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 1, index, value, out)); +} + +TEST_F(OpScatterValueOutTest, ValidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.make({3}, {0, 0, 0}); + const Scalar& value = 5; + Tensor out = tf_data.make({}, {2.71}); + op_scatter_value_out(self, 0, index, value, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5})); +} + +TEST_F(OpScatterValueOutTest, ValidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + Tensor index = tf_index.make({}, {2}); + const Scalar& value = 5; + Tensor out = tf_data.make({3}, {1729, 1729, 1729}); + op_scatter_value_out(self, 0, index, value, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {10, 20, 5})); +} + +TEST_F(OpScatterValueOutTest, InvalidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.make({3}, {10, 100, 1000}); + const Scalar& value = 5; + Tensor out = tf_data.make({}, {2.71}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); +} + +TEST_F(OpScatterValueOutTest, InvalidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + Tensor index = tf_index.make({}, {100}); + const Scalar& value = 5; + Tensor out = tf_data.make({3}, {1729, 1729, 1729}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_value_out(self, 0, index, value, out)); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index e44769841b2..69f4e176ff9 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -273,6 +273,7 @@ def define_common_targets(): _common_op_test("op_rsqrt_test", ["aten", "portable"]) _common_op_test("op_rsub_test", ["aten", "portable"]) _common_op_test("op_scalar_tensor_test", ["aten", "portable"]) + _common_op_test("op_scatter_test", ["aten", "portable"]) _common_op_test("op_scatter_add_test", ["aten", "portable"]) _common_op_test("op_select_scatter_test", ["aten", "portable"]) _common_op_test("op_select_copy_test", ["aten", "portable"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index a0200cb1a6f..0cc9ab5fd0e 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -979,6 +979,13 @@ ATEN_OPS = ( name = "op_scalar_tensor", deps = [":scalar_utils"], ), + op_target( + name = "op_scatter", + deps = [ + ":scalar_utils", + "//executorch/kernels/portable/cpu/util:index_util", + ], + ), op_target( name = "op_scatter_add", deps = [ From 959bb1be0b7c846c387adb3b7076a7a711e2ac48 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Thu, 29 Aug 2024 14:22:20 -0700 Subject: [PATCH 116/531] Update ExecuTorch for XNNPACK 87ee0b4 (#4916) (#4916) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4916 Reviewed By: digantdesai Differential Revision: D61822607 Pulled By: GregoryComer --- .gitmodules | 2 +- backends/xnnpack/cmake/Dependencies.cmake | 4 + backends/xnnpack/runtime/XNNCompiler.cpp | 73 +++++++++++++++++-- .../xnnpack/serialization/runtime_schema.fbs | 1 + backends/xnnpack/serialization/schema.fbs | 1 + backends/xnnpack/test/ops/linear.py | 12 +-- backends/xnnpack/third-party/XNNPACK | 2 +- .../third-party/generate-xnnpack-wrappers.py | 27 +++++-- backends/xnnpack/third-party/xnnpack.buck.bzl | 28 +------ .../xnnpack/third-party/xnnpack_src_defs.bzl | 13 ---- .../third-party/xnnpack_wrapper_defs.bzl | 13 ---- 11 files changed, 102 insertions(+), 74 deletions(-) diff --git a/.gitmodules b/.gitmodules index 0999bdb9356..71ff854bb03 100644 --- a/.gitmodules +++ b/.gitmodules @@ -21,7 +21,7 @@ url = https://github.com/Maratyszcza/FXdiv.git [submodule "backends/xnnpack/third-party/XNNPACK"] path = backends/xnnpack/third-party/XNNPACK - url = https://github.com/digantdesai/XNNPACK.git + url = https://github.com/google/XNNPACK.git [submodule "backends/xnnpack/third-party/cpuinfo"] path = backends/xnnpack/third-party/cpuinfo url = https://github.com/pytorch/cpuinfo.git diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake index 40e4e72c38b..b76c54bee60 100644 --- a/backends/xnnpack/cmake/Dependencies.cmake +++ b/backends/xnnpack/cmake/Dependencies.cmake @@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "" ) +set(XNNPACK_ENABLE_KLEIDIAI + OFF + CACHE BOOL "" +) add_subdirectory("${XNNPACK_SOURCE_DIR}") include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR}) list(APPEND xnnpack_third_party XNNPACK) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 7999bb9a71f..2145ea15199 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -21,6 +21,25 @@ namespace executor { namespace xnnpack { namespace delegate { +/* + * Provide compile-time allocation. + */ +class CompileAllocator { + public: + /* + * Allocate memory which will be automatically freed at the end + * of the compilation process. + */ + void* allocateTemporary(size_t size) { + auto mem = new uint8_t[size]; + temporaries_.emplace_back(mem); + return mem; + } + + private: + std::vector> temporaries_; +}; + // Flatbuffer types using ValuePtr = const fb_xnnpack::XValue*; using NodePtr = const fb_xnnpack::XNode*; @@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)( const std::unordered_map&, NodePtr) noexcept; +/* +Convert a tensor from fp32 to bf16. +*/ +void convertF32TensorToBF16( + const float* f32_data, + uint16_t* bf16_data_out, + size_t numel) { + for (auto i = 0u; i < numel; i++) { + // Adjust the f32 value such that it rounds properly after truncation. + // Constant factor scales 1+2^-8 to 1+2e-7. + float f32_adjusted = f32_data[i] * 1.00389105f; + uint32_t f32_bits; + memcpy(&f32_bits, &f32_adjusted, sizeof(float)); + bf16_data_out[i] = static_cast(f32_bits >> 16); + } +} + /* Gets the output min and output max for a given node operator */ @@ -152,7 +188,8 @@ Error defineTensor( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, std::vector& input_ids, - std::vector& output_ids) { + std::vector& output_ids, + CompileAllocator& allocator) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -356,12 +393,31 @@ Error defineTensor( size_t group_size = qparams->group_size(); size_t output_channels = tensor_value->dims()->Get(0); size_t input_channels = tensor_value->dims()->Get(1); + + const uint16_t* scale_data = nullptr; + uint32_t scale_numel = 0; + + // Block scales are preferably serialized as bf16 but can also be + // serialized as fp32 for backwards compatability. + if (qparams->scale_bf16() != nullptr) { + scale_data = + static_cast(qparams->scale_bf16()->data()); + scale_numel = qparams->scale_bf16()->size(); + } else { + // Read fp32 scales, convert to bf16. + auto conv_buffer = static_cast(allocator.allocateTemporary( + qparams->scale()->size() * sizeof(uint16_t))); + scale_numel = qparams->scale()->size(); + convertF32TensorToBF16( + qparams->scale()->data(), conv_buffer, scale_numel); + scale_data = conv_buffer; + } + ET_CHECK_OR_RETURN_ERROR( - qparams->scale()->size() == - output_channels * input_channels / group_size, + scale_numel == output_channels * input_channels / group_size, Internal, "scale size %zu != output channels %zu * group size %zu", - (size_t)qparams->scale()->size(), + static_cast(scale_numel), output_channels, group_size); int32_t zero_point = @@ -370,18 +426,19 @@ Error defineTensor( Debug, "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n", buffer_ptr, - qparams->scale()->size(), + scale_numel, qparams->channel_dim(), group_size, output_channels, datatype, zero_point, datatype); + status = xnn_define_blockwise_quantized_tensor_value( /*subgraph=*/subgraph_ptr, /*datatype=*/datatype, /*zero_point=*/zero_point, - /*scale=*/qparams->scale()->data(), + /*scale=*/scale_data, /*num_dims=*/tensor_value->num_dims(), /*channel_dim=*/qparams->channel_dim(), /*block_size=*/qparams->group_size(), @@ -1617,6 +1674,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; + CompileAllocator compile_allocator; // Header status can only either be Error::Ok or Error::NotFound if (header.ok()) { @@ -1688,7 +1746,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( flatbuffer_graph, constant_data, input_ids, - output_ids); + output_ids, + compile_allocator); if (err != Error::Ok) { return err; diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index f32e7c60637..efe717e085e 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -63,6 +63,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table XNNTensorValue { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index 773a459bbf6..33571195d63 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -48,6 +48,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table PerChannelQuant { diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py index d886ce26694..d8de79f283d 100644 --- a/backends/xnnpack/test/ops/linear.py +++ b/backends/xnnpack/test/ops/linear.py @@ -407,8 +407,8 @@ def test_qd8_per_channel_linear_parallel_and_sequential(self): ) def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: @@ -430,8 +430,8 @@ def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): ) def test_qd8_fp16_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: @@ -602,8 +602,8 @@ def _test_groupwise_dq_linear( use_bias: bool = False, group_size: int = 8, num_linears: int = 1, - atol: float = 1e-3, - rtol: float = 1e-3, + atol: float = 5e-3, + rtol: float = 5e-3, ): quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) unwrap_tensor_subclass(mod) diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 1d139a3b4b7..87ee0b46b83 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 1d139a3b4b7155889c88c31f370a82c48e7ca89c +Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3 diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py index bda79527178..e9b23e4a784 100644 --- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py +++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import print_function +from pathlib import Path import collections import os import sys @@ -36,8 +37,8 @@ "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)", "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)", @@ -46,7 +47,7 @@ # add non-prod microkernel sources here: } -SRC_NAMES = set([ +SRC_NAMES = { "OPERATOR_SRCS", "SUBGRAPH_SRCS", "LOGGING_SRCS", @@ -81,30 +82,42 @@ "PROD_AVX512F_MICROKERNEL_SRCS", "PROD_AVX512SKX_MICROKERNEL_SRCS", "PROD_AVX512VBMI_MICROKERNEL_SRCS", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_AVX512VNNI_MICROKERNEL_SRCS", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_RVV_MICROKERNEL_SRCS", "PROD_AVXVNNI_MICROKERNEL_SRCS", "AARCH32_ASM_MICROKERNEL_SRCS", "AARCH64_ASM_MICROKERNEL_SRCS", # add non-prod microkernel sources here: -]) +} def handle_singleline_parse(line): start_index = line.find("(") end_index = line.find(")") line = line[start_index+1:end_index] key_val = line.split(" ") - return key_val[0], list(map(lambda x: x[4:], key_val[1:])) + return key_val[0], [x[4:] for x in key_val[1:]] def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"): + print(f"Updating sources from {cmakefile}") sources = collections.defaultdict(list) with open(os.path.join(xnnpack_path, cmakefile)) as cmake: lines = cmake.readlines() i = 0 while i < len(lines): line = lines[i] + + if lines[i].startswith("INCLUDE"): + file, _ = handle_singleline_parse(line) + if file.startswith("cmake/gen/"): + path = Path(xnnpack_path) / "XNNPACK" / file + local_sources = update_sources(xnnpack_path, path.absolute().as_posix()) + for k,v in local_sources.items(): + if k in sources: + sources[k] = sources[k] + local_sources[k] + else: + sources[k] = local_sources[k] if lines[i].startswith("SET") and "src/" in lines[i]: name, val = handle_singleline_parse(line) @@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path): xnnpack_sources = collections.defaultdict(list) sources = update_sources(xnnpack_path) - microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake") + microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake") for key in microkernels_sources: sources[key] = microkernels_sources[key] @@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path): def main(argv): + print("Generating wrappers...") + if argv is None or len(argv) == 0: gen_wrappers(".") else: diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl index a1add446643..7f0a8ca6f21 100644 --- a/backends/xnnpack/third-party/xnnpack.buck.bzl +++ b/backends/xnnpack/third-party/xnnpack.buck.bzl @@ -1,7 +1,6 @@ load("//third-party:glob_defs.bzl", "subdir_glob") load( ":xnnpack_src_defs.bzl", - "JIT_SRCS", "LOGGING_SRCS", "OPERATOR_SRCS", "SUBGRAPH_SRCS", @@ -69,27 +68,6 @@ def define_xnnpack(): ], ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. - native.cxx_library( - name = "jit_memory", - srcs = JIT_SRCS, - headers = subdir_glob([ - ("XNNPACK/src", "**/*.h"), - ]), - header_namespace = "", - compiler_flags = [ - "-std=c++17", - ], - preferred_linkage = "static", - preprocessor_flags = [ - "-DXNN_LOG_LEVEL=0", - ], - exported_deps = [ - ":clog", - ":interface", - ], - ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. native.cxx_library( name = "operators", @@ -139,7 +117,6 @@ def define_xnnpack(): preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_SPARSE=0", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0", "-DXNN_ENABLE_MEMOPT", @@ -1223,7 +1200,6 @@ def define_xnnpack(): ] ARM_XNNPACK_DEPS = [ - ":jit_memory", ":ukernels_armsimd32", ":ukernels_fp16arith", ":ukernels_asm", @@ -1246,11 +1222,10 @@ def define_xnnpack(): "XNNPACK/src/configs/hardware-config.c", "XNNPACK/src/microparams-init.c", "XNNPACK/src/operator-run.c", - "XNNPACK/src/operators/post-operation.c", "XNNPACK/src/microkernel-utils.c", ], headers = subdir_glob([ - ("XNNPACK/src", "xnnpack/*.h"), + ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h"), ]), exported_headers = { @@ -1271,7 +1246,6 @@ def define_xnnpack(): "-DXNN_NO_X8_OPERATORS", "-DXNN_ENABLE_MEMOPT", "-DXNN_ENABLE_SPARSE=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_ASSEMBLY", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION", "-DXNN_ENABLE_ARM_DOTPROD", diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl index 0a0beba7efd..d8ebe7c72bb 100644 --- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl @@ -200,7 +200,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "XNNPACK/src/amalgam/gen/xop.c", ] PROD_AVX512F_MICROKERNEL_SRCS = [ @@ -493,30 +492,18 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] XNNPACK_SRCS = [ diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl index 2dbb41ff01b..a9d4af95ccf 100644 --- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl @@ -92,7 +92,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "xnnpack_wrappers/amalgam/gen/xop.c", ] PROD_FMA3_MICROKERNEL_SRCS = [ @@ -447,28 +446,16 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] From f3077486f94b32986ab6e99d9cbe38e3b933bd4c Mon Sep 17 00:00:00 2001 From: Chester Hu Date: Thu, 29 Aug 2024 16:29:42 -0700 Subject: [PATCH 117/531] Add tiktoken support in setup-with-qnn.sh (#4991) Adding flags in the build process for QNN backend to enable tiktoken for llama3. --- examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 86a9e051c65..5e3ac6fc011 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -34,6 +34,7 @@ cmake examples/models/llama2 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="$ANDROID_ABI" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -45,6 +46,7 @@ cmake extension/android \ -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ + -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android From 49b4dde59a2d2a85546da009e17b5b964879e980 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Thu, 29 Aug 2024 16:36:58 -0700 Subject: [PATCH 118/531] Update cpuinfo dependency version Differential Revision: D61999049 Pull Request resolved: https://github.com/pytorch/executorch/pull/4988 --- backends/xnnpack/third-party/cpuinfo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index d6860c477c9..16bfc1622c6 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7 +Subproject commit 16bfc1622c6902d6f91d316ec54894910c620325 From 7608ab8fa7c37f334fdaa9a946730fdbb1c97d34 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:13:54 -0700 Subject: [PATCH 119/531] Decouple model exporting from dataset downloading Differential Revision: D61950464 Pull Request resolved: https://github.com/pytorch/executorch/pull/4961 --- .ci/scripts/test.sh | 19 +++++++++++++++++-- .github/workflows/android-perf.yml | 4 ++-- .github/workflows/trunk.yml | 2 +- examples/qualcomm/scripts/deeplab_v3.py | 11 ++++++++--- examples/qualcomm/scripts/inception_v3.py | 13 ++++++++----- examples/qualcomm/scripts/inception_v4.py | 13 ++++++++----- examples/qualcomm/scripts/mobilenet_v2.py | 13 ++++++++----- examples/qualcomm/scripts/mobilenet_v3.py | 13 ++++++++----- 8 files changed, 60 insertions(+), 28 deletions(-) diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh index 1f20042f02a..338f2868e33 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test.sh @@ -156,9 +156,24 @@ test_model_with_qnn() { export PYTHONPATH=$EXECUTORCH_ROOT/.. if [[ "${MODEL_NAME}" == "dl3" ]]; then - "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.deeplab_v3 -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only --download - EXPORTED_MODEL=./deeplab_v3/dlv3_qnn.pte + EXPORT_SCRIPT=deeplab_v3 + EXPORTED_MODEL_NAME=dlv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv3" ]]; then + EXPORT_SCRIPT=mobilenet_v3 + EXPORTED_MODEL_NAME=mv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv2" ]]; then + EXPORT_SCRIPT=mobilenet_v2 + EXPORTED_MODEL_NAME=mv2_qnn.pte + elif [[ "${MODEL_NAME}" == "ic4" ]]; then + EXPORT_SCRIPT=inception_v4 + EXPORTED_MODEL_NAME=ic4_qnn.pte + elif [[ "${MODEL_NAME}" == "ic3" ]]; then + EXPORT_SCRIPT=inception_v3 + EXPORTED_MODEL_NAME=ic3_qnn.pte fi + + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only + EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME} } if [[ "${BACKEND}" == "portable" ]]; then diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 49d07516b15..285bf930918 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -84,9 +84,9 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: "stories110M" + CRON_DEFAULT_MODELS: "dl3,mv3,mv2,ic4,ic3" CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x" - CRON_DEFAULT_DELEGATES: "xnnpack" + CRON_DEFAULT_DELEGATES: "xnnpack,qnn" run: | set -ex MODELS="${{ inputs.models }}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 31887da855b..6d08675b8e2 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -305,7 +305,7 @@ jobs: strategy: matrix: dtype: [fp32] - model: [dl3] + model: [dl3, mv3, mv2, ic4, ic3] fail-fast: false with: runner: linux.2xlarge diff --git a/examples/qualcomm/scripts/deeplab_v3.py b/examples/qualcomm/scripts/deeplab_v3.py index 34a94c8a76a..7f24d616182 100755 --- a/examples/qualcomm/scripts/deeplab_v3.py +++ b/examples/qualcomm/scripts/deeplab_v3.py @@ -12,6 +12,7 @@ from multiprocessing.connection import Client import numpy as np +import torch from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model @@ -74,9 +75,13 @@ def main(args): ) data_num = 100 - inputs, targets, input_list = get_dataset( - data_size=data_num, dataset_dir=args.artifact, download=args.download - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + data_size=data_num, dataset_dir=args.artifact, download=args.download + ) + pte_filename = "dlv3_qnn" instance = DeepLabV3ResNet101Model() diff --git a/examples/qualcomm/scripts/inception_v3.py b/examples/qualcomm/scripts/inception_v3.py index 82b290d253d..9cc35463d41 100755 --- a/examples/qualcomm/scripts/inception_v3.py +++ b/examples/qualcomm/scripts/inception_v3.py @@ -71,10 +71,13 @@ def main(args): ) data_num = 100 - inputs, targets, input_list = get_dataset( - dataset_path=f"{args.dataset}", - data_size=data_num, - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) pte_filename = "ic3_qnn" instance = InceptionV3Model() build_executorch_binary( @@ -142,7 +145,7 @@ def main(args): "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" ), type=str, - required=True, + required=False, ) parser.add_argument( diff --git a/examples/qualcomm/scripts/inception_v4.py b/examples/qualcomm/scripts/inception_v4.py index e7f2fea1cd7..9a19de1a37a 100755 --- a/examples/qualcomm/scripts/inception_v4.py +++ b/examples/qualcomm/scripts/inception_v4.py @@ -70,10 +70,13 @@ def main(args): ) data_num = 100 - inputs, targets, input_list = get_dataset( - dataset_path=f"{args.dataset}", - data_size=data_num, - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) pte_filename = "ic4_qnn" instance = InceptionV4Model() build_executorch_binary( @@ -141,7 +144,7 @@ def main(args): "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" ), type=str, - required=True, + required=False, ) parser.add_argument( diff --git a/examples/qualcomm/scripts/mobilenet_v2.py b/examples/qualcomm/scripts/mobilenet_v2.py index 7cc0226e250..a915e26c6be 100755 --- a/examples/qualcomm/scripts/mobilenet_v2.py +++ b/examples/qualcomm/scripts/mobilenet_v2.py @@ -71,10 +71,13 @@ def main(args): ) data_num = 100 - inputs, targets, input_list = get_dataset( - dataset_path=f"{args.dataset}", - data_size=data_num, - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) pte_filename = "mv2_qnn" instance = MV2Model() build_executorch_binary( @@ -142,7 +145,7 @@ def main(args): "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" ), type=str, - required=True, + required=False, ) parser.add_argument( diff --git a/examples/qualcomm/scripts/mobilenet_v3.py b/examples/qualcomm/scripts/mobilenet_v3.py index 08c65904631..068e9cba3a7 100644 --- a/examples/qualcomm/scripts/mobilenet_v3.py +++ b/examples/qualcomm/scripts/mobilenet_v3.py @@ -70,10 +70,13 @@ def main(args): ) data_num = 100 - inputs, targets, input_list = get_dataset( - dataset_path=f"{args.dataset}", - data_size=data_num, - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) pte_filename = "mv3_qnn" instance = MV3Model() build_executorch_binary( @@ -140,7 +143,7 @@ def main(args): "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" ), type=str, - required=True, + required=False, ) parser.add_argument( From ff4a73602d413e12fbb50cf43e8163cef98a1746 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Thu, 29 Aug 2024 17:30:32 -0700 Subject: [PATCH 120/531] Hide and simplify backend registry internals Differential Revision: D61928651 Pull Request resolved: https://github.com/pytorch/executorch/pull/4947 --- runtime/backend/interface.cpp | 47 ++++++++++++++++--------------- runtime/backend/interface.h | 52 +++++++---------------------------- 2 files changed, 33 insertions(+), 66 deletions(-) diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp index 44785cfcd09..d7f0489db5e 100644 --- a/runtime/backend/interface.cpp +++ b/runtime/backend/interface.cpp @@ -14,45 +14,44 @@ namespace runtime { PyTorchBackendInterface::~PyTorchBackendInterface() {} -// TODO(T128866626): Remove global static variables. -// We want to be able to run multiple Executor instances -// and having a global registration isn't a viable solution -// in the long term. -BackendRegistry& getBackendRegistry(); -BackendRegistry& getBackendRegistry() { - static BackendRegistry backend_reg; - return backend_reg; -} +namespace { -PyTorchBackendInterface* get_backend_class(const char* name) { - return getBackendRegistry().get_backend_class(name); -} +// The max number of backends that can be registered globally. +constexpr size_t kMaxRegisteredBackends = 16; + +// TODO(T128866626): Remove global static variables. We want to be able to run +// multiple Executor instances and having a global registration isn't a viable +// solution in the long term. + +/// Global table of registered backends. +Backend registered_backends[kMaxRegisteredBackends]; -PyTorchBackendInterface* BackendRegistry::get_backend_class(const char* name) { - for (size_t idx = 0; idx < registrationTableSize_; idx++) { - Backend backend = backend_table_[idx]; - if (strcmp(backend.name_, name) == 0) { - return backend.interface_ptr_; +/// The number of backends registered in the table. +size_t num_registered_backends = 0; + +} // namespace + +PyTorchBackendInterface* get_backend_class(const char* name) { + for (size_t i = 0; i < num_registered_backends; i++) { + Backend backend = registered_backends[i]; + if (strcmp(backend.name, name) == 0) { + return backend.backend; } } return nullptr; } Error register_backend(const Backend& backend) { - return getBackendRegistry().register_backend(backend); -} - -Error BackendRegistry::register_backend(const Backend& backend) { - if (registrationTableSize_ >= kRegistrationTableMaxSize) { + if (num_registered_backends >= kMaxRegisteredBackends) { return Error::Internal; } // Check if the name already exists in the table - if (this->get_backend_class(backend.name_) != nullptr) { + if (get_backend_class(backend.name) != nullptr) { return Error::InvalidArgument; } - backend_table_[registrationTableSize_++] = backend; + registered_backends[num_registered_backends++] = backend; return Error::Ok; } diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h index 02cbb1e8d4d..0b77283a358 100644 --- a/runtime/backend/interface.h +++ b/runtime/backend/interface.h @@ -110,46 +110,6 @@ class PyTorchBackendInterface { virtual void destroy(ET_UNUSED DelegateHandle* handle) const {} }; -struct Backend { - const char* name_; - PyTorchBackendInterface* interface_ptr_; -}; - -// The max number of backends that can be registered in -// an app. It's hard coded to 16 because it's not estimated -// to have more than 16 backends in a system. Each table -// element has two pointers, represented by Backend struct. -// The memory overhead for this table is minimum (only a few bytes). -constexpr size_t kRegistrationTableMaxSize = 16; - -class BackendRegistry { - public: - BackendRegistry() : registrationTableSize_(0) {} - - /** - * Registers the Backend object (i.e. string name and PyTorchBackendInterface - * pair) so that it could be called via the name during the runtime. - * @param[in] backend Backend object of the user-defined backend delegate. - * @retval Error code representing whether registration was successful. - */ - ET_NODISCARD Error register_backend(const Backend& backend); - - /** - * Returns the corresponding object pointer for a given string name. - * The mapping is populated using register_backend method. - * - * @param[in] name Name of the user-defined backend delegate. - * @retval Pointer to the appropriate object that implements - * PyTorchBackendInterface. Nullptr if it can't find anything - * with the given name. - */ - PyTorchBackendInterface* get_backend_class(const char* name); - - private: - Backend backend_table_[kRegistrationTableMaxSize]; - size_t registrationTableSize_; -}; - /** * Returns the corresponding object pointer for a given string name. * The mapping is populated using register_backend method. @@ -161,6 +121,16 @@ class BackendRegistry { */ PyTorchBackendInterface* get_backend_class(const char* name); +/** + * A named instance of a backend. + */ +struct Backend { + /// The name of the backend. Must match the string used in the PTE file. + const char* name; + /// The instance of the backend to use when loading and executing programs. + PyTorchBackendInterface* backend; +}; + /** * Registers the Backend object (i.e. string name and PyTorchBackendInterface * pair) so that it could be called via the name during the runtime. @@ -178,11 +148,9 @@ namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. using ::executorch::runtime::Backend; -using ::executorch::runtime::BackendRegistry; using ::executorch::runtime::CompileSpec; using ::executorch::runtime::DelegateHandle; using ::executorch::runtime::get_backend_class; -// using ::executorch::runtime::kRegistrationTableMaxSize; using ::executorch::runtime::PyTorchBackendInterface; using ::executorch::runtime::register_backend; using ::executorch::runtime::SizedBuffer; From f99e25f42c29d8eadcde13d5fe7007854fa09660 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 29 Aug 2024 18:10:16 -0700 Subject: [PATCH 121/531] [llama] Build the runner with tiktoken by default Differential Revision: D61830302 Pull Request resolved: https://github.com/pytorch/executorch/pull/4921 --- .ci/scripts/build-qnn-sdk.sh | 1 + backends/qualcomm/scripts/build.sh | 1 + build/build_android_llm_demo.sh | 7 --- .../demo-apps/android/LlamaDemo/README.md | 12 +---- examples/demo-apps/android/LlamaDemo/setup.sh | 2 - examples/models/llama2/CMakeLists.txt | 19 -------- examples/models/llama2/README.md | 3 -- examples/models/llama2/runner/CMakeLists.txt | 36 +++++++++----- examples/models/llama2/runner/runner.cpp | 25 +++++----- examples/models/llama2/runner/targets.bzl | 8 +--- .../oss_scripts/llama2/CMakeLists.txt | 2 +- .../qaihub_scripts/llama/CMakeLists.txt | 3 +- .../stable_diffusion/CMakeLists.txt | 2 +- extension/android/CMakeLists.txt | 29 ++++++------ extension/llm/third-party/TARGETS | 47 +++++++++++++++++++ shim/xplat/executorch/build/env_interface.bzl | 2 +- 16 files changed, 108 insertions(+), 91 deletions(-) create mode 100644 extension/llm/third-party/TARGETS diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index 260072f7342..ec3a8a39e37 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -6,6 +6,7 @@ # LICENSE file in the root directory of this source tree. set -eux +set -o xtrace build_qnn_backend() { echo "Start building qnn backend." diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index b63ea6fe8d9..61b363f1a77 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. set -e +set -o xtrace if [[ -z ${QNN_SDK_ROOT} ]]; then echo "Please export QNN_SDK_ROOT=/path/to/qnn_sdk" diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index a11e54f932d..61f54c47cba 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -20,11 +20,6 @@ build_android_native_library() { TOKENIZER="$2" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" - if [[ $TOKENIZER == "tiktoken" ]]; then - EXECUTORCH_USE_TIKTOKEN=ON - else - EXECUTORCH_USE_TIKTOKEN=OFF - fi cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ @@ -54,7 +49,6 @@ build_android_native_library() { -DANDROID_ABI="$ANDROID_ABI" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ @@ -72,7 +66,6 @@ build_android_native_library() { -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md index 7bb36657da3..fc58d70a2f1 100644 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ b/examples/demo-apps/android/LlamaDemo/README.md @@ -64,22 +64,14 @@ Note: `` is the root for the NDK, which is usually under `~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md. We use `/build/cmake/android.toolchain.cmake` for CMake to cross-compile. -3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set -`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer. -If you need to run other models like LLaMA2, skip this skip. - -```bash -export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3 -``` - -4. Build the Android Java extension code: +3. Build the Android Java extension code: ```bash pushd extension/android ./gradlew build popd ``` -5. Run the following command set up the required JNI library: +4. Run the following command set up the required JNI library: ```bash pushd examples/demo-apps/android/LlamaDemo ./gradlew :app:setup diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 39a50f9b968..5b3244fbcce 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -35,7 +35,6 @@ cmake examples/models/llama2 \ -DANDROID_ABI="$ANDROID_ABI" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ @@ -50,7 +49,6 @@ cmake extension/android \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index 8b82fdda12f..b517641f408 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -21,8 +21,6 @@ project(llama_runner) # Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) -option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF) - include(CMakeDependentOption) # # pthreadpool: build pthreadpool library. Disable on unsupported platforms @@ -94,23 +92,6 @@ endif() # llama_runner library add_subdirectory(runner) -if(EXECUTORCH_USE_TIKTOKEN) - # find RE2 for tokenizer - set(ABSL_ENABLE_INSTALL ON) - set(ABSL_PROPAGATE_CXX_STD ON) - set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp - ) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 - ) - set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - target_link_libraries(llama_runner PUBLIC re2::re2) -endif() set(link_libraries gflags) set(_srcs main.cpp) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index b8a260865b5..ea95c7f965c 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -227,8 +227,6 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the cmake --build cmake-out/examples/models/llama2 -j16 --config Release ``` -For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner. - 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40). ``` cmake-out/examples/models/llama2/llama_main --model_path= --tokenizer_path= --prompt= @@ -283,7 +281,6 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release ``` -For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner. **2. Run on Android via adb shell** diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index c99a54982aa..abad63a3b5f 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -41,16 +41,13 @@ target_include_directories( extension_module INTERFACE ${_common_include_directories} ) -if(EXECUTORCH_USE_TIKTOKEN) - list( - APPEND _llama_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp - ) - list(APPEND _llama_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp - ) - set(_preprocessor_flag -DET_USE_TIKTOKEN) -endif() +list( + APPEND _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp +) +list(APPEND _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp +) if(CMAKE_TOOLCHAIN_IOS OR ANDROID @@ -63,7 +60,24 @@ else() add_library(llama_runner SHARED ${_llama_runner__srcs}) endif() -set(llama_runner_deps executorch extension_module extension_data_loader) +# find RE2 for tokenizer, build tiktoken +set(ABSL_ENABLE_INSTALL ON) +set(ABSL_PROPAGATE_CXX_STD ON) +set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp + ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp +) +add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/third-party/re2 + ${CMAKE_CURRENT_BINARY_DIR}/re2 +) +set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) + +set(llama_runner_deps executorch extension_module extension_data_loader + re2::re2 +) target_link_libraries(llama_runner PUBLIC ${llama_runner_deps}) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 7a2fa676628..8b9e6865516 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -16,11 +16,8 @@ #include #include -#if ET_USE_TIKTOKEN #include -#else /* BPE */ #include -#endif /* ET_USE_TIKTOKEN*/ namespace torch::executor { namespace { @@ -46,13 +43,6 @@ Runner::Runner( : temperature_(temperature), module_(std::make_unique(model_path, Module::LoadMode::File)), tokenizer_path_(tokenizer_path), - tokenizer_( -#if ET_USE_TIKTOKEN - get_tiktoken_for_llama() -#else - std::make_unique() -#endif - ), metadata_({ {kAppendEosToPrompt, false}, {kEnableDynamicShape, false}, @@ -79,8 +69,19 @@ Error Runner::load() { return Error::Ok; } ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - - tokenizer_->load(tokenizer_path_); + // load tokenizer + tokenizer_ = nullptr; + tokenizer_ = std::make_unique(); + Error err = tokenizer_->load(tokenizer_path_); + if (err == Error::InvalidArgument) { + ET_LOG( + Info, + "Failed to load %s as a BPETokenizer artifact, trying Tiktoken", + tokenizer_path_.c_str()); + tokenizer_.reset(); + tokenizer_ = get_tiktoken_for_llama(); + tokenizer_->load(tokenizer_path_); + } ET_LOG(Info, "Reading metadata from model"); diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 3ffc10421fc..475c5d92ab1 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -8,9 +8,6 @@ def _get_operator_lib(aten = False): else: return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"] -def use_tiktoken(): - return native.read_config("llama", "use_tiktoken", "0") == "1" - def define_common_targets(): for aten in (True, False): aten_suffix = "_aten" if aten else "" @@ -26,7 +23,6 @@ def define_common_targets(): preprocessor_flags = [ "-DUSE_ATEN_LIB", ] if aten else [], - exported_preprocessor_flags = ["-DET_USE_TIKTOKEN"] if use_tiktoken() else [], visibility = [ "@EXECUTORCH_CLIENTS", ], @@ -43,11 +39,9 @@ def define_common_targets(): "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ] + ([ "//executorch/examples/models/llama2/tokenizer:tiktoken", - ] if use_tiktoken() else [ "//executorch/extension/llm/tokenizer:bpe_tokenizer", - ]) + (_get_operator_lib(aten)) + ([ + ] + (_get_operator_lib(aten)) + ([ # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE) # Therefore enable it explicitly for now to avoid failing tests "//executorch/backends/vulkan:vulkan_backend_lib", diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt index 7b59120d713..f02da300334 100644 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt @@ -24,6 +24,6 @@ target_include_directories( ) target_link_libraries( qnn_llama_runner qnn_executorch_backend full_portable_ops_lib - extension_data_loader extension_module gflags + extension_data_loader extension_module gflags re2::re2 ) target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index 674aa2b72fe..2ca3364905c 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -31,7 +31,7 @@ target_include_directories( ) target_link_libraries( qaihub_llama2_7b_runner qnn_executorch_backend executorch_no_prim_ops - extension_data_loader extension_module gflags + extension_data_loader extension_module gflags re2::re2 ) target_compile_options( qaihub_llama2_7b_runner PUBLIC ${_common_compile_options} @@ -71,7 +71,6 @@ list( _qaihub_llama3_8b_runner__srcs ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp ) -set(_preprocessor_flag -DET_USE_TIKTOKEN) # build qaihub llama3 8b runner add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs}) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt index b0cec2d3005..affe666234a 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -20,7 +20,7 @@ target_include_directories( ) target_link_libraries( qaihub_stable_diffusion_runner qnn_executorch_backend executorch_no_prim_ops - extension_data_loader extension_module gflags + extension_data_loader extension_module gflags re2::re2 ) target_compile_options( qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options} diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 5982cd16e10..4c2abeb4f6e 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -129,19 +129,18 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) quantized_ops_lib ) target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) - if(EXECUTORCH_USE_TIKTOKEN) - set(ABSL_ENABLE_INSTALL ON) - set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp - ) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 - ) - set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - target_link_libraries(executorch_llama_jni re2::re2) - endif() + # link re2 + set(ABSL_ENABLE_INSTALL ON) + set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp + ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp + ) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2 + ${CMAKE_CURRENT_BINARY_DIR}/re2 + ) + set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) + target_link_libraries(executorch_llama_jni re2::re2) endif() diff --git a/extension/llm/third-party/TARGETS b/extension/llm/third-party/TARGETS new file mode 100644 index 00000000000..978c12371fe --- /dev/null +++ b/extension/llm/third-party/TARGETS @@ -0,0 +1,47 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.cxx_library( + name = "abseil", + public_include_directories = ["abseil-cpp"], + srcs = glob( + ["abseil-cpp/absl/**/*.cc"], + exclude = [ + "abseil-cpp/absl/**/*test*.cc", + "abseil-cpp/absl/**/*mock*.cc", + "abseil-cpp/absl/**/*matchers*.cc", + "abseil-cpp/absl/**/*benchmark*.cc", + ], + ), + exported_linker_flags = select( + { + "DEFAULT": [], + "ovr_config//os:macos": ["-Wl,-framework,CoreFoundation"], + }, + ), + visibility = ["PUBLIC"], + _is_external_target = True, +) + +runtime.cxx_library( + name = "re2", + public_include_directories = ["re2"], + srcs = glob( + [ + "re2/re2/**/*.cc", + "re2/util/**/*.cc", + ], + exclude = [ + "re2/re2/**/*test*.cc", + "re2/re2/testing/*.cc", + "re2/re2/fuzzing/*.cc", + "re2/re2/**/*benchmark*.cc", + ], + ), + exported_deps = [ + ":abseil", + ], + visibility = ["PUBLIC"], + _is_external_target = True, +) diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl index 27d2887b668..5b0acd36dab 100644 --- a/shim/xplat/executorch/build/env_interface.bzl +++ b/shim/xplat/executorch/build/env_interface.bzl @@ -41,7 +41,7 @@ _EXTERNAL_DEPS = { "libtorch_python": "//third-party:libtorch_python", "prettytable": "//third-party:prettytable", "pybind11": "//third-party:pybind11", - "re2": [], # TODO(larryliu0820): Add support + "re2": "//extension/llm/third-party:re2", "sentencepiece-py": [], # Core C++ PyTorch functionality like Tensor and ScalarType. "torch-core-cpp": "//third-party:libtorch", From 369f804e709a9fcf5f7d46bac47496a6fc97a646 Mon Sep 17 00:00:00 2001 From: lucylq Date: Thu, 29 Aug 2024 19:06:23 -0700 Subject: [PATCH 122/531] Update default segment alignment to 128 Differential Revision: D61104820 Pull Request resolved: https://github.com/pytorch/executorch/pull/4994 --- exir/_serialize/_program.py | 2 +- exir/_serialize/test/test_program.py | 4 +++- exir/capture/_config.py | 2 +- exir/lowered_backend_module.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index a82b947cec3..d22de71d1c2 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -348,7 +348,7 @@ def serialize_pte_binary( mutable_data: Optional[List[Buffer]] = None, extract_delegate_segments: bool = False, extract_constant_segment: bool = False, - segment_alignment: int = 4096, + segment_alignment: int = 128, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, ) -> Cord: diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index 54f8c7b6225..09927ad9648 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import copy import difflib import json @@ -36,7 +38,7 @@ ) from executorch.exir.tests.common import get_test_program -SEGMENT_ALIGNMENT: int = 4096 +SEGMENT_ALIGNMENT: int = 128 CONSTANT_TENSOR_ALIGNMENT: int = 16 diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 42dc170c19d..c0f7b71baf9 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -73,7 +73,7 @@ class ExecutorchBackendConfig: # When extracting segments, the starting offset of each segment will be # aligned to this value (in bytes). Must be a power of two. - segment_alignment: int = 4096 + segment_alignment: int = 128 # If provided, the minimum alignment of tensor buffers in the program. Must # be a power of 2. If not provided, uses the value in the schema file. diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index d93905a2bd0..6ba3b6bcb05 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -137,7 +137,7 @@ def original_module(self) -> ExportedProgram: def buffer( self, extract_delegate_segments: bool = False, - segment_alignment: int = 4096, + segment_alignment: int = 128, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, memory_planning: MemoryPlanningPass = None, # pyre-fixme[9] From 12039af2aaa0845eaa46f574e9ce334105846ff9 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 29 Aug 2024 22:19:42 -0700 Subject: [PATCH 123/531] Format Cmake files. Differential Revision: D62013660 Pull Request resolved: https://github.com/pytorch/executorch/pull/4999 --- examples/qualcomm/oss_scripts/llama2/CMakeLists.txt | 9 +++++++-- examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt | 9 +++++++-- .../qaihub_scripts/stable_diffusion/CMakeLists.txt | 9 +++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt index f02da300334..006e0f75174 100644 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt @@ -23,7 +23,12 @@ target_include_directories( qnn_llama_runner PUBLIC ${_common_include_directories} ) target_link_libraries( - qnn_llama_runner qnn_executorch_backend full_portable_ops_lib - extension_data_loader extension_module gflags re2::re2 + qnn_llama_runner + qnn_executorch_backend + full_portable_ops_lib + extension_data_loader + extension_module + gflags + re2::re2 ) target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index 2ca3364905c..c1fd5dc6538 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -30,8 +30,13 @@ target_include_directories( qaihub_llama2_7b_runner PUBLIC ${_common_include_directories} ) target_link_libraries( - qaihub_llama2_7b_runner qnn_executorch_backend executorch_no_prim_ops - extension_data_loader extension_module gflags re2::re2 + qaihub_llama2_7b_runner + qnn_executorch_backend + executorch_no_prim_ops + extension_data_loader + extension_module + gflags + re2::re2 ) target_compile_options( qaihub_llama2_7b_runner PUBLIC ${_common_compile_options} diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt index affe666234a..e6af95595b7 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -19,8 +19,13 @@ target_include_directories( qaihub_stable_diffusion_runner PUBLIC ${_common_include_directories} ) target_link_libraries( - qaihub_stable_diffusion_runner qnn_executorch_backend executorch_no_prim_ops - extension_data_loader extension_module gflags re2::re2 + qaihub_stable_diffusion_runner + qnn_executorch_backend + executorch_no_prim_ops + extension_data_loader + extension_module + gflags + re2::re2 ) target_compile_options( qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options} From 12639645db567fa6b8f46e16381ef1d56d93d4bd Mon Sep 17 00:00:00 2001 From: lucylq Date: Thu, 29 Aug 2024 23:54:23 -0700 Subject: [PATCH 124/531] Preprocess C++ Differential Revision: D61833480 Pull Request resolved: https://github.com/pytorch/executorch/pull/4987 --- .../models/flamingo/preprocess/preprocess.cpp | 118 ++++++++++++++++++ .../models/flamingo/preprocess/preprocess.h | 41 ++++++ .../flamingo/preprocess/preprocess_test.cpp | 113 +++++++++++++++++ .../models/flamingo/preprocess/targets.bzl | 20 +++ 4 files changed, 292 insertions(+) create mode 100644 examples/models/flamingo/preprocess/preprocess.cpp create mode 100644 examples/models/flamingo/preprocess/preprocess.h create mode 100644 examples/models/flamingo/preprocess/preprocess_test.cpp create mode 100644 examples/models/flamingo/preprocess/targets.bzl diff --git a/examples/models/flamingo/preprocess/preprocess.cpp b/examples/models/flamingo/preprocess/preprocess.cpp new file mode 100644 index 00000000000..ff46070f669 --- /dev/null +++ b/examples/models/flamingo/preprocess/preprocess.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "executorch/examples/models/flamingo/preprocess/preprocess.h" + +#include +#include + +std::vector _get_factors(int n) { + std::vector factors; + for (int i = 1; i <= n; i++) { + if (n % i == 0) { + factors.push_back(i); + } + } + return factors; +} + +std::vector> find_supported_resolutions( + int max_num_tiles, + int tile_size) { + std::vector> supported_resolutions; + for (int _tile_size = max_num_tiles; _tile_size > 0; _tile_size--) { + auto factors = _get_factors(_tile_size); + for (int i = 0; i < factors.size(); i++) { + int height = factors[i]; + int width = _tile_size / factors[i]; + supported_resolutions.push_back({height * tile_size, width * tile_size}); + } + } + return supported_resolutions; +} + +std::vector get_canvas_best_fit( + std::vector image_size, + std::vector> possible_resolutions, + bool resize_to_max_canvas) { + assert(image_size.size() == 2); + int image_h = image_size[0]; + int image_w = image_size[1]; + + float best_scale = -0.1; + std::vector best_resolution; + int best_area = 0; + + for (int i = 0; i < possible_resolutions.size(); i++) { + assert(possible_resolutions[i].size() == 2); + float scale_h = possible_resolutions[i][0] / (float)image_h; + float scale_w = possible_resolutions[i][1] / (float)image_w; + + // Get limiting side scaling -> no distortion + float scale = scale_h < scale_w ? scale_h : scale_w; + + bool is_candidate = false; + + if (scale >= 1.0) { + // Upscaling options. + if (resize_to_max_canvas) { + is_candidate = scale >= best_scale; + } else { + is_candidate = ((scale <= best_scale) || (best_resolution.size() == 0)); + } + } else { + // If no upscaling options, find the minimum downscaling (max scale for + // scales < 1) + is_candidate = ((scale >= best_scale) || (best_resolution.size() == 0)); + } + + // Select the best resolution. + if (is_candidate) { + // @lint-ignore CLANGTIDY facebook-hte-ParameterUncheckedArrayBounds + int area = possible_resolutions[i][0] * possible_resolutions[i][1]; + if (scale == best_scale) { + // If there are multiple resolutions, get the one with minimum area to + // reduce padding. + if (scale >= 1.0 && area < best_area) { + best_resolution = possible_resolutions[i]; + best_area = area; + } + } else { + best_resolution = possible_resolutions[i]; + best_scale = scale; + best_area = area; + } + } + } + return best_resolution; +} + +std::vector get_inscribed_size( + std::vector image_size, + std::vector target_size, + int max_size) { + assert(image_size.size() == 2); + assert(target_size.size() == 2); + + int target_height = target_size[0]; + int target_width = target_size[1]; + + if (max_size > 0) { + target_height = std::min(std::max(image_size[0], max_size), target_size[0]); + target_width = std::min(std::max(image_size[1], max_size), target_size[1]); + } + + int resize_height = std::min( + (int)(image_size[0] * (target_width / (float)image_size[1])), + target_height); + int resize_width = std::min( + (int)(image_size[1] * (target_height / (float)image_size[0])), + target_width); + + return {resize_height, resize_width}; +} diff --git a/examples/models/flamingo/preprocess/preprocess.h b/examples/models/flamingo/preprocess/preprocess.h new file mode 100644 index 00000000000..f6c7b813e95 --- /dev/null +++ b/examples/models/flamingo/preprocess/preprocess.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +// C++ implementation of the python functions in torchtune: +// https://github.com/pytorch/torchtune/tree/main/torchtune/modules/transforms/vision_utils + +// Calculate all factors of a given number. +std::vector _get_factors(int n); + +// Computes all combinations of resolutions, multiple of tile_size, +// that contain up to max_num_tiles. Useful for when dividing an image into +// tiles. For example, if we want at most 2 tiles per image, then we can support +// the following resolutions: (1x1, 1x2, 2x1) * tile_size Returns a vector of +// tuples of (height, width). +std::vector> find_supported_resolutions( + int max_num_tiles, + int tile_size); + +// Determines the best canvas possible from a list of possible resolutions to +// resize an image to, without distortion. +std::vector get_canvas_best_fit( + std::vector image_size, + std::vector> possible_resolutions, + bool resize_to_max_canvas); + +// Calculates the size of an image, if it was resized to be inscribed within the +// target_size. It is upscaled or downscaled such that one size is equal to the +// target_size, and the second size is less than or equal to the target_size. +std::vector get_inscribed_size( + std::vector image_size, + std::vector canvas_size, + int max_size); diff --git a/examples/models/flamingo/preprocess/preprocess_test.cpp b/examples/models/flamingo/preprocess/preprocess_test.cpp new file mode 100644 index 00000000000..deede877223 --- /dev/null +++ b/examples/models/flamingo/preprocess/preprocess_test.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using namespace ::testing; + +// Mirror the torchtune python testing: +// https://github.com/pytorch/torchtune/tree/main/tests/torchtune/modules/transforms + +void test_find_supported_resolutions( + int max_num_tiles, + int tile_size, + std::vector> expected_resolutions) { + std::vector> resolutions = + find_supported_resolutions(max_num_tiles, tile_size); + + EXPECT_EQ(resolutions.size(), expected_resolutions.size()); + + for (int i = 0; i < resolutions.size(); i++) { + EXPECT_EQ(resolutions[i].size(), expected_resolutions[i].size()); + EXPECT_EQ(resolutions[i][0], expected_resolutions[i][0]); // height + EXPECT_EQ(resolutions[i][1], expected_resolutions[i][1]); // width + } +} + +TEST(PreprocessTest, TestFindSupportedResolution) { + test_find_supported_resolutions(1, 224, {{224, 224}}); + test_find_supported_resolutions(2, 100, {{100, 200}, {200, 100}, {100, 100}}); + test_find_supported_resolutions( + 3, 50, {{50, 150}, {150, 50}, {50, 100}, {100, 50}, {50, 50}}); + test_find_supported_resolutions( + 4, + 300, + { + {300, 1200}, + {600, 600}, + {1200, 300}, + {300, 900}, + {900, 300}, + {300, 600}, + {600, 300}, + {300, 300}, + }); +} + +void test_get_canvas_best_fit( + std::vector image_size, + std::vector> possible_resolutions, + bool resize_to_max_canvas, + std::vector expected_best_resolution) { + std::vector best_resolution = get_canvas_best_fit( + image_size, possible_resolutions, resize_to_max_canvas); + EXPECT_EQ(best_resolution[0], expected_best_resolution[0]); // height + EXPECT_EQ(best_resolution[1], expected_best_resolution[1]); // width +} + +TEST(PreprocessTest, TestGetCanvasBestFit_200x300_F) { + std::vector> possible_resolutions = { + {224, 896}, + {448, 448}, + {224, 224}, + {896, 224}, + {224, 672}, + {672, 224}, + {224, 448}, + {448, 224}, + }; + test_get_canvas_best_fit( + {200, 300}, + possible_resolutions, + false, // resize_to_max_canvas + {224, 448}); + + test_get_canvas_best_fit( + {200, 500}, + possible_resolutions, + true, // resize_to_max_canvas + {224, 672}); + test_get_canvas_best_fit( + {200, 200}, + possible_resolutions, + false, // resize_to_max_canvas + {224, 224}); + test_get_canvas_best_fit( + {200, 100}, + possible_resolutions, + true, // resize_to_max_canvas + {448, 224}); +} + +void test_get_inscribed_size( + std::vector image_size, + std::vector target_size, + int max_size, + std::vector expected_target_size) { + std::vector result = + get_inscribed_size(image_size, target_size, max_size); + EXPECT_EQ(result[0], expected_target_size[0]); // height + EXPECT_EQ(result[1], expected_target_size[1]); // width +} +TEST(PreprocessTest, GetInscribedSize) { + test_get_inscribed_size({200, 100}, {1000, 1200}, 600, {600, 300}); + test_get_inscribed_size({2000, 200}, {1000, 1200}, 2000, {1000, 100}); + test_get_inscribed_size({400, 200}, {1000, 1200}, -1, {1000, 500}); + test_get_inscribed_size({1000, 500}, {400, 300}, -1, {400, 200}); +} diff --git a/examples/models/flamingo/preprocess/targets.bzl b/examples/models/flamingo/preprocess/targets.bzl new file mode 100644 index 00000000000..fd60d94a907 --- /dev/null +++ b/examples/models/flamingo/preprocess/targets.bzl @@ -0,0 +1,20 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.cxx_library( + name = "preprocess", + srcs = ["preprocess.cpp"], + exported_headers = ["preprocess.h"], + ) + + runtime.cxx_test( + name = "preprocess_test", + srcs = ["preprocess_test.cpp"], + deps = [":preprocess"], + ) From b95e4b3c5a049b227824760067cf4433fa23bd77 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Fri, 30 Aug 2024 22:07:41 +0800 Subject: [PATCH 125/531] Qualcomm AI Engine Direct - Check the version QNN API and backend API (#4998) Summary: QNN API version format is major.minor.patch. If major given by the user does not match the built major, it will return and show error message. If minor does not match, it will show warning message. --- backends/qualcomm/runtime/QnnManager.cpp | 3 + .../runtime/backends/QnnBackendCommon.cpp | 79 +++++++++++++++++++ .../runtime/backends/QnnBackendCommon.h | 9 +++ .../runtime/backends/QnnBackendFactory.cpp | 9 ++- .../runtime/backends/htpbackend/HtpBackend.h | 10 +++ 5 files changed, 108 insertions(+), 2 deletions(-) diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 3027c184d95..38245ca7f96 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -281,6 +282,8 @@ Error QnnManager::Init() { options_->backend_options()->backend_type()); backend_params_ptr_ = QnnBackendFactory().Create( qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_); + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.") ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok, Internal, diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp index 3e286c07b02..c67f9b52f5d 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp @@ -53,6 +53,85 @@ Error QnnBackend::Configure() { } return Error::Ok; } + +Error QnnBackend::VerifyQNNSDKVersion( + const QnnExecuTorchBackendType backend_id) { + const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); + + Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT}; + Qnn_ErrorHandle_t error = + qnn_interface.qnn_backend_get_api_version(&qnn_version); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR("Failed to get Qnn API version."); + return Error::Internal; + } + + Qnn_ApiVersion_t expected_version = {QNN_VERSION_INIT}; + expected_version.coreApiVersion.major = QNN_API_VERSION_MAJOR; + expected_version.coreApiVersion.minor = QNN_API_VERSION_MINOR; + expected_version.coreApiVersion.patch = QNN_API_VERSION_PATCH; + expected_version.backendApiVersion = GetExpectedBackendVersion(); + const char* backend_type = EnumNameQnnExecuTorchBackendType(backend_id); + + Error status = VersionChecker( + qnn_version.coreApiVersion, expected_version.coreApiVersion, "Qnn API"); + if (status == Error::Ok) { + status = VersionChecker( + qnn_version.backendApiVersion, + expected_version.backendApiVersion, + backend_type); + } + + return status; +} + +Error QnnBackend::VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix) { + if (qnn_version.major != expected.major) { + QNN_EXECUTORCH_LOG_ERROR( + "%s version %u.%u.%u is not supported. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + return Error::Internal; + } + if (qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor < expected.minor) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is mismatched. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + if ((qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor > expected.minor)) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is used. " + "The version is tested against %u.%u.%u.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + return Error::Ok; +} } // namespace qnn } // namespace executor } // namespace torch diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.h b/backends/qualcomm/runtime/backends/QnnBackendCommon.h index e6ea0adff8b..de007898e5d 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.h @@ -13,8 +13,10 @@ #include +#include "HTP/QnnHtpCommon.h" #include "QnnBackend.h" #include "QnnCommon.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -43,7 +45,10 @@ class QnnBackend { return handle_; } + Error VerifyQNNSDKVersion(const QnnExecuTorchBackendType backend_id); + protected: + virtual Qnn_Version_t GetExpectedBackendVersion() const = 0; virtual Error MakeConfig(std::vector& config) { return Error::Ok; }; @@ -52,6 +57,10 @@ class QnnBackend { Qnn_BackendHandle_t handle_; const QnnImplementation& implementation_; QnnLogger* logger_; + Error VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix); }; } // namespace qnn } // namespace executor diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index acb95524682..9fb292613a3 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -16,6 +16,7 @@ std::unique_ptr QnnBackendFactory::Create( const QnnExecuTorchContextBinary& qnn_context_blob, const QnnExecuTorchOptions* options) { auto backend_params = std::make_unique(); + switch (options->backend_options()->backend_type()) { case QnnExecuTorchBackendType::kHtpBackend: { auto htp_options = options->backend_options()->htp_options(); @@ -51,6 +52,7 @@ std::unique_ptr QnnBackendFactory::Create( } backend_params->qnn_backend_ptr_ = std::make_unique(implementation, logger); + backend_params->qnn_device_ptr_ = std::make_unique( implementation, logger, options->soc_info(), htp_options); @@ -72,7 +74,6 @@ std::unique_ptr QnnBackendFactory::Create( backend_params->qnn_mem_manager_ptr_ = std::make_unique( implementation, backend_params->qnn_context_ptr_.get()); backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED; - return backend_params; } break; case QnnExecuTorchBackendType::kGpuBackend: case QnnExecuTorchBackendType::kDspBackend: @@ -81,7 +82,11 @@ std::unique_ptr QnnBackendFactory::Create( return nullptr; } - // should not reach here + if (backend_params->qnn_backend_ptr_->VerifyQNNSDKVersion( + options->backend_options()->backend_type()) == Error::Ok) { + return backend_params; + } + return nullptr; } } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h index d4b14178a43..d00bd50cdc3 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h @@ -8,7 +8,9 @@ #pragma once #include +#include "HTP/QnnHtpCommon.h" #include "HTP/QnnHtpProfile.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -24,6 +26,14 @@ class HtpBackend : public QnnBackend { event_type == QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE); } + Qnn_Version_t GetExpectedBackendVersion() const override { + Qnn_Version_t backend_version; + backend_version.major = QNN_HTP_API_VERSION_MAJOR; + backend_version.minor = QNN_HTP_API_VERSION_MINOR; + backend_version.patch = QNN_HTP_API_VERSION_PATCH; + return backend_version; + } + protected: Error MakeConfig(std::vector& config) override { return Error::Ok; From 2520d50b12b17b030c9fb6c47882dfa6042ced72 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:01:17 -0700 Subject: [PATCH 126/531] Include stories with qnn in benchinfra Differential Revision: D62035079 Pull Request resolved: https://github.com/pytorch/executorch/pull/4995 --- .github/workflows/android-perf.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 285bf930918..f9f11d9abe0 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -84,7 +84,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: "dl3,mv3,mv2,ic4,ic3" + CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3" CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x" CRON_DEFAULT_DELEGATES: "xnnpack,qnn" run: | @@ -162,6 +162,11 @@ jobs: # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "qnn" ]]; then + DELEGATE_CONFIG="qnn" + else + echo "Unsupported delegate ${{ matrix.delegate }}" + exit 1 fi PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" else From 1d6662dacf81fdebc4ad2ae7f38484d0d5029def Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:14:21 -0700 Subject: [PATCH 127/531] Remove tokenizer variants for Android build Now we only have one aar with both tiktoken and sentencepiece Pull Request resolved: https://github.com/pytorch/executorch/pull/5004 --- .github/workflows/android-perf.yml | 5 +---- .github/workflows/android.yml | 16 +++------------- build/build_android_llm_demo.sh | 13 +++++-------- examples/demo-apps/android/LlamaDemo/setup.sh | 1 - 4 files changed, 9 insertions(+), 26 deletions(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index f9f11d9abe0..cf89944abd7 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -206,9 +206,6 @@ jobs: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main needs: set-parameters - strategy: - matrix: - tokenizer: [bpe] with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -227,7 +224,7 @@ jobs: # TODO: This needs to be replaced with a generic loader .apk # Build LLM Demo for Android - bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} + bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat upload-android-apps: diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index e33b6e78334..4c693a90e61 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -24,9 +24,6 @@ jobs: build-llm-demo: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - tokenizer: [bpe, tiktoken] with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -44,7 +41,7 @@ jobs: export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded # Build LLM Demo for Android - bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} + bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat upload-artifacts: @@ -155,13 +152,6 @@ jobs: id-token: write contents: read uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main - strategy: - matrix: - # https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/README.md#alternative-2-build-from-local-machine - # mentions that tiktoken is only for Llama3. So, we can export it later in another archive - # like https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip when this is - # updated to run Llama3 - tokenizer: [bpe] with: device-type: android runner: linux.2xlarge @@ -171,8 +161,8 @@ jobs: # This is the custom Android device pool that only includes Samsung Galaxy S2x device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa # Uploaded to S3 from the previous job, the name of the app comes from the project itself - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk + android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk + android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30 # days and the job will automatically re-upload the file when that happens. diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 61f54c47cba..4d34eb95b23 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -17,7 +17,6 @@ build_jar() { build_android_native_library() { ANDROID_ABI="$1" - TOKENIZER="$2" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" @@ -100,9 +99,8 @@ build_android_llm_demo_app() { } collect_artifacts_to_be_uploaded() { - TOKENIZER="$1" - ARTIFACTS_DIR_NAME="$2" - DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo_${TOKENIZER}" + ARTIFACTS_DIR_NAME="$1" + DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo" # The app directory is named using its build flavor as a suffix. mkdir -p "${DEMO_APP_DIR}" # Collect the app and its test suite @@ -124,13 +122,12 @@ export BUILD_AAR_DIR ANDROID_ABIS=("arm64-v8a" "x86_64") export ANDROID_ABIS -TOKENIZER="${1:-bpe}" -ARTIFACTS_DIR_NAME="$2" +ARTIFACTS_DIR_NAME="$1" build_jar for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do - build_android_native_library ${ANDROID_ABI} ${TOKENIZER} + build_android_native_library ${ANDROID_ABI} done build_aar build_android_llm_demo_app -collect_artifacts_to_be_uploaded ${TOKENIZER} ${ARTIFACTS_DIR_NAME} +collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME} diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 5b3244fbcce..ccb2a788d6e 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -8,7 +8,6 @@ set -eu CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}" -EXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN:-OFF}" # Note: Set up ANDROID_NDK and ANDROID_ABI cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ From cd8aed6631c672cdd407319090135f6b4587e96b Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:12:41 -0700 Subject: [PATCH 128/531] Use a separate threadpool library Differential Revision: D61940492 Pull Request resolved: https://github.com/pytorch/executorch/pull/4967 --- CMakeLists.txt | 7 +++++ backends/xnnpack/test/CMakeLists.txt | 1 + build/Test.cmake | 2 ++ build/executorch-config.cmake | 1 + examples/models/llama2/CMakeLists.txt | 11 ++----- examples/models/llava/CMakeLists.txt | 11 ++----- extension/android/CMakeLists.txt | 10 ++----- extension/llm/custom_ops/CMakeLists.txt | 11 ++----- extension/parallel/test/CMakeLists.txt | 4 +-- extension/threadpool/CMakeLists.txt | 40 +++++++++++++++++++++++++ extension/threadpool/threadpool.cpp | 1 + 11 files changed, 61 insertions(+), 38 deletions(-) create mode 100644 extension/threadpool/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 20bb1bb122a..721e29f426d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -637,6 +637,13 @@ if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) endif() +if(EXECUTORCH_BUILD_PTHREADPOOL + AND EXECUTORCH_BUILD_CPUINFO + AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 +) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) +endif() + if(EXECUTORCH_BUILD_PYBIND) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11) diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index 4b787e80eed..02852871fe0 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -34,6 +34,7 @@ et_cxx_test( SOURCES ${_test_srcs} EXTRA_LIBS + extension_threadpool xnnpack_backend XNNPACK pthreadpool diff --git a/build/Test.cmake b/build/Test.cmake index 20d5cc58f84..d6ef124793c 100644 --- a/build/Test.cmake +++ b/build/Test.cmake @@ -25,9 +25,11 @@ find_package(executorch CONFIG REQUIRED) enable_testing() find_package(GTest CONFIG REQUIRED) +target_link_options_shared_lib(cpuinfo) target_link_options_shared_lib(extension_data_loader) target_link_options_shared_lib(portable_kernels) target_link_options_shared_lib(portable_ops_lib) +target_link_options_shared_lib(pthreadpool) target_link_options_shared_lib(quantized_ops_lib) # Add code coverage flags to supported compilers diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 962990d7c82..695c8e455ba 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -46,6 +46,7 @@ set(lib_list extension_module extension_module_static extension_runner_util + extension_threadpool xnnpack_backend XNNPACK cpuinfo diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index b517641f408..7a9b69d65b1 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -125,13 +125,7 @@ set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) list(APPEND _common_compile_options -DET_USE_THREADPOOL) - list(APPEND link_libraries pthreadpool) - # These 2 source files are included in xnnpack_backend - if(NOT TARGET xnnpack_backend) - list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp - ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp - ) - endif() + list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include ) @@ -139,8 +133,7 @@ endif() # Extra sources for cpuinfo if(EXECUTORCH_BUILD_CPUINFO) - list(APPEND link_libraries cpuinfo) - list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/cpuinfo_utils.cpp) + list(APPEND link_libraries extension_threadpool cpuinfo) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/cpuinfo/include ) diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index abd455a6b17..444f6b33892 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -127,13 +127,7 @@ set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) list(APPEND _common_compile_options -DET_USE_THREADPOOL) - list(APPEND link_libraries pthreadpool) - # These 2 source files are included in xnnpack_backend - if(NOT TARGET xnnpack_backend) - list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp - ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp - ) - endif() + list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include ) @@ -141,8 +135,7 @@ endif() # Extra sources for cpuinfo if(EXECUTORCH_BUILD_CPUINFO) - list(APPEND link_libraries cpuinfo) - list(APPEND _srcs ${EXECUTORCH_ROOT}/extension/threadpool/cpuinfo_utils.cpp) + list(APPEND link_libraries extension_threadpool cpuinfo) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/cpuinfo/include ) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 4c2abeb4f6e..daa9c7c2496 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -32,7 +32,7 @@ find_package(executorch CONFIG REQUIRED) target_link_options_shared_lib(executorch) set(link_libraries) -list(APPEND link_libraries extension_data_loader extension_module executorch +list(APPEND link_libraries extension_data_loader extension_module extension_threadpool executorch fbjni ) @@ -93,13 +93,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) target_link_options_shared_lib(quantized_ops_lib) - if(TARGET pthreadpool) - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp - ../../extension/threadpool/cpuinfo_utils.cpp - ) - else() - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) - endif() + set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS}) if(TARGET pthreadpool) target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 5822352f306..1d9cf1e1f24 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -47,17 +47,10 @@ list(APPEND custom_ops_libs eigen_blas) list(TRANSFORM _custom_ops__srcs PREPEND "${EXECUTORCH_ROOT}/") -# TODO: Consider moving xnnpack/threadpool in a separate lib since it's now used -# by custom ops too. if(NOT EXECUTORCH_BUILD_XNNPACK) - list( - APPEND - _custom_ops__srcs - "${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/threadpool/threadpool.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/threadpool/threadpool_guard.cpp" - ) + list(APPEND custom_ops_libs extension_threadpool) else() - list(APPEND custom_ops_libs xnnpack_backend) + list(APPEND custom_ops_libs extension_threadpool xnnpack_backend) endif() add_library(custom_ops ${_custom_ops__srcs}) diff --git a/extension/parallel/test/CMakeLists.txt b/extension/parallel/test/CMakeLists.txt index 1453a868920..9f1ff1871a2 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/parallel/test/CMakeLists.txt @@ -23,12 +23,10 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp - ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp - ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp ) et_cxx_test( - extension_parallel_test SOURCES ${_test_srcs} EXTRA_LIBS pthreadpool cpuinfo + extension_parallel_test SOURCES ${_test_srcs} EXTRA_LIBS pthreadpool cpuinfo extension_threadpool ) target_include_directories( extension_parallel_test diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt new file mode 100644 index 00000000000..674d3136e1a --- /dev/null +++ b/extension/threadpool/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +add_library(extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp) +target_link_libraries(extension_threadpool PUBLIC executorch cpuinfo pthreadpool) +target_include_directories(extension_threadpool PUBLIC ${EXECUTORCH_ROOT}/..) +target_include_directories( + extension_threadpool + PUBLIC + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include +) +target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) + +# Install libraries +install( + TARGETS extension_threadpool + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp index 3de179de10d..e8f2ea5f704 100644 --- a/extension/threadpool/threadpool.cpp +++ b/extension/threadpool/threadpool.cpp @@ -14,6 +14,7 @@ #include #include +#include namespace torch { namespace executorch { From 9c1a52cd4e700c7b29946ee38423e39f798691b0 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:06:01 -0700 Subject: [PATCH 129/531] Fix export vit w/ QNN delegate Differential Revision: D62012121 Pull Request resolved: https://github.com/pytorch/executorch/pull/4997 --- .ci/scripts/test.sh | 3 +++ .github/workflows/android-perf.yml | 2 +- .github/workflows/trunk.yml | 2 +- backends/qualcomm/partition/common_defs.py | 6 ++++- examples/qualcomm/scripts/torchvision_vit.py | 23 ++++++++++++++------ 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh index 338f2868e33..1dbf4a8ce9e 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test.sh @@ -170,6 +170,9 @@ test_model_with_qnn() { elif [[ "${MODEL_NAME}" == "ic3" ]]; then EXPORT_SCRIPT=inception_v3 EXPORTED_MODEL_NAME=ic3_qnn.pte + elif [[ "${MODEL_NAME}" == "vit" ]]; then + EXPORT_SCRIPT=torchvision_vit + EXPORTED_MODEL_NAME=vit_qnn.pte fi "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index cf89944abd7..028cd4c5a22 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -84,7 +84,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3" + CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit" CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x" CRON_DEFAULT_DELEGATES: "xnnpack,qnn" run: | diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 6d08675b8e2..9d41f39172b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -305,7 +305,7 @@ jobs: strategy: matrix: dtype: [fp32] - model: [dl3, mv3, mv2, ic4, ic3] + model: [dl3, mv3, mv2, ic4, ic3, vit] fail-fast: false with: runner: linux.2xlarge diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py index 353169bc186..d68441c2f79 100644 --- a/backends/qualcomm/partition/common_defs.py +++ b/backends/qualcomm/partition/common_defs.py @@ -17,7 +17,11 @@ ] to_be_implemented_operator = [ - exir_ops.edge.aten.where.default, + exir_ops.edge.aten.any.dim, + exir_ops.edge.aten.eq.Scalar, + exir_ops.edge.aten.full_like.default, + exir_ops.edge.aten.logical_not.default, + exir_ops.edge.aten.where.self, ] allow_list_operator = [ diff --git a/examples/qualcomm/scripts/torchvision_vit.py b/examples/qualcomm/scripts/torchvision_vit.py index dc9459bb13c..c9fc988d560 100755 --- a/examples/qualcomm/scripts/torchvision_vit.py +++ b/examples/qualcomm/scripts/torchvision_vit.py @@ -6,6 +6,7 @@ import json import os +import sys from multiprocessing.connection import Client import numpy as np @@ -61,10 +62,14 @@ def main(args): os.makedirs(args.artifact, exist_ok=True) data_num = 100 - inputs, targets, input_list = get_dataset( - dataset_path=f"{args.dataset}", - data_size=data_num, - ) + if args.compile_only: + inputs = [(torch.rand(1, 3, 224, 224),)] + else: + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + pte_filename = "vit_qnn" instance = TorchVisionViTModel() build_executorch_binary( @@ -77,6 +82,9 @@ def main(args): shared_buffer=args.shared_buffer, ) + if args.compile_only: + sys.exit(0) + adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", @@ -126,13 +134,14 @@ def main(args): "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" ), type=str, - required=True, + required=False, ) parser.add_argument( "-a", "--artifact", - help="path for storing generated artifacts by this example. " "Default ./vit", - default="./vit", + help="path for storing generated artifacts by this example. " + "Default ./torchvision_vit", + default="./torchvision_vit", type=str, ) From 0a8547a9f392751a9c2851519236a8cb29809c72 Mon Sep 17 00:00:00 2001 From: meta-emilian <162623112+meta-emilian@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:36:05 -0700 Subject: [PATCH 130/531] Enable MKL on x86 to get around long-context discrepancies with torch.nn.functional.scaled_dot_product_attention Differential Revision: D61931885 Pull Request resolved: https://github.com/pytorch/executorch/pull/4948 --- extension/llm/custom_ops/TARGETS | 2 +- extension/llm/custom_ops/targets.bzl | 81 ++++++++--------- kernels/optimized/lib_defs.bzl | 99 +++++++++++++-------- shim/tools/build_defs/fb_native_wrapper.bzl | 10 +++ 4 files changed, 112 insertions(+), 80 deletions(-) create mode 100644 shim/tools/build_defs/fb_native_wrapper.bzl diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS index ff3fde6e2cc..8fe776ab095 100644 --- a/extension/llm/custom_ops/TARGETS +++ b/extension/llm/custom_ops/TARGETS @@ -14,7 +14,7 @@ runtime.python_test( "test_sdpa_with_kv_cache.py", ], preload_deps = [ - ":custom_ops_aot_lib", + ":custom_ops_aot_lib_mkl_noomp", ":custom_ops_aot_py", ], deps = [ diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index b90b636f7c4..099266de1bb 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -6,47 +6,48 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ - runtime.cxx_library( - name = "custom_ops", - srcs = ["op_sdpa.cpp", "op_fallback.cpp"], - exported_headers = ["op_sdpa.h", "op_fallback.h"], - exported_deps = [ - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/kernels/optimized:libblas", - "//executorch/kernels/optimized:libvec", - "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", - "//executorch/extension/threadpool:threadpool", - ], - compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], - visibility = [ - "//executorch/...", - "//executorch/extension/llm/custom_ops/...", - "@EXECUTORCH_CLIENTS", - ], - # @lint-ignore BUCKLINT link_whole - link_whole = True, - force_static = True, - ) + for mkl_dep in ["", "_mkl_noomp"]: + runtime.cxx_library( + name = "custom_ops" + mkl_dep, + srcs = ["op_sdpa.cpp", "op_fallback.cpp"], + exported_headers = ["op_sdpa.h", "op_fallback.h"], + exported_deps = [ + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/kernels/optimized:libblas{}".format(mkl_dep), + "//executorch/kernels/optimized:libvec", + "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/parallel:thread_parallel", + "//executorch/extension/threadpool:threadpool", + ], + compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"], + visibility = [ + "//executorch/...", + "//executorch/extension/llm/custom_ops/...", + "@EXECUTORCH_CLIENTS", + ], + # @lint-ignore BUCKLINT link_whole + link_whole = True, + force_static = True, + ) - runtime.cxx_library( - name = "custom_ops_aot_lib", - srcs = [ - "op_sdpa_aot.cpp", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - external_deps = [ - "libtorch", - ], - deps = [ - ":custom_ops", - "//executorch/extension/aten_util:aten_bridge", - ], - ) + runtime.cxx_library( + name = "custom_ops_aot_lib" + mkl_dep, + srcs = [ + "op_sdpa_aot.cpp", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + external_deps = [ + "libtorch", + ], + deps = [ + ":custom_ops" + mkl_dep, + "//executorch/extension/aten_util:aten_bridge", + ], + ) runtime.python_library( name = "custom_ops_aot_py", diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 5af9b423ad0..16ce446df40 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -1,4 +1,5 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFORM_REGEX") +load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") # Because vec exists as a collection of header files, compile and preprocessor @@ -99,44 +100,64 @@ def define_libs(): ], ) - runtime.cxx_library( - name = "libblas", - srcs = native.glob([ - "blas/**/*.cpp", - ]), - exported_headers = native.glob([ - "blas/**/*.h", - ]), - header_namespace = "executorch/kernels/optimized", - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - fbandroid_platform_preprocessor_flags = [ - ( - "^android-arm64.*$", - [ - "-DET_BUILD_WITH_BLAS", - ], - ), - ], - fbandroid_platform_deps = [ - ( - "^android-arm64.*$", - [ - "fbsource//third-party/openblas:openblas", - ], - ), - ], - fbobjc_exported_preprocessor_flags = [ - "-DET_BUILD_WITH_BLAS", - "-DET_BUILD_FOR_APPLE", - ], - fbobjc_frameworks = [ - "Accelerate", - ], - exported_deps = [ - "//executorch/kernels/optimized:libutils", - "//executorch/runtime/core/exec_aten:lib", + # OSS doesn't have ovr_config//os:linux-x86_64 + fb_native.config_setting( + name = "linux-x86_64", + constraint_values = [ + "ovr_config//os/constraints:linux", + "ovr_config//cpu/constraints:x86_64", ], ) + + for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: + runtime.cxx_library( + name = libblas_name, + srcs = native.glob([ + "blas/**/*.cpp", + ]), + exported_headers = native.glob([ + "blas/**/*.h", + ]), + header_namespace = "executorch/kernels/optimized", + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + preprocessor_flags = select({ + ":linux-x86_64": [ + "-DET_BUILD_WITH_BLAS", + ] if not runtime.is_oss else [], + "DEFAULT": [], + }), + fbandroid_platform_preprocessor_flags = [ + ( + "^android-arm64.*$", + [ + "-DET_BUILD_WITH_BLAS", + ], + ), + ], + fbandroid_platform_deps = [ + ( + "^android-arm64.*$", + [ + "fbsource//third-party/openblas:openblas", + ], + ), + ], + fbobjc_exported_preprocessor_flags = [ + "-DET_BUILD_WITH_BLAS", + "-DET_BUILD_FOR_APPLE", + ], + fbobjc_frameworks = [ + "Accelerate", + ], + deps = select({ + ":linux-x86_64": [mkl_dep] if not runtime.is_oss else [], + "DEFAULT": [], + }), + exported_deps = [ + "//executorch/kernels/optimized:libutils", + "//executorch/runtime/core/exec_aten:lib", + ], + ) diff --git a/shim/tools/build_defs/fb_native_wrapper.bzl b/shim/tools/build_defs/fb_native_wrapper.bzl new file mode 100644 index 00000000000..d67b9384fe9 --- /dev/null +++ b/shim/tools/build_defs/fb_native_wrapper.bzl @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under both the MIT license found in the +# LICENSE-MIT file in the root directory of this source tree and the Apache +# License, Version 2.0 found in the LICENSE-APACHE file in the root directory +# of this source tree. + +fb_native = struct( + config_setting = native.config_setting, +) From 47ac24ba6e7bb3c962583abbd258ce89c18cbc4f Mon Sep 17 00:00:00 2001 From: meta-emilian <162623112+meta-emilian@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:57:55 -0700 Subject: [PATCH 131/531] Changing sdpa_with_kv_cache tests to use a wider dynamic range. Differential Revision: D61403179 Pull Request resolved: https://github.com/pytorch/executorch/pull/4892 --- .../llm/custom_ops/test_sdpa_with_kv_cache.py | 91 ++++++++++++++++--- 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py index a1b36e688f9..dd63c68f138 100644 --- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py +++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py @@ -392,17 +392,50 @@ def setUp(self): self.max_seq_len = 2048 self.setup_caches() + def _scale_tensor(self, tensor, min_value, max_value, scale=True): + normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min()) + + scaled_tensor = normalized_tensor * (max_value - min_value) + min_value + + return scaled_tensor if scale else tensor + def _test_sdpa_common( - self, n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len=1 + self, + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + seq_len, + next_iter_seq_len=1, + scale_tensors=False, ): + # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests + tensor_scale_max = 20 + tensor_scale_min = -20 self.n_heads_kv = n_heads_kv self.n_heads_q = n_heads_q self.head_dim = head_dim self.max_seq_len = max_seq_len self.setup_caches() - q = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)) - k = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)) - v = torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)) + q = self._scale_tensor( + torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + k = self._scale_tensor( + torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + v = self._scale_tensor( + torch.rand((1, seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + start_pos = 0 attn_mask = self.mask[start_pos : start_pos + seq_len, :] attn_mask = attn_mask[:, : start_pos + seq_len] @@ -412,11 +445,27 @@ def _test_sdpa_common( op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True ) - self.assertTrue(torch.allclose(ref_output, op_output)) + self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6)) + + q = self._scale_tensor( + torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + k = self._scale_tensor( + torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) + v = self._scale_tensor( + torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)), + tensor_scale_max, + tensor_scale_min, + scale_tensors, + ) - q = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)) - k = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)) - v = torch.rand((1, next_iter_seq_len, self.n_heads_kv, self.head_dim)) start_pos = seq_len seq_len = q.size(1) attn_mask = self.mask[start_pos : start_pos + seq_len, :] @@ -427,7 +476,7 @@ def _test_sdpa_common( op_output = torch.ops.llama.sdpa_with_kv_cache( q, k, v, self.k_cache, self.v_cache, start_pos, seq_len, None, 0, True ) - self.assertTrue(torch.allclose(ref_output, op_output)) + self.assertTrue(torch.allclose(ref_output, op_output, atol=1e-6)) class SDPATestForLargeSeqLength(SDPATestCommon): @@ -438,7 +487,9 @@ def test_sdpa_with_cache_seq_len_130(self): head_dim = 128 max_seq_len = 2048 seq_len = 130 - self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len) + self._test_sdpa_common( + n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True + ) def test_sdpa_with_cache_seq_len_small(self): n_heads_kv = 4 @@ -462,7 +513,9 @@ def test_sdpa_with_cache_seq_len_130_gqa(self): head_dim = 128 max_seq_len = 2048 seq_len = 130 - self._test_sdpa_common(n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len) + self._test_sdpa_common( + n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, True + ) def test_sdpa_with_cache_seq_len_llava_example_gqa(self): n_heads_kv = 16 @@ -483,7 +536,13 @@ def test_sdpa_with_cache_seq_len_130(self): seq_len = 130 next_iter_seq_len = 17 self._test_sdpa_common( - n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + seq_len, + next_iter_seq_len, + True, ) def test_sdpa_with_cache_seq_len_llava_example(self): @@ -505,7 +564,13 @@ def test_sdpa_with_cache_seq_len_130_gqa(self): seq_len = 130 next_iter_seq_len = 33 self._test_sdpa_common( - n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len + n_heads_kv, + n_heads_q, + head_dim, + max_seq_len, + seq_len, + next_iter_seq_len, + True, ) def test_sdpa_with_cache_seq_len_llava_example_gqa(self): From e7e86478b4807d18ec0ac68c91f6d7e5c6d0f14e Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:05:27 -0700 Subject: [PATCH 132/531] Add a new MiniBench apk for benchmarking - app skeleton So far it supports generic model (not LLM). Later we can combine two Build: ``` cd extension/android/benchmark mkdir app/libs cp app/libs/executorch.aar ./gradlew :app:installDebug When we have library changes ready (#5000), we can start trying it adb shell am start -n org.pytorch.minibench/org.pytorch.minibench.BenchmarkActivity --es model_path /data/local/tmp/model.pte adb shell run-as org.pytorch.minibench cat files/benchmark_results.txt ``` Pull Request resolved: https://github.com/pytorch/executorch/pull/5015 --- extension/android/benchmark/.gitignore | 16 ++ extension/android/benchmark/app/.gitignore | 1 + .../android/benchmark/app/build.gradle.kts | 41 ++++ .../android/benchmark/app/proguard-rules.pro | 21 ++ .../minibench/ExampleInstrumentedTest.java | 26 +++ .../app/src/main/AndroidManifest.xml | 21 ++ .../pytorch/minibench/BenchmarkActivity.java | 46 +++++ .../app/src/main/res/values/colors.xml | 10 + .../app/src/main/res/values/strings.xml | 3 + .../app/src/main/res/values/themes.xml | 5 + .../pytorch/minibench/ExampleUnitTest.java | 17 ++ extension/android/benchmark/build.gradle.kts | 4 + extension/android/benchmark/gradle.properties | 21 ++ .../gradle/wrapper/gradle-wrapper.jar | Bin 0 -> 59203 bytes .../gradle/wrapper/gradle-wrapper.properties | 6 + extension/android/benchmark/gradlew | 185 ++++++++++++++++++ extension/android/benchmark/gradlew.bat | 89 +++++++++ .../android/benchmark/settings.gradle.kts | 17 ++ 18 files changed, 529 insertions(+) create mode 100644 extension/android/benchmark/.gitignore create mode 100644 extension/android/benchmark/app/.gitignore create mode 100644 extension/android/benchmark/app/build.gradle.kts create mode 100644 extension/android/benchmark/app/proguard-rules.pro create mode 100644 extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java create mode 100644 extension/android/benchmark/app/src/main/AndroidManifest.xml create mode 100644 extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java create mode 100644 extension/android/benchmark/app/src/main/res/values/colors.xml create mode 100644 extension/android/benchmark/app/src/main/res/values/strings.xml create mode 100644 extension/android/benchmark/app/src/main/res/values/themes.xml create mode 100644 extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java create mode 100644 extension/android/benchmark/build.gradle.kts create mode 100644 extension/android/benchmark/gradle.properties create mode 100644 extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar create mode 100644 extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties create mode 100755 extension/android/benchmark/gradlew create mode 100644 extension/android/benchmark/gradlew.bat create mode 100644 extension/android/benchmark/settings.gradle.kts diff --git a/extension/android/benchmark/.gitignore b/extension/android/benchmark/.gitignore new file mode 100644 index 00000000000..0d02171028f --- /dev/null +++ b/extension/android/benchmark/.gitignore @@ -0,0 +1,16 @@ +*.iml +.gradle +/local.properties +/.idea/caches +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +/.idea/navEditor.xml +/.idea/assetWizardSettings.xml +.DS_Store +/build +/captures +.externalNativeBuild +.cxx +local.properties +*.aar diff --git a/extension/android/benchmark/app/.gitignore b/extension/android/benchmark/app/.gitignore new file mode 100644 index 00000000000..42afabfd2ab --- /dev/null +++ b/extension/android/benchmark/app/.gitignore @@ -0,0 +1 @@ +/build \ No newline at end of file diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts new file mode 100644 index 00000000000..b48404f8ff7 --- /dev/null +++ b/extension/android/benchmark/app/build.gradle.kts @@ -0,0 +1,41 @@ +plugins { + id("com.android.application") +} + +android { + namespace = "org.pytorch.minibench" + compileSdk = 34 + + defaultConfig { + applicationId = "org.pytorch.minibench" + minSdk = 28 + targetSdk = 33 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } +} + +dependencies { + implementation(files("libs/executorch.aar")) + implementation("com.facebook.soloader:soloader:0.10.5") + implementation("com.facebook.fbjni:fbjni:0.5.1") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.2.1") + androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") +} diff --git a/extension/android/benchmark/app/proguard-rules.pro b/extension/android/benchmark/app/proguard-rules.pro new file mode 100644 index 00000000000..481bb434814 --- /dev/null +++ b/extension/android/benchmark/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java new file mode 100644 index 00000000000..c5887aebccf --- /dev/null +++ b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java @@ -0,0 +1,26 @@ +package org.pytorch.minibench; + +import android.content.Context; + +import androidx.test.platform.app.InstrumentationRegistry; +import androidx.test.ext.junit.runners.AndroidJUnit4; + +import org.junit.Test; +import org.junit.runner.RunWith; + +import static org.junit.Assert.*; + +/** + * Instrumented test, which will execute on an Android device. + * + * @see Testing documentation + */ +@RunWith(AndroidJUnit4.class) +public class ExampleInstrumentedTest { + @Test + public void useAppContext() { + // Context of the app under test. + Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); + assertEquals("org.pytorch.minibench", appContext.getPackageName()); + } +} diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml new file mode 100644 index 00000000000..49711b6830e --- /dev/null +++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java new file mode 100644 index 00000000000..17897d0d36e --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +import android.app.Activity; +import android.content.Intent; +import android.os.Bundle; + +import org.pytorch.executorch.Module; + +import java.io.FileWriter; +import java.io.IOException; + +public class BenchmarkActivity extends Activity { + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + Intent intent = getIntent(); + String modelPath = intent.getStringExtra("model_path"); + int numIter = intent.getIntExtra("num_iter", 10); + + // TODO: Format the string with a parsable format + StringBuilder resultText = new StringBuilder(); + + Module module = Module.load(modelPath); + for (int i = 0; i < numIter; i++) { + long start = System.currentTimeMillis(); + module.forward(); + long forwardMs = System.currentTimeMillis() - start; + resultText.append(forwardMs).append(";"); + } + + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { + writer.write(resultText.toString()); + } catch (IOException e) { + e.printStackTrace(); + } + + } +} diff --git a/extension/android/benchmark/app/src/main/res/values/colors.xml b/extension/android/benchmark/app/src/main/res/values/colors.xml new file mode 100644 index 00000000000..ca1931bca99 --- /dev/null +++ b/extension/android/benchmark/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/extension/android/benchmark/app/src/main/res/values/strings.xml b/extension/android/benchmark/app/src/main/res/values/strings.xml new file mode 100644 index 00000000000..34062786b93 --- /dev/null +++ b/extension/android/benchmark/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + MiniBench + \ No newline at end of file diff --git a/extension/android/benchmark/app/src/main/res/values/themes.xml b/extension/android/benchmark/app/src/main/res/values/themes.xml new file mode 100644 index 00000000000..8a63cb8955a --- /dev/null +++ b/extension/android/benchmark/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + + diff --git a/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java new file mode 100644 index 00000000000..134410482b8 --- /dev/null +++ b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java @@ -0,0 +1,17 @@ +package org.pytorch.minibench; + +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * Example local unit test, which will execute on the development machine (host). + * + * @see Testing documentation + */ +public class ExampleUnitTest { + @Test + public void addition_isCorrect() { + assertEquals(4, 2 + 2); + } +} diff --git a/extension/android/benchmark/build.gradle.kts b/extension/android/benchmark/build.gradle.kts new file mode 100644 index 00000000000..cc9db8a5cc0 --- /dev/null +++ b/extension/android/benchmark/build.gradle.kts @@ -0,0 +1,4 @@ +// Top-level build file where you can add configuration options common to all sub-projects/modules. +plugins { + id("com.android.application") version "8.1.0" apply false +} diff --git a/extension/android/benchmark/gradle.properties b/extension/android/benchmark/gradle.properties new file mode 100644 index 00000000000..a03b3548962 --- /dev/null +++ b/extension/android/benchmark/gradle.properties @@ -0,0 +1,21 @@ +# Project-wide Gradle settings. +# IDE (e.g. Android Studio) users: +# Gradle settings configured through the IDE *will override* +# any settings specified in this file. +# For more details on how to configure your build environment visit +# http://www.gradle.org/docs/current/userguide/build_environment.html +# Specifies the JVM arguments used for the daemon process. +# The setting is particularly useful for tweaking memory settings. +org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 +# When configured, Gradle will run in incubating parallel mode. +# This option should only be used with decoupled projects. More details, visit +# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects +# org.gradle.parallel=true +# AndroidX package structure to make it clearer which packages are bundled with the +# Android operating system, and which are packaged with your app's APK +# https://developer.android.com/topic/libraries/support-library/androidx-rn +android.useAndroidX=true +# Enables namespacing of each library's R class so that its R class includes only the +# resources declared in the library itself and none from the library's dependencies, +# thereby reducing the size of the R class for that library +android.nonTransitiveRClass=true diff --git a/extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000000000000000000000000000000000000..e708b1c023ec8b20f512888fe07c5bd3ff77bb8f GIT binary patch literal 59203 zcma&O1CT9Y(k9%tZQHhO+qUh#ZQHhO+qmuS+qP|E@9xZO?0h@l{(r>DQ>P;GjjD{w zH}lENr;dU&FbEU?00aa80D$0M0RRB{U*7-#kbjS|qAG&4l5%47zyJ#WrfA#1$1Ctx zf&Z_d{GW=lf^w2#qRJ|CvSJUi(^E3iv~=^Z(zH}F)3Z%V3`@+rNB7gTVU{Bb~90p|f+0(v;nz01EG7yDMX9@S~__vVgv%rS$+?IH+oZ03D5zYrv|^ zC1J)SruYHmCki$jLBlTaE5&dFG9-kq3!^i>^UQL`%gn6)jz54$WDmeYdsBE9;PqZ_ zoGd=P4+|(-u4U1dbAVQrFWoNgNd;0nrghPFbQrJctO>nwDdI`Q^i0XJDUYm|T|RWc zZ3^Qgo_Qk$%Fvjj-G}1NB#ZJqIkh;kX%V{THPqOyiq)d)0+(r9o(qKlSp*hmK#iIY zA^)Vr$-Hz<#SF=0@tL@;dCQsm`V9s1vYNq}K1B)!XSK?=I1)tX+bUV52$YQu*0%fnWEukW>mxkz+%3-S!oguE8u#MGzST8_Dy^#U?fA@S#K$S@9msUiX!gd_ow>08w5)nX{-KxqMOo7d?k2&?Vf z&diGDtZr(0cwPe9z9FAUSD9KC)7(n^lMWuayCfxzy8EZsns%OEblHFSzP=cL6}?J| z0U$H!4S_TVjj<`6dy^2j`V`)mC;cB%* z8{>_%E1^FH!*{>4a7*C1v>~1*@TMcLK{7nEQ!_igZC}ikJ$*<$yHy>7)oy79A~#xE zWavoJOIOC$5b6*q*F_qN1>2#MY)AXVyr$6x4b=$x^*aqF*L?vmj>Mgv+|ITnw_BoW zO?jwHvNy^prH{9$rrik1#fhyU^MpFqF2fYEt(;4`Q&XWOGDH8k6M=%@fics4ajI;st# zCU^r1CK&|jzUhRMv;+W~6N;u<;#DI6cCw-otsc@IsN3MoSD^O`eNflIoR~l4*&-%RBYk@gb^|-JXs&~KuSEmMxB}xSb z@K76cXD=Y|=I&SNC2E+>Zg?R6E%DGCH5J1nU!A|@eX9oS(WPaMm==k2s_ueCqdZw| z&hqHp)47`c{BgwgvY2{xz%OIkY1xDwkw!<0veB#yF4ZKJyabhyyVS`gZepcFIk%e2 zTcrmt2@-8`7i-@5Nz>oQWFuMC_KlroCl(PLSodswHqJ3fn<;gxg9=}~3x_L3P`9Sn zChIf}8vCHvTriz~T2~FamRi?rh?>3bX1j}%bLH+uFX+p&+^aXbOK7clZxdU~6Uxgy z8R=obwO4dL%pmVo*Ktf=lH6hnlz_5k3cG;m8lgaPp~?eD!Yn2kf)tU6PF{kLyn|oI@eQ`F z3IF7~Blqg8-uwUuWZScRKn%c2_}dXB6Dx_&xR*n9M9LXasJhtZdr$vBY!rP{c@=)& z#!?L$2UrkvClwQO>U*fSMs67oSj2mxiJ$t;E|>q%Kh_GzzWWO&3;ufU%2z%ucBU8H z3WIwr$n)cfCXR&>tyB7BcSInK>=ByZA%;cVEJhcg<#6N{aZC4>K41XF>ZgjG`z_u& zGY?;Ad?-sgiOnI`oppF1o1Gurqbi*;#x2>+SSV6|1^G@ooVy@fg?wyf@0Y!UZ4!}nGuLeC^l)6pwkh|oRY`s1Pm$>zZ3u-83T|9 zGaKJIV3_x+u1>cRibsaJpJqhcm%?0-L;2 zitBrdRxNmb0OO2J%Y&Ym(6*`_P3&&5Bw157{o7LFguvxC$4&zTy#U=W*l&(Q2MNO} zfaUwYm{XtILD$3864IA_nn34oVa_g^FRuHL5wdUd)+W-p-iWCKe8m_cMHk+=? zeKX)M?Dt(|{r5t7IenkAXo%&EXIb-i^w+0CX0D=xApC=|Xy(`xy+QG^UyFe z+#J6h_&T5i#sV)hj3D4WN%z;2+jJcZxcI3*CHXGmOF3^)JD5j&wfX)e?-|V0GPuA+ zQFot%aEqGNJJHn$!_}#PaAvQ^{3-Ye7b}rWwrUmX53(|~i0v{}G_sI9uDch_brX&6 zWl5Ndj-AYg(W9CGfQf<6!YmY>Ey)+uYd_JNXH=>|`OH-CDCmcH(0%iD_aLlNHKH z7bcW-^5+QV$jK?R*)wZ>r9t}loM@XN&M-Pw=F#xn(;u3!(3SXXY^@=aoj70;_=QE9 zGghsG3ekq#N||u{4We_25U=y#T*S{4I{++Ku)> zQ!DZW;pVcn>b;&g2;YE#+V`v*Bl&Y-i@X6D*OpNA{G@JAXho&aOk(_j^weW{#3X5Y z%$q_wpb07EYPdmyH(1^09i$ca{O<}7) zRWncXdSPgBE%BM#by!E>tdnc$8RwUJg1*x($6$}ae$e9Knj8gvVZe#bLi!<+&BkFj zg@nOpDneyc+hU9P-;jmOSMN|*H#>^Ez#?;%C3hg_65leSUm;iz)UkW)jX#p)e&S&M z1|a?wDzV5NVnlhRBCd_;F87wp>6c<&nkgvC+!@KGiIqWY4l}=&1w7|r6{oBN8xyzh zG$b#2=RJp_iq6)#t5%yLkKx(0@D=C3w+oiXtSuaQ%I1WIb-eiE$d~!)b@|4XLy!CZ z9p=t=%3ad@Ep+<9003D2KZ5VyP~_n$=;~r&YUg5UZ0KVD&tR1DHy9x)qWtKJp#Kq# zP*8p#W(8JJ_*h_3W}FlvRam?<4Z+-H77^$Lvi+#vmhL9J zJ<1SV45xi;SrO2f=-OB(7#iNA5)x1uNC-yNxUw|!00vcW2PufRm>e~toH;M0Q85MQLWd?3O{i8H+5VkR@l9Dg-ma ze2fZ%>G(u5(k9EHj2L6!;(KZ8%8|*-1V|B#EagbF(rc+5iL_5;Eu)L4Z-V;0HfK4d z*{utLse_rvHZeQ>V5H=f78M3Ntg1BPxFCVD{HbNA6?9*^YIq;B-DJd{Ca2L#)qWP? zvX^NhFmX?CTWw&Ns}lgs;r3i+Bq@y}Ul+U%pzOS0Fcv9~aB(0!>GT0)NO?p=25LjN z2bh>6RhgqD7bQj#k-KOm@JLgMa6>%-ok1WpOe)FS^XOU{c?d5shG(lIn3GiVBxmg`u%-j=)^v&pX1JecJics3&jvPI)mDut52? z3jEA)DM%}BYbxxKrizVYwq?(P&19EXlwD9^-6J+4!}9{ywR9Gk42jjAURAF&EO|~N z)?s>$Da@ikI4|^z0e{r`J8zIs>SpM~Vn^{3fArRu;?+43>lD+^XtUcY1HidJwnR6+ z!;oG2=B6Z_=M%*{z-RaHc(n|1RTKQdNjjV!Pn9lFt^4w|AeN06*j}ZyhqZ^!-=cyGP_ShV1rGxkx8t zB;8`h!S{LD%ot``700d0@Grql(DTt4Awgmi+Yr0@#jbe=2#UkK%rv=OLqF)9D7D1j z!~McAwMYkeaL$~kI~90)5vBhBzWYc3Cj1WI0RS`z000R8-@ET0dA~*r(gSiCJmQMN&4%1D zyVNf0?}sBH8zNbBLn>~(W{d3%@kL_eQ6jEcR{l>C|JK z(R-fA!z|TTRG40|zv}7E@PqCAXP3n`;%|SCQ|ZS%ym$I{`}t3KPL&^l5`3>yah4*6 zifO#{VNz3)?ZL$be;NEaAk9b#{tV?V7 zP|wf5YA*1;s<)9A4~l3BHzG&HH`1xNr#%){4xZ!jq%o=7nN*wMuXlFV{HaiQLJ`5G zBhDi#D(m`Q1pLh@Tq+L;OwuC52RdW7b8}~60WCOK5iYMUad9}7aWBuILb({5=z~YF zt?*Jr5NG+WadM{mDL>GyiByCuR)hd zA=HM?J6l1Xv0Dl+LW@w$OTcEoOda^nFCw*Sy^I@$sSuneMl{4ys)|RY#9&NxW4S)9 zq|%83IpslTLoz~&vTo!Ga@?rj_kw{|k{nv+w&Ku?fyk4Ki4I?);M|5Axm)t+BaE)D zm(`AQ#k^DWrjbuXoJf2{Aj^KT zFb1zMSqxq|vceV+Mf-)$oPflsO$@*A0n0Z!R{&(xh8s}=;t(lIy zv$S8x>m;vQNHuRzoaOo?eiWFe{0;$s`Bc+Osz~}Van${u;g(su`3lJ^TEfo~nERfP z)?aFzpDgnLYiERsKPu|0tq4l2wT)Atr6Qb%m-AUn6HnCue*yWICp7TjW$@sO zm5rm4aTcPQ(rfi7a`xP7cKCFrJD}*&_~xgLyr^-bmsL}y;A5P|al8J3WUoBSjqu%v zxC;mK!g(7r6RRJ852Z~feoC&sD3(6}^5-uLK8o)9{8L_%%rItZK9C){UxB|;G>JbP zsRRtS4-3B*5c+K2kvmgZK8472%l>3cntWUOVHxB|{Ay~aOg5RN;{PJgeVD*H%ac+y!h#wi%o2bF2Ca8IyMyH{>4#{E_8u^@+l-+n=V}Sq?$O z{091@v%Bd*3pk0^2UtiF9Z+(a@wy6 zUdw8J*ze$K#=$48IBi1U%;hmhO>lu!uU;+RS}p&6@rQila7WftH->*A4=5W|Fmtze z)7E}jh@cbmr9iup^i%*(uF%LG&!+Fyl@LFA-}Ca#bxRfDJAiR2dt6644TaYw1Ma79 zt8&DYj31j^5WPNf5P&{)J?WlCe@<3u^78wnd(Ja4^a>{^Tw}W>|Cjt^If|7l^l)^Q zbz|7~CF(k_9~n|h;ysZ+jHzkXf(*O*@5m zLzUmbHp=x!Q|!9NVXyipZ3)^GuIG$k;D)EK!a5=8MFLI_lpf`HPKl=-Ww%z8H_0$j ztJ||IfFG1lE9nmQ0+jPQy zCBdKkjArH@K7jVcMNz);Q(Q^R{d5G?-kk;Uu_IXSyWB)~KGIizZL(^&qF;|1PI7!E zTP`%l)gpX|OFn&)M%txpQ2F!hdA~hX1Cm5)IrdljqzRg!f{mN%G~H1&oqe`5eJCIF zHdD7O;AX-{XEV(a`gBFJ9ews#CVS2y!&>Cm_dm3C8*n3MA*e67(WC?uP@8TXuMroq z{#w$%z@CBIkRM7?}Xib+>hRjy?%G!fiw8! z8(gB+8J~KOU}yO7UGm&1g_MDJ$IXS!`+*b*QW2x)9>K~Y*E&bYMnjl6h!{17_8d!%&9D`a7r&LKZjC<&XOvTRaKJ1 zUY@hl5^R&kZl3lU3njk`3dPzxj$2foOL26r(9zsVF3n_F#v)s5vv3@dgs|lP#eylq62{<-vczqP!RpVBTgI>@O6&sU>W|do17+#OzQ7o5A$ICH z?GqwqnK^n2%LR;$^oZM;)+>$X3s2n}2jZ7CdWIW0lnGK-b#EG01)P@aU`pg}th&J-TrU`tIpb5t((0eu|!u zQz+3ZiOQ^?RxxK4;zs=l8q!-n7X{@jSwK(iqNFiRColuEOg}!7cyZi`iBX4g1pNBj zAPzL?P^Ljhn;1$r8?bc=#n|Ed7wB&oHcw()&*k#SS#h}jO?ZB246EGItsz*;^&tzp zu^YJ0=lwsi`eP_pU8}6JA7MS;9pfD;DsSsLo~ogzMNP70@@;Fm8f0^;>$Z>~}GWRw!W5J3tNX*^2+1f3hz{~rIzJo z6W%J(H!g-eI_J1>0juX$X4Cl6i+3wbc~k146UIX&G22}WE>0ga#WLsn9tY(&29zBvH1$`iWtTe zG2jYl@P!P)eb<5DsR72BdI7-zP&cZNI{7q3e@?N8IKc4DE#UVr->|-ryuJXk^u^>4 z$3wE~=q390;XuOQP~TNoDR?#|NSPJ%sTMInA6*rJ%go|=YjGe!B>z6u$IhgQSwoV* zjy3F2#I>uK{42{&IqP59)Y(1*Z>>#W8rCf4_eVsH)`v!P#^;BgzKDR`ARGEZzkNX+ zJUQu=*-ol=Xqqt5=`=pA@BIn@6a9G8C{c&`i^(i+BxQO9?YZ3iu%$$da&Kb?2kCCo zo7t$UpSFWqmydXf@l3bVJ=%K?SSw)|?srhJ-1ZdFu*5QhL$~-IQS!K1s@XzAtv6*Y zl8@(5BlWYLt1yAWy?rMD&bwze8bC3-GfNH=p zynNFCdxyX?K&G(ZZ)afguQ2|r;XoV^=^(;Cku#qYn4Lus`UeKt6rAlFo_rU`|Rq z&G?~iWMBio<78of-2X(ZYHx~=U0Vz4btyXkctMKdc9UM!vYr~B-(>)(Hc|D zMzkN4!PBg%tZoh+=Gba!0++d193gbMk2&krfDgcbx0jI92cq?FFESVg0D$>F+bil} zY~$)|>1HZsX=5sAZ2WgPB5P=8X#TI+NQ(M~GqyVB53c6IdX=k>Wu@A0Svf5#?uHaF zsYn|koIi3$(%GZ2+G+7Fv^lHTb#5b8sAHSTnL^qWZLM<(1|9|QFw9pnRU{svj}_Al zL)b9>fN{QiA($8peNEJyy`(a{&uh-T4_kdZFIVsKKVM(?05}76EEz?#W za^fiZOAd14IJ4zLX-n7Lq0qlQ^lW8Cvz4UKkV9~P}>sq0?xD3vg+$4vLm~C(+ zM{-3Z#qnZ09bJ>}j?6ry^h+@PfaD7*jZxBEY4)UG&daWb??6)TP+|3#Z&?GL?1i+280CFsE|vIXQbm| zM}Pk!U`U5NsNbyKzkrul-DzwB{X?n3E6?TUHr{M&+R*2%yOiXdW-_2Yd6?38M9Vy^ z*lE%gA{wwoSR~vN0=no}tP2Ul5Gk5M(Xq`$nw#ndFk`tcpd5A=Idue`XZ!FS>Q zG^0w#>P4pPG+*NC9gLP4x2m=cKP}YuS!l^?sHSFftZy{4CoQrb_ z^20(NnG`wAhMI=eq)SsIE~&Gp9Ne0nD4%Xiu|0Fj1UFk?6avDqjdXz{O1nKao*46y zT8~iA%Exu=G#{x=KD;_C&M+Zx4+n`sHT>^>=-1YM;H<72k>$py1?F3#T1*ef9mLZw z5naLQr?n7K;2l+{_uIw*_1nsTn~I|kkCgrn;|G~##hM;9l7Jy$yJfmk+&}W@JeKcF zx@@Woiz8qdi|D%aH3XTx5*wDlbs?dC1_nrFpm^QbG@wM=i2?Zg;$VK!c^Dp8<}BTI zyRhAq@#%2pGV49*Y5_mV4+OICP|%I(dQ7x=6Ob}>EjnB_-_18*xrY?b%-yEDT(wrO z9RY2QT0`_OpGfMObKHV;QLVnrK%mc?$WAdIT`kJQT^n%GuzE7|9@k3ci5fYOh(287 zuIbg!GB3xLg$YN=n)^pHGB0jH+_iIiC=nUcD;G6LuJsjn2VI1cyZx=a?ShCsF==QK z;q~*m&}L<-cb+mDDXzvvrRsybcgQ;Vg21P(uLv5I+eGc7o7tc6`;OA9{soHFOz zT~2?>Ts}gprIX$wRBb4yE>ot<8+*Bv`qbSDv*VtRi|cyWS>)Fjs>fkNOH-+PX&4(~ z&)T8Zam2L6puQl?;5zg9h<}k4#|yH9czHw;1jw-pwBM*O2hUR6yvHATrI%^mvs9q_ z&ccT0>f#eDG<^WG^q@oVqlJrhxH)dcq2cty@l3~|5#UDdExyXUmLQ}f4#;6fI{f^t zDCsgIJ~0`af%YR%Ma5VQq-p21k`vaBu6WE?66+5=XUd%Ay%D$irN>5LhluRWt7 zov-=f>QbMk*G##&DTQyou$s7UqjjW@k6=!I@!k+S{pP8R(2=e@io;N8E`EOB;OGoI zw6Q+{X1_I{OO0HPpBz!X!@`5YQ2)t{+!?M_iH25X(d~-Zx~cXnS9z>u?+If|iNJbx zyFU2d1!ITX64D|lE0Z{dLRqL1Ajj=CCMfC4lD3&mYR_R_VZ>_7_~|<^o*%_&jevU+ zQ4|qzci=0}Jydw|LXLCrOl1_P6Xf@c0$ieK2^7@A9UbF{@V_0p%lqW|L?5k>bVM8|p5v&2g;~r>B8uo<4N+`B zH{J)h;SYiIVx@#jI&p-v3dwL5QNV1oxPr8J%ooezTnLW>i*3Isb49%5i!&ac_dEXv zvXmVUck^QHmyrF8>CGXijC_R-y(Qr{3Zt~EmW)-nC!tiH`wlw5D*W7Pip;T?&j%kX z6DkZX4&}iw>hE(boLyjOoupf6JpvBG8}jIh!!VhnD0>}KSMMo{1#uU6kiFcA04~|7 zVO8eI&x1`g4CZ<2cYUI(n#wz2MtVFHx47yE5eL~8bot~>EHbevSt}LLMQX?odD{Ux zJMnam{d)W4da{l7&y-JrgiU~qY3$~}_F#G7|MxT)e;G{U`In&?`j<5D->}cb{}{T(4DF0BOk-=1195KB-E*o@c?`>y#4=dMtYtSY=&L{!TAjFVcq0y@AH`vH! z$41+u!Ld&}F^COPgL(EE{0X7LY&%D7-(?!kjFF7=qw<;`V{nwWBq<)1QiGJgUc^Vz ztMUlq1bZqKn17|6x6iAHbWc~l1HcmAxr%$Puv!znW)!JiukwIrqQ00|H$Z)OmGG@= zv%A8*4cq}(?qn4rN6o`$Y))(MyXr8R<2S^J+v(wmFmtac!%VOfN?&(8Nr!T@kV`N; z*Q33V3t`^rN&aBiHet)18wy{*wi1=W!B%B-Q6}SCrUl$~Hl{@!95ydml@FK8P=u4s z4e*7gV2s=YxEvskw2Ju!2%{8h01rx-3`NCPc(O zH&J0VH5etNB2KY6k4R@2Wvl^Ck$MoR3=)|SEclT2ccJ!RI9Nuter7u9@;sWf-%um;GfI!=eEIQ2l2p_YWUd{|6EG ze{yO6;lMc>;2tPrsNdi@&1K6(1;|$xe8vLgiouj%QD%gYk`4p{Ktv9|j+!OF-P?@p z;}SV|oIK)iwlBs+`ROXkhd&NK zzo__r!B>tOXpBJMDcv!Mq54P+n4(@dijL^EpO1wdg~q+!DT3lB<>9AANSe!T1XgC=J^)IP0XEZ()_vpu!!3HQyJhwh?r`Ae%Yr~b% zO*NY9t9#qWa@GCPYOF9aron7thfWT`eujS4`t2uG6)~JRTI;f(ZuoRQwjZjp5Pg34 z)rp$)Kr?R+KdJ;IO;pM{$6|2y=k_siqvp%)2||cHTe|b5Ht8&A{wazGNca zX$Ol?H)E_R@SDi~4{d-|8nGFhZPW;Cts1;08TwUvLLv&_2$O6Vt=M)X;g%HUr$&06 zISZb(6)Q3%?;3r~*3~USIg=HcJhFtHhIV(siOwV&QkQe#J%H9&E21!C*d@ln3E@J* zVqRO^<)V^ky-R|%{(9`l-(JXq9J)1r$`uQ8a}$vr9E^nNiI*thK8=&UZ0dsFN_eSl z(q~lnD?EymWLsNa3|1{CRPW60>DSkY9YQ;$4o3W7Ms&@&lv9eH!tk~N&dhqX&>K@} zi1g~GqglxkZ5pEFkllJ)Ta1I^c&Bt6#r(QLQ02yHTaJB~- zCcE=5tmi`UA>@P=1LBfBiqk)HB4t8D?02;9eXj~kVPwv?m{5&!&TFYhu>3=_ zsGmYZ^mo*-j69-42y&Jj0cBLLEulNRZ9vXE)8~mt9C#;tZs;=#M=1*hebkS;7(aGf zcs7zH(I8Eui9UU4L--))yy`&d&$In&VA2?DAEss4LAPCLd>-$i?lpXvn!gu^JJ$(DoUlc6wE98VLZ*z`QGQov5l4Fm_h?V-;mHLYDVOwKz7>e4+%AzeO>P6v}ndPW| zM>m#6Tnp7K?0mbK=>gV}=@k*0Mr_PVAgGMu$j+pWxzq4MAa&jpCDU&-5eH27Iz>m^ zax1?*HhG%pJ((tkR(V(O(L%7v7L%!_X->IjS3H5kuXQT2!ow(;%FDE>16&3r){!ex zhf==oJ!}YU89C9@mfDq!P3S4yx$aGB?rbtVH?sHpg?J5C->!_FHM%Hl3#D4eplxzQ zRA+<@LD%LKSkTk2NyWCg7u=$%F#;SIL44~S_OGR}JqX}X+=bc@swpiClB`Zbz|f!4 z7Ysah7OkR8liXfI`}IIwtEoL}(URrGe;IM8%{>b1SsqXh)~w}P>yiFRaE>}rEnNkT z!HXZUtxUp1NmFm)Dm@-{FI^aRQqpSkz}ZSyKR%Y}YHNzBk)ZIp} zMtS=aMvkgWKm9&oTcU0?S|L~CDqA+sHpOxwnswF-fEG)cXCzUR?ps@tZa$=O)=L+5 zf%m58cq8g_o}3?Bhh+c!w4(7AjxwQ3>WnVi<{{38g7yFboo>q|+7qs<$8CPXUFAN< zG&}BHbbyQ5n|qqSr?U~GY{@GJ{(Jny{bMaOG{|IkUj7tj^9pa9|FB_<+KHLxSxR;@ zHpS$4V)PP+tx}22fWx(Ku9y+}Ap;VZqD0AZW4gCDTPCG=zgJmF{|x;(rvdM|2|9a}cex6xrMkERnkE;}jvU-kmzd%_J50$M`lIPCKf+^*zL=@LW`1SaEc%=m zQ+lT06Gw+wVwvQ9fZ~#qd430v2HndFsBa9WjD0P}K(rZYdAt^5WQIvb%D^Q|pkVE^ zte$&#~zmULFACGfS#g=2OLOnIf2Of-k!(BIHjs77nr!5Q1*I9 z1%?=~#Oss!rV~?-6Gm~BWJiA4mJ5TY&iPm_$)H1_rTltuU1F3I(qTQ^U$S>%$l z)Wx1}R?ij0idp@8w-p!Oz{&*W;v*IA;JFHA9%nUvVDy7Q8woheC#|8QuDZb-L_5@R zOqHwrh|mVL9b=+$nJxM`3eE{O$sCt$UK^2@L$R(r^-_+z?lOo+me-VW=Zw z-Bn>$4ovfWd%SPY`ab-u9{INc*k2h+yH%toDHIyqQ zO68=u`N}RIIs7lsn1D){)~%>ByF<>i@qFb<-axvu(Z+6t7v<^z&gm9McRB~BIaDn$ z#xSGT!rzgad8o>~kyj#h1?7g96tOcCJniQ+*#=b7wPio>|6a1Z?_(TS{)KrPe}(8j z!#&A=k(&Pj^F;r)CI=Z{LVu>uj!_W1q4b`N1}E(i%;BWjbEcnD=mv$FL$l?zS6bW!{$7j1GR5ocn94P2u{ z70tAAcpqtQo<@cXw~@i-@6B23;317|l~S>CB?hR5qJ%J3EFgyBdJd^fHZu7AzHF(BQ!tyAz^L0`X z23S4Fe{2X$W0$zu9gm%rg~A>ijaE#GlYlrF9$ds^QtaszE#4M(OLVP2O-;XdT(XIC zatwzF*)1c+t~c{L=fMG8Z=k5lv>U0;C{caN1NItnuSMp)6G3mbahu>E#sj&oy94KC zpH}8oEw{G@N3pvHhp{^-YaZeH;K+T_1AUv;IKD<=mv^&Ueegrb!yf`4VlRl$M?wsl zZyFol(2|_QM`e_2lYSABpKR{{NlxlDSYQNkS;J66aT#MSiTx~;tUmvs-b*CrR4w=f z8+0;*th6kfZ3|5!Icx3RV11sp=?`0Jy3Fs0N4GZQMN=8HmT6%x9@{Dza)k}UwL6JT zHRDh;%!XwXr6yuuy`4;Xsn0zlR$k%r%9abS1;_v?`HX_hI|+EibVnlyE@3aL5vhQq zlIG?tN^w@0(v9M*&L+{_+RQZw=o|&BRPGB>e5=ys7H`nc8nx)|-g;s7mRc7hg{GJC zAe^vCIJhajmm7C6g! zL&!WAQ~5d_5)00?w_*|*H>3$loHrvFbitw#WvLB!JASO?#5Ig5$Ys10n>e4|3d;tS zELJ0|R4n3Az(Fl3-r^QiV_C;)lQ1_CW{5bKS15U|E9?ZgLec@%kXr84>5jV2a5v=w z?pB1GPdxD$IQL4)G||B_lI+A=08MUFFR4MxfGOu07vfIm+j=z9tp~5i_6jb`tR>qV z$#`=BQ*jpCjm$F0+F)L%xRlnS%#&gro6PiRfu^l!EVan|r3y}AHJQOORGx4~ z&<)3=K-tx518DZyp%|!EqpU!+X3Et7n2AaC5(AtrkW>_57i}$eqs$rupubg0a1+WO zGHZKLN2L0D;ab%{_S1Plm|hx8R?O14*w*f&2&bB050n!R2by zw!@XOQx$SqZ5I<(Qu$V6g>o#A!JVwErWv#(Pjx=KeS0@hxr4?13zj#oWwPS(7Ro|v z>Mp@Kmxo79q|}!5qtX2-O@U&&@6s~!I&)1WQIl?lTnh6UdKT_1R640S4~f=_xoN3- zI+O)$R@RjV$F=>Ti7BlnG1-cFKCC(t|Qjm{SalS~V-tX#+2ekRhwmN zZr`8{QF6y~Z!D|{=1*2D-JUa<(1Z=;!Ei!KiRNH?o{p5o3crFF=_pX9O-YyJchr$~ zRC`+G+8kx~fD2k*ZIiiIGR<8r&M@3H?%JVOfE>)})7ScOd&?OjgAGT@WVNSCZ8N(p zuQG~76GE3%(%h1*vUXg$vH{ua0b`sQ4f0*y=u~lgyb^!#CcPJa2mkSEHGLsnO^kb$ zru5_l#nu=Y{rSMWiYx?nO{8I!gH+?wEj~UM?IrG}E|bRIBUM>UlY<`T1EHpRr36vv zBi&dG8oxS|J$!zoaq{+JpJy+O^W(nt*|#g32bd&K^w-t>!Vu9N!k9eA8r!Xc{utY> zg9aZ(D2E0gL#W0MdjwES-7~Wa8iubPrd?8-$C4BP?*wok&O8+ykOx{P=Izx+G~hM8 z*9?BYz!T8~dzcZr#ux8kS7u7r@A#DogBH8km8Ry4slyie^n|GrTbO|cLhpqgMdsjX zJ_LdmM#I&4LqqsOUIXK8gW;V0B(7^$y#h3h>J0k^WJfAMeYek%Y-Dcb_+0zPJez!GM zAmJ1u;*rK=FNM0Nf}Y!!P9c4)HIkMnq^b;JFd!S3?_Qi2G#LIQ)TF|iHl~WKK6JmK zbv7rPE6VkYr_%_BT}CK8h=?%pk@3cz(UrZ{@h40%XgThP*-Oeo`T0eq9 zA8BnWZKzCy5e&&_GEsU4*;_k}(8l_&al5K-V*BFM=O~;MgRkYsOs%9eOY6s6AtE*<7GQAR2ulC3RAJrG_P1iQK5Z~&B z&f8X<>yJV6)oDGIlS$Y*D^Rj(cszTy5c81a5IwBr`BtnC6_e`ArI8CaTX_%rx7;cn zR-0?J_LFg*?(#n~G8cXut(1nVF0Oka$A$1FGcERU<^ggx;p@CZc?3UB41RY+wLS`LWFNSs~YP zuw1@DNN3lTd|jDL7gjBsd9}wIw}4xT2+8dBQzI00m<@?c2L%>}QLfK5%r!a-iII`p zX@`VEUH)uj^$;7jVUYdADQ2k*!1O3WdfgF?OMtUXNpQ1}QINamBTKDuv19^{$`8A1 zeq%q*O0mi@(%sZU>Xdb0Ru96CFqk9-L3pzLVsMQ`Xpa~N6CR{9Rm2)A|CI21L(%GW zh&)Y$BNHa=FD+=mBw3{qTgw)j0b!Eahs!rZnpu)z!!E$*eXE~##yaXz`KE5(nQM`s zD!$vW9XH)iMxu9R>r$VlLk9oIR%HxpUiW=BK@4U)|1WNQ=mz9a z^!KkO=>GaJ!GBXm{KJj^;kh-MkUlEQ%lza`-G&}C5y1>La1sR6hT=d*NeCnuK%_LV zOXt$}iP6(YJKc9j-Fxq~*ItVUqljQ8?oaysB-EYtFQp9oxZ|5m0^Hq(qV!S+hq#g( z?|i*H2MIr^Kxgz+3vIljQ*Feejy6S4v~jKEPTF~Qhq!(ms5>NGtRgO5vfPPc4Z^AM zTj!`5xEreIN)vaNxa|q6qWdg>+T`Ol0Uz)ckXBXEGvPNEL3R8hB3=C5`@=SYgAju1 z!)UBr{2~=~xa{b8>x2@C7weRAEuatC)3pkRhT#pMPTpSbA|tan%U7NGMvzmF?c!V8 z=pEWxbdXbTAGtWTyI?Fml%lEr-^AE}w#l(<7OIw;ctw}imYax&vR4UYNJZK6P7ZOd zP87XfhnUHxCUHhM@b*NbTi#(-8|wcv%3BGNs#zRCVV(W?1Qj6^PPQa<{yaBwZ`+<`w|;rqUY_C z&AeyKwwf*q#OW-F()lir=T^<^wjK65Lif$puuU5+tk$;e_EJ;Lu+pH>=-8=PDhkBg z8cWt%@$Sc#C6F$Vd+0507;{OOyT7Hs%nKS88q-W!$f~9*WGBpHGgNp}=C*7!RiZ5s zn1L_DbKF@B8kwhDiLKRB@lsXVVLK|ph=w%_`#owlf@s@V(pa`GY$8h%;-#h@TsO|Y8V=n@*!Rog7<7Cid%apR|x zOjhHCyfbIt%+*PCveTEcuiDi%Wx;O;+K=W?OFUV%)%~6;gl?<0%)?snDDqIvkHF{ zyI02)+lI9ov42^hL>ZRrh*HhjF9B$A@=H94iaBESBF=eC_KT$8A@uB^6$~o?3Wm5t1OIaqF^~><2?4e3c&)@wKn9bD? zoeCs;H>b8DL^F&>Xw-xjZEUFFTv>JD^O#1E#)CMBaG4DX9bD(Wtc8Rzq}9soQ8`jf zeSnHOL}<+WVSKp4kkq&?SbETjq6yr@4%SAqOG=9E(3YeLG9dtV+8vmzq+6PFPk{L; z(&d++iu=^F%b+ea$i2UeTC{R*0Isk;vFK!no<;L+(`y`3&H-~VTdKROkdyowo1iqR zbVW(3`+(PQ2>TKY>N!jGmGo7oeoB8O|P_!Ic@ zZ^;3dnuXo;WJ?S+)%P>{Hcg!Jz#2SI(s&dY4QAy_vRlmOh)QHvs_7c&zkJCmJGVvV zX;Mtb>QE+xp`KyciG$Cn*0?AK%-a|=o!+7x&&yzHQOS>8=B*R=niSnta^Pxp1`=md z#;$pS$4WCT?mbiCYU?FcHGZ#)kHVJTTBt^%XE(Q};aaO=Zik0UgLcc0I(tUpt(>|& zcxB_|fxCF7>&~5eJ=Dpn&5Aj{A^cV^^}(7w#p;HG&Q)EaN~~EqrE1qKrMAc&WXIE;>@<&)5;gD2?={Xf@Mvn@OJKw=8Mgn z!JUFMwD+s==JpjhroT&d{$kQAy%+d`a*XxDEVxy3`NHzmITrE`o!;5ClXNPb4t*8P zzAivdr{j_v!=9!^?T3y?gzmqDWX6mkzhIzJ-3S{T5bcCFMr&RPDryMcdwbBuZbsgN zGrp@^i?rcfN7v0NKGzDPGE#4yszxu=I_`MI%Z|10nFjU-UjQXXA?k8Pk|OE<(?ae) zE%vG#eZAlj*E7_3dx#Zz4kMLj>H^;}33UAankJiDy5ZvEhrjr`!9eMD8COp}U*hP+ zF}KIYx@pkccIgyxFm#LNw~G&`;o&5)2`5aogs`1~7cMZQ7zj!%L4E`2yzlQN6REX20&O<9 zKV6fyr)TScJPPzNTC2gL+0x#=u>(({{D7j)c-%tvqls3#Y?Z1m zV5WUE)zdJ{$p>yX;^P!UcXP?UD~YM;IRa#Rs5~l+*$&nO(;Ers`G=0D!twR(0GF@c zHl9E5DQI}Oz74n zfKP>&$q0($T4y$6w(p=ERAFh+>n%iaeRA%!T%<^+pg?M)@ucY<&59$x9M#n+V&>}=nO9wCV{O~lg&v#+jcUj(tQ z`0u1YH)-`U$15a{pBkGyPL0THv1P|4e@pf@3IBZS4dVJPo#H>pWq%Lr0YS-SeWash z8R7=jb28KPMI|_lo#GEO|5B?N_e``H*23{~a!AmUJ+fb4HX-%QI@lSEUxKlGV7z7Q zSKw@-TR>@1RL%w{x}dW#k1NgW+q4yt2Xf1J62Bx*O^WG8OJ|FqI4&@d3_o8Id@*)4 zYrk=>@!wv~mh7YWv*bZhxqSmFh2Xq)o=m;%n$I?GSz49l1$xRpPu_^N(vZ>*>Z<04 z2+rP70oM=NDysd!@fQdM2OcyT?3T^Eb@lIC-UG=Bw{BjQ&P`KCv$AcJ;?`vdZ4){d z&gkoUK{$!$$K`3*O-jyM1~p-7T*qb)Ys>Myt^;#1&a%O@x8A+E>! zY8=eD`ZG)LVagDLBeHg>=atOG?Kr%h4B%E6m@J^C+U|y)XX@f z8oyJDW|9g=<#f<{JRr{y#~euMnv)`7j=%cHWLc}ngjq~7k**6%4u>Px&W%4D94(r* z+akunK}O0DC2A%Xo9jyF;DobX?!1I(7%}@7F>i%&nk*LMO)bMGg2N+1iqtg+r(70q zF5{Msgsm5GS7DT`kBsjMvOrkx&|EU!{{~gL4d2MWrAT=KBQ-^zQCUq{5PD1orxlIL zq;CvlWx#f1NWvh`hg011I%?T_s!e38l*lWVt|~z-PO4~~1g)SrJ|>*tXh=QfXT)%( z+ex+inPvD&O4Ur;JGz>$sUOnWdpSLcm1X%aQDw4{dB!cnj`^muI$CJ2%p&-kULVCE z>$eMR36kN$wCPR+OFDM3-U(VOrp9k3)lI&YVFqd;Kpz~K)@Fa&FRw}L(SoD z9B4a+hQzZT-BnVltst&=kq6Y(f^S4hIGNKYBgMxGJ^;2yrO}P3;r)(-I-CZ)26Y6? z&rzHI_1GCvGkgy-t1E;r^3Le30|%$ebDRu2+gdLG)r=A~Qz`}~&L@aGJ{}vVs_GE* zVUjFnzHiXfKQbpv&bR&}l2bzIjAooB)=-XNcYmrGmBh(&iu@o!^hn0^#}m2yZZUK8 zufVm7Gq0y`Mj;9b>`c?&PZkU0j4>IL=UL&-Lp3j&47B5pAW4JceG{!XCA)kT<%2nqCxj<)uy6XR_uws~>_MEKPOpAQ!H zkn>FKh)<9DwwS*|Y(q?$^N!6(51O0 z^JM~Ax{AI1Oj$fs-S5d4T7Z_i1?{%0SsIuQ&r8#(JA=2iLcTN+?>wOL532%&dMYkT z*T5xepC+V6zxhS@vNbMoi|i)=rpli@R9~P!39tWbSSb904ekv7D#quKbgFEMTb48P zuq(VJ+&L8aWU(_FCD$3^uD!YM%O^K(dvy~Wm2hUuh6bD|#(I39Xt>N1Y{ZqXL`Fg6 zKQ?T2htHN!(Bx;tV2bfTtIj7e)liN-29s1kew>v(D^@)#v;}C4-G=7x#;-dM4yRWm zyY`cS21ulzMK{PoaQ6xChEZ}o_#}X-o}<&0)$1#3we?+QeLt;aVCjeA)hn!}UaKt< zat1fHEx13y-rXNMvpUUmCVzocPmN~-Y4(YJvQ#db)4|%B!rBsgAe+*yor~}FrNH08 z3V!97S}D7d$zbSD{$z;@IYMxM6aHdypIuS*pr_U6;#Y!_?0i|&yU*@16l z*dcMqDQgfNBf}?quiu4e>H)yTVfsp#f+Du0@=Kc41QockXkCkvu>FBd6Q+@FL!(Yx z2`YuX#eMEiLEDhp+9uFqME_E^faV&~9qjBHJkIp~%$x^bN=N)K@kvSVEMdDuzA0sn z88CBG?`RX1@#hQNd`o^V{37)!w|nA)QfiYBE^m=yQKv-fQF+UCMcuEe1d4BH7$?>b zJl-r9@0^Ie=)guO1vOd=i$_4sz>y3x^R7n4ED!5oXL3@5**h(xr%Hv)_gILarO46q+MaDOF%ChaymKoI6JU5Pg;7#2n9-18|S1;AK+ zgsn6;k6-%!QD>D?cFy}8F;r@z8H9xN1jsOBw2vQONVqBVEbkiNUqgw~*!^##ht>w0 zUOykwH=$LwX2j&nLy=@{hr)2O&-wm-NyjW7n~Zs9UlH;P7iP3 zI}S(r0YFVYacnKH(+{*)Tbw)@;6>%=&Th=+Z6NHo_tR|JCI8TJiXv2N7ei7M^Q+RM z?9o`meH$5Yi;@9XaNR#jIK^&{N|DYNNbtdb)XW1Lv2k{E>;?F`#Pq|&_;gm~&~Zc9 zf+6ZE%{x4|{YdtE?a^gKyzr}dA>OxQv+pq|@IXL%WS0CiX!V zm$fCePA%lU{%pTKD7|5NJHeXg=I0jL@$tOF@K*MI$)f?om)D63K*M|r`gb9edD1~Y zc|w7N)Y%do7=0{RC|AziW7#am$)9jciRJ?IWl9PE{G3U+$%FcyKs_0Cgq`=K3@ttV z9g;M!3z~f_?P%y3-ph%vBMeS@p7P&Ea8M@97+%XEj*(1E6vHj==d zjsoviB>j^$_^OI_DEPvFkVo(BGRo%cJeD){6Uckei=~1}>sp299|IRjhXe)%?uP0I zF5+>?0#Ye}T^Y$u_rc4=lPcq4K^D(TZG-w30-YiEM=dcK+4#o*>lJ8&JLi+3UcpZk z!^?95S^C0ja^jwP`|{<+3cBVog$(mRdQmadS+Vh~z zS@|P}=|z3P6uS+&@QsMp0no9Od&27O&14zHXGAOEy zh~OKpymK5C%;LLb467@KgIiVwYbYd6wFxI{0-~MOGfTq$nBTB!{SrWmL9Hs}C&l&l#m?s*{tA?BHS4mVKHAVMqm63H<|c5n0~k)-kbg zXidai&9ZUy0~WFYYKT;oe~rytRk?)r8bptITsWj(@HLI;@=v5|XUnSls7$uaxFRL+ zRVMGuL3w}NbV1`^=Pw*0?>bm8+xfeY(1PikW*PB>>Tq(FR`91N0c2&>lL2sZo5=VD zQY{>7dh_TX98L2)n{2OV=T10~*YzX27i2Q7W86M4$?gZIXZaBq#sA*{PH8){|GUi;oM>e?ua7eF4WFuFYZSG| zze?srg|5Ti8Og{O zeFxuw9!U+zhyk?@w zjsA6(oKD=Ka;A>Ca)oPORxK+kxH#O@zhC!!XS4@=swnuMk>t+JmLmFiE^1aX3f<)D@`%K0FGK^gg1a1j>zi z2KhV>sjU7AX3F$SEqrXSC}fRx64GDoc%!u2Yag68Lw@w9v;xOONf@o)Lc|Uh3<21ctTYu-mFZuHk*+R{GjXHIGq3p)tFtQp%TYqD=j1&y)>@zxoxUJ!G@ zgI0XKmP6MNzw>nRxK$-Gbzs}dyfFzt>#5;f6oR27ql!%+{tr+(`(>%51|k`ML} zY4eE)Lxq|JMas(;JibNQds1bUB&r}ydMQXBY4x(^&fY_&LlQC)3hylc$~8&~|06-D z#T+%66rYbHX%^KuqJED_wuGB+=h`nWA!>1n0)3wZrBG3%`b^Ozv6__dNa@%V14|!D zQ?o$z5u0^8`giv%qE!BzZ!3j;BlDlJDk)h@9{nSQeEk!z9RGW) z${RSF3phEM*ce*>Xdp}585vj$|40=&S{S-GTiE?Op*vY&Lvr9}BO$XWy80IF+6@%n z5*2ueT_g@ofP#u5pxb7n*fv^Xtt7&?SRc{*2Ka-*!BuOpf}neHGCiHy$@Ka1^Dint z;DkmIL$-e)rj4o2WQV%Gy;Xg(_Bh#qeOsTM2f@KEe~4kJ8kNLQ+;(!j^bgJMcNhvklP5Z6I+9Fq@c&D~8Fb-4rmDT!MB5QC{Dsb;BharP*O;SF4& zc$wj-7Oep7#$WZN!1nznc@Vb<_Dn%ga-O#J(l=OGB`dy=Sy&$(5-n3zzu%d7E#^8`T@}V+5B;PP8J14#4cCPw-SQTdGa2gWL0*zKM z#DfSXs_iWOMt)0*+Y>Lkd=LlyoHjublNLefhKBv@JoC>P7N1_#> zv=mLWe96%EY;!ZGSQDbZWb#;tzqAGgx~uk+-$+2_8U`!ypbwXl z^2E-FkM1?lY@yt8=J3%QK+xaZ6ok=-y%=KXCD^0r!5vUneW>95PzCkOPO*t}p$;-> ze5j-BLT_;)cZQzR2CEsm@rU7GZfFtdp*a|g4wDr%8?2QkIGasRfDWT-Dvy*U{?IHT z*}wGnzdlSptl#ZF^sf)KT|BJs&kLG91^A6ls{CzFprZ6-Y!V0Xysh%9p%iMd7HLsS zN+^Un$tDV)T@i!v?3o0Fsx2qI(AX_$dDkBzQ@fRM%n zRXk6hb9Py#JXUs+7)w@eo;g%QQ95Yq!K_d=z{0dGS+pToEI6=Bo8+{k$7&Z zo4>PH(`ce8E-Ps&uv`NQ;U$%t;w~|@E3WVOCi~R4oj5wP?%<*1C%}Jq%a^q~T7u>K zML5AKfQDv6>PuT`{SrKHRAF+^&edg6+5R_#H?Lz3iGoWo#PCEd0DS;)2U({{X#zU^ zw_xv{4x7|t!S)>44J;KfA|DC?;uQ($l+5Vp7oeqf7{GBF9356nx|&B~gs+@N^gSdd zvb*>&W)|u#F{Z_b`f#GVtQ`pYv3#||N{xj1NgB<#=Odt6{eB%#9RLt5v zIi|0u70`#ai}9fJjKv7dE!9ZrOIX!3{$z_K5FBd-Kp-&e4(J$LD-)NMTp^_pB`RT; zftVVlK2g@+1Ahv2$D){@Y#cL#dUj9*&%#6 zd2m9{1NYp>)6=oAvqdCn5#cx{AJ%S8skUgMglu2*IAtd+z1>B&`MuEAS(D(<6X#Lj z?f4CFx$)M&$=7*>9v1ER4b6!SIz-m0e{o0BfkySREchp?WdVPpQCh!q$t>?rL!&Jg zd#heM;&~A}VEm8Dvy&P|J*eAV&w!&Nx6HFV&B8jJFVTmgLaswn!cx$&%JbTsloz!3 zMEz1d`k==`Ueub_JAy_&`!ogbwx27^ZXgFNAbx=g_I~5nO^r)}&myw~+yY*cJl4$I znNJ32M&K=0(2Dj_>@39`3=FX!v3nZHno_@q^!y}%(yw0PqOo=);6Y@&ylVe>nMOZ~ zd>j#QQSBn3oaWd;qy$&5(5H$Ayi)0haAYO6TH>FR?rhqHmNOO+(})NB zLI@B@v0)eq!ug`>G<@htRlp3n!EpU|n+G+AvXFrWSUsLMBfL*ZB`CRsIVHNTR&b?K zxBgsN0BjfB>UVcJ|x%=-zb%OV7lmZc& zxiupadZVF7)6QuhoY;;FK2b*qL0J-Rn-8!X4ZY$-ZSUXV5DFd7`T41c(#lAeLMoeT z4%g655v@7AqT!i@)Edt5JMbN(=Q-6{=L4iG8RA%}w;&pKmtWvI4?G9pVRp|RTw`g0 zD5c12B&A2&P6Ng~8WM2eIW=wxd?r7A*N+&!Be7PX3s|7~z=APxm=A?5 zt>xB4WG|*Td@VX{Rs)PV0|yK`oI3^xn(4c_j&vgxk_Y3o(-`_5o`V zRTghg6%l@(qodXN;dB#+OKJEEvhfcnc#BeO2|E(5df-!fKDZ!%9!^BJ_4)9P+9Dq5 zK1=(v?KmIp34r?z{NEWnLB3Px{XYwy-akun4F7xTRr2^zeYW{gcK9)>aJDdU5;w5@ zak=<+-PLH-|04pelTb%ULpuuuJC7DgyT@D|p{!V!0v3KpDnRjANN12q6SUR3mb9<- z>2r~IApQGhstZ!3*?5V z8#)hJ0TdZg0M-BK#nGFP>$i=qk82DO z7h;Ft!D5E15OgW)&%lej*?^1~2=*Z5$2VX>V{x8SC+{i10BbtUk9@I#Vi&hX)q
Q!LwySI{Bnv%Sm)yh{^sSVJ8&h_D-BJ_YZe5eCaAWU9b$O2c z$T|{vWVRtOL!xC0DTc(Qbe`ItNtt5hr<)VijD0{U;T#bUEp381_y`%ZIav?kuYG{iyYdEBPW=*xNSc;Rlt6~F4M`5G+VtOjc z*0qGzCb@gME5udTjJA-9O<&TWd~}ysBd(eVT1-H82-doyH9RST)|+Pb{o*;$j9Tjs zhU!IlsPsj8=(x3bAKJTopW3^6AKROHR^7wZ185wJGVhA~hEc|LP;k7NEz-@4p5o}F z`AD6naG3(n=NF9HTH81=F+Q|JOz$7wm9I<+#BSmB@o_cLt2GkW9|?7mM;r!JZp89l zbo!Hp8=n!XH1{GwaDU+k)pGp`C|cXkCU5%vcH)+v@0eK>%7gWxmuMu9YLlChA|_D@ zi#5zovN_!a-0?~pUV-Rj*1P)KwdU-LguR>YM&*Nen+ln8Q$?WFCJg%DY%K}2!!1FE zDv-A%Cbwo^p(lzac&_TZ-l#9kq`mhLcY3h9ZTUVCM(Ad&=EriQY5{jJv<5K&g|*Lk zgV%ILnf1%8V2B0E&;Sp4sYbYOvvMebLwYwzkRQ#F8GpTQq#uv=J`uaSJ34OWITeSGo6+-8Xw znCk*n{kdDEi)Hi&u^)~cs@iyCkFWB2SWZU|Uc%^43ZIZQ-vWNExCCtDWjqHs;;tWf$v{}0{p0Rvxkq``)*>+Akq%|Na zA`@~-Vfe|+(AIlqru+7Ceh4nsVmO9p9jc8}HX^W&ViBDXT+uXbT#R#idPn&L>+#b6 zflC-4C5-X;kUnR~L>PSLh*gvL68}RBsu#2l`s_9KjUWRhiqF`j)`y`2`YU(>3bdBj z?>iyjEhe-~$^I5!nn%B6Wh+I`FvLNvauve~eX<+Ipl&04 zT}};W&1a3%W?dJ2=N#0t?e+aK+%t}5q%jSLvp3jZ%?&F}nOOWr>+{GFIa%wO_2`et z=JzoRR~}iKuuR+azPI8;Gf9)z3kyA4EIOSl!sRR$DlW}0>&?GbgPojmjmnln;cTqCt=ADbE zZ8GAnoM+S1(5$i8^O4t`ue;vO4i}z0wz-QEIVe5_u03;}-!G1NyY8;h^}y;tzY}i5 zqQr#Ur3Fy8sSa$Q0ys+f`!`+>9WbvU_I`Sj;$4{S>O3?#inLHCrtLy~!s#WXV=oVP zeE93*Nc`PBi4q@%Ao$x4lw9vLHM!6mn3-b_cebF|n-2vt-zYVF_&sDE--J-P;2WHo z+@n2areE0o$LjvjlV2X7ZU@j+`{*8zq`JR3gKF#EW|#+{nMyo-a>nFFTg&vhyT=b} zDa8+v0(Dgx0yRL@ZXOYIlVSZ0|MFizy0VPW8;AfA5|pe!#j zX}Py^8fl5SyS4g1WSKKtnyP+_PoOwMMwu`(i@Z)diJp~U54*-miOchy7Z35eL>^M z4p<-aIxH4VUZgS783@H%M7P9hX>t{|RU7$n4T(brCG#h9e9p! z+o`i;EGGq3&pF;~5V~eBD}lC)>if$w%Vf}AFxGqO88|ApfHf&Bvu+xdG)@vuF}Yvk z)o;~k-%+0K0g+L`Wala!$=ZV|z$e%>f0%XoLib%)!R^RoS+{!#X?h-6uu zF&&KxORdZU&EwQFITIRLo(7TA3W}y6X{?Y%y2j0It!ekU#<)$qghZtpcS>L3uh`Uj z7GY;6f$9qKynP#oS3$$a{p^{D+0oJQ71`1?OAn_m8)UGZmj3l*ZI)`V-a>MKGGFG< z&^jg#Ok%(hhm>hSrZ5;Qga4u(?^i>GiW_j9%_7M>j(^|Om$#{k+^*ULnEgzW_1gCICtAD^WpC`A z{9&DXkG#01Xo)U$OC(L5Y$DQ|Q4C6CjUKk1UkPj$nXH##J{c8e#K|&{mA*;b$r0E4 zUNo0jthwA(c&N1l=PEe8Rw_8cEl|-eya9z&H3#n`B$t#+aJ03RFMzrV@gowbe8v(c zIFM60^0&lCFO10NU4w@|61xiZ4CVXeaKjd;d?sv52XM*lS8XiVjgWpRB;&U_C0g+`6B5V&w|O6B*_q zsATxL!M}+$He)1eOWECce#eS@2n^xhlB4<_Nn?yCVEQWDs(r`|@2GqLe<#(|&P0U? z$7V5IgpWf09uIf_RazRwC?qEqRaHyL?iiS05UiGesJy%^>-C{{ypTBI&B0-iUYhk> zIk<5xpsuV@g|z(AZD+C-;A!fTG=df1=<%nxy(a(IS+U{ME4ZbDEBtcD_3V=icT6*_ z)>|J?>&6%nvHhZERBtjK+s4xnut*@>GAmA5m*OTp$!^CHTr}vM4n(X1Q*;{e-Rd2BCF-u@1ZGm z!S8hJ6L=Gl4T_SDa7Xx|-{4mxveJg=ctf`BJ*fy!yF6Dz&?w(Q_6B}WQVtNI!BVBC zKfX<>7vd6C96}XAQmF-Jd?1Q4eTfRB3q7hCh0f!(JkdWT5<{iAE#dKy*Jxq&3a1@~ z8C||Dn2mFNyrUV|<-)C^_y7@8c2Fz+2jrae9deBDu;U}tJ{^xAdxCD248(k;dCJ%o z`y3sADe>U%suxwwv~8A1+R$VB=Q?%U?4joI$um;aH+eCrBqpn- z%79D_7rb;R-;-9RTrwi9dPlg8&@tfWhhZ(Vx&1PQ+6(huX`;M9x~LrW~~#3{j0Bh2kDU$}@!fFQej4VGkJv?M4rU^x!RU zEwhu$!CA_iDjFjrJa`aocySDX16?~;+wgav;}Zut6Mg%C4>}8FL?8)Kgwc(Qlj{@#2Pt0?G`$h7P#M+qoXtlV@d}%c&OzO+QYKK`kyXaK{U(O^2DyIXCZlNQjt0^8~8JzNGrIxhj}}M z&~QZlbx%t;MJ(Vux;2tgNKGlAqphLq%pd}JG9uoVHUo?|hN{pLQ6Em%r*+7t^<);X zm~6=qChlNAVXNN*Sow->*4;}T;l;D1I-5T{Bif@4_}=>l`tK;qqDdt5zvisCKhMAH z#r}`)7VW?LZqfdmXQ%zo5bJ00{Xb9^YKrk0Nf|oIW*K@(=`o2Vndz}ZDyk{!u}PVx zzd--+_WC*U{~DH3{?GI64IB+@On&@9X>EUAo&L+G{L^dozaI4C3G#2wr~hseW@K&g zKWs{uHu-9Je!3;4pE>eBltKUXb^*hG8I&413)$J&{D4N%7PcloU6bn%jPxJyQL?g* z9g+YFFEDiE`8rW^laCNzQmi7CTnPfwyg3VDHRAl>h=In6jeaVOP@!-CP60j3+#vpL zEYmh_oP0{-gTe7Or`L6x)6w?77QVi~jD8lWN@3RHcm80iV%M1A!+Y6iHM)05iC64tb$X2lV_%Txk@0l^hZqi^%Z?#- zE;LE0uFx)R08_S-#(wC=dS&}vj6P4>5ZWjhthP=*Hht&TdLtKDR;rXEX4*z0h74FA zMCINqrh3Vq;s%3MC1YL`{WjIAPkVL#3rj^9Pj9Ss7>7duy!9H0vYF%>1jh)EPqvlr6h%R%CxDsk| z!BACz7E%j?bm=pH6Eaw{+suniuY7C9Ut~1cWfOX9KW9=H><&kQlinPV3h9R>3nJvK z4L9(DRM=x;R&d#a@oFY7mB|m8h4692U5eYfcw|QKwqRsshN(q^v$4$)HgPpAJDJ`I zkqjq(8Cd!K!+wCd=d@w%~e$=gdUgD&wj$LQ1r>-E=O@c ze+Z$x{>6(JA-fNVr)X;*)40Eym1TtUZI1Pwwx1hUi+G1Jlk~vCYeXMNYtr)1?qwyg zsX_e*$h?380O00ou?0R@7-Fc59o$UvyVs4cUbujHUA>sH!}L54>`e` zHUx#Q+Hn&Og#YVOuo*niy*GU3rH;%f``nk#NN5-xrZ34NeH$l`4@t);4(+0|Z#I>Y z)~Kzs#exIAaf--65L0UHT_SvV8O2WYeD>Mq^Y6L!Xu8%vnpofG@w!}R7M28?i1*T&zp3X4^OMCY6(Dg<-! zXmcGQrRgHXGYre7GfTJ)rhl|rs%abKT_Nt24_Q``XH{88NVPW+`x4ZdrMuO0iZ0g` z%p}y};~T5gbb9SeL8BSc`SO#ixC$@QhXxZ=B}L`tP}&k?1oSPS=4%{UOHe0<_XWln zwbl5cn(j-qK`)vGHY5B5C|QZd5)W7c@{bNVXqJ!!n$^ufc?N9C-BF2QK1(kv++h!>$QbAjq)_b$$PcJdV+F7hz0Hu@ zqj+}m0qn{t^tD3DfBb~0B36|Q`bs*xs|$i^G4uNUEBl4g;op-;Wl~iThgga?+dL7s zUP(8lMO?g{GcYpDS{NM!UA8Hco?#}eNEioRBHy4`mq!Pd-9@-97|k$hpEX>xoX+dY zDr$wfm^P&}Wu{!%?)U_(%Mn79$(ywvu*kJ9r4u|MyYLI_67U7%6Gd_vb##Nerf@>& z8W11z$$~xEZt$dPG}+*IZky+os5Ju2eRi;1=rUEeIn>t-AzC_IGM-IXWK3^6QNU+2pe=MBn4I*R@A%-iLDCOHTE-O^wo$sL_h{dcPl=^muAQb`_BRm};=cy{qSkui;`WSsj9%c^+bIDQ z0`_?KX0<-=o!t{u(Ln)v>%VGL z0pC=GB7*AQ?N7N{ut*a%MH-tdtNmNC+Yf$|KS)BW(gQJ*z$d{+{j?(e&hgTy^2|AR9vx1Xre2fagGv0YXWqtNkg*v%40v?BJBt|f9wX5 z{QTlCM}b-0{mV?IG>TW_BdviUKhtosrBqdfq&Frdz>cF~yK{P@(w{Vr7z2qKFwLhc zQuogKO@~YwyS9%+d-zD7mJG~@?EFJLSn!a&mhE5$_4xBl&6QHMzL?CdzEnC~C3$X@ zvY!{_GR06ep5;<#cKCSJ%srxX=+pn?ywDwtJ2{TV;0DKBO2t++B(tIO4)Wh`rD13P z4fE$#%zkd=UzOB74gi=-*CuID&Z3zI^-`4U^S?dHxK8fP*;fE|a(KYMgMUo`THIS1f!*6dOI2 zFjC3O=-AL`6=9pp;`CYPTdVX z8(*?V&%QoipuH0>WKlL8A*zTKckD!paN@~hh zmXzm~qZhMGVdQGd=AG8&20HW0RGV8X{$9LldFZYm zE?}`Q3i?xJRz43S?VFMmqRyvWaS#(~Lempg9nTM$EFDP(Gzx#$r)W&lpFKqcAoJh-AxEw$-bjW>`_+gEi z2w`99#UbFZGiQjS8kj~@PGqpsPX`T{YOj`CaEqTFag;$jY z8_{Wzz>HXx&G*Dx<5skhpETxIdhKH?DtY@b9l8$l?UkM#J-Snmts7bd7xayKTFJ(u zyAT&@6cAYcs{PBfpqZa%sxhJ5nSZBPji?Zlf&}#L?t)vC4X5VLp%~fz2Sx<*oN<7` z?ge=k<=X7r<~F7Tvp9#HB{!mA!QWBOf%EiSJ6KIF8QZNjg&x~-%e*tflL(ji_S^sO ztmib1rp09uon}RcsFi#k)oLs@$?vs(i>5k3YN%$T(5Or(TZ5JW9mA6mIMD08=749$ z!d+l*iu{Il7^Yu}H;lgw=En1sJpCKPSqTCHy4(f&NPelr31^*l%KHq^QE>z>Ks_bH zjbD?({~8Din7IvZeJ>8Ey=e;I?thpzD=zE5UHeO|neioJwG;IyLk?xOz(yO&0DTU~ z^#)xcs|s>Flgmp;SmYJ4g(|HMu3v7#;c*Aa8iF#UZo7CvDq4>8#qLJ|YdZ!AsH%^_7N1IQjCro

K7UpUK$>l@ zw`1S}(D?mUXu_C{wupRS-jiX~w=Uqqhf|Vb3Cm9L=T+w91Cu^ z*&Ty%sN?x*h~mJc4g~k{xD4ZmF%FXZNC;oVDwLZ_WvrnzY|{v8hc1nmx4^}Z;yriXsAf+Lp+OFLbR!&Ox?xABwl zu8w&|5pCxmu#$?Cv2_-Vghl2LZ6m7}VLEfR5o2Ou$x02uA-%QB2$c(c1rH3R9hesc zfpn#oqpbKuVsdfV#cv@5pV4^f_!WS+F>SV6N0JQ9E!T90EX((_{bSSFv9ld%I0&}9 zH&Jd4MEX1e0iqDtq~h?DBrxQX1iI0lIs<|kB$Yrh&cpeK0-^K%=FBsCBT46@h#yi!AyDq1V(#V}^;{{V*@T4WJ&U-NTq43w=|K>z8%pr_nC>%C(Wa_l78Ufib$r8Od)IIN=u>417 z`Hl{9A$mI5A(;+-Q&$F&h-@;NR>Z<2U;Y21>>Z;s@0V@SbkMQQj%_;~+qTuQ?c|AV zcWm3XZQHhP&R%QWarS%mJ!9R^&!_)*s(v+VR@I#QrAT}`17Y+l<`b-nvmDNW`De%y zrwTZ9EJrj1AFA>B`1jYDow}~*dfPs}IZMO3=a{Fy#IOILc8F0;JS4x(k-NSpbN@qM z`@aE_e}5{!$v3+qVs7u?sOV(y@1Os*Fgu`fCW9=G@F_#VQ%xf$hj0~wnnP0$hFI+@ zkQj~v#V>xn)u??YutKsX>pxKCl^p!C-o?+9;!Nug^ z{rP!|+KsP5%uF;ZCa5F;O^9TGac=M|=V z_H(PfkV1rz4jl?gJ(ArXMyWT4y(86d3`$iI4^l9`vLdZkzpznSd5Ikfrs8qcSy&>z zTIZgWZGXw0n9ibQxYWE@gI0(3#KA-dAdPcsL_|hg2@~C!VZDM}5;v_Nykfq!*@*Zf zE_wVgx82GMDryKO{U{D>vSzSc%B~|cjDQrt5BN=Ugpsf8H8f1lR4SGo#hCuXPL;QQ z#~b?C4MoepT3X`qdW2dNn& zo8)K}%Lpu>0tQei+{>*VGErz|qjbK#9 zvtd8rcHplw%YyQCKR{kyo6fgg!)6tHUYT(L>B7er5)41iG`j$qe*kSh$fY!PehLcD zWeKZHn<492B34*JUQh=CY1R~jT9Jt=k=jCU2=SL&&y5QI2uAG2?L8qd2U(^AW#{(x zThSy=C#>k+QMo^7caQcpU?Qn}j-`s?1vXuzG#j8(A+RUAY})F@=r&F(8nI&HspAy4 z4>(M>hI9c7?DCW8rw6|23?qQMSq?*Vx?v30U%luBo)B-k2mkL)Ljk5xUha3pK>EEj z@(;tH|M@xkuN?gsz;*bygizwYR!6=(Xgcg^>WlGtRYCozY<rFX2E>kaZo)O<^J7a`MX8Pf`gBd4vrtD|qKn&B)C&wp0O-x*@-|m*0egT=-t@%dD zgP2D+#WPptnc;_ugD6%zN}Z+X4=c61XNLb7L1gWd8;NHrBXwJ7s0ce#lWnnFUMTR& z1_R9Fin4!d17d4jpKcfh?MKRxxQk$@)*hradH2$3)nyXep5Z;B z?yX+-Bd=TqO2!11?MDtG0n(*T^!CIiF@ZQymqq1wPM_X$Iu9-P=^}v7npvvPBu!d$ z7K?@CsA8H38+zjA@{;{kG)#AHME>Ix<711_iQ@WWMObXyVO)a&^qE1GqpP47Q|_AG zP`(AD&r!V^MXQ^e+*n5~Lp9!B+#y3#f8J^5!iC@3Y@P`;FoUH{G*pj*q7MVV)29+j z>BC`a|1@U_v%%o9VH_HsSnM`jZ-&CDvbiqDg)tQEnV>b%Ptm)T|1?TrpIl)Y$LnG_ zzKi5j2Fx^K^PG1=*?GhK;$(UCF-tM~^=Z*+Wp{FSuy7iHt9#4n(sUuHK??@v+6*|10Csdnyg9hAsC5_OrSL;jVkLlf zHXIPukLqbhs~-*oa^gqgvtpgTk_7GypwH><53riYYL*M=Q@F-yEPLqQ&1Sc zZB%w}T~RO|#jFjMWcKMZccxm-SL)s_ig?OC?y_~gLFj{n8D$J_Kw%{r0oB8?@dWzn zB528d-wUBQzrrSSLq?fR!K%59Zv9J4yCQhhDGwhptpA5O5U?Hjqt>8nOD zi{)0CI|&Gu%zunGI*XFZh(ix)q${jT8wnnzbBMPYVJc4HX*9d^mz|21$=R$J$(y7V zo0dxdbX3N#=F$zjstTf*t8vL)2*{XH!+<2IJ1VVFa67|{?LP&P41h$2i2;?N~RA30LV`BsUcj zfO9#Pg1$t}7zpv#&)8`mis3~o+P(DxOMgz-V*(?wWaxi?R=NhtW}<#^Z?(BhSwyar zG|A#Q7wh4OfK<|DAcl9THc-W4*>J4nTevsD%dkj`U~wSUCh15?_N@uMdF^Kw+{agk zJ`im^wDqj`Ev)W3k3stasP`88-M0ZBs7;B6{-tSm3>I@_e-QfT?7|n0D~0RRqDb^G zyHb=is;IwuQ&ITzL4KsP@Z`b$d%B0Wuhioo1CWttW8yhsER1ZUZzA{F*K=wmi-sb#Ju+j z-l@In^IKnb{bQG}Ps>+Vu_W#grNKNGto+yjA)?>0?~X`4I3T@5G1)RqGUZuP^NJCq&^HykuYtMDD8qq+l8RcZNJsvN(10{ zQ1$XcGt}QH-U^WU!-wRR1d--{B$%vY{JLWIV%P4-KQuxxDeJaF#{eu&&r!3Qu{w}0f--8^H|KwE>)ORrcR+2Qf zb})DRcH>k0zWK8@{RX}NYvTF;E~phK{+F;MkIP$)T$93Ba2R2TvKc>`D??#mv9wg$ zd~|-`Qx5LwwsZ2hb*Rt4S9dsF%Cny5<1fscy~)d;0m2r$f=83<->c~!GNyb!U)PA; zq^!`@@)UaG)Ew(9V?5ZBq#c%dCWZrplmuM`o~TyHjAIMh0*#1{B>K4po-dx$Tk-Cq z=WZDkP5x2W&Os`N8KiYHRH#UY*n|nvd(U>yO=MFI-2BEp?x@=N<~CbLJBf6P)}vLS?xJXYJ2^<3KJUdrwKnJnTp{ zjIi|R=L7rn9b*D#Xxr4*R<3T5AuOS+#U8hNlfo&^9JO{VbH!v9^JbK=TCGR-5EWR@ zN8T-_I|&@A}(hKeL4_*eb!1G8p~&_Im8|wc>Cdir+gg90n1dw?QaXcx6Op_W1r=axRw>4;rM*UOpT#Eb9xU1IiWo@h?|5uP zka>-XW0Ikp@dIe;MN8B01a7+5V@h3WN{J=HJ*pe0uwQ3S&MyWFni47X32Q7SyCTNQ z+sR!_9IZa5!>f&V$`q!%H8ci!a|RMx5}5MA_kr+bhtQy{-^)(hCVa@I!^TV4RBi zAFa!Nsi3y37I5EK;0cqu|9MRj<^r&h1lF}u0KpKQD^5Y+LvFEwM zLU@@v4_Na#Axy6tn3P%sD^5P#<7F;sd$f4a7LBMk zGU^RZHBcxSA%kCx*eH&wgA?Qwazm8>9SCSz_!;MqY-QX<1@p$*T8lc?@`ikEqJ>#w zcG``^CoFMAhdEXT9qt47g0IZkaU)4R7wkGs^Ax}usqJ5HfDYAV$!=6?>J6+Ha1I<5 z|6=9soU4>E))tW$<#>F ziZ$6>KJf0bPfbx_)7-}tMINlc=}|H+$uX)mhC6-Hz+XZxsKd^b?RFB6et}O#+>Wmw9Ec9) z{q}XFWp{3@qmyK*Jvzpyqv57LIR;hPXKsrh{G?&dRjF%Zt5&m20Ll?OyfUYC3WRn{cgQ?^V~UAv+5 z&_m#&nIwffgX1*Z2#5^Kl4DbE#NrD&Hi4|7SPqZ}(>_+JMz=s|k77aEL}<=0Zfb)a z%F(*L3zCA<=xO)2U3B|pcTqDbBoFp>QyAEU(jMu8(jLA61-H!ucI804+B!$E^cQQa z)_ERrW3g!B9iLb3nn3dlkvD7KsY?sRvls3QC0qPi>o<)GHx%4Xb$5a3GBTJ(k@`e@ z$RUa^%S15^1oLEmA=sayrP5;9qtf!Z1*?e$ORVPsXpL{jL<6E)0sj&swP3}NPmR%FM?O>SQgN5XfHE< zo(4#Cv11(%Nnw_{_Ro}r6=gKd{k?NebJ~<~Kv0r(r0qe4n3LFx$5%x(BKvrz$m?LG zjLIc;hbj0FMdb9aH9Lpsof#yG$(0sG2%RL;d(n>;#jb!R_+dad+K;Ccw!|RY?uS(a zj~?=&M!4C(5LnlH6k%aYvz@7?xRa^2gml%vn&eKl$R_lJ+e|xsNfXzr#xuh(>`}9g zLHSyiFwK^-p!;p$yt7$F|3*IfO3Mlu9e>Dpx8O`37?fA`cj`C0B-m9uRhJjs^mRp# zWB;Aj6|G^1V6`jg7#7V9UFvnB4((nIwG?k%c7h`?0tS8J3Bn0t#pb#SA}N-|45$-j z$R>%7cc2ebAClXc(&0UtHX<>pd)akR3Kx_cK+n<}FhzmTx!8e9^u2e4%x{>T6pQ`6 zO182bh$-W5A3^wos0SV_TgPmF4WUP-+D25KjbC{y_6W_9I2_vNKwU(^qSdn&>^=*t z&uvp*@c8#2*paD!ZMCi3;K{Na;I4Q35zw$YrW5U@Kk~)&rw;G?d7Q&c9|x<Hg|CNMsxovmfth*|E*GHezPTWa^Hd^F4!B3sF;)? z(NaPyAhocu1jUe(!5Cy|dh|W2=!@fNmuNOzxi^tE_jAtzNJ0JR-avc_H|ve#KO}#S z#a(8secu|^Tx553d4r@3#6^MHbH)vmiBpn0X^29xEv!Vuh1n(Sr5I0V&`jA2;WS|Y zbf0e}X|)wA-Pf5gBZ>r4YX3Mav1kKY(ulAJ0Q*jB)YhviHK)w!TJsi3^dMa$L@^{` z_De`fF4;M87vM3Ph9SzCoCi$#Fsd38u!^0#*sPful^p5oI(xGU?yeYjn;Hq1!wzFk zG&2w}W3`AX4bxoVm03y>ts{KaDf!}b&7$(P4KAMP=vK5?1In^-YYNtx1f#}+2QK@h zeSeAI@E6Z8a?)>sZ`fbq9_snl6LCu6g>o)rO;ijp3|$vig+4t} zylEo7$SEW<_U+qgVcaVhk+4k+C9THI5V10qV*dOV6pPtAI$)QN{!JRBKh-D zk2^{j@bZ}yqW?<#VVuI_27*cI-V~sJiqQv&m07+10XF+#ZnIJdr8t`9s_EE;T2V;B z4UnQUH9EdX%zwh-5&wflY#ve!IWt0UE-My3?L#^Bh%kcgP1q{&26eXLn zTkjJ*w+(|_>Pq0v8{%nX$QZbf)tbJaLY$03;MO=Ic-uqYUmUCuXD>J>o6BCRF=xa% z3R4SK9#t1!K4I_d>tZgE>&+kZ?Q}1qo4&h%U$GfY058s%*=!kac{0Z+4Hwm!)pFLR zJ+5*OpgWUrm0FPI2ib4NPJ+Sk07j(`diti^i#kh&f}i>P4~|d?RFb#!JN)~D@)beox}bw?4VCf^y*`2{4`-@%SFTry2h z>9VBc9#JxEs1+0i2^LR@B1J`B9Ac=#FW=(?2;5;#U$0E0UNag_!jY$&2diQk_n)bT zl5Me_SUvqUjwCqmVcyb`igygB_4YUB*m$h5oeKv3uIF0sk}~es!{D>4r%PC*F~FN3owq5e0|YeUTSG#Vq%&Gk7uwW z0lDo#_wvflqHeRm*}l?}o;EILszBt|EW*zNPmq#?4A+&i0xx^?9obLyY4xx=Y9&^G;xYXYPxG)DOpPg!i_Ccl#3L}6xAAZzNhPK1XaC_~ z!A|mlo?Be*8Nn=a+FhgpOj@G7yYs(Qk(8&|h@_>w8Y^r&5nCqe0V60rRz?b5%J;GYeBqSAjo|K692GxD4` zRZyM2FdI+-jK2}WAZTZ()w_)V{n5tEb@>+JYluDozCb$fA4H)$bzg(Ux{*hXurjO^ zwAxc+UXu=&JV*E59}h3kzQPG4M)X8E*}#_&}w*KEgtX)cU{vm9b$atHa;s>| z+L6&cn8xUL*OSjx4YGjf6{Eq+Q3{!ZyhrL&^6Vz@jGbI%cAM9GkmFlamTbcQGvOlL zmJ?(FI)c86=JEs|*;?h~o)88>12nXlpMR4@yh%qdwFNpct;vMlc=;{FSo*apJ;p}! zAX~t;3tb~VuP|ZW;z$=IHf->F@Ml)&-&Bnb{iQyE#;GZ@C$PzEf6~q}4D>9jic@mTO5x76ulDz@+XAcm35!VSu zT*Gs>;f0b2TNpjU_BjHZ&S6Sqk6V1370+!eppV2H+FY!q*n=GHQ!9Rn6MjY!Jc77A zG7Y!lFp8?TIHN!LXO?gCnsYM-gQxsm=Ek**VmZu7vnuufD7K~GIxfxbsQ@qv2T zPa`tvHB$fFCyZl>3oYg?_wW)C>^_iDOc^B7klnTOoytQH18WkOk)L2BSD0r%xgRSW zQS9elF^?O=_@|58zKLK;(f77l-Zzu}4{fXed2saq!5k#UZAoDBqYQS{sn@j@Vtp|$ zG%gnZ$U|9@u#w1@11Sjl8ze^Co=)7yS(}=;68a3~g;NDe_X^}yJj;~s8xq9ahQ5_r zxAlTMnep*)w1e(TG%tWsjo3RR;yVGPEO4V{Zp?=a_0R#=V^ioQu4YL=BO4r0$$XTX zZfnw#_$V}sDAIDrezGQ+h?q24St0QNug_?{s-pI(^jg`#JRxM1YBV;a@@JQvH8*>> zIJvku74E0NlXkYe_624>znU0J@L<-c=G#F3k4A_)*;ky!C(^uZfj%WB3-*{*B$?9+ zDm$WFp=0(xnt6`vDQV3Jl5f&R(Mp};;q8d3I%Kn>Kx=^;uSVCw0L=gw53%Bp==8Sw zxtx=cs!^-_+i{2OK`Q;913+AXc_&Z5$@z3<)So0CU3;JAv=H?@Zpi~riQ{z-zLtVL z!oF<}@IgJp)Iyz1zVJ42!SPHSkjYNS4%ulVVIXdRuiZ@5Mx8LJS}J#qD^Zi_xQ@>DKDr-_e#>5h3dtje*NcwH_h;i{Sx7}dkdpuW z(yUCjckQsagv*QGMSi9u1`Z|V^}Wjf7B@q%j2DQXyd0nOyqg%m{CK_lAoKlJ7#8M} z%IvR?Vh$6aDWK2W!=i?*<77q&B8O&3?zP(Cs@kapc)&p7En?J;t-TX9abGT#H?TW? ztO5(lPKRuC7fs}zwcUKbRh=7E8wzTsa#Z{a`WR}?UZ%!HohN}d&xJ=JQhpO1PI#>X zHkb>pW04pU%Bj_mf~U}1F1=wxdBZu1790>3Dm44bQ#F=T4V3&HlOLsGH)+AK$cHk6 zia$=$kog?)07HCL*PI6}DRhpM^*%I*kHM<#1Se+AQ!!xyhcy6j7`iDX7Z-2i73_n# zas*?7LkxS-XSqv;YBa zW_n*32D(HTYQ0$feV_Fru1ZxW0g&iwqixPX3=9t4o)o|kOo79V$?$uh?#8Q8e>4e)V6;_(x&ViUVxma+i25qea;d-oK7ouuDsB^ab{ zu1qjQ%`n56VtxBE#0qAzb7lph`Eb-}TYpXB!H-}3Ykqyp`otprp7{VEuW*^IR2n$Fb99*nAtqT&oOFIf z@w*6>YvOGw@Ja?Pp1=whZqydzx@9X4n^2!n83C5{C?G@|E?&$?p*g68)kNvUTJ)I6 z1Q|(#UuP6pj78GUxq11m-GSszc+)X{C2eo-?8ud9sB=3(D47v?`JAa{V(IF zPZQ_0AY*9M97>Jf<o%#O_%Wq}8>YM=q0|tGY+hlXcpE=Z4Od z`NT7Hu2hnvRoqOw@g1f=bv`+nba{GwA$Ak0INlqI1k<9!x_!sL()h?hEWoWrdU3w` zZ%%)VR+Bc@_v!C#koM1p-3v_^L6)_Ktj4HE>aUh%2XZE@JFMOn)J~c`_7VWNb9c-N z2b|SZMR4Z@E7j&q&9(6H3yjEu6HV7{2!1t0lgizD;mZ9$r(r7W5G$ky@w(T_dFnOD z*p#+z$@pKE+>o@%eT(2-p_C}wbQ5s(%Sn_{$HDN@MB+Ev?t@3dPy`%TZ!z}AThZSu zN<1i$siJhXFdjV zP*y|V<`V8t=h#XTRUR~5`c`Z9^-`*BZf?WAehGdg)E2Je)hqFa!k{V(u+(hTf^Yq& zoruUh2(^3pe)2{bvt4&4Y9CY3js)PUHtd4rVG57}uFJL)D(JfSIo^{P=7liFXG zq5yqgof0V8paQcP!gy+;^pp-DA5pj=gbMN0eW=-eY+N8~y+G>t+x}oa!5r>tW$xhI zPQSv=pi;~653Gvf6~*JcQ%t1xOrH2l3Zy@8AoJ+wz@daW@m7?%LXkr!bw9GY@ns3e zSfuWF_gkWnesv?s3I`@}NgE2xwgs&rj?kH-FEy82=O8`+szN ziHch`vvS`zNfap14!&#i9H@wF7}yIPm=UB%(o(}F{wsZ(wA0nJ2aD^@B41>>o-_U6 zUqD~vdo48S8~FTb^+%#zcbQiiYoDKYcj&$#^;Smmb+Ljp(L=1Kt_J!;0s%1|JK}Wi z;={~oL!foo5n8=}rs6MmUW~R&;SIJO3TL4Ky?kh+b2rT9B1Jl4>#Uh-Bec z`Hsp<==#UEW6pGPhNk8H!!DUQR~#F9jEMI6T*OWfN^Ze&X(4nV$wa8QUJ>oTkruH# zm~O<`J7Wxseo@FqaZMl#Y(mrFW9AHM9Kb|XBMqaZ2a)DvJgYipkDD_VUF_PKd~dT7 z#02}bBfPn9a!X!O#83=lbJSK#E}K&yx-HI#T6ua)6o0{|={*HFusCkHzs|Fn&|C3H zBck1cmfcWVUN&i>X$YU^Sn6k2H;r3zuXbJFz)r5~3$d$tUj(l1?o={MM){kjgqXRO zc5R*#{;V7AQh|G|)jLM@wGAK&rm2~@{Pewv#06pHbKn#wL0P6F1!^qw9g&cW3Z=9} zj)POhOlwsh@eF=>z?#sIs*C-Nl(yU!#DaiaxhEs#iJqQ8w%(?+6lU02MYSeDkr!B- zPjMv+on6OLXgGnAtl(ao>|X2Y8*Hb}GRW5}-IzXnoo-d0!m4Vy$GS!XOLy>3_+UGs z2D|YcQx@M#M|}TDOetGi{9lGo9m-=0-^+nKE^*?$^uHkxZh}I{#UTQd;X!L+W@jm( zDg@N4+lUqI92o_rNk{3P>1gxAL=&O;x)ZT=q1mk0kLlE$WeWuY_$0`0jY-Kkt zP*|m3AF}Ubd=`<>(Xg0har*_@x2YH}bn0Wk*OZz3*e5;Zc;2uBdnl8?&XjupbkOeNZsNh6pvsq_ydmJI+*z**{I{0K)-;p1~k8cpJXL$^t!-`E}=*4G^-E8>H!LjTPxSx zcF+cS`ommfKMhNSbas^@YbTpH1*RFrBuATUR zt{oFWSk^$xU&kbFQ;MCX22RAN5F6eq9UfR$ut`Jw--p2YX)A*J69m^!oYfj2y7NYcH6&r+0~_sH^c^nzeN1AU4Ga7=FlR{S|Mm~MpzY0$Z+p2W(a={b-pR9EO1Rs zB%KY|@wLcAA@)KXi!d2_BxrkhDn`DT1=Dec}V!okd{$+wK z4E{n8R*xKyci1(CnNdhf$Dp2(Jpof0-0%-38X=Dd9PQgT+w%Lshx9+loPS~MOm%ZT zt%2B2iL_KU_ita%N>xjB!#71_3=3c}o zgeW~^U_ZTJQ2!PqXulQd=3b=XOQhwATK$y(9$#1jOQ4}4?~l#&nek)H(04f(Sr=s| zWv7Lu1=%WGk4FSw^;;!8&YPM)pQDCY9DhU`hMty1@sq1=Tj7bFsOOBZOFlpR`W>-J$-(kezWJj;`?x-v>ev{*8V z8p|KXJPV$HyQr1A(9LVrM47u-XpcrIyO`yWvx1pVYc&?154aneRpLqgx)EMvRaa#|9?Wwqs2+W8n5~79G z(}iCiLk;?enn}ew`HzhG+tu+Ru@T+K5juvZN)wY;x6HjvqD!&!)$$;1VAh~7fg0K| zEha#aN=Yv|3^~YFH}cc38ovVb%L|g@9W6fo(JtT6$fa?zf@Ct88e}m?i)b*Jgc{fl zExfdvw-BYDmH6>(4QMt#p0;FUIQqkhD}aH?a7)_%JtA~soqj{ppP_82yi9kaxuK>~ ze_)Zt>1?q=ZH*kF{1iq9sr*tVuy=u>Zev}!gEZx@O6-fjyu9X00gpIl-fS_pzjpqJ z1yqBmf9NF!jaF<+YxgH6oXBdK)sH(>VZ)1siyA$P<#KDt;8NT*l_0{xit~5j1P)FN zI8hhYKhQ)i z37^aP13B~u65?sg+_@2Kr^iWHN=U;EDSZ@2W2!5ALhGNWXnFBY%7W?1 z=HI9JzQ-pLKZDYTv<0-lt|6c-RwhxZ)mU2Os{bsX_i^@*fKUj8*aDO5pks=qn3Dv6 zwggpKLuyRCTVPwmw1r}B#AS}?X7b837UlXwp~E2|PJw2SGVueL7){Y&z!jL!XN=0i zU^Eig`S2`{+gU$68aRdWx?BZ{sU_f=8sn~>s~M?GU~`fH5kCc; z8ICp+INM3(3{#k32RZdv6b9MQYdZXNuk7ed8;G?S2nT+NZBG=Tar^KFl2SvhW$bGW#kdWL-I)s_IqVnCDDM9fm8g;P;8 z7t4yZn3^*NQfx7SwmkzP$=fwdC}bafQSEF@pd&P8@H#`swGy_rz;Z?Ty5mkS%>m#% zp_!m9e<()sfKiY(nF<1zBz&&`ZlJf6QLvLhl`_``%RW&{+O>Xhp;lwSsyRqGf=RWd zpftiR`={2(siiPAS|p}@q=NhVc0ELprt%=fMXO3B)4ryC2LT(o=sLM7hJC!}T1@)E zA3^J$3&1*M6Xq>03FX`R&w*NkrZE?FwU+Muut;>qNhj@bX17ZJxnOlPSZ=Zeiz~T_ zOu#yc3t6ONHB;?|r4w+pI)~KGN;HOGC)txxiUN8#mexj+W(cz%9a4sx|IRG=}ia zuEBuba3AHsV2feqw-3MvuL`I+2|`Ud4~7ZkN=JZ;L20|Oxna5vx1qbIh#k2O4$RQF zo`tL()zxaqibg^GbB+BS5#U{@K;WWQj~GcB1zb}zJkPwH|5hZ9iH2308!>_;%msji zJHSL~s)YHBR=Koa1mLEOHos*`gp=s8KA-C zu0aE+W!#iJ*0xqKm3A`fUGy#O+X+5W36myS>Uh2!R*s$aCU^`K&KKLCCDkejX2p=5 z%o7-fl03x`gaSNyr?3_JLv?2RLS3F*8ub>Jd@^Cc17)v8vYEK4aqo?OS@W9mt%ITJ z9=S2%R8M){CugT@k~~0x`}Vl!svYqX=E)c_oU6o}#Hb^%G1l3BudxA{F*tbjG;W_>=xV73pKY53v%>I)@D36I_@&p$h|Aw zonQS`07z_F#@T-%@-Tb|)7;;anoD_WH>9ewFy(ZcEOM$#Y)8>qi7rCnsH9GO-_7zF zu*C87{Df1P4TEOsnzZ@H%&lvV(3V@;Q!%+OYRp`g05PjY^gL$^$-t0Y>H*CDDs?FZly*oZ&dxvsxaUWF!{em4{A>n@vpXg$dwvt@_rgmHF z-MER`ABa8R-t_H*kv>}CzOpz;!>p^^9ztHMsHL|SRnS<-y5Z*r(_}c4=fXF`l^-i}>e7v!qs_jv zqvWhX^F=2sDNWA9c@P0?lUlr6ecrTKM%pNQ^?*Lq?p-0~?_j50xV%^(+H>sMul#Tw zeciF*1=?a7cI(}352%>LO96pD+?9!fNyl^9v3^v&Y4L)mNGK0FN43&Xf8jUlxW1Bw zyiu2;qW-aGNhs=zbuoxnxiwZ3{PFZM#Kw)9H@(hgX23h(`Wm~m4&TvoZoYp{plb^> z_#?vXcxd>r7K+1HKJvhed>gtK`TAbJUazUWQY6T~t2af%#<+Veyr%7-#*A#@&*;@g58{i|E%6yC_InGXCOd{L0;$)z#?n7M`re zh!kO{6=>7I?*}czyF7_frt#)s1CFJ_XE&VrDA?Dp3XbvF{qsEJgb&OLSNz_5g?HpK z9)8rsr4JN!Af3G9!#Qn(6zaUDqLN(g2g8*M)Djap?WMK9NKlkC)E2|-g|#-rp%!Gz zAHd%`iq|81efi93m3yTBw3g0j#;Yb2X{mhRAI?&KDmbGqou(2xiRNb^sV}%%Wu0?< z?($L>(#BO*)^)rSgyNRni$i`R4v;GhlCZ8$@e^ROX(p=2_v6Y!%^As zu022)fHdv_-~Yu_H6WVPLpHQx!W%^6j)cBhS`O3QBW#x(eX54d&I22op(N59b*&$v zFiSRY6rOc^(dgSV1>a7-5C;(5S5MvKcM2Jm-LD9TGqDpP097%52V+0>Xqq!! zq4e3vj53SE6i8J`XcQB|MZPP8j;PAOnpGnllH6#Ku~vS42xP*Nz@~y%db7Xi8s09P z1)e%8ys6&M8D=Dt6&t`iKG_4X=!kgRQoh%Z`dc&mlOUqXk-k`jKv9@(a^2-Upw>?< zt5*^DV~6Zedbec4NVl($2T{&b)zA@b#dUyd>`2JC0=xa_fIm8{5um zr-!ApXZhC8@=vC2WyxO|!@0Km)h8ep*`^he92$@YwP>VcdoS5OC^s38e#7RPsg4j+ zbVGG}WRSET&ZfrcR(x~k8n1rTP%CnfUNKUonD$P?FtNFF#cn!wEIab-;jU=B1dHK@ z(;(yAQJ`O$sMn>h;pf^8{JISW%d+@v6@CnXh9n5TXGC}?FI9i-D0OMaIg&mAg=0Kn zNJ7oz5*ReJukD55fUsMuaP+H4tDN&V9zfqF@ zr=#ecUk9wu{0;!+gl;3Bw=Vn^)z$ahVhhw)io!na&9}LmWurLb0zubxK=UEnU*{5P z+SP}&*(iBKSO4{alBHaY^)5Q=mZ+2OwIooJ7*Q5XJ+2|q`9#f?6myq!&oz?klihLq z4C)$XP!BNS0G_Z1&TM>?Jk{S~{F3n83ioli=IO6f%wkvCl(RFFw~j0tb{GvXTx>*sB0McY0s&SNvj4+^h`9nJ_wM>F!Uc>X}9PifQekn0sKI2SAJP!a4h z5cyGTuCj3ZBM^&{dRelIlT^9zcfaAuL5Y~bl!ppSf`wZbK$z#6U~rdclk``e+!qhe z6Qspo*%<)eu6?C;Bp<^VuW6JI|Ncvyn+LlSl;Mp22Bl7ARQ0Xc24%29(ZrdsIPw&-=yHQ7_Vle|5h>AST0 zUGX2Zk34vp?U~IHT|;$U86T+UUHl_NE4m|}>E~6q``7hccCaT^#y+?wD##Q%HwPd8 zV3x4L4|qqu`B$4(LXqDJngNy-{&@aFBvVsywt@X^}iH7P%>bR?ciC$I^U-4Foa`YKI^qDyGK7k%E%c_P=yzAi`YnxGA%DeNd++j3*h^ z=rn>oBd0|~lZ<6YvmkKY*ZJlJ;Im0tqgWu&E92eqt;+NYdxx`eS(4Hw_Jb5|yVvBg z*tbdY^!AN;luEyN4VRhS@-_DC{({ziH{&Z}iGElSV~qvT>L-8G%+yEL zX#MFOhj{InyKG=mvW-<1B@c-}x$vA(nU?>S>0*eN#!SLzQ)Ex7fvQ)S4D<8|I#N$3 zT5Ei`Z?cxBODHX8(Xp73v`IsAYC@9b;t}z0wxVuQSY1J^GRwDPN@qbM-ZF48T$GZ< z8WU+;Pqo?{ghI-KZ-i*ydXu`Ep0Xw^McH_KE9J0S7G;x8Fe`DVG?j3Pv=0YzJ}yZR z%2=oqHiUjvuk0~Ca>Kol4CFi0_xQT~;_F?=u+!kIDl-9g`#ZNZ9HCy17Ga1v^Jv9# z{T4Kb1-AzUxq*MutfOWWZgD*HnFfyYg0&e9f(5tZ>krPF6{VikNeHoc{linPPt#Si z&*g>(c54V8rT_AX!J&bNm-!umPvOR}vDai#`CX___J#=zeB*{4<&2WpaDncZsOkp* zsg<%@@rbrMkR_ux9?LsQxzoBa1s%$BBn6vk#{&&zUwcfzeCBJUwFYSF$08qDsB;gWQN*g!p8pxjofWbqNSZOEKOaTx@+* zwdt5*Q47@EOZ~EZL9s?1o?A%9TJT=Ob_13yyugvPg*e&ZU(r6^k4=2+D-@n=Hv5vu zSXG|hM(>h9^zn=eQ=$6`JO&70&2|%V5Lsx>)(%#;pcOfu>*nk_3HB_BNaH$`jM<^S zcSftDU1?nL;jy)+sfonQN}(}gUW?d_ikr*3=^{G)=tjBtEPe>TO|0ddVB zTklrSHiW+!#26frPXQQ(YN8DG$PZo?(po(QUCCf_OJC`pw*uey00%gmH!`WJkrKXj2!#6?`T25mTu9OJp2L8z3! z=arrL$ZqxuE{%yV)14Kd>k}j7pxZ6#$Dz8$@WV5p8kTqN<-7W)Q7Gt2{KoOPK_tZ| zf2WG~O5@{qPI+W<4f_;reuFVdO^5`ADC1!JQE|N`s3cq@(0WB!n0uh@*c{=LAd;~} zyGK@hbF-Oo+!nN)@i*O(`@FA#u?o=~e{`4O#5}z&=UkU*50fOrzi11D^&FOqe>wii z?*k+2|EcUs;Gx{!@KBT~>PAwLrIDT7Th=Utu?~?np@t^gFs?zgX=D${RwOY^WGh-+ z+#4$066ISh8eYW#FXWp~S`<*%O^ZuItL1Tyqt8#tZ zY120E;^VG`!lZn&3sPd$RkdHpU#|w+bYV)pJC|SH9g%|5IkxVTQcBA4CL0}$&}ef@ zW^Vtj%M;;_1xxP9x#ex17&4N*{ksO*_4O}xYu(p*JkL#yr}@7b)t5X?%CY<+s5_MJ zuiqt+N_;A(_)%lumoyRFixWa-M7qK_9s6<1X?JDa9fP!+_6u~~M$5L=ipB=7(j#f< zZ34J%=bs549%~_mA(|={uZNs_0?o7;-LBP(ZRnkd{-^|2|=4vUTmtByHL8 zEph`(LSEzQj68a+`d$V<45J7cyv^#|^|%fD#si1Nx!4NW*`l*{->HEWNh6-|g>-=r zXmQ|-i}Ku$ndUeHQ^&ieT!Lf}vf6GaqW9$DJ2NWrqwPY%%4nip$@vK$nRp*_C-v<| zuKz~ZyN&<%!NS26&x?jhy+@awJipMQ-8(X4#Ae5??U<1QMt1l9R=w9fAnEF}NYu$2 z>6}Vkc zIb*A?G*z8^IvibmBKn_u^5&T_1oey0gZS2~obf(#xk=erZGTEdQnt3DMGM+0oPwss zj5zXD;(oWhB_T@~Ig#9@v)AKtXu3>Inmgf@A|-lD-1U>cNyl3h?ADD9)GG4}zUGPk zZzaXe!~Kf?<~@$G?Uql3t8jy9{2!doq4=J}j9ktTxss{p6!9UdjyDERlA*xZ!=Q)KDs5O)phz>Vq3BNGoM(H|=1*Q4$^2fTZw z(%nq1P|5Rt81}SYJpEEzMPl5VJsV5&4e)ZWKDyoZ>1EwpkHx-AQVQc8%JMz;{H~p{=FXV>jIxvm4X*qv52e?Y-f%DJ zxEA165GikEASQ^fH6K#d!Tpu2HP{sFs%E=e$gYd$aj$+xue6N+Wc(rAz~wUsk2`(b z8Kvmyz%bKQxpP}~baG-rwYcYCvkHOi zlkR<=>ZBTU*8RF_d#Bl@zZsRIhx<%~Z@Z=ik z>adw3!DK(8R|q$vy{FTxw%#xliD~6qXmY^7_9kthVPTF~Xy1CfBqbU~?1QmxmU=+k z(ggxvEuA;0e&+ci-zQR{-f7aO{O(Pz_OsEjLh_K>MbvoZ4nxtk5u{g@nPv)cgW_R} z9}EA4K4@z0?7ue}Z(o~R(X&FjejUI2g~08PH1E4w>9o{)S(?1>Z0XMvTb|;&EuyOE zGvWNpYX)Nv<8|a^;1>bh#&znEcl-r!T#pn= z4$?Yudha6F%4b>*8@=BdtXXY4N+`U4Dmx$}>HeVJk-QdTG@t!tVT#0(LeV0gvqyyw z2sEp^9eY0N`u10Tm4n8No&A=)IeEC|gnmEXoNSzu!1<4R<%-9kY_8~5Ej?zRegMn78wuMs#;i&eUA0Zk_RXQ3b&TT} z;SCI=7-FUB@*&;8|n>(_g^HGf3@QODE3LpmX~ELnymQm{Sx9xrKS zK29p~?v@R$0=v6Dr5aW>-!{+h@?Q58|Kz8{{W`%J+lDAdb&M5VHrX_mDY;1-JLnf)ezmPau$)1;=`-FU=-r-83tX=C`S#}GZufju zQ>sXNT0Ny=k@nc%cFnvA_i4SC)?_ORXHq8B4D%el1uPX`c~uG#S1M7C+*MMqLw78E zhY2dI8@+N^qrMI1+;TUda(vGqGSRyU{Fnm`aqrr7bz42c5xsOO-~oZpkzorD1g}Y<6rk&3>PsSGy}W?MtqFky@A(X# zIuNZK0cK?^=;PUAu>j0#HtjbHCV*6?jzA&OoE$*Jlga*}LF`SF?WLhv1O|zqC<>*> zYB;#lsYKx0&kH@BFpW8n*yDcc6?;_zaJs<-jPSkCsSX-!aV=P5kUgF@Nu<{a%#K*F z134Q{9|YX7X(v$62_cY3^G%t~rD>Q0z@)1|zs)vjJ6Jq9;7#Ki`w+eS**En?7;n&7 zu==V3T&eFboN3ZiMx3D8qYc;VjFUk_H-WWCau(VFXSQf~viH0L$gwD$UfFHqNcgN`x}M+YQ6RnN<+@t>JUp#)9YOkqst-Ga?{FsDpEeX0(5v{0J~SEbWiL zXC2}M4?UH@u&|;%0y`eb33ldo4~z-x8zY!oVmV=c+f$m?RfDC35mdQ2E>Pze7KWP- z>!Bh<&57I+O_^s}9Tg^k)h7{xx@0a0IA~GAOt2yy!X%Q$1rt~LbTB6@Du!_0%HV>N zlf)QI1&gvERKwso23mJ!Ou6ZS#zCS5W`gxE5T>C#E|{i<1D35C222I33?Njaz`On7 zi<+VWFP6D{e-{yiN#M|Jgk<44u1TiMI78S5W`Sdb5f+{zu34s{CfWN7a3Cf^@L%!& zN$?|!!9j2c)j$~+R6n#891w-z8(!oBpL2K=+%a$r2|~8-(vQj5_XT`<0Ksf;oP+tz z9CObS!0m)Tgg`K#xBM8B(|Z)Wb&DYL{WTYv`;A=q6~Nnx2+!lTIXtj8J7dZE!P_{z z#f8w6F}^!?^KE#+ZDv+xd5O&3EmomZzsv?>E-~ygGum45fk!SBN&|eo1rKw^?aZJ4 E2O(~oYXATM literal 0 HcmV?d00001 diff --git a/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 00000000000..ba68b6be2f9 --- /dev/null +++ b/extension/android/benchmark/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Thu Aug 29 23:29:08 PDT 2024 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/extension/android/benchmark/gradlew b/extension/android/benchmark/gradlew new file mode 100755 index 00000000000..4f906e0c811 --- /dev/null +++ b/extension/android/benchmark/gradlew @@ -0,0 +1,185 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/extension/android/benchmark/gradlew.bat b/extension/android/benchmark/gradlew.bat new file mode 100644 index 00000000000..ac1b06f9382 --- /dev/null +++ b/extension/android/benchmark/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/extension/android/benchmark/settings.gradle.kts b/extension/android/benchmark/settings.gradle.kts new file mode 100644 index 00000000000..f2f5ac42a2c --- /dev/null +++ b/extension/android/benchmark/settings.gradle.kts @@ -0,0 +1,17 @@ +pluginManagement { + repositories { + google() + mavenCentral() + gradlePluginPortal() + } +} +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + } +} + +rootProject.name = "MiniBench" +include(":app") From 37cad019d28cc963b6fc03f35fcfa540c881622a Mon Sep 17 00:00:00 2001 From: meta-emilian <162623112+meta-emilian@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:29:47 -0700 Subject: [PATCH 133/531] Exposing custom_ops_aot_py to executorch clients. Differential Revision: D62011371 Pull Request resolved: https://github.com/pytorch/executorch/pull/4996 --- extension/llm/custom_ops/targets.bzl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index 099266de1bb..ded25054acc 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -54,7 +54,10 @@ def define_common_targets(): srcs = [ "sdpa_with_kv_cache.py", ], - visibility = ["//executorch/..."], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], deps = [ "//caffe2:torch", ], From 91dc80141337ad5936a8c647cc225f846d659f39 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:34:14 -0700 Subject: [PATCH 134/531] [ET-VK] Rename `StorageBuffer` to `StagingBuffer` Differential Revision: D62049779 Pull Request resolved: https://github.com/pytorch/executorch/pull/5012 --- backends/vulkan/runtime/api/api.h | 2 +- .../{StorageBuffer.h => StagingBuffer.h} | 14 +++++++------- backends/vulkan/runtime/graph/ComputeGraph.cpp | 4 ++-- backends/vulkan/runtime/graph/ComputeGraph.h | 2 +- backends/vulkan/runtime/graph/containers/Value.h | 8 ++++---- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 8 ++++---- backends/vulkan/runtime/graph/ops/PrepackNode.h | 2 +- .../runtime/graph/ops/utils/BindingUtils.cpp | 2 +- .../runtime/graph/ops/utils/BindingUtils.h | 2 +- .../runtime/graph/ops/utils/StagingUtils.cpp | 6 +++--- .../runtime/graph/ops/utils/StagingUtils.h | 6 +++--- backends/vulkan/test/utils/test_utils.cpp | 6 +++--- backends/vulkan/test/utils/test_utils.h | 10 +++++----- backends/vulkan/test/vulkan_compute_api_test.cpp | 16 ++++++++-------- .../vulkan/tools/gpuinfo/include/architecture.h | 6 +++--- backends/vulkan/tools/gpuinfo/include/buffers.h | 8 ++++---- backends/vulkan/tools/gpuinfo/include/textures.h | 4 ++-- 17 files changed, 53 insertions(+), 53 deletions(-) rename backends/vulkan/runtime/api/containers/{StorageBuffer.h => StagingBuffer.h} (83%) diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h index de77c57fb0e..0f496a4af8a 100644 --- a/backends/vulkan/runtime/api/api.h +++ b/backends/vulkan/runtime/api/api.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h similarity index 83% rename from backends/vulkan/runtime/api/containers/StorageBuffer.h rename to backends/vulkan/runtime/api/containers/StagingBuffer.h index 17c34706057..ab650c09a43 100644 --- a/backends/vulkan/runtime/api/containers/StorageBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -17,7 +17,7 @@ namespace vkcompute { namespace api { -class StorageBuffer final { +class StagingBuffer final { private: Context* context_p_; vkapi::ScalarType dtype_; @@ -26,7 +26,7 @@ class StorageBuffer final { vkapi::VulkanBuffer vulkan_buffer_; public: - StorageBuffer( + StagingBuffer( Context* context_p, const vkapi::ScalarType dtype, const size_t numel, @@ -39,13 +39,13 @@ class StorageBuffer final { nbytes_, gpuonly)) {} - StorageBuffer(const StorageBuffer&) = delete; - StorageBuffer& operator=(const StorageBuffer&) = delete; + StagingBuffer(const StagingBuffer&) = delete; + StagingBuffer& operator=(const StagingBuffer&) = delete; - StorageBuffer(StorageBuffer&&) = default; - StorageBuffer& operator=(StorageBuffer&&) = default; + StagingBuffer(StagingBuffer&&) = default; + StagingBuffer& operator=(StagingBuffer&&) = default; - ~StorageBuffer() { + ~StagingBuffer() { context_p_->register_buffer_cleanup(vulkan_buffer_); } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 9fa0091b298..6c3ec88eaa7 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -38,7 +38,7 @@ namespace vkcompute { VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging) +VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) @@ -236,7 +236,7 @@ ValueRef ComputeGraph::add_staging( const size_t numel) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StorageBuffer(context(), dtype, numel)); + values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); return idx; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 5740d24a448..9b04b08a70e 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -58,7 +58,7 @@ class ComputeGraph; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer) +DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index ba82213c6f8..6e03bbd4a21 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -53,7 +53,7 @@ struct Value final { } u; api::vTensor as_tensor; - api::StorageBuffer as_staging; + api::StagingBuffer as_staging; TensorRef as_tensorref; std::vector as_int_list; @@ -108,7 +108,7 @@ struct Value final { CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSOR, api::vTensor, as_tensor, vTensor); CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer); + TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer); CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); // Scalar lists @@ -152,7 +152,7 @@ struct Value final { payload.as_tensor.~vTensor(); break; case TypeTag::STAGING: - payload.as_staging.~StorageBuffer(); + payload.as_staging.~StagingBuffer(); break; case TypeTag::TENSORREF: payload.as_tensorref.~TensorRef(); @@ -247,7 +247,7 @@ struct Value final { as_tensor); SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - api::StorageBuffer, + api::StagingBuffer, Staging, TypeTag::STAGING, as_staging); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index b77c62920dd..a9c2f6c9b6a 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -45,14 +45,14 @@ PrepackNode::PrepackNode( graph.update_descriptor_counts(noop_shader_, /*execute = */ false); } -api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { +api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { vTensorPtr packed = graph->get_tensor(packed_); // If no TensorRef is provided, create a staging buffer of zeros according to // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StorageBuffer staging(graph->context(), packed->dtype(), numel); + api::StagingBuffer staging(graph->context(), packed->dtype(), numel); size_t nbytes = numel * vkapi::element_size(packed->dtype()); set_staging_zeros(staging, nbytes); return staging; @@ -60,7 +60,7 @@ api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StorageBuffer staging(graph->context(), tref->dtype, numel); + api::StagingBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); copy_ptr_to_staging(tref->data, staging, nbytes); return staging; @@ -70,7 +70,7 @@ void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); vTensorPtr packed = graph->get_tensor(packed_); - api::StorageBuffer staging = create_staging_buffer(graph); + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index c3ac8b963fd..3e713303c3d 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -56,7 +56,7 @@ class PrepackNode final { const vkapi::SpecVarList spec_vars_; private: - api::StorageBuffer create_staging_buffer(ComputeGraph* graph); + api::StagingBuffer create_staging_buffer(ComputeGraph* graph); }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp index b0964ace225..2cfb34a052e 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp @@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set( } void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx) { descriptor_set.bind(idx, staging.buffer()); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h index 3a7ec029da7..eed39a97979 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h @@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set( const uint32_t base_idx); void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 294e36b9a86..9cb715e202a 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -73,7 +73,7 @@ void memcpy_from_mapping( void copy_ptr_to_staging( const void* src, - api::StorageBuffer& staging, + api::StagingBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); mapping.invalidate(); @@ -81,7 +81,7 @@ void copy_ptr_to_staging( } void copy_staging_to_ptr( - api::StorageBuffer& staging, + api::StagingBuffer& staging, void* dst, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); @@ -89,7 +89,7 @@ void copy_staging_to_ptr( memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); } -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) { +void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) { vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); uint8_t* data_ptr = mapping.template data(); memset(data_ptr, 0, staging.nbytes()); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index cabc17f30ee..f16c52ecf33 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -18,14 +18,14 @@ namespace vkcompute { void copy_ptr_to_staging( const void* src, - api::StorageBuffer& staging, + api::StagingBuffer& staging, const size_t nbytes); void copy_staging_to_ptr( - api::StorageBuffer& staging, + api::StagingBuffer& staging, void* dst, const size_t nbytes); -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes); +void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes); // // Functions to get shaders diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 4a3a41d6c72..4c2972419d0 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -112,7 +112,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StorageBuffer& dst_buffer) { + api::StagingBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); utils::uvec3 global_wg_size = {buffer_len, 1, 1}; @@ -324,7 +324,7 @@ void record_reference_matmul( _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); #define CASE(ctype, name) \ case vkapi::ScalarType::name: { \ @@ -411,7 +411,7 @@ void fill_vtensor( } void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer( + api::StagingBuffer staging_buffer( api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index c8af5470862..3bc12c472db 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -37,13 +37,13 @@ using namespace vkcompute; allocate_memory); #define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ + api::StagingBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_nchw_to_image_op( \ api::context(), staging_buffer_##tensor.buffer(), tensor); #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ + api::StagingBuffer staging_buffer_##tensor( \ api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ record_image_to_nchw_op( \ api::context(), tensor, staging_buffer_##tensor.buffer()); @@ -85,7 +85,7 @@ void record_image_to_nchw_op( void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StorageBuffer& dst_buffer); + api::StagingBuffer& dst_buffer); void record_conv2d_prepack_weights_op( api::Context* const context, @@ -126,7 +126,7 @@ void record_reference_matmul( // inline void -fill_staging(api::StorageBuffer& staging, float val, int numel = -1) { +fill_staging(api::StagingBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } @@ -164,7 +164,7 @@ inline std::vector extract_vtensor(api::vTensor& vten) { } inline void -check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { +check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) { if (numel < 0) { numel = staging.numel(); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 3d172f490cf..f3c60a21376 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -336,7 +336,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StorageBuffer buffer(context(), vkapi::kFloat, len); + StagingBuffer buffer(context(), vkapi::kFloat, len); float scale = 3.0f; float offset = 1.5f; @@ -407,7 +407,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { params.buffer()); } - StorageBuffer staging_buffer( + StagingBuffer staging_buffer( context(), vkapi::kFloat, a.staging_buffer_numel()); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); @@ -428,7 +428,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StorageBuffer buffer(context(), dtype, len); + StagingBuffer buffer(context(), dtype, len); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -2040,7 +2040,7 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); + StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2073,7 +2073,7 @@ void round_trip_test( vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); // Create and fill input staging buffer - StorageBuffer staging_buffer_in( + StagingBuffer staging_buffer_in( context(), dtype, vten.staging_buffer_numel()); std::vector data_in(staging_buffer_in.numel()); @@ -2084,7 +2084,7 @@ void round_trip_test( data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes()); // Output staging buffer - StorageBuffer staging_buffer_out( + StagingBuffer staging_buffer_out( context(), dtype, vten.staging_buffer_numel()); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2538,7 +2538,7 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { @@ -2550,7 +2550,7 @@ void test_conv2d( // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 0d312ee87c3..20c6254e1a0 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -40,7 +40,7 @@ void reg_count(const App& app) { uint32_t NITER; auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); + StagingBuffer buffer(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "reg_count_" + std::to_string(nreg); @@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) { uint32_t NITER; auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_physical"; @@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) { // doesn't depend on kernel timing, so the extra wait time doesn't lead to // inaccuracy. auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_scheduler"; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index c8cf93c4a12..31137b11eea 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) { uint32_t NITER; auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); + StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StagingBuffer out_buf(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "buf_cacheline_size"; @@ -132,8 +132,8 @@ void _bandwidth( // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( + StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index 7679f11b0ca..c9ff133f1ec 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) { vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); vkapi::PipelineBarrier pipeline_barrier{}; @@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) { // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer out_buf( + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; From 36c1f54315335d41159360664a0996757721eeec Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Fri, 30 Aug 2024 17:45:46 -0700 Subject: [PATCH 135/531] [codegen] Change cmake file to take CMAKE_CURRENT_SOURCE_DIR and add logging Differential Revision: D62037064 Pull Request resolved: https://github.com/pytorch/executorch/pull/5005 --- codegen/tools/gen_oplist.py | 39 ++++++++++++++++++++------- codegen/tools/test/test_gen_oplist.py | 2 +- kernels/portable/CMakeLists.txt | 5 ++-- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py index f21fb8dc6b5..fbb191a6a81 100644 --- a/codegen/tools/gen_oplist.py +++ b/codegen/tools/gen_oplist.py @@ -230,7 +230,7 @@ def gen_oplist( if model_file_path: assert os.path.isfile( model_file_path - ), "The value for --model_file_path needs to be a valid file." + ), f"The value for --model_file_path needs to be a valid file, got {model_file_path}" op_set.update(_get_operators(model_file_path)) source_name = model_file_path et_kernel_metadata = merge_et_kernel_metadata( @@ -239,7 +239,7 @@ def gen_oplist( if ops_schema_yaml_path: assert os.path.isfile( ops_schema_yaml_path - ), "The value for --ops_schema_yaml_path needs to be a valid file." + ), f"The value for --ops_schema_yaml_path needs to be a valid file, got {ops_schema_yaml_path}" et_kernel_metadata = merge_et_kernel_metadata( et_kernel_metadata, _get_et_kernel_metadata_from_ops_yaml(ops_schema_yaml_path), @@ -300,14 +300,33 @@ def main(args: List[Any]) -> None: ) options = parser.parse_args(args) - gen_oplist( - output_path=options.output_path, - model_file_path=options.model_file_path, - ops_schema_yaml_path=options.ops_schema_yaml_path, - root_ops=options.root_ops, - ops_dict=options.ops_dict, - include_all_operators=options.include_all_operators, - ) + try: + gen_oplist( + output_path=options.output_path, + model_file_path=options.model_file_path, + ops_schema_yaml_path=options.ops_schema_yaml_path, + root_ops=options.root_ops, + ops_dict=options.ops_dict, + include_all_operators=options.include_all_operators, + ) + except Exception as e: + command = ["python codegen/tools/gen_oplist.py"] + if options.model_file_path: + command.append(f"--model_file_path {options.model_file_path}") + if options.ops_schema_yaml_path: + command.append(f"--ops_schema_yaml_path {options.ops_schema_yaml_path}") + if options.root_ops: + command.append(f"--root_ops {options.root_ops}") + if options.ops_dict: + command.append(f"--ops_dict {options.ops_dict}") + if options.include_all_operators: + command.append("--include-all-operators") + repro_command = " ".join(command) + raise RuntimeError( + f"""Failed to generate selected_operators.yaml. Repro command: + {repro_command} + """ + ) from e if __name__ == "__main__": diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py index d455ddb6899..bd1d0082489 100644 --- a/codegen/tools/test/test_gen_oplist.py +++ b/codegen/tools/test/test_gen_oplist.py @@ -42,7 +42,7 @@ def test_gen_op_list_with_wrong_path( mock_get_operators: NonCallableMock, ) -> None: args = ["--output_path=wrong_path", "--model_file_path=path2"] - with self.assertRaises(AssertionError): + with self.assertRaises(RuntimeError): gen_oplist.main(args) @patch("executorch.codegen.tools.gen_oplist._get_kernel_metadata_for_model") diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index eb3cedd5b3f..885c509246b 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -38,12 +38,11 @@ list(FILTER _portable_kernels__srcs EXCLUDE REGEX "test/*.cpp") list(FILTER _portable_kernels__srcs EXCLUDE REGEX "codegen") # Generate C++ bindings to register kernels into both PyTorch (for AOT) and # Executorch (for runtime). Here select all ops in functions.yaml -set(_yaml "${CMAKE_CURRENT_LIST_DIR}/functions.yaml") +set(_yaml "${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml") gen_selected_ops(LIB_NAME "portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}") # Expect gen_selected_ops output file to be selected_operators.yaml generate_bindings_for_kernels( - LIB_NAME "portable_ops_lib" FUNCTIONS_YAML - ${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml + LIB_NAME "portable_ops_lib" FUNCTIONS_YAML "${_yaml}" ) message("Generated files ${gen_command_sources}") From 61ddee54058bf3fc7e17eb539d61d39d12cc86f8 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 30 Aug 2024 18:07:46 -0700 Subject: [PATCH 136/531] Android library update for benchmarking support Test: ``` cd extension/android/benchmark mkdir app/libs cp app/libs/executorch.aar ./gradlew :app:installDebug adb shell am start -n org.pytorch.minibench/org.pytorch.minibench.BenchmarkActivity --es model_path /data/local/tmp/model.pte adb shell run-as org.pytorch.minibench cat files/benchmark_results.txt ``` --- build/build_android_llm_demo.sh | 1 + extension/android/CMakeLists.txt | 11 +++++++++-- extension/android/jni/jni_layer.cpp | 16 ++++++++++++---- extension/android/jni/jni_layer_constants.h | 4 ++-- .../java/org/pytorch/executorch/Module.java | 6 ++++++ .../org/pytorch/executorch/NativePeer.java | 18 ++++++++++++++++++ extension/module/module.h | 2 ++ 7 files changed, 50 insertions(+), 8 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 4d34eb95b23..7b7150de210 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -30,6 +30,7 @@ build_android_native_library() { -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index daa9c7c2496..6827ae79040 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -32,8 +32,15 @@ find_package(executorch CONFIG REQUIRED) target_link_options_shared_lib(executorch) set(link_libraries) -list(APPEND link_libraries extension_data_loader extension_module extension_threadpool executorch - fbjni +list( + APPEND + link_libraries + executorch + extension_data_loader + extension_module + extension_runner_util + extension_threadpool + fbjni ) if(TARGET optimized_native_cpu_ops_lib) diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index c70912a5451..79c6ebc5161 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -18,6 +18,7 @@ #include "jni_layer_constants.h" #include +#include #include #include #include @@ -56,7 +57,7 @@ void et_pal_emit_log_message( using namespace torch::executor; -namespace executorch_jni { +namespace executorch::extension { class TensorHybrid : public facebook::jni::HybridClass { public: constexpr static const char* kJavaDescriptor = @@ -352,19 +353,26 @@ class ExecuTorchJni : public facebook::jni::HybridClass { return jresult; } + jint forward_ones() { + auto&& load_result = module_->load_method("forward"); + auto&& buf = prepare_input_tensors(*(module_->methods_["forward"].method)); + auto&& result = module_->methods_["forward"].method->execute(); + return (jint)result; + } + static void registerNatives() { registerHybrid({ makeNativeMethod("initHybrid", ExecuTorchJni::initHybrid), makeNativeMethod("forward", ExecuTorchJni::forward), makeNativeMethod("execute", ExecuTorchJni::execute), makeNativeMethod("loadMethod", ExecuTorchJni::load_method), + makeNativeMethod("forwardOnes", ExecuTorchJni::forward_ones), }); } }; - -} // namespace executorch_jni +} // namespace executorch::extension JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) { return facebook::jni::initialize( - vm, [] { executorch_jni::ExecuTorchJni::registerNatives(); }); + vm, [] { executorch::extension::ExecuTorchJni::registerNatives(); }); } diff --git a/extension/android/jni/jni_layer_constants.h b/extension/android/jni/jni_layer_constants.h index ac52b3a650d..43946ffab6e 100644 --- a/extension/android/jni/jni_layer_constants.h +++ b/extension/android/jni/jni_layer_constants.h @@ -10,7 +10,7 @@ #include -namespace executorch_jni { +namespace executorch::extension { constexpr static int kTensorDTypeUInt8 = 0; constexpr static int kTensorDTypeInt8 = 1; @@ -93,4 +93,4 @@ const std::unordered_map java_dtype_to_scalar_type = { {kTensorDTypeBits16, ScalarType::Bits16}, }; -} // namespace executorch_jni +} // namespace executorch::extension diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java index 5e57174114d..981cfcd8c62 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/Module.java +++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java @@ -79,6 +79,12 @@ public static Module load(final String modelPath) { * @return return value from the 'forward' method. */ public EValue[] forward(EValue... inputs) { + if (inputs.length == 0) { + // forward default args (ones) + mNativePeer.forwardOnes(); + // discard the return value + return null; + } return mNativePeer.forward(inputs); } diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java index 865c503765d..6eadbf05097 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java +++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java @@ -13,6 +13,7 @@ import com.facebook.soloader.nativeloader.NativeLoader; import java.util.Map; +/** Interface for the native peer object for entry points to the Module */ class NativePeer { static { // Loads libexecutorch.so from jniLibs @@ -29,16 +30,33 @@ private static native HybridData initHybrid( mHybridData = initHybrid(moduleAbsolutePath, extraFiles, loadMode); } + /** Clean up the native resources associated with this instance */ public void resetNative() { mHybridData.resetNative(); } + /** Run a "forward" call with the given inputs */ @DoNotStrip public native EValue[] forward(EValue... inputs); + /** + * Run a "forward" call with the sample inputs (ones) to test a module + * + * @return the outputs of the forward call + * @apiNote This is experimental and test-only API + */ + @DoNotStrip + public native int forwardOnes(); + + /** Run an arbitrary method on the module */ @DoNotStrip public native EValue[] execute(String methodName, EValue... inputs); + /** + * Load a method on this module. + * + * @return the Error code if there was an error loading the method + */ @DoNotStrip public native int loadMethod(String methodName); } diff --git a/extension/module/module.h b/extension/module/module.h index 689fef5cd29..8ae7e436556 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -358,6 +358,8 @@ class Module final { std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator_; std::unique_ptr<::executorch::runtime::EventTracer> event_tracer_; std::unordered_map methods_; + + friend class ExecuTorchJni; }; } // namespace extension From 05010e893aece78585a7e6d2a0f2a8f9dac6a041 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Tue, 3 Sep 2024 09:28:52 -0700 Subject: [PATCH 137/531] [ET-VK] Remove unused Allocator function Differential Revision: D62049777 Pull Request resolved: https://github.com/pytorch/executorch/pull/5013 --- backends/vulkan/runtime/vk_api/memory/Allocator.cpp | 11 ----------- backends/vulkan/runtime/vk_api/memory/Allocator.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1dadca27a0b..f7428f12b67 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -162,17 +162,6 @@ VulkanBuffer Allocator::create_storage_buffer( allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } -VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - - VkBufferUsageFlags buffer_usage = - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); -} - VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY | diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 904163cefb4..6d8ee09ae5d 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -67,8 +67,6 @@ class Allocator final { const bool gpu_only = true, const bool allocate_memory = true); - VulkanBuffer create_staging_buffer(const VkDeviceSize); - /* * Create a uniform buffer with a specified size */ From ef3c53d567370e1ede308c0187712cc59fe45dcc Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:57:45 -0400 Subject: [PATCH 138/531] Update pytorch pin for ET Differential Revision: D62081793 Pull Request resolved: https://github.com/pytorch/executorch/pull/5026 --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 14422e45d7c..b291722c3f0 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -e4cd76cf8283c8ddbf95674b020fbfcff467cb4b +00e3eea170ce5db8ea9c62ce5e48f13886cd6d20 diff --git a/install_requirements.py b/install_requirements.py index 1f5982c80e0..64243ec6943 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -94,7 +94,7 @@ def python_is_compatible(): # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION = "dev20240829" +NIGHTLY_VERSION = "dev20240901" # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" From 9e8ffbbf2ae6e177d1dfb6a63eee10db5bce66d5 Mon Sep 17 00:00:00 2001 From: Mengtao Yuan Date: Tue, 3 Sep 2024 10:36:51 -0700 Subject: [PATCH 139/531] [evaluate] pte mode: Add error message and instruction for full logits. Differential Revision: D62113387 Pull Request resolved: https://github.com/pytorch/executorch/pull/5031 --- examples/models/llama2/eval_llama_lib.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 9e27b987bb8..3ea4e77a1a6 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -62,15 +62,19 @@ def _model_call(self, inps): # Given inps (tokens), return the logits from a single forward call # inps: Tensor of shape (1, max_seq_len - 1) # logits: Tensor of shape (1, max_seq_len - 1, vocab_size) + result = [] if self._use_kv_cache: pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) result = self._et_model.forward( (inps[:, : self._max_seq_length], pos_tensor) ) - return result[0] else: result = self._et_model.forward((inps,)) - return result[0] + if result[0].dim() != 3: + raise ValueError( + f"Dim of logits must be 3 for evaluation. Got {result[0].dim()} here. Add --generate_full_logits in export_llama to generate a pte file with full logits." + ) + return result[0] class ETRunnerEvalWrapper(EagerEvalWrapper): From 4732749679807c82ed5ef87ab64adbcebea4a979 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Tue, 3 Sep 2024 19:58:32 +0200 Subject: [PATCH 140/531] Add pass for adjusting conv2d input shape to parameters Differential Revision: D62034797 Pull Request resolved: https://github.com/pytorch/executorch/pull/4903 --- backends/arm/operators/op_conv2d.py | 2 +- .../annotate_channels_last_dim_order_pass.py | 4 +- backends/arm/passes/arm_pass_manager.py | 2 + .../arm/passes/size_adjust_conv2d_pass.py | 129 ++++++++++++++++++ backends/arm/test/ops/test_conv.py | 24 +++- 5 files changed, 155 insertions(+), 6 deletions(-) create mode 100644 backends/arm/passes/size_adjust_conv2d_pass.py diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 323b11601cb..9437e96f5e9 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -40,7 +40,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation): if mod_remainder > pad: raise RuntimeError( - f"ignoring input element is not currently supported, got a large stride {stride}" + "This case should be handled by the SizeAdjustConv2d pass, is it enabled?" ) return pad - mod_remainder diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py index 9bb45c504a4..ea3c171c580 100644 --- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py @@ -46,7 +46,9 @@ def call(self, graph_module: torch.fx.GraphModule): NHWC_Order = (0, 2, 3, 1) HWCM_Order = (2, 3, 0, 1) for node in graph_module.graph.nodes: - if isinstance(node.meta["val"], tuple): + if isinstance( + node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list) + ): node_data = node.meta["val"][0].data else: node_data = node.meta["val"].data diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index 054d823dbbb..8cac53b1347 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -16,6 +16,7 @@ ConvertSplitToSlicePass, ) from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass +from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.pass_manager import PassManager @@ -29,6 +30,7 @@ def transform_to_backend_pipeline( self, graph_module: torch.fx.Graph, compile_spec: CompileSpec ): """Apply passes before transforming program to backend""" + self.add_pass(SizeAdjustConv2DPass()) self.add_pass(RemoveClonePass()) self.add_pass(ConvertExpandCopyToRepeatPass()) self.add_pass(ConvertSplitToSlicePass()) diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py new file mode 100644 index 00000000000..25d27e7f40f --- /dev/null +++ b/backends/arm/passes/size_adjust_conv2d_pass.py @@ -0,0 +1,129 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import cast, Optional + +import torch.fx +from executorch.backends.arm.tosa_quant_utils import is_quant_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch._ops import OpOverload + + +def conv_remainder(input_length, pad, dilation, weight, stride): + """ + Returns the size + """ + return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride + + +def insert_q_dq_pair( + graph: torch.fx.Graph, + anchor: torch.fx.Node, + q_params: tuple, +): + with graph.inserting_after(anchor): + q = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + args=(), # We add the argument last + ) + q.meta = anchor.meta + + with graph.inserting_after(q): + dq = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + args=(q,) + q_params, + ) + dq.meta = q.meta + + anchor.replace_all_uses_with(dq) + # We add this last so the replace all uses above does not replace the quantized + # node's first use + q.args = (anchor,) + q_params + return dq + + +def create_node( + graph: torch.fx.Graph, + op_target: OpOverload, + args: tuple = (), + kwargs: Optional[dict] = None, +): + return graph.create_node( + "call_function", + op_target, + args=args, + kwargs=kwargs or {}, + ) + + +class SizeAdjustConv2DPass(ExportPass): + """ + Adjust the convolution input size to match perfectly with the + weight size, padding, stride and dilation parameters. + This is done by inserting a slice op to remove the uneven end of the input. + """ + + conv2d_op = exir_ops.edge.aten.convolution.default + slice_op = exir_ops.edge.aten.slice_copy.Tensor + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + modified_graph = False + for node in graph.nodes: + if node.op != "call_function": + continue + if node.target != self.conv2d_op: + continue + + conv_node = cast(torch.fx.Node, node) + input_node, weight, _, stride_hw, pad_hw, dilation_hw, _, _, _ = ( + conv_node.args + ) + weight_shape = weight.meta["val"].shape + input_shape = input_node.meta["val"].shape + + slice_args = [] + for stride, pad, dilation, dim in zip( + cast(list, stride_hw), + cast(list, pad_hw), + cast(list, dilation_hw), + (2, 3), + ): + remainder = conv_remainder( + input_shape[dim], pad, dilation, weight_shape[dim], stride + ) + if remainder > pad: + adjustment = remainder - pad + args = (dim, 0, input_shape[dim] - adjustment) + slice_args.append(args) + if len(slice_args) == 0: + continue + + with graph_module.graph.inserting_before(node): + last_node = cast(torch.fx.Node, input_node) + for args in slice_args: + slice_node = graph.create_node( + "call_function", self.slice_op, (last_node,) + args + ) + if is_quant_node(last_node): + q_params = last_node.args[1:] + dq_node = insert_q_dq_pair( + graph_module.graph, slice_node, q_params + ) + last_node = dq_node + else: + last_node = slice_node + conv_node.replace_input_with(input_node, last_node) + modified_graph = True + + if modified_graph: + graph_module = super().call(graph_module).graph_module + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py index ae1c5a65a83..82748799533 100644 --- a/backends/arm/test/ops/test_conv.py +++ b/backends/arm/test/ops/test_conv.py @@ -155,14 +155,14 @@ def forward(self, x): batches=1, ) -conv2d_2x2_1x1x14x14_st2 = Conv2d( +conv2d_2x2_1x1x14x13_st2 = Conv2d( in_channels=1, out_channels=1, kernel_size=(2, 2), stride=2, padding=0, width=14, - height=14, + height=13, batches=1, ) @@ -188,6 +188,18 @@ def forward(self, x): batches=1, ) +conv2d_5x5_1x3x14x15_st3_pd1 = Conv2d( + in_channels=3, + out_channels=16, + kernel_size=(5, 5), + stride=3, + padding=1, + width=14, + height=15, + batches=1, +) + + two_conv2d_nobias = Conv2d( nbr_conv=2, width=256, @@ -221,7 +233,8 @@ def forward(self, x): ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1), ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1), ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1), - ("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2), + ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2), + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1), ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1), ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1), ("two_conv2d_nobias", two_conv2d_nobias), @@ -236,7 +249,10 @@ def forward(self, x): testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1)) # Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191. -testsuite_u55.remove(("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2)) +testsuite_u55.remove(("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2)) +testsuite_u55.remove( + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1) +) class TestConv2D(unittest.TestCase): From 1c2e57f40b5c5d739325920aad38f0a0d831306d Mon Sep 17 00:00:00 2001 From: Zingo Andersen Date: Tue, 3 Sep 2024 20:02:54 +0200 Subject: [PATCH 141/531] Arm backend report Ethos-U PMU counters Differential Revision: D61882383 Pull Request resolved: https://github.com/pytorch/executorch/pull/4904 --- backends/arm/runtime/ArmBackendEthosU.cpp | 18 +- examples/arm/executor_runner/CMakeLists.txt | 2 +- .../executor_runner/arm_executor_runner.cpp | 7 + .../arm/executor_runner/arm_perf_monitor.cpp | 173 ++++++++++++++++++ .../arm/executor_runner/arm_perf_monitor.h | 10 + 5 files changed, 207 insertions(+), 3 deletions(-) create mode 100644 examples/arm/executor_runner/arm_perf_monitor.cpp create mode 100644 examples/arm/executor_runner/arm_perf_monitor.h diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 74042935515..9f9ea8ec9fa 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -14,7 +14,6 @@ #include #include -#include #include "executorch/backends/arm/runtime/VelaBinStream.h" #include "executorch/runtime/backend/interface.h" @@ -32,6 +31,21 @@ typedef struct { bool permuted_io_flag; } ExecutionHandle; +extern "C" { +void __attribute__((weak)) ArmBackend_execute_begin() {} +void __attribute__((weak)) ArmBackend_execute_end() {} +} + +class ArmBackendExecuteCallbacks { + public: + ArmBackendExecuteCallbacks() { + ArmBackend_execute_begin(); + } + ~ArmBackendExecuteCallbacks() { + ArmBackend_execute_end(); + } +}; + class ArmBackend final : public PyTorchBackendInterface { public: ArmBackend() {} @@ -83,6 +97,7 @@ class ArmBackend final : public PyTorchBackendInterface { ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; + ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned char* data = (char*)execution_handle->processed->data(); ET_LOG(Info, "ArmBackend::execute %p", data); @@ -233,7 +248,6 @@ class ArmBackend final : public PyTorchBackendInterface { } } } - return Error::Ok; } diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 136f72ee756..b32b2d8d2bf 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -133,7 +133,7 @@ endif() # The arm_executor_runner executable add_executable(arm_executor_runner) -target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp) +target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp) # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 8605038936a..f8f9d34ecfc 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -20,6 +20,8 @@ #include #include +#include "arm_perf_monitor.h" + /** * This header file is generated by the build process based on the .pte file * specified in the ET_PTE_FILE_PATH variable to the cmake build. @@ -350,7 +352,10 @@ int main(int argc, const char* argv[]) { ET_LOG(Info, "Input prepared."); ET_LOG(Info, "Starting the model execution..."); + StartMeasurements(); Error status = method->execute(); + StopMeasurements(); + if (status != Error::Ok) { ET_LOG( Info, @@ -368,6 +373,8 @@ int main(int argc, const char* argv[]) { for (int i = 0; i < outputs.size(); ++i) { Tensor t = outputs[i].toTensor(); #ifndef SEMIHOSTING + // The output might be collected and parsed so printf() is used instead + // of ET_LOG() here for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { if (t.scalar_type() == ScalarType::Int) { printf( diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp new file mode 100644 index 00000000000..c53d28baab4 --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -0,0 +1,173 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include "arm_perf_monitor.h" + +#ifdef ETHOSU +#include +#include +#include + +static uint32_t ethosu_inference_count = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCount = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCount = 0; +static uint64_t ethosu_pmuCycleCount = 0; +static std::vector ethosu_pmuEventCounts( + ETHOSU_PMU_Get_NumEventCounters(), + 0); + +static const uint32_t ethosu_pmuCountersUsed = 4; +// ethosu_pmuCountersUsed should match numbers of counters setup in +// ethosu_inference_begin() and not be more then the HW supports +static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed); + +extern "C" { + +// Callback invoked at start of NPU execution +void ethosu_inference_begin(struct ethosu_driver* drv, void*) { + // Enable PMU + ETHOSU_PMU_Enable(drv); + ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE); + ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE); + + // Setup 4 counters + ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN); + ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE); + // Enable 4 counters + ETHOSU_PMU_CNTR_Enable(drv, 0xf); + + ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); + ETHOSU_PMU_CYCCNT_Reset(drv); + + // Reset all counters + ETHOSU_PMU_EVCNTR_ALL_Reset(drv); + + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ethosu_inference_end() + ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of NPU execution +void ethosu_inference_end(struct ethosu_driver* drv, void*) { + ethosu_inference_count++; + ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] += ETHOSU_PMU_Get_EVCNTR(drv, i); + } + ETHOSU_PMU_Disable(drv); + // Add Cortex-M cycle clock used during this NPU execution + ethosu_ArmWhenNPURunCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart); +} + +// Callback invoked at start of ArmBackend::execute() +void ArmBackend_execute_begin() { + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ArmBackend_execute_end() + ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of ArmBackend::execute() +void ArmBackend_execute_end() { + // Add Cortex-M cycle clock used during this ArmBackend::execute() + ethosu_ArmBackendExecuteCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart); +} +} + +void StartMeasurements() { + ethosu_ArmBackendExecuteCycleCount = 0; + ethosu_ArmWhenNPURunCycleCount = 0; + ethosu_pmuCycleCount = 0; + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] = 0; + } + ARM_PMU_Enable(); + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable + ARM_PMU_CYCCNT_Reset(); + ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +} + +void StopMeasurements() { + ARM_PMU_CNTR_Disable( + PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | + PMU_CNTENCLR_CNT1_ENABLE_Msk); + uint32_t cycle_count = ARM_PMU_Get_CCNTR(); + + // Number of comand streams handled by the NPU + ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count); + ET_LOG(Info, "Profiler report, CPU cycles per operator:"); + // This is number of CPU cycles for the ethos-u operator from start to finish + // in the framework If there is more then one commandstream the time is added + // together + ET_LOG( + Info, + "ethos-u : cycle_cnt : %d cycles", + ethosu_ArmBackendExecuteCycleCount); + // We could print a list of the cycles used by the other delegates here in the + // future but now we only print ethos-u: this means that "Operator(s) total: + // ..." will be the same number as ethos-u : cycle_cnt and not the sum of all + ET_LOG( + Info, + "Operator(s) total: %d CPU cycles", + ethosu_ArmBackendExecuteCycleCount); + // Total CPU cycles used in the executorch method->execute() + // Other delegates and no delegates are counted in this + ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count); + + ET_LOG( + Info, + "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency"); + + // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly. + if (cycle_count == 0) { + ET_LOG(Info, "Inference CPU ratio: ?.?? %%"); + ET_LOG(Info, "Inference NPU ratio: ?.?? %%"); + } else { + ET_LOG( + Info, + "Inference CPU ratio: %.2f %%", + 100.0 * (cycle_count - ethosu_ArmWhenNPURunCycleCount) / cycle_count); + ET_LOG( + Info, + "Inference NPU ratio: %.2f %%", + 100.0 * ethosu_ArmWhenNPURunCycleCount / cycle_count); + } + + // CPU cycles used by NPU, e.g. number of CPU cycles used between + // ethosu_inference_begin() and ethosu_inference_end() + // If there is more then one commandstream the time is added together + ET_LOG( + Info, + "cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles", + ethosu_ArmWhenNPURunCycleCount); + + ET_LOG(Info, "Ethos-U PMU report:"); + ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]); + } + ET_LOG( + Info, + "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]"); +} + +#else +void StartMeasurements() {} + +void StopMeasurements() {} + +#endif diff --git a/examples/arm/executor_runner/arm_perf_monitor.h b/examples/arm/executor_runner/arm_perf_monitor.h new file mode 100644 index 00000000000..3925a9a5713 --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.h @@ -0,0 +1,10 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +void StartMeasurements(); +void StopMeasurements(); From f824c1e5a71ce7a857acb85cfd5551597eb1dfd8 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Tue, 3 Sep 2024 20:02:59 +0200 Subject: [PATCH 142/531] Add exp and log ops to Arm backend Differential Revision: D62035484 Pull Request resolved: https://github.com/pytorch/executorch/pull/5001 --- backends/arm/arm_partitioner.py | 2 + backends/arm/operators/__init__.py | 2 + backends/arm/operators/op_exp.py | 81 +++++++++++++ backends/arm/operators/op_log.py | 81 +++++++++++++ backends/arm/quantizer/arm_quantizer.py | 1 + .../quantization_annotation/__init__.py | 1 + .../one_to_one_annotator.py | 55 +++++++++ backends/arm/test/ops/test_exp.py | 110 ++++++++++++++++++ backends/arm/test/ops/test_log.py | 110 ++++++++++++++++++ 9 files changed, 443 insertions(+) create mode 100644 backends/arm/operators/op_exp.py create mode 100644 backends/arm/operators/op_log.py create mode 100644 backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py create mode 100644 backends/arm/test/ops/test_exp.py create mode 100644 backends/arm/test/ops/test_log.py diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index bee8b8a27f9..524316613ff 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -45,6 +45,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.convolution.default, exir_ops.edge.aten.div.Tensor, + exir_ops.edge.aten.exp.default, + exir_ops.edge.aten.log.default, exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.full.default, exir_ops.edge.aten.mul.Tensor, diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index fb5e46c5c2d..5c1109eec1f 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -14,9 +14,11 @@ op_conv2d, op_dequant, op_div, + op_exp, op_full, op_get_item, op_hardtanh, + op_log, op_mean_dim, op_mm, op_mul, diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py new file mode 100644 index 00000000000..f9319b5ea8b --- /dev/null +++ b/backends/arm/operators/op_exp.py @@ -0,0 +1,81 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class ExpVisitor(NodeVisitor): + target = "aten.exp.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = exp_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name]) + + +def exp_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to exp([qmin,qmax]) + """ + + def exp(x): + # Convert quantized input to floating point exp input space. + v = dequantize_value(x, in_quantargs) + # Compute exp. + v = np.exp(v) + # Convert exp output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + exp(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py new file mode 100644 index 00000000000..a76eb57f710 --- /dev/null +++ b/backends/arm/operators/op_log.py @@ -0,0 +1,81 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class LogVisitor(NodeVisitor): + target = "aten.log.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = log_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name]) + + +def log_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to log([qmin,qmax]) + """ + + def log(x): + # Convert quantized input to floating point log input space. + v = dequantize_value(x, in_quantargs) + # Compute log. + v = np.log(v) + # Convert log output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + log(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 26920383524..e8fb78fea49 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -268,6 +268,7 @@ class ArmQuantizer(Quantizer): "sigmoid", "mm", "cat", + "one_to_one", ] def __init__(self) -> None: diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py index 68ad522feeb..f3017c2d7df 100644 --- a/backends/arm/quantizer/quantization_annotation/__init__.py +++ b/backends/arm/quantizer/quantization_annotation/__init__.py @@ -55,6 +55,7 @@ def decorator(annotator: AnnotatorType): max_pool2d_annotator, mm_annotator, mul_annotator, + one_to_one_annotator, sigmoid_annotator, sub_annotator, ) diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py new file mode 100644 index 00000000000..2c3c485b055 --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, List, Optional + +import torch +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer.utils import ( + _annotate_input_qspec_map, + _annotate_output_qspec, +) +from torch.fx import Node + + +@register_annotator("one_to_one") +def _annotate_one_to_one( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + """ + This annotator adds the input and output qspec from the quantization config to + ops in 'one_to_one_ops' that have the following properties: + - Have a single input and single output. + - Can handle different qspecs on the input and output. + + Typical ops are ops implemented with a lookup table. + """ + annotated_partitions = [] + one_to_one_ops = (torch.ops.aten.exp.default, torch.ops.aten.log.default) + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in one_to_one_ops: + continue + if filter_fn and not filter_fn(node): + continue + input_node = node.args[0] + + if not arm_quantizer_utils.is_annotated(node): + _annotate_input_qspec_map( + node, + input_node, + quantization_config.get_input_act_qspec(), + ) + _annotate_output_qspec(node, quantization_config.get_output_act_qspec()) + + arm_quantizer_utils.mark_nodes_as_annotated([node]) + annotated_partitions.append([node]) + + return annotated_partitions diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py new file mode 100644 index 00000000000..79020ade25c --- /dev/null +++ b/backends/arm/test/ops/test_exp.py @@ -0,0 +1,110 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestExp(unittest.TestCase): + """Tests lowering of aten.exp""" + + class Exp(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.exp(x) + + def _test_exp_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.exp.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.exp.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.exp.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_exp_tosa_MI_pipeline(self.Exp(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,)) + + # Fails due to Vela diff from Tosa spec, expected to work with Regor. + @parameterized.expand(test_data_suite) + @unittest.expectedFailure + def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_tosa_u55_BI_pipeline(self.Exp(), (test_data,)) diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py new file mode 100644 index 00000000000..80bc17c987f --- /dev/null +++ b/backends/arm/test/ops/test_log.py @@ -0,0 +1,110 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("ones_rank4", torch.ones(1, 10, 10, 10)), + ("ones_rank3", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) + 0.001), + ("randn_pos", torch.randn(10) + 10), + ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)), + ("ramp", torch.arange(0.01, 20, 0.2)), +] + + +class TestLog(unittest.TestCase): + """Tests lowering of aten.log""" + + class Log(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.log(x) + + def _test_log_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.log.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.log.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.log.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_log_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_log_tosa_MI_pipeline(self.Log(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_tosa_BI_pipeline(self.Log(), (test_data,)) + + # Fails due to Vela diff from Tosa spec, logected to work with Regor. + @parameterized.expand(test_data_suite) + @unittest.expectedFailure + def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_tosa_u55_BI_pipeline(self.Log(), (test_data,)) From b73312f194c9373f203f951943b39e688df0db11 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 11:53:46 -0700 Subject: [PATCH 143/531] Pass images by value to allow rvalue args. Differential Revision: D62141582 Pull Request resolved: https://github.com/pytorch/executorch/pull/5034 --- examples/models/llava/main.cpp | 2 +- examples/models/llava/runner/llava_runner.cpp | 2 +- examples/models/llava/runner/llava_runner.h | 2 +- examples/models/llava/runner/targets.bzl | 3 +++ extension/android/jni/jni_layer_llama.cpp | 2 +- extension/llm/runner/multimodal_runner.h | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 431f86c906e..171eb77077f 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -103,6 +103,6 @@ int32_t main(int32_t argc, char** argv) { .width = static_cast(image_tensor.size(2)), .height = static_cast(image_tensor.size(1))}}; // generate - runner.generate(images, prompt, seq_len); + runner.generate(std::move(images), prompt, seq_len); return 0; } diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index b186af892f1..cb968ca88d8 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -74,7 +74,7 @@ Error LlavaRunner::load() { } Error LlavaRunner::generate( - std::vector& images, + std::vector images, const std::string& prompt, int32_t seq_len, std::function token_callback, diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index 13d842e30fe..9b14bc9283a 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -31,7 +31,7 @@ class LlavaRunner : public MultimodalRunner { bool is_loaded(); Error load(); Error generate( - std::vector& images, + std::vector images, const std::string& prompt, int32_t seq_len = 1024, std::function token_callback = {}, diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl index 435ab2a8c70..72942acf16f 100644 --- a/examples/models/llava/runner/targets.bzl +++ b/examples/models/llava/runner/targets.bzl @@ -8,6 +8,9 @@ def define_common_targets(): visibility = [ "@EXECUTORCH_CLIENTS", ], + compiler_flags = [ + "-Wno-global-constructors", + ], exported_deps = [ "//executorch/backends/xnnpack:xnnpack_backend", "//executorch/extension/llm/runner:runner_lib", diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 4f67d04396c..dda9ece589d 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -165,7 +165,7 @@ class ExecuTorchLlamaJni images.push_back(image_runner); } multi_modal_runner_->generate( - images, + std::move(images), prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 745f086f803..dbffac46fce 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -56,7 +56,7 @@ class MultimodalRunner { virtual bool is_loaded() = 0; virtual ::executorch::runtime::Error load() = 0; virtual ::executorch::runtime::Error generate( - std::vector& images, + std::vector images, const std::string& prompt, int32_t seq_len = 1024, std::function token_callback = {}, From 5af5ed09db5515a30d68e9460262d0eee572858f Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:30:08 -0700 Subject: [PATCH 144/531] [ET-VK] Simplify Allocator's buffer creation methods Differential Revision: D62049778 Pull Request resolved: https://github.com/pytorch/executorch/pull/5014 --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- .../runtime/api/containers/StagingBuffer.h | 8 ++-- .../vulkan/runtime/api/containers/Tensor.cpp | 2 +- .../runtime/vk_api/memory/Allocator.cpp | 40 +++++++++---------- .../vulkan/runtime/vk_api/memory/Allocator.h | 3 +- install_requirements.py | 2 +- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index b291722c3f0..14422e45d7c 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -00e3eea170ce5db8ea9c62ce5e48f13886cd6d20 +e4cd76cf8283c8ddbf95674b020fbfcff467cb4b diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h index ab650c09a43..a24728470b0 100644 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -29,15 +29,13 @@ class StagingBuffer final { StagingBuffer( Context* context_p, const vkapi::ScalarType dtype, - const size_t numel, - const bool gpuonly = false) + const size_t numel) : context_p_(context_p), dtype_(dtype), numel_(numel), nbytes_(element_size(dtype_) * numel_), - vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer( - nbytes_, - gpuonly)) {} + vulkan_buffer_( + context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)) {} StagingBuffer(const StagingBuffer&) = delete; StagingBuffer& operator=(const StagingBuffer&) = delete; diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 578898ad194..7b9d30ef658 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -540,7 +540,7 @@ vkapi::VulkanBuffer allocate_buffer( } return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory); + element_size(dtype) * numel, allocate_memory); } vTensorStorage::vTensorStorage( diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index f7428f12b67..b990cf6a119 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -132,9 +132,27 @@ VulkanImage Allocator::create_image( allocate_memory); } +VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { + const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + + VmaAllocationCreateInfo alloc_create_info = {}; + alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + + // Staging buffers are accessed by both the CPU and GPU, so set the + // appropriate flags to indicate that the host device will be accessing + // the data from this buffer. + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + alloc_create_info.preferredFlags = + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); +} + VulkanBuffer Allocator::create_storage_buffer( const VkDeviceSize size, - const bool gpu_only, const bool allocate_memory) { const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -142,22 +160,6 @@ VulkanBuffer Allocator::create_storage_buffer( alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - // The create storage buffer will be accessed by both the CPU and GPU, so set - // the appropriate flags to indicate that the host device will be accessing - // the data from this buffer. - if (!gpu_only) { - // Deferred memory allocation should only be used for GPU only buffers. - VK_CHECK_COND( - allocate_memory, - "Only GPU-only buffers should use deferred memory allocation"); - - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - } - return VulkanBuffer( allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } @@ -170,9 +172,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VulkanBuffer uniform_buffer( - allocator_, size, alloc_create_info, buffer_usage); - return uniform_buffer; + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 6d8ee09ae5d..7d02ffe54e3 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -62,9 +62,10 @@ class Allocator final { const bool allow_transfer = false, const bool allocate_memory = true); + VulkanBuffer create_staging_buffer(const VkDeviceSize); + VulkanBuffer create_storage_buffer( const VkDeviceSize, - const bool gpu_only = true, const bool allocate_memory = true); /* diff --git a/install_requirements.py b/install_requirements.py index 64243ec6943..1f5982c80e0 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -94,7 +94,7 @@ def python_is_compatible(): # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION = "dev20240901" +NIGHTLY_VERSION = "dev20240829" # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" From a4092c59066f045043b18a7fbe8844e7216b15fc Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Tue, 3 Sep 2024 13:17:23 -0700 Subject: [PATCH 145/531] introduce dim order tests to op test Differential Revision: D55227304 Pull Request resolved: https://github.com/pytorch/executorch/pull/2637 --- kernels/portable/cpu/op_abs.cpp | 2 + kernels/test/TestUtil.h | 36 ++++ kernels/test/op_abs_test.cpp | 25 +++ .../exec_aten/testing_util/tensor_factory.h | 156 ++++++++++++++++-- .../testing_util/test/tensor_factory_test.cpp | 2 +- runtime/core/exec_aten/util/tensor_util.h | 116 ++++++++----- .../core/exec_aten/util/tensor_util_aten.cpp | 58 +++++++ .../exec_aten/util/tensor_util_portable.cpp | 34 ++++ runtime/core/exec_aten/util/test/targets.bzl | 22 +-- .../exec_aten/util/test/tensor_util_test.cpp | 56 ++++++- 10 files changed, 443 insertions(+), 64 deletions(-) diff --git a/kernels/portable/cpu/op_abs.cpp b/kernels/portable/cpu/op_abs.cpp index 0dd925a0e25..9c2c219832d 100644 --- a/kernels/portable/cpu/op_abs.cpp +++ b/kernels/portable/cpu/op_abs.cpp @@ -28,6 +28,8 @@ Tensor& abs_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { "Failed to resize output tensor."); ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "abs.out", CTYPE, [&] { apply_unary_map_fn( diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h index ed72dbc4128..8d782d3c2a9 100644 --- a/kernels/test/TestUtil.h +++ b/kernels/test/TestUtil.h @@ -30,6 +30,22 @@ #define ET_EXPECT_KERNEL_FAILURE_WITH_MSG(_context, _statement, _matcher) \ EXPECT_ANY_THROW(_statement) +#define ET_TEST_OP_SUPPORTS_MEMORY_FORMATS( \ + tf, op, input_contiguous, expected_contiguous, channels_last_support) \ + Tensor input_channels_last = tf.channels_last_like(input_contiguous); \ + Tensor expected_channel_last = tf.channels_last_like(expected_contiguous); \ + \ + Tensor output_contiguous = tf.zeros_like(expected_contiguous); \ + Tensor output_channels_last = tf.channels_last_like(output_contiguous); \ + \ + Tensor ret = op(input_channels_last, output_channels_last); \ + if (channels_last_support) { \ + EXPECT_TENSOR_EQ(output_channels_last, expected_channel_last); \ + } else { \ + EXPECT_TENSOR_NE(output_channels_last, expected_channel_last); \ + } \ + EXPECT_TENSOR_EQ(output_channels_last, ret); + #else #define ET_EXPECT_KERNEL_FAILURE(_context, _statement) \ @@ -52,6 +68,26 @@ } \ } while (false) +#define ET_TEST_OP_SUPPORTS_MEMORY_FORMATS( \ + tf, op, input_contiguous, expected_contiguous, channels_last_support) \ + Tensor input_channels_last = tf.channels_last_like(input_contiguous); \ + Tensor expected_channel_last = tf.channels_last_like(expected_contiguous); \ + \ + Tensor output_contiguous = tf.zeros_like(expected_contiguous); \ + Tensor output_channels_last = tf.channels_last_like(output_contiguous); \ + \ + Tensor ret = op(input_channels_last, output_channels_last); \ + if (channels_last_support) { \ + EXPECT_TENSOR_EQ(output_channels_last, expected_channel_last); \ + } else { \ + EXPECT_TENSOR_NE(output_channels_last, expected_channel_last); \ + } \ + EXPECT_TENSOR_EQ(output_channels_last, ret); \ + ET_EXPECT_KERNEL_FAILURE( \ + context_, op(input_channels_last, output_contiguous)); \ + ET_EXPECT_KERNEL_FAILURE( \ + context_, op(input_contiguous, output_channels_last)); + #endif // USE_ATEN_LIB /* diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp index b54cd971567..f596d586d90 100644 --- a/kernels/test/op_abs_test.cpp +++ b/kernels/test/op_abs_test.cpp @@ -38,3 +38,28 @@ TEST_F(OpAbsTest, SanityCheck) { EXPECT_TENSOR_EQ(out, ret); EXPECT_TENSOR_EQ(out, expected); } + +TEST_F(OpAbsTest, MemoryFormatCheck) { + TensorFactory tf; + + std::vector sizes = {2, 3, 1, 5}; + + Tensor input_contiguous = + tf.make(sizes, {0.8737, 0.5359, 0.3743, -0.3040, -0.7800, -0.2306, + -0.7684, -0.5364, 0.3478, -0.3289, 0.0829, 0.2939, + -0.8211, 0.8572, -0.0802, 0.9252, -0.2093, 0.9013, + -0.4197, 0.3987, -0.5291, -0.5567, 0.2691, 0.7819, + -0.8009, -0.4286, -0.9299, 0.2143, 0.2565, -0.5701}); + Tensor expected_contiguous = tf.make( + sizes, {0.8737, 0.5359, 0.3743, 0.3040, 0.7800, 0.2306, 0.7684, 0.5364, + 0.3478, 0.3289, 0.0829, 0.2939, 0.8211, 0.8572, 0.0802, 0.9252, + 0.2093, 0.9013, 0.4197, 0.3987, 0.5291, 0.5567, 0.2691, 0.7819, + 0.8009, 0.4286, 0.9299, 0.2143, 0.2565, 0.5701}); + + ET_TEST_OP_SUPPORTS_MEMORY_FORMATS( + tf, + op_abs_out, + input_contiguous, + expected_contiguous, + /*channels_last_support=*/true); +} diff --git a/runtime/core/exec_aten/testing_util/tensor_factory.h b/runtime/core/exec_aten/testing_util/tensor_factory.h index 8f39cc9911d..3045af55819 100644 --- a/runtime/core/exec_aten/testing_util/tensor_factory.h +++ b/runtime/core/exec_aten/testing_util/tensor_factory.h @@ -3,8 +3,10 @@ #pragma once #include +#include #include +#include #include #include #include @@ -54,7 +56,7 @@ inline size_t sizes_to_numel(const std::vector& sizes) { inline bool check_strides( const std::vector sizes, - const std::vector strides) { + const std::vector strides) { if (sizes.size() != strides.size()) { // The length of stride vector shall equal to size vector. return false; @@ -147,14 +149,14 @@ inline bool check_dim_order( return true; } -inline std::vector strides_from_dim_order( +inline std::vector strides_from_dim_order( const std::vector& sizes, const std::vector& dim_order) { bool legal = check_dim_order(sizes, dim_order); ET_CHECK_MSG(legal, "The input dim_order variable is illegal."); size_t ndim = sizes.size(); - std::vector strides(ndim); + std::vector strides(ndim); strides[dim_order[ndim - 1]] = 1; for (int i = ndim - 2; i >= 0; --i) { uint8_t cur_dim = dim_order[i]; @@ -258,7 +260,7 @@ class TensorFactory { at::Tensor make( const std::vector& sizes, const std::vector& data, - const std::vector strides = {}, + const std::vector strides = {}, ET_UNUSED TensorShapeDynamism dynamism = TensorShapeDynamism::DYNAMIC_UNBOUND) { auto expected_numel = internal::sizes_to_numel(sizes); @@ -344,6 +346,72 @@ class TensorFactory { sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism); } + /** + * Given data in contiguous memory format, returns a new Tensor with the + * specified shape and the same data but in channels last memory format. + * + * @param[in] sizes The sizes of the dimensions of the Tensor. + * @param[in] data The data in contiguous memory format that the Tensor should + * be initialized with. The size of this vector must be equal to the product + * of the elements of `sizes`. + * + * @return A new Tensor with the specified shape and data in channls last + * memory format. + */ + at::Tensor channels_last_like( + const at::Tensor& input, + TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) { + ET_CHECK_MSG( + input.sizes().size() == 4, "Only 4D tensors can be channels last"); + + const std::vector sizes( + input.sizes().begin(), input.sizes().end()); + + std::vector contiguous_dim_order(sizes.size()); + for (uint8_t i = 0; i < sizes.size(); i++) { + contiguous_dim_order[i] = i; + } + std::vector contiguous_strides = + internal::strides_from_dim_order(sizes, contiguous_dim_order); + + for (int32_t i = 0; i < input.dim(); i++) { + ET_CHECK_MSG( + input.strides()[i] == contiguous_strides[i], + "Input tensor is not contiguous"); + } + + int32_t N = sizes[0]; + int32_t C = sizes[1]; + int32_t H = sizes[2]; + int32_t W = sizes[3]; + + std::vector contiguous_data( + input.data_ptr(), input.data_ptr() + input.numel()); + std::vector channels_last_data( + N * C * H * W); // Create a new blob with the same total size to contain + // channels_last data + for (int32_t n = 0; n < N; ++n) { + for (int32_t c = 0; c < C; ++c) { + for (int32_t h = 0; h < H; ++h) { + for (int32_t w = 0; w < W; ++w) { + // Calculate the index in the original blob + int32_t old_index = ((n * C + c) * H + h) * W + w; + // Calculate the index in the new blob + int32_t new_index = ((n * H + h) * W + w) * C + c; + // Copy the data + channels_last_data[new_index] = contiguous_data[old_index]; + } + } + } + } + + return make_with_dimorder( + sizes, + channels_last_data, + internal::channels_last_dim_order(sizes.size()), + dynamism); + } + /** * Returns a new Tensor with the specified shape, containing contiguous * data will all elements set to `value`. @@ -459,14 +527,13 @@ class TensorFactory { */ at::Tensor empty_strided( const std::vector& sizes, - const std::vector& strides, + const std::vector& strides, ET_UNUSED TensorShapeDynamism dynamism = TensorShapeDynamism::DYNAMIC_UNBOUND) { auto sizes64 = vec_32_to_64(sizes); - auto strides64 = vec_32_to_64(strides); return at::empty_strided( sizes64, - strides64, + strides, DTYPE, /*layout_opt=*/at::Layout::Strided, /*device_opt=*/at::Device(at::DeviceType::CPU), @@ -666,7 +733,7 @@ class TensorFactory { torch::executor::Tensor make( const std::vector& sizes, const std::vector& data, - const std::vector strides = {}, + const std::vector strides = {}, TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) { std::vector default_strides; // Generate strides from the tensor dimensions, assuming contiguous data if @@ -746,7 +813,7 @@ class TensorFactory { /** * Returns a new Tensor with the specified shape and data in channels last - * memory layout. + * memory format. * * @param[in] sizes The sizes of the dimensions of the Tensor. * @param[in] data The data that the Tensor should be initialized with. The @@ -764,6 +831,60 @@ class TensorFactory { sizes, data, internal::channels_last_dim_order(sizes.size()), dynamism); } + /** + * Given data in contiguous memory format, returns a new Tensor with the + * specified shape and the same data but in channels last memory format. + * + * @param[in] sizes The sizes of the dimensions of the Tensor. + * @param[in] data The data in contiguous memory format that the Tensor should + * be initialized with. The size of this vector must be equal to the product + * of the elements of `sizes`. + * + * @return A new Tensor with the specified shape and data in channls last + * memory format. + */ + torch::executor::Tensor channels_last_like( + const torch::executor::Tensor& input, + TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) { + const std::vector sizes( + input.sizes().begin(), input.sizes().end()); + + ET_CHECK_MSG(sizes.size() == 4, "Only 4D tensors can be channels last"); + ET_CHECK_MSG( + is_contiguous_dim_order(input.dim_order().data(), input.dim()) == true, + "Input tensor is not contiguous"); + int32_t N = sizes[0]; + int32_t C = sizes[1]; + int32_t H = sizes[2]; + int32_t W = sizes[3]; + + std::vector contiguous_data( + input.data_ptr(), input.data_ptr() + input.numel()); + std::vector channels_last_data( + N * C * H * W); // Create a new blob with the same total size to contain + // channels_last data + for (int32_t n = 0; n < N; ++n) { + for (int32_t c = 0; c < C; ++c) { + for (int32_t h = 0; h < H; ++h) { + for (int32_t w = 0; w < W; ++w) { + // Calculate the index in the original blob + int32_t old_index = ((n * C + c) * H + h) * W + w; + // Calculate the index in the new blob + int32_t new_index = ((n * H + h) * W + w) * C + c; + // Copy the data + channels_last_data[new_index] = contiguous_data[old_index]; + } + } + } + } + + return make_with_dimorder( + sizes, + channels_last_data, + internal::channels_last_dim_order(sizes.size()), + dynamism); + } + /** * Returns a new Tensor with the specified shape, containing contiguous data * will all elements set to `value`. @@ -799,7 +920,20 @@ class TensorFactory { /** * Returns a new Tensor with the specified shape, containing contiguous data - * with all `0` elements. + * in channels last memory format with all `0` elements. + * + * @param[in] sizes The sizes of the dimensions of the Tensor. + * @return A new Tensor with the specified shape. + */ + torch::executor::Tensor zeros_channels_last( + const std::vector& sizes, + TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC) { + return full_channels_last(sizes, 0, dynamism); + } + + /** + * Returns a new Tensor with the specified shape, containing contiguous data + * in contiguous memory format with all `0` elements. * * @param[in] sizes The sizes of the dimensions of the Tensor. * @return A new Tensor with the specified shape. @@ -878,7 +1012,7 @@ class TensorFactory { std::vector sizes_; std::vector data_; std::vector dim_order_; - std::vector strides_; + std::vector strides_; torch::executor::TensorImpl impl_; }; diff --git a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp index a2bc36f4814..8681e9553a6 100644 --- a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp +++ b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp @@ -449,7 +449,7 @@ TEST_F(TensorFactoryTest, MakeStridedDataIsCopied) { // Create two tensors using the same input data and strided vector. std::vector data = {1, 2, 3, 4}; - std::vector strides = {1, 2}; + std::vector strides = {1, 2}; Tensor t1 = tf.make(/*sizes=*/{2, 2}, data, strides); Tensor t2 = tf.make(/*sizes=*/{2, 2}, data, strides); diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index b18cd349a62..4dcb0ef9f69 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -235,8 +235,9 @@ */ #define ET_CHECK_CONTIGUOUS(a__) \ ({ \ - const ::exec_aten::ArrayRef strides = a__.strides(); \ - const ::exec_aten::ArrayRef sizes = a__.sizes(); \ + const ::exec_aten::ArrayRef strides = \ + a__.strides(); \ + const ::exec_aten::ArrayRef sizes = a__.sizes(); \ ET_CHECK_MSG( \ strides[strides.size() - 1] == 1, \ "The stride of the last dimension shall be 1 for contiguous tensor, " \ @@ -267,8 +268,10 @@ "Two tensors shall have same number of strides, but not %zu and %zu.", \ a__.dim(), \ b__.dim()); \ - const ::exec_aten::ArrayRef a_strides = a__.strides(); \ - const ::exec_aten::ArrayRef b_strides = b__.strides(); \ + const ::exec_aten::ArrayRef a_strides = \ + a__.strides(); \ + const ::exec_aten::ArrayRef b_strides = \ + b__.strides(); \ for (size_t i = 0; i < a__.dim(); i++) { \ ET_CHECK_MSG( \ a_strides[i] == b_strides[i], \ @@ -276,8 +279,8 @@ "but now is %d and %d.", \ i, \ i, \ - a_strides[i], \ - b_strides[i]); \ + (int32_t)a_strides[i], \ + (int32_t)b_strides[i]); \ } \ }) @@ -295,9 +298,12 @@ a__.dim(), \ b__.dim(), \ c__.dim()); \ - const ::exec_aten::ArrayRef a_strides = a__.strides(); \ - const ::exec_aten::ArrayRef b_strides = b__.strides(); \ - const ::exec_aten::ArrayRef c_strides = c__.strides(); \ + const ::exec_aten::ArrayRef a_strides = \ + a__.strides(); \ + const ::exec_aten::ArrayRef b_strides = \ + b__.strides(); \ + const ::exec_aten::ArrayRef c_strides = \ + c__.strides(); \ for (size_t i = 0; i < a__.dim(); i++) { \ ET_CHECK_MSG( \ a_strides[i] == b_strides[i] && b_strides[i] == c_strides[i], \ @@ -306,9 +312,9 @@ i, \ i, \ i, \ - a_strides[i], \ - b_strides[i], \ - c_strides[i]); \ + (int32_t)a_strides[i], \ + (int32_t)b_strides[i], \ + (int32_t)c_strides[i]); \ } \ }) @@ -848,11 +854,11 @@ inline bool tensor_is_scalar(exec_aten::Tensor t) { /** * The expected output size may not be the existing size of any inputs and - * outputs if the operator supports both broadcast and dynamic shape. Therefore - * such operators needs extra space to store the calculated expected output - * size. such dynamic allocation is troublesome in executorch so we can just - * hard code a static value of a relatively small value because users don't - * create high dimensional tensors. + * outputs if the operator supports both broadcast and dynamic shape. + * Therefore such operators needs extra space to store the calculated expected + * output size. such dynamic allocation is troublesome in executorch so we can + * just hard code a static value of a relatively small value because users + * don't create high dimensional tensors. */ constexpr size_t kTensorDimensionLimit = 16; @@ -893,8 +899,8 @@ inline size_t getTrailingDims(const exec_aten::Tensor& tensor, int64_t dim) { * @param[in] tensor The tensor that will be indexed * @param[in] coordinate A n-dimensional array representing the coordinate to * index. It is assumed that the array has kTensorDimensionLimit elements. - * @param[out] index The linear index to element at the specified coordinate in - * the tensor. + * @param[out] index The linear index to element at the specified coordinate + * in the tensor. */ inline size_t coordinateToIndex( const exec_aten::Tensor& tensor, @@ -935,10 +941,10 @@ inline void indexToCoordinate( * * @param[in] tensor The source of the value to extract. * @param[out] out_val The extracted value, on success. - * @returns `true` if a value was extracted, and sets `*out_val` to that value. - * `false` if a value could not be extracted: either it was not an integer - * Scalar Tensor, or the value of that Scalar Tensor could not be represented - * by INT_T. + * @returns `true` if a value was extracted, and sets `*out_val` to that + * value. `false` if a value could not be extracted: either it was not an + * integer Scalar Tensor, or the value of that Scalar Tensor could not be + * represented by INT_T. */ template < typename INT_T, @@ -973,10 +979,10 @@ bool extract_scalar_tensor(exec_aten::Tensor tensor, INT_T* out_val) { * * @param[in] tensor The source of the value to extract. * @param[out] out_val The extracted value, on success. - * @returns `true` if a value was extracted, and sets `*out_val` to that value. - * `false` if a value could not be extracted: either it was not a floating - * point Scalar Tensor, or the value of that Scalar Tensor could not be - * represented by FLOAT_T. + * @returns `true` if a value was extracted, and sets `*out_val` to that + * value. `false` if a value could not be extracted: either it was not a + * floating point Scalar Tensor, or the value of that Scalar Tensor could not + * be represented by FLOAT_T. */ template < typename FLOAT_T, @@ -1076,9 +1082,9 @@ ET_NODISCARD Error resize_tensor_impl( * expand the tensor if new size exceeds the current capacity. Currently * fails an ET_CHECK if the tensor cannot be resized. * - * WARNING: Placeholder API until discussion around runtime context is settled, - * will likely move to be a class method on a TensorResizer object passed in - * through runtimeContext. + * WARNING: Placeholder API until discussion around runtime context is + * settled, will likely move to be a class method on a TensorResizer object + * passed in through runtimeContext. */ ET_NODISCARD inline Error resize_tensor( exec_aten::Tensor t, @@ -1091,9 +1097,9 @@ ET_NODISCARD inline Error resize_tensor( * expand the tensor if new size exceeds the current capacity. Currently * fails an ET_CHECK if the tensor cannot be resized. * - * WARNING: Placeholder API until discussion around runtime context is settled, - * will likely move to be a class method on a TensorResizer object passed in - * through runtimeContext. + * WARNING: Placeholder API until discussion around runtime context is + * settled, will likely move to be a class method on a TensorResizer object + * passed in through runtimeContext. */ template < typename T, @@ -1124,8 +1130,8 @@ ET_DEPRECATED inline void resize( /** * Get dim_order of a Tensor and write it to out_dim_order. * @param tensor The tensor where we want to get dim order from. - * @param out_dim_order Pointing to an array of DimOrderType where we write dim - * order into it. + * @param out_dim_order Pointing to an array of DimOrderType where we write + * dim order into it. * @param out_dim_order_size Size of the DimOrderType array. */ ET_NODISCARD Error get_dim_order( @@ -1134,18 +1140,47 @@ ET_NODISCARD Error get_dim_order( size_t out_dim_order_size); /** - * Checks whether a tensor has a valid dim order. If the dim order could not be - * determined, then this function returns false by default. + * Checks whether a tensor has a valid dim order. If the dim order could not + * be determined, then this function returns false by default. */ bool tensor_has_valid_dim_order(exec_aten::Tensor t); /** - * Checks whether a tensor has either the default of channels last dim order. If - * the dim order could not be determined, then this function returns false by - * default. + * Checks whether a tensor has either the default of channels last dim order. + * If the dim order could not be determined, then this function returns false + * by default. */ bool tensor_is_default_or_channels_last_dim_order(exec_aten::Tensor t); +/** + * Asserts that two tensors have the same dim_order + * + * Note that this macro only tests dim order, but not others like actual data, + * sizes, etc. Also this macro does not support ATen mode since we do not + * support dim order in ATen mode. + * + * TODO(T183094318): Add dim order and related function support for ATen mode. + */ + +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b); + +/** + * Asserts that three tensors have the same dim_order + * + * Note that this macro only tests dim order, but not others like actual data, + * sizes, etc. Also this macro does not support ATen mode since we do not + * support dim order in ATen mode. + * + * TODO(T183094318): Add dim order and related function support for ATen mode. + */ + +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b, + const exec_aten::Tensor& c); + /** * Given an n-dimensional coordinate array and an array of tensor strides, * calculates the linear index that can be used to retrieve the value at the @@ -1205,6 +1240,7 @@ using ::executorch::runtime::tensor_is_real_type; using ::executorch::runtime::tensor_is_realh_type; using ::executorch::runtime::tensor_is_realhb_type; using ::executorch::runtime::tensor_is_scalar; +using ::executorch::runtime::tensors_have_same_dim_order; using ::executorch::runtime::tensors_have_same_dtype; using ::executorch::runtime::tensors_have_same_rank; using ::executorch::runtime::tensors_have_same_shape; diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp index c5ff3b52234..91b75c06483 100644 --- a/runtime/core/exec_aten/util/tensor_util_aten.cpp +++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp @@ -77,6 +77,64 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) { return ret_val; } +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b) { + exec_aten::DimOrderType a_dim_order[kTensorDimensionLimit]; + exec_aten::DimOrderType b_dim_order[kTensorDimensionLimit]; + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(a, a_dim_order, a.dim()) == Error::Ok, + "Failed to retrieve dim order from first input tensor!"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(b, b_dim_order, b.dim()) == Error::Ok, + "Failed to retrieve dim order from second input tensor!"); + + bool all_contiguous = is_contiguous_dim_order(a_dim_order, a.dim()) && + is_contiguous_dim_order(b_dim_order, b.dim()); + + bool all_channels_last = is_channels_last_dim_order(a_dim_order, a.dim()) && + is_channels_last_dim_order(b_dim_order, b.dim()); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + all_contiguous || all_channels_last, + "Two input tensors have different dim orders"); + + return true; +} + +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b, + const exec_aten::Tensor& c) { + exec_aten::DimOrderType a_dim_order[kTensorDimensionLimit]; + exec_aten::DimOrderType b_dim_order[kTensorDimensionLimit]; + exec_aten::DimOrderType c_dim_order[kTensorDimensionLimit]; + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(a, a_dim_order, a.dim()) == Error::Ok, + "Failed to retrieve dim order from first input tensor!"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(b, b_dim_order, b.dim()) == Error::Ok, + "Failed to retrieve dim order from second input tensor!"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(c, c_dim_order, c.dim()) == Error::Ok, + "Failed to retrieve dim order from third input tensor!"); + + bool all_contiguous = is_contiguous_dim_order(a_dim_order, a.dim()) && + is_contiguous_dim_order(b_dim_order, b.dim()) && + is_contiguous_dim_order(c_dim_order, c.dim()); + + bool all_channels_last = is_channels_last_dim_order(a_dim_order, a.dim()) && + is_channels_last_dim_order(b_dim_order, b.dim()) && + is_channels_last_dim_order(c_dim_order, c.dim()); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + all_contiguous || all_channels_last, + "Three input tensors have different dim orders"); + + return true; +} + namespace internal { Error share_tensor_data(const at::Tensor& t_dst, const at::Tensor& t_src) { diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp index c7872d1499a..7e9a15f09a9 100644 --- a/runtime/core/exec_aten/util/tensor_util_portable.cpp +++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp @@ -73,6 +73,40 @@ bool tensor_is_default_or_channels_last_dim_order(torch::executor::Tensor t) { return ret_val; } +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b) { + bool all_contiguous = + is_contiguous_dim_order(a.dim_order().data(), a.dim_order().size()) && + is_contiguous_dim_order(b.dim_order().data(), b.dim_order().size()); + bool all_channels_last = + is_channels_last_dim_order(a.dim_order().data(), a.dim_order().size()) && + is_channels_last_dim_order(b.dim_order().data(), b.dim_order().size()); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + all_contiguous || all_channels_last, + "Two input tensors have different dim orders"); + + return true; +} + +bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b, + const exec_aten::Tensor& c) { + bool all_contiguous = + is_contiguous_dim_order(a.dim_order().data(), a.dim_order().size()) && + is_contiguous_dim_order(b.dim_order().data(), b.dim_order().size()) && + is_contiguous_dim_order(c.dim_order().data(), c.dim_order().size()); + bool all_channels_last = + is_channels_last_dim_order(a.dim_order().data(), a.dim_order().size()) && + is_channels_last_dim_order(b.dim_order().data(), b.dim_order().size()) && + is_channels_last_dim_order(c.dim_order().data(), c.dim_order().size()); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + all_contiguous || all_channels_last, + "Three input tensors have different dim orders"); + return true; +} namespace internal { Error share_tensor_data( diff --git a/runtime/core/exec_aten/util/test/targets.bzl b/runtime/core/exec_aten/util/test/targets.bzl index cbd31013b5b..615b7c99a44 100644 --- a/runtime/core/exec_aten/util/test/targets.bzl +++ b/runtime/core/exec_aten/util/test/targets.bzl @@ -16,16 +16,6 @@ def define_common_targets(): ], ) - runtime.cxx_test( - name = "tensor_util_test", - srcs = ["tensor_util_test.cpp"], - deps = [ - "//executorch/runtime/core/exec_aten/testing_util:tensor_util", - "//executorch/runtime/core/exec_aten/util:scalar_type_util", - "//executorch/runtime/core/exec_aten/util:tensor_util", - ], - ) - runtime.cxx_test( name = "operator_impl_example_test", srcs = ["operator_impl_example_test.cpp"], @@ -44,3 +34,15 @@ def define_common_targets(): "//executorch/runtime/core/exec_aten/util:tensor_util", ], ) + + for aten_mode in (True, False): + aten_suffix = "_aten" if aten_mode else "" + runtime.cxx_test( + name = "tensor_util_test" + aten_suffix, + srcs = ["tensor_util_test.cpp"], + deps = [ + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + "//executorch/runtime/core/exec_aten/util:scalar_type_util", + "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, + ], + ) diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp index 53ff06966c2..88588dade68 100644 --- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp +++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp @@ -14,8 +14,6 @@ #include #include -#include - using namespace ::testing; using exec_aten::ScalarType; using exec_aten::Tensor; @@ -553,3 +551,57 @@ TEST_F(TensorUtilTest, ResizeZeroDimTensor) { executorch::runtime::Error::Ok); EXPECT_EQ(a.dim(), 0); } + +TEST_F(TensorUtilTest, SameDimOrderContiguous) { + using namespace torch::executor; + // Three different tensors with the same shape and same dim order + // ([0, 1, 2, 3]), but different dtypes and contents. + std::vector sizes = {3, 5, 2, 1}; + Tensor a = tf_byte_.ones(sizes); + Tensor b = tf_int_.zeros(sizes); + Tensor c = tf_float_.full(sizes, 0.1); + + // The tensors have the same dim order, should pass the following checks. + EXPECT_TRUE(tensors_have_same_dim_order(a, b)); + EXPECT_TRUE(tensors_have_same_dim_order(b, a)); + EXPECT_TRUE(tensors_have_same_dim_order(a, b, c)); + EXPECT_TRUE(tensors_have_same_dim_order(b, c, a)); + EXPECT_TRUE(tensors_have_same_dim_order(c, a, b)); +} + +TEST_F(TensorUtilTest, SameDimOrderChannelsLast) { + using namespace torch::executor; + // Three different tensors with the same shape and same dim order + // ([0, 2, 3, 1]), but different dtypes and contents. + std::vector sizes = {3, 5, 2, 1}; + Tensor a = tf_byte_.full_channels_last(sizes, 1); + Tensor b = tf_int_.full_channels_last(sizes, 0); + Tensor c = tf_float_.full_channels_last(sizes, 0.1); + + // The tensors have the same dim order, should pass the following checks. + EXPECT_TRUE(tensors_have_same_dim_order(a, b)); + EXPECT_TRUE(tensors_have_same_dim_order(b, a)); + EXPECT_TRUE(tensors_have_same_dim_order(a, b, c)); + EXPECT_TRUE(tensors_have_same_dim_order(b, c, a)); + EXPECT_TRUE(tensors_have_same_dim_order(c, a, b)); +} + +TEST_F(TensorUtilTest, SameShapesDifferentDimOrder) { + using namespace torch::executor; + // Three different tensors with the same shape but different dtypes and + // contents, where b and c have the same dim order ([0, 2, 3, 1]) while a is + // different ([0, 1, 2, 3]). + std::vector sizes = {3, 5, 2, 1}; + Tensor a = tf_byte_.ones(sizes); + Tensor b = tf_int_.full_channels_last(sizes, 0); + Tensor c = tf_float_.full_channels_last(sizes, 0.1); + + // Not the same dim order. Chec + EXPECT_FALSE(tensors_have_same_dim_order(a, b)); + EXPECT_FALSE(tensors_have_same_dim_order(b, a)); + + // Test with a mismatching tensor in all positions, where the other two agree. + EXPECT_FALSE(tensors_have_same_dim_order(a, b, c)); + EXPECT_FALSE(tensors_have_same_dim_order(a, c, b)); + EXPECT_FALSE(tensors_have_same_dim_order(c, b, a)); +} From 324864d3dfba0dca9cfe7da954cd7df6f293ddd6 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:27:45 -0400 Subject: [PATCH 146/531] Add op: topk Differential Revision: D59936967 Pull Request resolved: https://github.com/pytorch/executorch/pull/4307 --- kernels/portable/cpu/op_topk.cpp | 251 ++++++++++++++++++ kernels/portable/functions.yaml | 5 + kernels/test/op_topk_test.cpp | 138 ++++++++++ kernels/test/targets.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 3 + 5 files changed, 398 insertions(+) create mode 100644 kernels/portable/cpu/op_topk.cpp create mode 100644 kernels/test/op_topk_test.cpp diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp new file mode 100644 index 00000000000..3cc0ccb9de4 --- /dev/null +++ b/kernels/portable/cpu/op_topk.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { +namespace { + +bool check_topk_args( + const Tensor& in, + int64_t k, + int64_t dim, + Tensor& values, + Tensor& indices) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, values)); + ET_LOG_AND_RETURN_IF_FALSE(indices.scalar_type() == ScalarType::Long); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim)); + if (dim < 0) { + dim += nonzero_dim(in); + } + ET_LOG_MSG_AND_RETURN_IF_FALSE( + k >= 0 && k <= nonempty_size(in, dim), "selected index k out of range"); + return true; +} + +bool get_topk_target_size( + const Tensor& in, + int64_t k, + int64_t dim, + Tensor::SizesType* target_size, + size_t* target_dim) { + *target_dim = in.dim(); + for (size_t i = 0; i < *target_dim; ++i) { + if (i == dim) { + target_size[i] = k; + } else { + target_size[i] = in.size(i); + } + } + return true; +} + +template > +void perform_topk( + const Tensor& in, + int64_t k, + int64_t dim, + bool largest, + bool sorted, + Tensor& values, + Tensor& indices, + elem_t* queue) { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* values_data = values.mutable_data_ptr(); + long* indices_data = indices.mutable_data_ptr(); + + if (in.dim() == 0) { + values_data[0] = in_data[0]; + indices_data[0] = 0; + return; + } + + if (k == 0) { + return; + } + + const size_t outer_size = getLeadingDims(in, dim); + + const size_t dim_size = in.size(dim); + const size_t dim_stride = in.strides()[dim]; + + const size_t outer_stride_in = dim_size * dim_stride; + const size_t outer_stride_out = k * dim_stride; + + bool use_partial_sort = k * 64 <= dim_size; + + // Loop through all outer dimensions + for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + size_t outer_in = outer_idx * outer_stride_in; + size_t outer_out = outer_idx * outer_stride_out; + // Loop through all inner dimensions + for (size_t inner_idx = 0; inner_idx < dim_stride; ++inner_idx) { + size_t base_in = outer_in + inner_idx; + size_t base_out = outer_out + inner_idx; + + // Populate the queue with the values from the input tensor + for (size_t i = 0; i < dim_size; ++i) { + size_t in_ix = base_in + i * dim_stride; + queue[i].first = in_data[in_ix]; + queue[i].second = i; + } + + // Perform topk on the queue + if (use_partial_sort) { + if (largest) { + std::partial_sort( + queue, + queue + k, + queue + dim_size, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (std::isnan(x.first) && !std::isnan(y.first)) || + (x.first > y.first)); + }); + } else { + std::partial_sort( + queue, + queue + k, + queue + dim_size, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (!std::isnan(x.first) && std::isnan(y.first)) || + (x.first < y.first)); + }); + } + } else { + if (largest) { + std::nth_element( + queue, + queue + k - 1, + queue + dim_size, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (std::isnan(x.first) && !std::isnan(y.first)) || + (x.first > y.first)); + }); + if (sorted) { + std::sort( + queue, + queue + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (std::isnan(x.first) && !std::isnan(y.first)) || + (x.first > y.first)); + }); + } + } else { + std::nth_element( + queue, + queue + k - 1, + queue + dim_size, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (!std::isnan(x.first) && std::isnan(y.first)) || + (x.first < y.first)); + }); + if (sorted) { + std::sort( + queue, + queue + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ( + (!std::isnan(x.first) && std::isnan(y.first)) || + (x.first < y.first)); + }); + } + } + } + + // Write the topk values and indices to the output tensors + for (size_t i = 0; i < k; ++i) { + size_t out_ix = base_out + i * dim_stride; + + values_data[out_ix] = queue[i].first; + indices_data[out_ix] = queue[i].second; + } + } + } +} + +void* allocate_temp_memory(RuntimeContext& ctx, size_t size) { + Result temp_mem_res = ctx.allocate_temp(size); + return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; +} + +} // namespace + +std::tuple topk_values( + RuntimeContext& ctx, + const Tensor& in, + int64_t k, + int64_t dim, + bool largest, + bool sorted, + Tensor& values, + Tensor& indices) { + auto out = std::tuple({values, indices}); + + ET_KERNEL_CHECK( + ctx, check_topk_args(in, k, dim, values, indices), InvalidArgument, out); + + if (dim < 0) { + dim += nonzero_dim(in); + } + + // @lint-ignore CLANGTIDY facebook-hte-CArray + Tensor::SizesType target_size[kTensorDimensionLimit]; + size_t target_dim = 0; + get_topk_target_size(in, k, dim, target_size, &target_dim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(values, {target_size, target_dim}) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(indices, {target_size, target_dim}) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "topk.values"; + + if (in.numel() == 0 || (k == 0 && in.dim() > 0)) { + return out; + } + + bool temp_mem_allocated = false; + + ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { + using elem_t = std::pair; + size_t temp_mem_size = nonempty_size(in, dim) * sizeof(elem_t); + + elem_t* queue = (elem_t*)allocate_temp_memory(ctx, temp_mem_size); + if (queue == nullptr) { + return; + } + temp_mem_allocated = true; + + perform_topk(in, k, dim, largest, sorted, values, indices, queue); + }); + + ET_KERNEL_CHECK(ctx, temp_mem_allocated, MemoryAllocationFailed, out); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 21258329aa8..6b0b2466888 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -847,6 +847,11 @@ - arg_meta: null kernel_name: torch::executor::tanh_out +- op: topk.values + kernels: + - arg_meta: null + kernel_name: torch::executor::topk_values + - op: transpose_copy.int_out kernels: - arg_meta: null diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp new file mode 100644 index 00000000000..9f57225ba4f --- /dev/null +++ b/kernels/test/op_topk_test.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::IntArrayRef; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::MemoryAllocator; +using torch::executor::testing::TensorFactory; + +class TempMemoryAllocator final : public MemoryAllocator { + private: + // We allocate a little more than requested and use that memory as a node in + // a linked list, pushing the allocated buffers onto a list that's iterated + // and freed when the KernelRuntimeContext is destroyed. + struct AllocationNode { + void* data; + AllocationNode* next; + }; + + AllocationNode* head_ = nullptr; + + public: + TempMemoryAllocator() : MemoryAllocator(0, nullptr) {} + + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override { + if (!isPowerOf2(alignment)) { + ET_LOG(Error, "Alignment %zu is not a power of 2", alignment); + return nullptr; + } + + // Allocate enough memory for the node, the data and the alignment bump. + size_t alloc_size = sizeof(AllocationNode) + size + alignment; + void* node_memory = malloc(alloc_size); + + // If allocation failed, log message and return nullptr. + if (node_memory == nullptr) { + ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size); + return nullptr; + } + + // Compute data pointer. + uint8_t* data_ptr = + reinterpret_cast(node_memory) + sizeof(AllocationNode); + + // Align the data pointer. + void* aligned_data_ptr = alignPointer(data_ptr, alignment); + + // Assert that the alignment didn't overflow the allocated memory. + ET_DCHECK_MSG( + reinterpret_cast(aligned_data_ptr) + size <= + reinterpret_cast(node_memory) + alloc_size, + "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu", + aligned_data_ptr, + size, + node_memory, + alloc_size); + + // Construct the node. + AllocationNode* new_node = reinterpret_cast(node_memory); + new_node->data = aligned_data_ptr; + new_node->next = head_; + head_ = new_node; + + // Return the aligned data pointer. + return head_->data; + } + + void reset() override { + AllocationNode* current = head_; + while (current != nullptr) { + AllocationNode* next = current->next; + free(current); + current = next; + } + head_ = nullptr; + } + + ~TempMemoryAllocator() override { + reset(); + } +}; + +std::tuple op_topk_values( + const Tensor& input, + int64_t k, + int64_t dim, + bool largest, + bool sorted, + Tensor& values, + Tensor& indices) { + TempMemoryAllocator allocator = TempMemoryAllocator(); + exec_aten::RuntimeContext context(nullptr, &allocator); + return torch::executor::aten::topk_outf( + context, input, k, dim, largest, sorted, values, indices); +} + +class OpTopkValuesTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpTopkValuesTest, SmokeTest) { + TensorFactory tfFloat; + TensorFactory tfLong; + + Tensor input = + tfFloat.make({3, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + int64_t k = 2; + int64_t dim = 0; + bool largest = true; + bool sorted = true; + Tensor values = tfFloat.zeros({2, 2, 2}); + Tensor indices = tfLong.zeros({2, 2, 2}); + Tensor values_expected = tfFloat.make({2, 2, 2}, {9, 10, 11, 12, 5, 6, 7, 8}); + Tensor indices_expected = tfLong.make({2, 2, 2}, {2, 2, 2, 2, 1, 1, 1, 1}); + op_topk_values(input, k, dim, largest, sorted, values, indices); + EXPECT_TENSOR_CLOSE(values, values_expected); + EXPECT_TENSOR_EQ(indices, indices_expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 69f4e176ff9..07421b25e51 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -295,6 +295,7 @@ def define_common_targets(): _common_op_test("op_tan_test", ["aten", "portable"]) _common_op_test("op_tanh_test", ["aten", "portable"]) _common_op_test("op_to_copy_test", ["aten", "portable"]) + _common_op_test("op_topk_test", ["aten", "portable"]) _common_op_test("op_transpose_copy_test", ["aten", "portable"]) _common_op_test("op_tril_test", ["aten", "portable"]) _common_op_test("op_trunc_test", ["aten", "portable"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index 0cc9ab5fd0e..04e824db57c 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -1125,6 +1125,9 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op_topk", + ), op_target( name = "op_transpose_copy", deps = ["//executorch/kernels/portable/cpu/util:transpose_util"], From bc56a97d86608aeee5ae9baa19f17aaeccca3006 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:42:55 -0400 Subject: [PATCH 147/531] Add op: convolution_backward Differential Revision: D62028659 Pull Request resolved: https://github.com/pytorch/executorch/pull/5032 --- kernels/aten/functions.yaml | 2 + .../portable/cpu/op_convolution_backward.cpp | 312 ++++++++++++++++++ kernels/portable/cpu/util/kernel_ops_util.cpp | 2 +- kernels/portable/cpu/util/kernel_ops_util.h | 2 +- kernels/portable/functions.yaml | 5 + kernels/test/op_convolution_backward_test.cpp | 200 +++++++++++ kernels/test/targets.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 6 + 8 files changed, 528 insertions(+), 2 deletions(-) create mode 100644 kernels/portable/cpu/op_convolution_backward.cpp create mode 100644 kernels/test/op_convolution_backward_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index f28cfb48b36..58f394eaa68 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -115,6 +115,8 @@ - op: convolution.out +- op: convolution_backward.out + - op: copy.out - op: cos.out diff --git a/kernels/portable/cpu/op_convolution_backward.cpp b/kernels/portable/cpu/op_convolution_backward.cpp new file mode 100644 index 00000000000..3a86f430d10 --- /dev/null +++ b/kernels/portable/cpu/op_convolution_backward.cpp @@ -0,0 +1,312 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; +using IntArrayRef = exec_aten::ArrayRef; +using OptIntArrayRef = exec_aten::OptionalArrayRef; + +namespace { + +bool check_convolution_backward_args( + const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + ET_UNUSED const OptIntArrayRef bias_sizes_opt, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool transposed, + IntArrayRef output_padding, + int64_t groups, + ET_UNUSED exec_aten::ArrayRef output_mask, + Tensor& grad_input, + Tensor& grad_weight, + Tensor& grad_bias) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + transposed == false, "Transposed Convolution Backward not supported yet"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + weight.dim() == 4, "Only 2D Convolution Backward supported for now"); + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(weight, input)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_output, input)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_input, input)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_weight, input)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_bias, input)); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + check_convolution_args( + input, + weight, + exec_aten::optional(), + stride, + padding, + dilation, + transposed, + output_padding, + groups, + grad_output), + "Invalid convolution arguments"); + + size_t output_ndim = 0; + exec_aten::SizesType output_sizes[kTensorDimensionLimit]; + get_convolution_out_target_size( + input, + weight, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_sizes, + &output_ndim); + + ET_LOG_AND_RETURN_IF_FALSE( + output_size_is_valid({output_sizes, output_ndim}, input.dim() - 2)); + + ET_LOG_MSG_AND_RETURN_IF_FALSE( + grad_output.dim() == input.dim(), + "grad_output should have same number of dimensions as input"); + + ET_LOG_AND_RETURN_IF_FALSE( + tensor_has_expected_size(grad_output, {output_sizes, output_ndim})); + + return true; +} + +template +void conv2d_backward_impl( + const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + exec_aten::ArrayRef output_mask, + Tensor& grad_input, + Tensor& grad_weight, + Tensor& grad_bias) { + auto batch_size = input.size(0); + auto in_channels = input.size(1); + auto out_channels = weight.size(0); + auto in_height = input.size(2); + auto in_width = input.size(3); + auto out_height = grad_output.size(2); + auto out_width = grad_output.size(3); + auto kernel_height = weight.size(2); + auto kernel_width = weight.size(3); + + const int64_t stride_h = val_at(stride, 0); + const int64_t padding_h = val_at(padding, 0, /*default_value=*/0); + const int64_t dilation_h = val_at(dilation, 0); + const int64_t stride_w = val_at(stride, 1); + const int64_t padding_w = val_at(padding, 1, /*default_value=*/0); + const int64_t dilation_w = val_at(dilation, 1); + + auto in_channels_per_group = in_channels / groups; + auto out_channels_per_group = out_channels / groups; + + const CTYPE* grad_output_data = grad_output.const_data_ptr(); + const CTYPE* input_data = input.const_data_ptr(); + const CTYPE* weight_data = weight.const_data_ptr(); + + CTYPE* grad_input_data = nullptr; + CTYPE* grad_weight_data = nullptr; + CTYPE* grad_bias_data = nullptr; + + if (output_mask[0]) { + grad_input_data = grad_input.mutable_data_ptr(); + memset(grad_input_data, 0, grad_input.nbytes()); + } + + if (output_mask[1]) { + grad_weight_data = grad_weight.mutable_data_ptr(); + memset(grad_weight_data, 0, grad_weight.nbytes()); + } + + if (output_mask[2]) { + grad_bias_data = grad_bias.mutable_data_ptr(); + memset(grad_bias_data, 0, grad_bias.nbytes()); + } + + // @lint-ignore CLANGTIDY facebook-hte-CArray + exec_aten::SizesType out_coord[kTensorDimensionLimit]; + // @lint-ignore CLANGTIDY facebook-hte-CArray + exec_aten::SizesType in_coord[kTensorDimensionLimit]; + // @lint-ignore CLANGTIDY facebook-hte-CArray + exec_aten::SizesType weight_coord[kTensorDimensionLimit]; + + // Compute gradients + for (int64_t b = 0; b < batch_size; ++b) { // Loop over each batch + in_coord[0] = b; + out_coord[0] = b; + for (int64_t g = 0; g < groups; ++g) { // Loop over each group + for (int64_t h = 0; h < out_height; ++h) { // Loop over each output row + out_coord[2] = h; + for (int64_t w = 0; w < out_width; ++w) { // Loop over each output col + out_coord[3] = w; + + // Loop over each output channel in the group + for (int64_t oc = 0; oc < out_channels_per_group; ++oc) { + int64_t oc_global = oc + g * out_channels_per_group; + weight_coord[0] = oc_global; + out_coord[1] = oc_global; + + int64_t out_idx = calculate_linear_index( + out_coord, grad_output.strides().data(), 4); + + // Accumulate the gradient with respect to the bias if required + if (output_mask[2]) { + grad_bias_data[oc_global] += grad_output_data[out_idx]; + } + + // Loop over each input channel in the group + for (int64_t ic = 0; ic < in_channels_per_group; ++ic) { + int64_t ic_global = ic + g * in_channels_per_group; + in_coord[1] = ic_global; + weight_coord[1] = ic; + + // Loop over each element + for (int64_t kh = 0; kh < kernel_height; ++kh) { + int64_t in_h = h * stride_h - padding_h + kh * dilation_h; + if (in_h >= 0 && in_h < in_height) { + in_coord[2] = in_h; + weight_coord[2] = kh; + + for (int64_t kw = 0; kw < kernel_width; ++kw) { + int64_t in_w = w * stride_w - padding_w + kw * dilation_w; + if (in_w >= 0 && in_w < in_width) { + in_coord[3] = in_w; + weight_coord[3] = kw; + + int64_t in_idx = calculate_linear_index( + in_coord, input.strides().data(), 4); + + int64_t weight_idx = calculate_linear_index( + weight_coord, weight.strides().data(), 4); + + // Gradient with respect to the input if required + if (output_mask[0]) { + grad_input_data[in_idx] += + grad_output_data[out_idx] * weight_data[weight_idx]; + } + // Gradient with respect to the weight if required + if (output_mask[1]) { + grad_weight_data[weight_idx] += + grad_output_data[out_idx] * input_data[in_idx]; + } + } + } + } + } + } + } + } + } + } + } +} + +} // namespace + +std::tuple convolution_backward_out( + RuntimeContext& ctx, + const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + const OptIntArrayRef bias_sizes_opt, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool transposed, + IntArrayRef output_padding, + int64_t groups, + exec_aten::ArrayRef output_mask, + Tensor& grad_input, + Tensor& grad_weight, + Tensor& grad_bias) { + (void)ctx; + + std::tuple ret_val( + grad_input, grad_weight, grad_bias); + + ET_KERNEL_CHECK( + ctx, + check_convolution_backward_args( + grad_output, + input, + weight, + bias_sizes_opt, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_mask, + grad_input, + grad_weight, + grad_bias), + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(grad_input, input.sizes()) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(grad_weight, weight.sizes()) == Error::Ok, + InvalidArgument, + ret_val); + + if (bias_sizes_opt.has_value()) { + ET_KERNEL_CHECK( + ctx, + resize_tensor(grad_bias, bias_sizes_opt.value()) == Error::Ok, + InvalidArgument, + ret_val); + } + + constexpr auto name = "convolution_backward.out"; + + ET_SWITCH_FLOATH_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { + conv2d_backward_impl( + grad_output, + input, + weight, + stride, + padding, + dilation, + groups, + output_mask, + grad_input, + grad_weight, + grad_bias); + }); + + return ret_val; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp index 6ac8e83d2d9..649526c94bf 100644 --- a/kernels/portable/cpu/util/kernel_ops_util.cpp +++ b/kernels/portable/cpu/util/kernel_ops_util.cpp @@ -326,7 +326,7 @@ bool check_convolution_args( bool transposed, IntArrayRef output_padding, int64_t groups, - Tensor& out) { + const Tensor& out) { ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight, out)); ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in)); diff --git a/kernels/portable/cpu/util/kernel_ops_util.h b/kernels/portable/cpu/util/kernel_ops_util.h index 22a09ef33d5..6b06e231f59 100644 --- a/kernels/portable/cpu/util/kernel_ops_util.h +++ b/kernels/portable/cpu/util/kernel_ops_util.h @@ -411,7 +411,7 @@ bool check_convolution_args( bool transposed, IntArrayRef output_padding, int64_t groups, - Tensor& out); + const Tensor& out); void get_convolution_out_target_size( const Tensor& in, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 6b0b2466888..085869715c7 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -248,6 +248,11 @@ - arg_meta: null kernel_name: torch::executor::convolution_out +- op: convolution_backward.out + kernels: + - arg_meta: null + kernel_name: torch::executor::convolution_backward_out + - op: copy.out kernels: - arg_meta: null diff --git a/kernels/test/op_convolution_backward_test.cpp b/kernels/test/op_convolution_backward_test.cpp new file mode 100644 index 00000000000..4a4d0f883f4 --- /dev/null +++ b/kernels/test/op_convolution_backward_test.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::optional; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using IntArrayRef = exec_aten::ArrayRef; +using OptIntArrayRef = exec_aten::OptionalArrayRef; +using torch::executor::testing::TensorFactory; + +class OpConvolutionBackwardOutTest : public OperatorTest { + protected: + std::tuple op_convolution_backward_out( + const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + const OptIntArrayRef bias_sizes_opt, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool transposed, + IntArrayRef output_padding, + int64_t groups, + std::array output_mask_a, + Tensor& grad_input, + Tensor& grad_weight, + Tensor& grad_bias) { +#ifndef USE_ATEN_LIB + ArrayRef output_mask(output_mask_a.data(), output_mask_a.size()); +#else + std::array output_mask = output_mask_a; +#endif + return torch::executor::aten::convolution_backward_outf( + context_, + grad_output, + input, + weight, + bias_sizes_opt, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_mask, + grad_input, + grad_weight, + grad_bias); + } +}; + +TEST_F(OpConvolutionBackwardOutTest, SmokeTest) { + TensorFactory tf; + + std::vector grad_output_data = { + 10, 12, 87, 13, 34, 87, 55, 22, 48, 33, 29, 38, 60, 49, 88, 30, + 99, 19, 42, 37, 61, 31, 33, 58, 38, 23, 2, 33, 3, 21, 32, 2, + 30, 72, 10, 67, 92, 19, 11, 16, 65, 37, 60, 74, 4, 19, 45, 37}; + std::vector input_data = { + 9, 89, 45, 39, 25, 2, 97, 55, 80, 24, 18, 33, 28, 89, 19, 16, 19, 33, + 69, 61, 34, 84, 58, 30, 33, 18, 75, 30, 6, 33, 42, 10, 80, 41, 66, 64, + 47, 51, 67, 62, 58, 10, 97, 71, 24, 44, 84, 34, 33, 54, 8, 73, 90, 15, + 21, 92, 55, 22, 56, 12, 10, 63, 32, 76, 65, 38, 95, 92, 22, 15, 37, 12, + 67, 14, 60, 44, 73, 74, 23, 4, 56, 64, 88, 90, 82, 32, 91, 3, 6, 87, + 55, 95, 7, 14, 24, 69, 52, 44, 14, 37, 75, 52, 37, 40, 25, 54, 4, 15, + 97, 51, 46, 28, 65, 95, 50, 82, 23, 39, 50, 55, 97, 52, 91, 16, 19, 49, + 61, 50, 42, 47, 87, 99, 9, 60, 22, 71, 47, 17, 0, 80, 28, 88, 93, 43, + 65, 25, 88, 67, 21, 89, 24, 81, 3, 71, 20, 34, 17, 17, 94, 10, 82, 25, + 10, 11, 7, 28, 77, 39, 74, 79, 17, 40, 67, 54, 49, 54, 21, 89, 17, 7, + 52, 64, 68, 80, 7, 72, 44, 35, 92, 47, 4, 13, 10, 43, 64, 66, 83, 49, + 81, 78, 58, 22, 86, 48, 35, 64, 98, 79, 8, 52, 56, 23, 38, 74, 16, 63, + 51, 70, 44, 28, 43, 13, 51, 85, 42, 29, 64, 26, 54, 91, 9, 96, 41, 56, + 7, 52, 27, 22, 69, 13, 8, 20, 22, 49, 66, 98, 77, 42, 54, 38, 70, 83, + 13, 8, 21, 56, 78, 37, 28, 69, 42, 30, 91, 5, 28, 15, 20, 14, 16, 39, + 95, 66, 4, 72, 52, 35, 54, 93, 87, 77, 3, 49, 82, 70, 84, 3, 73, 99, + 32, 95, 58, 65, 32, 75, 34, 22, 12, 84, 63, 72, 85, 66, 63, 27, 3, 73, + 45, 37, 61, 52, 41, 16, 37, 14, 80, 17, 48, 8, 87, 98, 69, 63, 92, 68, + 42, 63, 5, 22, 66, 91, 74, 11, 17, 45, 45, 33, 40, 85, 26, 75, 73, 81, + 54, 27, 80, 1, 44, 66, 10, 21, 15, 10, 76, 96, 0, 43, 39, 3, 57, 79, + 45, 64, 58, 92, 44, 42, 7, 28, 94, 4, 8, 22, 22, 31, 75, 44, 3, 70, + 83, 72, 87, 12, 20, 55, 84, 31, 50, 34, 25, 49, 29, 71, 57, 97, 25, 82, + 84, 42, 86, 41, 54, 92, 34, 30, 52, 34, 84, 25, 54, 37, 38, 26, 76, 82, + 34, 14, 85, 28, 93, 9}; + std::vector weight_data = { + 2, 54, 9, 37, 0, 47, 70, 9, 84, 69, 56, 79, 25, 35, 54, 13, + 65, 46, 38, 28, 74, 27, 66, 61, 20, 60, 62, 58, 15, 44, 75, 55, + 7, 52, 13, 36, 39, 64, 62, 45, 100, 6, 79, 63, 63, 52, 37, 60, + 78, 12, 69, 2, 74, 56, 93, 39, 62, 22, 55, 67, 68, 74, 12, 69, + 15, 73, 28, 70, 86, 20, 90, 49, 52, 26, 58, 2, 82, 17, 70, 55, + 54, 83, 70, 11, 27, 9, 5, 42, 34, 62, 29, 94, 69, 81, 54, 4}; + std::vector expected_grad_input_data = { + 1134, 7578, 686, 2682, 0, 4148, 7136, 2406, 8698, 0, + 3759, 6003, 2163, 2395, 0, 2929, 5830, 3469, 6955, 0, + 720, 6201, 495, 2063, 0, 5260, 5989, 3060, 7079, 0, + 9690, 3423, 3385, 1932, 0, 7644, 8499, 1323, 2613, 0, + 4334, 6624, 8532, 9719, 0, 5496, 8601, 1157, 2215, 0, + 4676, 7600, 6524, 10069, 0, 4047, 6117, 1612, 2567, 0, + 5931, 5651, 5669, 6623, 0, 7674, 3291, 2748, 1654, 0, + 10455, 4290, 4145, 796, 0, 9835, 5483, 11649, 5952, 0, + 7098, 5460, 3101, 2443, 0, 7788, 5909, 8582, 6298, 0, + 9462, 4845, 3041, 2067, 0, 7038, 6336, 10438, 6377, 0, + 7518, 8187, 2079, 2773, 0, 10036, 2642, 3952, 1166, 0, + 16014, 2250, 10025, 1908, 0, 9610, 298, 3868, 122, 0, + 16629, 4338, 11335, 3527, 0, 11514, 5965, 4762, 2207, 0, + 18552, 10755, 13309, 5996, 0, 12454, 6787, 4960, 2875, 0, + 8750, 6999, 3534, 3233, 0, 14160, 9399, 9595, 8922, 0, + 9110, 6567, 3820, 2351, 0, 12969, 11814, 9436, 5870, 0, + 7631, 7061, 2877, 2499, 0, 8553, 13527, 3631, 6863, 0, + 1361, 8634, 515, 3372, 0, 3394, 10206, 1504, 4112, 0, + 5505, 17421, 4702, 11891, 0, 4233, 11894, 1739, 5014, 0, + 11787, 14634, 8981, 10759, 0, 11777, 6701, 4719, 3111, 0, + 18459, 7761, 12044, 7627, 0, 11214, 4556, 4374, 1594, 0, + 604, 1908, 1506, 6102, 0, 2532, 4024, 1713, 6121, 0, + 1878, 1814, 4761, 5397, 0, 1127, 3885, 4373, 5832, 0, + 450, 1414, 1080, 4719, 0, 5210, 2683, 2765, 4252, 0, + 2390, 1668, 7710, 4257, 0, 378, 1698, 3276, 6021, 0, + 2866, 4881, 3547, 6822, 0, 502, 1238, 2784, 5199, 0, + 2496, 3975, 2700, 5004, 0, 1220, 1990, 3633, 5763, 0, + 4501, 2679, 4504, 5412, 0, 1968, 1376, 6246, 3669, 0, + 3130, 272, 9345, 1950, 0, 5167, 3278, 9097, 2138, 0, + 2446, 1946, 6942, 5460, 0, 5732, 3404, 7919, 5534, 0, + 2038, 1614, 6978, 4635, 0, 4544, 4839, 7367, 5574, 0, + 1242, 1922, 4842, 6333, 0, 1066, 236, 2236, 686, 0, + 17238, 2254, 10413, 1592, 0, 991, 30, 2206, 70, 0, + 18823, 6392, 12173, 2470, 0, 1142, 684, 2742, 1219, 0, + 21256, 11293, 12719, 7512, 0, 1303, 649, 2818, 1669, 0, + 898, 574, 2018, 1929, 0, 15720, 11989, 10517, 5972, 0, + 885, 781, 2210, 1281, 0, 14601, 12198, 7915, 4958, 0, + 856, 850, 1601, 1355, 0, 7039, 14083, 4113, 7490, 0, + 152, 927, 287, 1902, 0, 301, 1051, 886, 2346, 0, + 6821, 19615, 4491, 13281, 0, 424, 1146, 999, 2906, 0, + 15177, 15480, 8849, 12442, 0, 1222, 544, 2687, 1859, 0, + 20215, 9693, 11441, 4964, 0, 1206, 555, 2466, 860, 0}; + std::vector expected_grad_weight_data = { + 9246, 22073, 12431, 19714, 11179, 19032, 8458, 6495, 18707, 13830, + 20445, 17089, 17124, 18710, 11827, 17236, 16824, 9008, 14086, 18834, + 17419, 16759, 13152, 9339, 13801, 20888, 13976, 27277, 13010, 23949, + 9838, 11220, 17658, 15019, 25337, 17583, 13270, 21754, 16908, 20563, + 20732, 13413, 20868, 27521, 19537, 21170, 15888, 10034, 19195, 16370, + 40243, 25890, 40472, 30460, 21228, 21625, 13289, 24435, 19876, 29816, + 24188, 23619, 13752, 16251, 18741, 19368, 24517, 34261, 27054, 31257, + 21238, 18909, 15776, 16881, 34604, 22534, 28101, 23834, 18479, 16469, + 12852, 16551, 14204, 29983, 20167, 24150, 14281, 17501, 15897, 16019, + 21661, 32765, 23874, 26527, 20463, 18661}; + std::vector expected_grad_bias_data = {363, 438, 585, 501}; + + auto grad_output = tf.make({2, 4, 3, 2}, grad_output_data); + auto input = tf.make({2, 6, 7, 5}, input_data); + auto weight = tf.make({4, 3, 4, 2}, weight_data); + int64_t bias_sizes[1] = {4}; + int64_t stride[2] = {1, 2}; + int64_t padding[2] = {1, 0}; + int64_t dilation[2] = {2, 1}; + bool transposed = false; + int64_t output_padding[2] = {0, 0}; + int64_t groups = 2; + std::array output_mask_a = {true, true, true}; + auto grad_input = tf.zeros({2, 6, 7, 5}); + auto grad_weight = tf.zeros({4, 3, 4, 2}); + auto grad_bias = tf.zeros({4}); + + op_convolution_backward_out( + grad_output, + input, + weight, + IntArrayRef{bias_sizes, 1}, + IntArrayRef{stride, 2}, + IntArrayRef{padding, 2}, + IntArrayRef{dilation, 2}, + transposed, + IntArrayRef{output_padding, 2}, + groups, + output_mask_a, + grad_input, + grad_weight, + grad_bias); + + auto expected_grad_input = tf.make({2, 6, 7, 5}, expected_grad_input_data); + auto expected_grad_weight = tf.make({4, 3, 4, 2}, expected_grad_weight_data); + auto expected_grad_bias = tf.make({4}, expected_grad_bias_data); + + EXPECT_TENSOR_CLOSE(grad_input, expected_grad_input); + EXPECT_TENSOR_CLOSE(grad_weight, expected_grad_weight); + EXPECT_TENSOR_CLOSE(grad_bias, expected_grad_bias); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 07421b25e51..749a221f9c0 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -190,6 +190,7 @@ def define_common_targets(): _common_op_test("op_clone_test", ["aten", "portable"]) _common_op_test("op_constant_pad_nd_test", ["aten", "portable"]) _common_op_test("op_convolution_test", ["aten", "portable"]) + _common_op_test("op_convolution_backward_test", ["aten", "portable"]) _common_op_test("op_copy_test", ["aten", "portable"]) _common_op_test("op_cos_test", ["aten", "portable"]) _common_op_test("op_cosh_test", ["aten", "portable"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index 04e824db57c..b56f40c0215 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -412,6 +412,12 @@ ATEN_OPS = ( ":vec_ops", ], ), + op_target( + name = "op_convolution_backward", + deps = [ + "//executorch/kernels/portable/cpu/util:kernel_ops_util", + ], + ), op_target( name = "op_copy", deps = [ From 6961eed174a59ee78e284cc647f6f119f4d53151 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Tue, 3 Sep 2024 17:59:18 -0400 Subject: [PATCH 148/531] [ET-VK] Add test to track sizes of various objects Differential Revision: D62144400 Pull Request resolved: https://github.com/pytorch/executorch/pull/5039 --- .../vulkan/test/vulkan_compute_api_test.cpp | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index f3c60a21376..2f9c3d22f57 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -992,6 +992,28 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { graph.get_tensor(name.value)->staging_buffer_numel()); \ graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); +// The purpose of this test is simply to track the size of various classes over +// time, in the interest of making sure that they doesn't grow too large. +TEST_F(VulkanComputeAPITest, print_object_sizes) { +#define PRINT_SIZE(name) \ + std::cout << #name << " size: " << sizeof(name) << " B" << std::endl + PRINT_SIZE(vTensor); + PRINT_SIZE(Value); + PRINT_SIZE(StagingBuffer); + PRINT_SIZE(ComputeGraph); + PRINT_SIZE(ExecuteNode); +#undef PRINT_SIZE + + // The actual sizes of each object is dependent on the platform. However, we + // can alert ourselves to any significant changes in the sizes of these + // objects by checking the `sizeof()` the class against some loose thresholds. + EXPECT_TRUE(sizeof(vTensor) < 1800); + EXPECT_TRUE(sizeof(Value) < 2400); + EXPECT_TRUE(sizeof(StagingBuffer) < 500); + EXPECT_TRUE(sizeof(ComputeGraph) < 500); + EXPECT_TRUE(sizeof(ExecuteNode) < 500); +} + TEST(VulkanComputeGraphTest, test_values_scalars) { GraphConfig config; ComputeGraph graph(config); From f65531bc12182099835c2844442cf029fdfa2caf Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 3 Sep 2024 15:05:52 -0700 Subject: [PATCH 149/531] Swap to better default symshapeevalue pass Differential Revision: D62136390 Pull Request resolved: https://github.com/pytorch/executorch/pull/5033 --- examples/models/llava/export_llava.py | 7 ++++++- exir/capture/_config.py | 4 ++-- exir/passes/TARGETS | 1 + exir/passes/sym_shape_eval_pass.py | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 2823ca726e0..5cd8628c603 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -33,7 +33,10 @@ from executorch.exir.passes import MemoryPlanningPass from executorch.exir.passes.quant_fusion_pass import QuantFusionPass -from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass +from executorch.exir.passes.sym_shape_eval_pass import ( + ConstraintBasedSymShapeEvalPass, + HintBasedSymShapeEvalPass, +) from executorch.extension.llm.export.builder import DType, LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import Tokenizer @@ -227,6 +230,8 @@ def export_all(llava_model: LlavaModel): memory_planning_pass=MemoryPlanningPass("greedy", alloc_graph_input=False), sym_shape_eval_pass={ "image_encoder": ConstraintBasedSymShapeEvalPass(), + "text_model": ConstraintBasedSymShapeEvalPass(), + "token_embedding": HintBasedSymShapeEvalPass(), }, ) ) diff --git a/exir/capture/_config.py b/exir/capture/_config.py index c0f7b71baf9..7b91464bdce 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -12,7 +12,7 @@ from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode from executorch.exir.pass_manager import PassType from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass -from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.tracer import ExirDynamoConfig from torch.fx._compatibility import compatibility @@ -86,7 +86,7 @@ class ExecutorchBackendConfig: # A single sym shape eval pass can be defined for all the programs in the # EdgeProgramManager or can be defined per program. sym_shape_eval_pass: Union[PassType, Dict[str, PassType]] = ( - HintBasedSymShapeEvalPass() + ConstraintBasedSymShapeEvalPass() ) # If set to true, view_copy operations will be converted to lightweight diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS index 4e59af26eae..eeb1e5265b0 100644 --- a/exir/passes/TARGETS +++ b/exir/passes/TARGETS @@ -202,6 +202,7 @@ python_library( ], deps = [ "//caffe2:torch", + "//executorch/exir:_warnings", "//executorch/exir:pass_base", "//executorch/exir:sym_util", "//executorch/exir:tensor", diff --git a/exir/passes/sym_shape_eval_pass.py b/exir/passes/sym_shape_eval_pass.py index f4d11ed8143..ec61d4b3a6f 100644 --- a/exir/passes/sym_shape_eval_pass.py +++ b/exir/passes/sym_shape_eval_pass.py @@ -10,6 +10,8 @@ import torch import torch.utils._pytree as pytree + +from executorch.exir._warnings import deprecated from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassBase, PassResult from executorch.exir.sym_util import eval_expr, eval_shape, eval_upper_bound @@ -164,8 +166,21 @@ def index_Tensor(args, kwargs) -> List[Optional[int]]: # noqa: C901 return out_sizes +@deprecated( + "`HintBasedSymShapeEvalPass` is deprecated " + "and will be removed in a future version of ExecuTorch. " + "Please use `ConstraintBasedSymShapeEvalPass` instead.", + category=FutureWarning, +) class HintBasedSymShapeEvalPass(PassBase): """ + + .. warning:: + + 'HintBasedSymShapeEvalPass` is deprecated + and will be removed in a future version of ExecuTorch. + Please use `ConstraintBasedSymShapeEvalPass` instead. + If we enable dynamic shape tracing, a tensor's shape may become a symbolic formula. We should convert those symbolic formula to concrete value for static/upperbound tensors so we can properly do memory planning for them. From 35d0f59cd17825b1ec767ee52baf33d70a3d34e9 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Tue, 3 Sep 2024 18:28:53 -0400 Subject: [PATCH 150/531] [ET-VK] Add type for symbolic integers Differential Revision: D62144399 Pull Request resolved: https://github.com/pytorch/executorch/pull/5040 --- .../vulkan/runtime/graph/ComputeGraph.cpp | 24 ++++++++ backends/vulkan/runtime/graph/ComputeGraph.h | 18 +++++- .../runtime/graph/containers/SymInt.cpp | 24 ++++++++ .../vulkan/runtime/graph/containers/SymInt.h | 41 +++++++++++++ .../vulkan/runtime/graph/containers/Types.cpp | 1 + .../vulkan/runtime/graph/containers/Types.h | 1 + .../vulkan/runtime/graph/containers/Value.h | 9 +++ .../vulkan/test/glsl/scalar_add_texture.glsl | 29 ++++++++++ .../vulkan/test/vulkan_compute_api_test.cpp | 58 +++++++++++++++++++ 9 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 backends/vulkan/runtime/graph/containers/SymInt.cpp create mode 100644 backends/vulkan/runtime/graph/containers/SymInt.h create mode 100644 backends/vulkan/test/glsl/scalar_add_texture.glsl diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 6c3ec88eaa7..a8f57f57d2a 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -43,6 +43,7 @@ VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector, ValueList) +VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt) #undef VALUE_PTR_CLASS_IMPL @@ -261,6 +262,13 @@ ValueRef ComputeGraph::add_string(std::string&& str) { return idx; } +ValueRef ComputeGraph::add_symint(const int32_t val) { + ValueRef idx(static_cast(values_.size())); + check_no_active_value_ptrs(); + values_.emplace_back(SymInt(context(), val)); + return idx; +} + ValueRef ComputeGraph::set_input_tensor( const ValueRef idx, const bool use_staging) { @@ -300,6 +308,22 @@ ValueRef ComputeGraph::set_output_tensor( return idx; } +vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( + const ValueRef idx) { + if (values_.at(idx).isInt()) { + const int32_t val = extract_scalar(idx); + create_params_buffer(val); + } else if (values_.at(idx).isSymInt()) { + SymIntPtr symint = get_symint(idx); + return vkapi::BufferBindInfo(symint->gpu_buffer.buffer()); + } + VK_THROW("Cannot create a int param buffer for the given value"); +} + +void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) { + get_symint(idx)->set(val); +} + SharedObject& ComputeGraph::get_shared_object(const int64_t idx) { if (idx >= shared_objects_.size()) { shared_objects_.resize(static_cast(idx + 1)); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 9b04b08a70e..ac5e0d6c9d1 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -63,6 +63,7 @@ DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector) +DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); #undef DECL_VALUE_PTR_CLASS @@ -154,6 +155,7 @@ class ComputeGraph final { GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList) + GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt); #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS @@ -422,15 +424,28 @@ class ComputeGraph final { ValueRef add_string(std::string&& str); + ValueRef add_symint(const int32_t val); + ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); template - const vkapi::BufferBindInfo create_params_buffer(const Block& data) { + vkapi::BufferBindInfo create_params_buffer(const Block& data) { param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data)); return vkapi::BufferBindInfo(param_ubos_.back().buffer()); } + /* + * Given a ValueRef, do the following depending on the type of the Value: + * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object + * backing the SymInt. + * - If it is a regular Int, create a new ParamsBuffer using the integer value + * and return the BufferBindInfo of the created ParamsBuffer. + */ + vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); + + void set_symint(const ValueRef idx, const int32_t val); + /* * Convenience function to add an input tensor along with its staging buffer */ @@ -577,6 +592,7 @@ class ComputeGraph final { friend class DoubleListPtr; friend class BoolListPtr; friend class ValueListPtr; + friend class SymIntPtr; }; template diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp new file mode 100644 index 00000000000..c91db84b787 --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +SymInt::SymInt(api::Context* context_p, const int32_t val) + : gpu_buffer(context_p, val){}; + +void SymInt::set(const int32_t val) { + gpu_buffer.update(val); +} + +void SymInt::operator=(const int32_t val) { + gpu_buffer.update(val); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h new file mode 100644 index 00000000000..0c9fbee5fe2 --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace vkcompute { + +/* + * Represents a symbolic integer whose value can be variable. It is implemented + * as a thin wrapper around a `ParamsBuffer` object that holds the value of the + * integer. The `ParamsBuffer` object allows the value of the symbolic integer + * to be changed from the CPU and have those changes be visible to all shaders + * that use the symbolic integer; it also allows the value of the symbolic + * integer to be the result of a compute shader. + * + * Regular scalar types represented by `TypeTag::INT` cannot be used for + * symbolic integers because their value is assumed to be constant; therefore + * the `Value` instance holding the value of the scalar does not contain + * any reference to the GPU buffers used to pass its value into compute shaders. + * Therefore, updating the value of the scalar does not impact the value seen + * by compute shaders. + */ +struct SymInt final { + api::ParamsBuffer gpu_buffer; + + explicit SymInt(api::Context* context_p, const int32_t val); + + void set(const int32_t val); + + void operator=(const int32_t val); +}; + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp index c5ffc65add1..e7a8951a552 100644 --- a/backends/vulkan/runtime/graph/containers/Types.cpp +++ b/backends/vulkan/runtime/graph/containers/Types.cpp @@ -29,6 +29,7 @@ std::ostream& operator<<(std::ostream& out, const TypeTag& tag) { PRINT_CASE(BOOLLIST) PRINT_CASE(VALUELIST) PRINT_CASE(STRING) + PRINT_CASE(SYMINT) } return out; } diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h index 79edbd50d3a..5840d1695ee 100644 --- a/backends/vulkan/runtime/graph/containers/Types.h +++ b/backends/vulkan/runtime/graph/containers/Types.h @@ -36,6 +36,7 @@ enum class TypeTag : uint32_t { // Special Type VALUELIST, STRING, + SYMINT, }; std::ostream& operator<<(std::ostream& out, const TypeTag& tag); diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index 6e03bbd4a21..50a2b5e548c 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -13,6 +13,7 @@ #include #include +#include #include namespace vkcompute { @@ -67,6 +68,8 @@ struct Value final { std::string as_string; + SymInt as_symint; + Payload() : u() {} // NOLINTNEXTLINE ~Payload(){}; @@ -123,6 +126,7 @@ struct Value final { TypeTag::VALUELIST, std::vector, as_value_list, vector); CASE_MOVE_MOVEABLE_TYPE( TypeTag::STRING, std::string, as_string, basic_string); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt); case TypeTag::NONE: clearToNone(); @@ -172,6 +176,9 @@ struct Value final { case TypeTag::STRING: payload.as_string.~basic_string(); break; + case TypeTag::SYMINT: + payload.as_symint.~SymInt(); + break; // Manually list out the types so that if a type here is added later and // not handled the compiler can catch it. case TypeTag::NONE: @@ -288,6 +295,8 @@ struct Value final { TypeTag::STRING, as_string); + SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint); + #undef SUPPORT_TRIVIALLY_COPYABLE_TYPE #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl new file mode 100644 index 00000000000..aa2b22c81f9 --- /dev/null +++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl @@ -0,0 +1,29 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +layout(std430) buffer; + +${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")} +${layout_declare_ubo(1, "uvec3", "extents")} +${layout_declare_ubo(2, "int", "scalar")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(pos, extents))) { + return; + } + + vec4 in_tex = imageLoad(t_in, pos); + imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar)); +} diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 2f9c3d22f57..a0bfefafa02 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1268,6 +1268,64 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) { + GraphConfig config; + config.set_storage_type_override(utils::kTexture3D); + ComputeGraph graph(config); + + std::vector sizes = {8, 64, 124}; + + // Build graph + + ValueRef scalar = graph.add_symint(1); + IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat); + + IOValueRef out = {}; + out.value = a.value; + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR("scalar_add_texture"), + graph.create_global_wg_size(a.value), + graph.create_local_wg_size(a.value), + // Inputs and Outputs + {{out.value, vkapi::MemoryAccessType::WRITE}}, + // Shader params buffers + {graph.texture_limits_ubo(a.value), + graph.get_or_create_int_param_buffer(scalar)}, + // Specialization Constants + {}, + // Resizing Logic + nullptr, + {})); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + int scalar_val = i - 3.0f; + graph.set_symint(scalar, scalar_val); + + float val_a = i + 2.0f; + float val_out = val_a + scalar_val; + + fill_vtensor(graph, a, val_a); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val) \ std::vector data_##name(utils::multiply_integers(sizes)); \ std::fill(data_##name.begin(), data_##name.end(), val); \ From e3cbeed355e2b26432fb3fb7a8822941d68ad7e1 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:57:52 -0400 Subject: [PATCH 151/531] Add op: scatter.src_out Differential Revision: D62143589 Pull Request resolved: https://github.com/pytorch/executorch/pull/5037 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_scatter.cpp | 81 ++++++- kernels/portable/cpu/util/index_util.cpp | 9 + kernels/portable/cpu/util/index_util.h | 7 + kernels/portable/functions.yaml | 5 + kernels/test/op_scatter_test.cpp | 292 +++++++++++++++++++++++ 6 files changed, 389 insertions(+), 7 deletions(-) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 58f394eaa68..b71585ef9dd 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -323,6 +323,8 @@ - op: scalar_tensor.out +- op: scatter.src_out + - op: scatter.value_out - op: scatter_add.out diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp index 9696ab4f14d..0a2fee9a61e 100644 --- a/kernels/portable/cpu/op_scatter.cpp +++ b/kernels/portable/cpu/op_scatter.cpp @@ -23,6 +23,46 @@ using ScalarType = exec_aten::ScalarType; namespace { +template +void scatter_src_helper( + const Tensor& in, + int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& out) { + const CTYPE* in_data = in.const_data_ptr(); + const long* index_data = index.const_data_ptr(); + const CTYPE* src_data = src.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + memcpy(out_data, in_data, in.nbytes()); + + if (dim < 0) { + dim += nonzero_dim(in); + } + + for (size_t ix = 0; ix < index.numel(); ++ix) { + // @lint-ignore CLANGTIDY facebook-hte-CArray + size_t ix_coord[kTensorDimensionLimit]; + indexToCoordinate(index, ix, ix_coord); + + size_t src_ix = coordinateToIndex(src, ix_coord); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + size_t out_coord[kTensorDimensionLimit]; + for (size_t i = 0; i < out.dim(); ++i) { + if (i == dim) { + out_coord[i] = index_data[ix]; + } else { + out_coord[i] = ix_coord[i]; + } + } + size_t out_ix = coordinateToIndex(out, out_coord); + + out_data[out_ix] = src_data[src_ix]; + } +} + template void scatter_value_helper( const Tensor& in, @@ -36,15 +76,16 @@ void scatter_value_helper( memcpy(out_data, in_data, in.nbytes()); - if (index.dim() == 0) { - out_data[index_data[0]] = static_cast(val); - return; + if (dim < 0) { + dim += nonzero_dim(in); } for (size_t ix = 0; ix < index.numel(); ++ix) { + // @lint-ignore CLANGTIDY facebook-hte-CArray size_t ix_coord[kTensorDimensionLimit]; indexToCoordinate(index, ix, ix_coord); + // @lint-ignore CLANGTIDY facebook-hte-CArray size_t out_coord[kTensorDimensionLimit]; for (size_t i = 0; i < out.dim(); ++i) { if (i == dim) { @@ -61,6 +102,36 @@ void scatter_value_helper( } // namespace +Tensor& scatter_src_out( + RuntimeContext& context, + const Tensor& in, + int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& out) { + (void)context; + + ET_KERNEL_CHECK( + context, + check_scatter_src_args(in, dim, index, src, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + context, + resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "scatter.src_out"; + + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { + scatter_src_helper(in, dim, index, src, out); + }); + + return out; +} + Tensor& scatter_value_out( RuntimeContext& ctx, const Tensor& in, @@ -79,10 +150,6 @@ Tensor& scatter_value_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); - if (dim < 0) { - dim += nonzero_dim(in); - } - ScalarType val_type = utils::get_scalar_dtype(value); constexpr auto name = "scatter.value_out"; diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp index ca9900773a1..b1c9696fd62 100644 --- a/kernels/portable/cpu/util/index_util.cpp +++ b/kernels/portable/cpu/util/index_util.cpp @@ -191,6 +191,15 @@ bool check_scatter_add_args( return true; } +bool check_scatter_src_args( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& out) { + return check_scatter_add_args(self, dim, index, src, out); +} + bool check_scatter_value_args( const Tensor& self, int64_t dim, diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h index ae6654be52b..73d264a748c 100644 --- a/kernels/portable/cpu/util/index_util.h +++ b/kernels/portable/cpu/util/index_util.h @@ -43,6 +43,13 @@ bool check_scatter_add_args( const Tensor& src, Tensor& out); +bool check_scatter_src_args( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& out); + bool check_scatter_value_args( const Tensor& self, int64_t dim, diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 085869715c7..69e0334051c 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -742,6 +742,11 @@ - arg_meta: null kernel_name: torch::executor::scalar_tensor_out +- op: scatter.src_out + kernels: + - arg_meta: null + kernel_name: torch::executor::scatter_src_out + - op: scatter.value_out kernels: - arg_meta: null diff --git a/kernels/test/op_scatter_test.cpp b/kernels/test/op_scatter_test.cpp index 2335c839d00..83c112a8c34 100644 --- a/kernels/test/op_scatter_test.cpp +++ b/kernels/test/op_scatter_test.cpp @@ -22,6 +22,189 @@ using exec_aten::ScalarType; using exec_aten::Tensor; using torch::executor::testing::TensorFactory; +class OpScatterSrcOutTest : public OperatorTest { + protected: + Tensor& op_scatter_src_out( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& out) { + return torch::executor::aten::scatter_outf( + context_, self, dim, index, src, out); + } + + // Common testing for the operator + template + void test_scatter_src_out() { + TensorFactory tf_index; + TensorFactory tf_data; + const std::vector sizes = {3, 5}; + // clang-format off + Tensor src = tf_data.make( + /*sizes=*/{2, 5}, + { + 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 + }); + // clang-format on + Tensor in = tf_data.zeros(sizes); + Tensor out = tf_data.zeros(sizes); + // clang-format off + Tensor index = tf_index.make( + /*sizes=*/{2, 3}, + { + 0, 1, 2, + 0, 1, 2 + }); + // clang-format on + + // Valid input should give the expected output + op_scatter_src_out(in, 0, index, src, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make( + sizes, + { + 6, 0, 0, 0, 0, + 0, 7, 0, 0, 0, + 0, 0, 8, 0, 0 + })); + // clang-format on + + // Valid input should give the expected output + op_scatter_src_out(in, 1, index, src, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, tf_data.make(sizes, + { + 1, 2, 3, 0, 0, + 6, 7, 8, 0, 0, + 0, 0, 0, 0, 0 + })); + + src = tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 1, 2, 3, + 4, 5, 6, + 7, 8, 9, + + // [1, :, :] + 10, 11, 12, + 13, 14, 15, + 16, 17, 18 + }); + // clang-format on + in = tf_data.ones(/*sizes=*/{2, 3, 3}); + out = tf_data.zeros(/*sizes=*/{2, 3, 3}); + // clang-format off + index = tf_index.make( + /*sizes=*/{1, 3, 2}, + { + 0, 1, + 1, 2, + 0, 2 + }); + // clang-format on + + op_scatter_src_out(in, 1, index, src, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 7, 1, 1, + 4, 2, 1, + 1, 8, 1, + + // [1, :, :] + 1, 1, 1, + 1, 1, 1, + 1, 1, 1 + })); + // clang-format on + + out = tf_data.zeros(/*sizes=*/{2, 3, 3}); + op_scatter_src_out(in, 2, index, src, out); + // clang-format off + EXPECT_TENSOR_EQ( + out, + tf_data.make( + /*sizes=*/{2, 3, 3}, + { + // [0, :, :] + 1, 2, 1, + 1, 4, 5, + 7, 1, 8, + + // [1, :, :] + 1, 1, 1, + 1, 1, 1, + 1, 1, 1 + })); + // clang-format on + } + + // Invalid dimensions + template + void test_scatter_src_out_invalid_dim() { + TensorFactory tf_index; + TensorFactory tf_data; + const std::vector sizes = {3, 5}; + // clang-format off + Tensor src = tf_data.make(/*sizes=*/{2, 5}, + { + 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 + }); + Tensor index = tf_index.make(/*sizes=*/{2, 3}, + { + 0, 1, 2, + 0, 1, 2 + }); + // clang-format on + Tensor self = tf_data.zeros(sizes); + Tensor out = tf_data.zeros(sizes); + + // Invalid dim should die + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, -3, index, src, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 2, index, src, out)); + + // Self, index and src hsould have same number of dimensions + src = tf_data.zeros(/*sizes=*/{2, 2, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); + + src = tf_data.zeros(/*sizes=*/{5, 5}); + index = tf_index.zeros(/*sizes=*/{2, 2, 2}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); + + // Size of dimension of index should be smaller than the size of that + // dimension of src + index = tf_index.zeros(/*sizes=*/{4, 6}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); + + // Size of dimension of index should be smaller than the size of that + // dimension of self if dimension != dim + index = tf_index.zeros(/*sizes=*/{4, 5}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 1, index, src, out)); + + // Index out of bound for self in dim + index = tf_index.make(/*sizes=*/{2, 3}, {0, 1, 3, 0, 1, 3}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); + } +}; + class OpScatterValueOutTest : public OperatorTest { protected: Tensor& op_scatter_value_out( @@ -183,6 +366,19 @@ class OpScatterValueOutTest : public OperatorTest { } }; +TEST_F(OpScatterSrcOutTest, AllValidInputOutputSupport) { +#define TEST_ENTRY(CTYPE, DTYPE) test_scatter_src_out(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpScatterSrcOutTest, InvalidDimensionsDies) { +#define TEST_ENTRY(CTYPE, DTYPE) \ + test_scatter_src_out_invalid_dim(); + ET_FORALL_REAL_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + TEST_F(OpScatterValueOutTest, AllValidInputOutputSupport) { #define TEST_ENTRY(CTYPE, DTYPE) test_scatter_value_out(); ET_FORALL_REAL_TYPES(TEST_ENTRY); @@ -360,3 +556,99 @@ TEST_F(OpScatterValueOutTest, InvalidOneDimInputAndZeroDimIndex) { ET_EXPECT_KERNEL_FAILURE( context_, op_scatter_value_out(self, 0, index, value, out)); } + +TEST_F(OpScatterSrcOutTest, EmptyIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({2, 5}); + Tensor index = tf_index.zeros({2, 0, 3}); + Tensor src = tf_data.ones({1, 1, 4}); + Tensor out = tf_data.zeros({2, 5}); + op_scatter_src_out(self, 0, index, src, out); + EXPECT_TENSOR_CLOSE(out, tf_data.ones({2, 5})); +} + +TEST_F(OpScatterSrcOutTest, ValidZeroDim) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.zeros({}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.zeros({}); + op_scatter_src_out(self, 0, index, src, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5})); +} + +TEST_F(OpScatterSrcOutTest, InvalidZeroDimInput) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.ones({}); + Tensor index = tf_index.make({2, 3}, {0, 0, 0, 0, 0, 0}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.zeros({}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); +} + +TEST_F(OpScatterSrcOutTest, InvalidZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({2, 3}, {1, 2, 3, 4, 5, 6}); + Tensor index = tf_index.make({}, {2}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.zeros({2, 3}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 1, index, src, out)); +} + +TEST_F(OpScatterSrcOutTest, ValidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.make({3}, {0, 0, 0}); + Tensor src = tf_data.make({3}, {5, 5, 5}); + Tensor out = tf_data.make({}, {2.71}); + op_scatter_src_out(self, 0, index, src, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({}, {5})); +} + +TEST_F(OpScatterSrcOutTest, ValidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + Tensor index = tf_index.make({}, {2}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.make({3}, {1729, 1729, 1729}); + op_scatter_src_out(self, 0, index, src, out); + EXPECT_TENSOR_CLOSE(out, tf_data.make({3}, {10, 20, 5})); +} + +TEST_F(OpScatterSrcOutTest, InvalidZeroDimInputAndOneDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({}, {3.14}); + Tensor index = tf_index.make({3}, {10, 100, 1000}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.make({}, {2.71}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); +} + +TEST_F(OpScatterSrcOutTest, InvalidOneDimInputAndZeroDimIndex) { + TensorFactory tf_index; + TensorFactory tf_data; + + Tensor self = tf_data.make({3}, {10, 20, 30}); + Tensor index = tf_index.make({}, {100}); + Tensor src = tf_data.make({}, {5}); + Tensor out = tf_data.make({3}, {1729, 1729, 1729}); + ET_EXPECT_KERNEL_FAILURE( + context_, op_scatter_src_out(self, 0, index, src, out)); +} From 77df7b402807aa8d1abd3a365640be92b20f8e16 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:39:00 -0700 Subject: [PATCH 152/531] Fix MacOS CMake build Fix deps in extension_threadpool Pull Request resolved: https://github.com/pytorch/executorch/pull/5042 --- extension/llm/custom_ops/CMakeLists.txt | 4 +++- extension/threadpool/CMakeLists.txt | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 1d9cf1e1f24..41c8c0ee160 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -93,7 +93,9 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) target_link_libraries(custom_ops_aot_lib PUBLIC executorch_no_prim_ops) endif() - target_link_libraries(custom_ops_aot_lib PUBLIC cpublas torch) + target_link_libraries( + custom_ops_aot_lib PUBLIC cpublas torch extension_threadpool + ) if(WIN32) # There is no direct replacement for libpthread.so on Windows. For the # Windows build, link directly against pthreadpool and cpuinfo. diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index 674d3136e1a..a82afc045a3 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -21,7 +21,9 @@ if(NOT CMAKE_CXX_STANDARD) endif() add_library(extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp) -target_link_libraries(extension_threadpool PUBLIC executorch cpuinfo pthreadpool) +target_link_libraries( + extension_threadpool PUBLIC executorch_no_prim_ops cpuinfo pthreadpool +) target_include_directories(extension_threadpool PUBLIC ${EXECUTORCH_ROOT}/..) target_include_directories( extension_threadpool From 0293cac203879de478cbb6725c248c350e8142f9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 3 Sep 2024 16:52:56 -0700 Subject: [PATCH 153/531] Fix wrong Android app artifact path after #5004 Differential Revision: D62158211 Pull Request resolved: https://github.com/pytorch/executorch/pull/5050 --- .github/workflows/android-perf.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 028cd4c5a22..a23f9487157 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -280,9 +280,8 @@ jobs: # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer. # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only # one app+flavor that could load and run the model. - # TODO: Hard code llm_demo_bpe for now in this job. - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk + android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk + android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk # NB: Need to set the default spec here so that it works for periodic too test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }} # Uploaded to S3 from the previous job From 887abab5057998a2129bf3011cbde9bc0aa0b46e Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:55:43 -0700 Subject: [PATCH 154/531] Update pytorch pin for ET Differential Revision: D62081793 Pull Request resolved: https://github.com/pytorch/executorch/pull/5026 Co-authored-by: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 14422e45d7c..b291722c3f0 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -e4cd76cf8283c8ddbf95674b020fbfcff467cb4b +00e3eea170ce5db8ea9c62ce5e48f13886cd6d20 diff --git a/install_requirements.py b/install_requirements.py index 1f5982c80e0..64243ec6943 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -94,7 +94,7 @@ def python_is_compatible(): # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION = "dev20240829" +NIGHTLY_VERSION = "dev20240901" # The pip repository that hosts nightly torch packages. TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" From ec89223022b8c2c7177eedf8dcb19e1ae569c166 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:56:13 -0700 Subject: [PATCH 155/531] Introduce ciflow/android and ciflow/apple label Pull Request resolved: https://github.com/pytorch/executorch/pull/4992 --- .github/pytorch-probot.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index f684d83fa51..2b66829ed0a 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,5 +1,7 @@ # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml ciflow_push_tags: +- ciflow/android +- ciflow/apple - ciflow/nightly - ciflow/trunk - ciflow/binaries From 83c8a165d50ed3cc2a5ade3c0a0d3d2f8ee1a3c0 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:56:37 -0700 Subject: [PATCH 156/531] Trigger apple.yml on examples/demo-apps/apple only So we won't trigger it on Android changes Pull Request resolved: https://github.com/pytorch/executorch/pull/5018 --- .github/workflows/apple.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 2c4c172ac1c..a74fbcdaf5f 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -15,7 +15,7 @@ on: - build/build_apple_frameworks.sh - build/create_frameworks.sh - build/test_ios_ci.sh - - examples/demo-apps/** + - examples/demo-apps/apple/** - extension/apple/** - extension/module/** workflow_dispatch: From 23f03b9e73635ab04b08111fa628cdec5eed104e Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Tue, 3 Sep 2024 17:04:32 -0700 Subject: [PATCH 157/531] Rename PyTorchBackendInterface to BackendInterface Differential Revision: D61925076 Pull Request resolved: https://github.com/pytorch/executorch/pull/5022 --- runtime/backend/interface.cpp | 6 ++-- runtime/backend/interface.h | 35 +++++++++---------- runtime/executor/method.cpp | 4 +-- .../test/backend_integration_test.cpp | 8 ++--- .../test/test_backend_compiler_lib.cpp | 4 +-- .../test_backend_with_delegate_mapping.cpp | 4 +-- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp index d7f0489db5e..84c0bb82d43 100644 --- a/runtime/backend/interface.cpp +++ b/runtime/backend/interface.cpp @@ -7,12 +7,12 @@ */ #include -#include namespace executorch { namespace runtime { -PyTorchBackendInterface::~PyTorchBackendInterface() {} +// Pure-virtual dtors still need an implementation. +BackendInterface::~BackendInterface() {} namespace { @@ -31,7 +31,7 @@ size_t num_registered_backends = 0; } // namespace -PyTorchBackendInterface* get_backend_class(const char* name) { +BackendInterface* get_backend_class(const char* name) { for (size_t i = 0; i < num_registered_backends; i++) { Backend backend = registered_backends[i]; if (strcmp(backend.name, name) == 0) { diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h index 0b77283a358..c0305f68cd3 100644 --- a/runtime/backend/interface.h +++ b/runtime/backend/interface.h @@ -39,9 +39,9 @@ struct CompileSpec { */ using DelegateHandle = void; -class PyTorchBackendInterface { +class BackendInterface { public: - virtual ~PyTorchBackendInterface() = 0; + virtual ~BackendInterface() = 0; /** * Returns true if the backend is available to process delegation calls. @@ -52,19 +52,19 @@ class PyTorchBackendInterface { * Responsible to further process (compile/transform/optimize) the compiled * unit that was produced, ahead-of-time, as well as perform any backend * initialization to ready it for execution. This method is called every time - * the PyTorch program is initialized. Consequently, this is the place to + * the ExecuTorch program is initialized. Consequently, this is the place to * perform any backend initialization as well as transformations, * optimizations, and even compilation that depend on the target device. As * such, it is strongly encouraged to push as much processing as possible to * the ahead-of-time processing. * - * @param[in] processed An opaque (to PyTorch) compiled unit from the - * preprocessor. Can contain anything the backend needs to execute the - * equivalent semantics of the passed-in Module and its method. Often - * passed unmodified to `execute()` as a `DelegateHandle`, unless it needs - * further processing at init time to be fully executable. If the data is - * not needed after init(), calling processed->Free() can reclaim its - * memory. + * @param[in] processed An opaque (to ExecuTorch) backend-specific compiled + * unit from the preprocessor. Can contain anything the backend needs to + * execute the equivalent semantics of the passed-in Module and its + * method. Often passed unmodified to `execute()` as a `DelegateHandle`, + * unless it needs further processing at init time to be fully executable. + * If the data is not needed after init(), calling processed->Free() can + * reclaim its memory. * @param[in] compile_specs The exact same compiler specification that * was used ahead-of-time to produce `processed`. * @@ -115,11 +115,10 @@ class PyTorchBackendInterface { * The mapping is populated using register_backend method. * * @param[in] name Name of the user-defined backend delegate. - * @retval Pointer to the appropriate object that implements - * PyTorchBackendInterface. Nullptr if it can't find anything - * with the given name. + * @retval Pointer to the appropriate object that implements BackendInterface. + * Nullptr if it can't find anything with the given name. */ -PyTorchBackendInterface* get_backend_class(const char* name); +BackendInterface* get_backend_class(const char* name); /** * A named instance of a backend. @@ -128,12 +127,12 @@ struct Backend { /// The name of the backend. Must match the string used in the PTE file. const char* name; /// The instance of the backend to use when loading and executing programs. - PyTorchBackendInterface* backend; + BackendInterface* backend; }; /** - * Registers the Backend object (i.e. string name and PyTorchBackendInterface - * pair) so that it could be called via the name during the runtime. + * Registers the Backend object (i.e. string name and BackendInterface pair) so + * that it could be called via the name during the runtime. * * @param[in] backend Backend object * @retval Error code representing whether registration was successful. @@ -151,8 +150,8 @@ using ::executorch::runtime::Backend; using ::executorch::runtime::CompileSpec; using ::executorch::runtime::DelegateHandle; using ::executorch::runtime::get_backend_class; -using ::executorch::runtime::PyTorchBackendInterface; using ::executorch::runtime::register_backend; using ::executorch::runtime::SizedBuffer; +using PyTorchBackendInterface = ::executorch::runtime::BackendInterface; } // namespace executor } // namespace torch diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 717f6fb7f72..d39ba875531 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -58,7 +58,7 @@ class BackendDelegate final { ET_CHECK_OR_RETURN_ERROR( delegate.id() != nullptr, InvalidProgram, "Missing backend id"); const char* backend_id = delegate.id()->c_str(); - PyTorchBackendInterface* backend = get_backend_class(backend_id); + BackendInterface* backend = get_backend_class(backend_id); ET_CHECK_OR_RETURN_ERROR( backend != nullptr, NotFound, @@ -198,7 +198,7 @@ class BackendDelegate final { } FreeableBuffer segment_; - const PyTorchBackendInterface* backend_; + const BackendInterface* backend_; DelegateHandle* handle_; }; diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp index e3902bb9bc4..9180d77aa35 100644 --- a/runtime/executor/test/backend_integration_test.cpp +++ b/runtime/executor/test/backend_integration_test.cpp @@ -31,6 +31,7 @@ using namespace ::testing; using exec_aten::ArrayRef; using executorch::runtime::BackendExecutionContext; using executorch::runtime::BackendInitContext; +using executorch::runtime::BackendInterface; using executorch::runtime::CompileSpec; using executorch::runtime::DataLoader; using executorch::runtime::DelegateHandle; @@ -40,7 +41,6 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; using executorch::runtime::Method; using executorch::runtime::Program; -using executorch::runtime::PyTorchBackendInterface; using executorch::runtime::Result; using executorch::runtime::testing::ManagedMemoryManager; using torch::executor::util::FileDataLoader; @@ -48,9 +48,9 @@ using torch::executor::util::FileDataLoader; /** * A backend class whose methods can be overridden individually. */ -class StubBackend final : public PyTorchBackendInterface { +class StubBackend final : public BackendInterface { public: - // Function signature types that match the PyTorchBackendInterface methods. + // Function signature types that match the BackendInterface methods. using IsAvailableFn = std::function; using InitFn = std::function( FreeableBuffer*, @@ -325,7 +325,7 @@ class BackendIntegrationTest : public ::testing::TestWithParam { }; TEST_P(BackendIntegrationTest, BackendIsPresent) { - PyTorchBackendInterface* backend = + BackendInterface* backend = executorch::runtime::get_backend_class(StubBackend::kName); ASSERT_EQ(backend, &StubBackend::singleton()); } diff --git a/runtime/executor/test/test_backend_compiler_lib.cpp b/runtime/executor/test/test_backend_compiler_lib.cpp index 20028b2dc5a..7bfd7689a47 100644 --- a/runtime/executor/test/test_backend_compiler_lib.cpp +++ b/runtime/executor/test/test_backend_compiler_lib.cpp @@ -17,13 +17,13 @@ using executorch::runtime::ArrayRef; using executorch::runtime::Backend; using executorch::runtime::BackendExecutionContext; using executorch::runtime::BackendInitContext; +using executorch::runtime::BackendInterface; using executorch::runtime::CompileSpec; using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; -using executorch::runtime::PyTorchBackendInterface; using executorch::runtime::Result; struct DemoOp { @@ -38,7 +38,7 @@ struct DemoOpList { size_t numops; }; -class BackendWithCompiler final : public PyTorchBackendInterface { +class BackendWithCompiler final : public BackendInterface { int max_shape = 4; public: diff --git a/runtime/executor/test/test_backend_with_delegate_mapping.cpp b/runtime/executor/test/test_backend_with_delegate_mapping.cpp index ba580c98d70..ead99c1305a 100644 --- a/runtime/executor/test/test_backend_with_delegate_mapping.cpp +++ b/runtime/executor/test/test_backend_with_delegate_mapping.cpp @@ -18,13 +18,13 @@ using executorch::runtime::ArrayRef; using executorch::runtime::Backend; using executorch::runtime::BackendExecutionContext; using executorch::runtime::BackendInitContext; +using executorch::runtime::BackendInterface; using executorch::runtime::CompileSpec; using executorch::runtime::DelegateHandle; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; -using executorch::runtime::PyTorchBackendInterface; using executorch::runtime::Result; struct DemoOp { @@ -37,7 +37,7 @@ struct DemoOpList { size_t numops; }; -class BackendWithDelegateMapping final : public PyTorchBackendInterface { +class BackendWithDelegateMapping final : public BackendInterface { public: ~BackendWithDelegateMapping() override = default; From d519b4d3a1ffdc81b45e2b1d4733423ce0577813 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 3 Sep 2024 17:12:54 -0700 Subject: [PATCH 158/531] [executorch] Add logs for helping debug address space overflow issue Differential Revision: D62142486 Pull Request resolved: https://github.com/pytorch/executorch/pull/5035 --- exir/emit/_emitter.py | 17 +++++++++++++++-- exir/passes/sym_shape_eval_pass.py | 4 ++-- exir/tensor.py | 8 +++++++- exir/tests/test_tensor.py | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index 2d2cc0f3f18..dea9cf6fd6a 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -79,6 +79,7 @@ TensorShapeDynamism, ) from executorch.exir.tensor import ( + AddressSpaceOverflowException, layout_enum, make_allocation_info, make_tensor_value, @@ -349,7 +350,20 @@ def _tensor_spec_to_evalue(self, spec: TensorSpec) -> EValue: self.node, f"Non-const tensor should be an activation tensor: mem_offset {spec.mem_offset}", ) - allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset) + try: + allocation_info = make_allocation_info(spec.mem_id, spec.mem_offset) + except AddressSpaceOverflowException as e: + raise InternalError( + self._emit_node_specific_error( + self.node, + ( + f"{e}\nHint: If you are using a memory pass based on dynamic shape bounds, " + f"such as ConstraintBasedSymShapeEvalPass, this may be the cause of an " + f"unbacked SymInt with its upper bound lazily set to 2^64-1 (uint64 max) " + "during torch.export()." + ), + ) + ) if spec.const: # Tensor with a blob we need to serialize. May not actually be constant at runtime @@ -1527,7 +1541,6 @@ def placeholder( is_user_input = True if isinstance(target, str) and isinstance(spec, TensorSpec): - fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec) # From the fqn find the corresponding tensor diff --git a/exir/passes/sym_shape_eval_pass.py b/exir/passes/sym_shape_eval_pass.py index ec61d4b3a6f..4ba554c6a17 100644 --- a/exir/passes/sym_shape_eval_pass.py +++ b/exir/passes/sym_shape_eval_pass.py @@ -196,7 +196,7 @@ class HintBasedSymShapeEvalPass(PassBase): Warning: if you're using torch.export with constrain API, this method doesn't respect the input constraints. - Not inherit from ExportPass since we simply need a way to iterate thru + Not inherited from ExportPass since we simply need a way to iterate thru every node's output. PassBase is easier for that purpose. """ @@ -260,7 +260,7 @@ class ConstraintBasedSymShapeEvalPass(PassBase): formula. We should convert those symbolic formula to concrete value for static/upperbound tensors so we can properly do memory planning for them. - Not inherit from ExportPass since we simply need a way to iterate thru + Not inherited from ExportPass since we simply need a way to iterate through every node's output. PassBase is easier for that purpose. """ diff --git a/exir/tensor.py b/exir/tensor.py index 7380a96ebc7..d63ed5d2627 100644 --- a/exir/tensor.py +++ b/exir/tensor.py @@ -22,6 +22,10 @@ from executorch.exir.sym_util import eval_shape +class AddressSpaceOverflowException(Exception): + pass + + def num_bytes_from_shape_and_dtype(shape: torch.Size, dtype: torch.dtype) -> int: """ Assume the tensor is a contiguous one. @@ -297,7 +301,9 @@ def make_allocation_info(mem_id: int, mem_offset: int) -> schema.AllocationDetai memory_offset_low = mem_offset & ((1 << 32) - 1) memory_offset_high = mem_offset >> 32 if memory_offset_high >= 1 << 32: - raise ValueError(f"mem_offset {mem_offset} does not fit in 64 bits") + raise AddressSpaceOverflowException( + f"mem_offset {mem_offset} does not fit in 64 bits" + ) allocation_info = schema.AllocationDetails( memory_id=mem_id, diff --git a/exir/tests/test_tensor.py b/exir/tests/test_tensor.py index a5d197a85b7..c5383b0dac2 100644 --- a/exir/tests/test_tensor.py +++ b/exir/tests/test_tensor.py @@ -171,7 +171,7 @@ def test_allocation_info_fails(self) -> None: ) for test_case in test_cases: kwargs = test_case[0] - with self.assertRaisesRegex(ValueError, test_case[1], msg=f"{kwargs}"): + with self.assertRaisesRegex(Exception, test_case[1], msg=f"{kwargs}"): make_allocation_info(**kwargs) def test_contiguous_stride_from_shape(self) -> None: From 3c4e26f5782b1176bef14c8e5f62a60a49bb47de Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 17:44:51 -0700 Subject: [PATCH 159/531] TensorImpl managed by a smart pointer. Differential Revision: D61959528 Pull Request resolved: https://github.com/pytorch/executorch/pull/4986 --- extension/tensor/TARGETS | 8 + extension/tensor/targets.bzl | 31 +++ extension/tensor/tensor_impl_ptr.cpp | 120 ++++++++++++ extension/tensor/tensor_impl_ptr.h | 109 +++++++++++ extension/tensor/test/TARGETS | 8 + extension/tensor/test/targets.bzl | 22 +++ .../tensor/test/tensor_impl_ptr_test.cpp | 183 ++++++++++++++++++ 7 files changed, 481 insertions(+) create mode 100644 extension/tensor/TARGETS create mode 100644 extension/tensor/targets.bzl create mode 100644 extension/tensor/tensor_impl_ptr.cpp create mode 100644 extension/tensor/tensor_impl_ptr.h create mode 100644 extension/tensor/test/TARGETS create mode 100644 extension/tensor/test/targets.bzl create mode 100644 extension/tensor/test/tensor_impl_ptr_test.cpp diff --git a/extension/tensor/TARGETS b/extension/tensor/TARGETS new file mode 100644 index 00000000000..2341af9282f --- /dev/null +++ b/extension/tensor/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl new file mode 100644 index 00000000000..133e29fd68b --- /dev/null +++ b/extension/tensor/targets.bzl @@ -0,0 +1,31 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + for aten_mode in (True, False): + aten_suffix = ("_aten" if aten_mode else "") + + runtime.cxx_library( + name = "tensor" + aten_suffix, + srcs = [ + "tensor_impl_ptr.cpp", + ], + exported_headers = [ + "tensor_impl_ptr.h", + ], + visibility = [ + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, + ], + exported_deps = [ + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + ], + ) diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp new file mode 100644 index 00000000000..aa5f78e7f8d --- /dev/null +++ b/extension/tensor/tensor_impl_ptr.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +namespace executorch { +namespace extension { +namespace { +#ifndef USE_ATEN_LIB +// No-op deleter that does nothing when called. +static void noop_deleter(void*) {} + +/** + * Custom deleter for TensorImplPtr that ensures the memory associated with + * dynamic metadata (sizes, dim_order, and strides) is properly managed when the + * TensorImpl is destroyed. + * + * Since TensorImpl does not own the metadata arrays (sizes, dim_order, + * strides), this deleter is responsible for releasing that memory when the + * TensorImpl is destroyed. + */ +struct TensorImplPtrDeleter final { + std::unique_ptr> data; + std::vector sizes; + std::vector dim_order; + std::vector strides; + + void operator()(exec_aten::TensorImpl* pointer) { + // Release all resources immediately since the data held by the + // TensorImplDeleter is tied to the managed object, not the smart pointer + // itself. We need to free this memory when the object is destroyed, not + // when the smart pointer (and deleter) are eventually destroyed or reset. + data.reset(); + sizes = {}; + dim_order = {}; + strides = {}; + delete pointer; + } +}; +#endif // USE_ATEN_LIB +} // namespace + +TensorImplPtr make_tensor_impl_ptr( + exec_aten::ScalarType type, + std::vector sizes, + void* data, + std::vector dim_order, + std::vector strides, + exec_aten::TensorShapeDynamism dynamism, + std::function deleter) { + const auto dim = sizes.size(); + if (dim_order.empty()) { + dim_order.resize(dim); + std::iota(dim_order.begin(), dim_order.end(), 0); + if (!strides.empty()) { + std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) { + return strides[a] > strides[b]; + }); + } + } + std::vector computed_strides(dim); + auto error = runtime::dim_order_to_stride( + sizes.data(), dim_order.data(), dim, computed_strides.data()); + ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides."); + + if (!strides.empty()) { + ET_CHECK_MSG(computed_strides == strides, "Invalid strides provided."); + } else { + strides = std::move(computed_strides); + } +#ifndef USE_ATEN_LIB + auto tensor_impl = std::make_unique( + type, + dim, + sizes.data(), + data, + dim_order.data(), + strides.data(), + dynamism); + return TensorImplPtr( + tensor_impl.release(), + TensorImplPtrDeleter{ + std::unique_ptr>( + data, std::move(deleter) ?: noop_deleter), + std::move(sizes), + std::move(dim_order), + std::move(strides)}); +#else + auto options = c10::TensorOptions() + .dtype(c10::scalarTypeToTypeMeta(type)) + .device(c10::kCPU); + auto storage = c10::Storage( + c10::Storage::use_byte_size_t(), + at::detail::computeStorageNbytes( + sizes, strides, options.dtype().itemsize()), + c10::InefficientStdFunctionContext::makeDataPtr( + data, std::move(deleter), options.device()), + nullptr, + false); + auto tensor_impl = c10::make_intrusive( + std::move(storage), + c10::DispatchKeySet(c10::DispatchKey::CPU), + options.dtype()); + tensor_impl->set_sizes_and_strides(sizes, strides); + return tensor_impl; +#endif // USE_ATEN_LIB +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h new file mode 100644 index 00000000000..e8180d93e72 --- /dev/null +++ b/extension/tensor/tensor_impl_ptr.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace executorch { +namespace extension { + +#ifndef USE_ATEN_LIB +/** + * A smart pointer type for managing the lifecycle of a TensorImpl. + * + * TensorImplPtr uses a shared pointer because multiple Tensor objects might + * share the same underlying data and metadata. This shared ownership model + * ensures that the TensorImpl is only destroyed when all references to it are + * gone, providing a safe and efficient way to manage shared tensor + * implementations. This abstraction is designed to be a safer and more + * convenient alternative to the original TensorImpl, which does not + * manage metadata by design. + */ +using TensorImplPtr = std::shared_ptr; +#else +/** + * A smart pointer type for managing the lifecycle of a TensorImpl. + * + * TensorImplPtr uses an intrusive pointer when working with ATen, ensuring + * efficient reference counting and shared ownership of the underlying data and + * metadata. + */ +using TensorImplPtr = + c10::intrusive_ptr; +#endif // USE_ATEN_LIB + +/** + * Creates a TensorImplPtr that manages a newly created TensorImpl with the + * specified properties. + * + * @param type The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A pointer to the data buffer. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of each dimension. + * @param dynamism Specifies the mutability of the tensor's shape. + * @param deleter A custom deleter function for managing the lifetime of the + * data buffer. If provided, this deleter will be called when the managed + * TensorImpl object is destroyed. + * @return A TensorImplPtr managing the newly created TensorImpl. + */ +TensorImplPtr make_tensor_impl_ptr( + exec_aten::ScalarType type, + std::vector sizes, + void* data, + std::vector dim_order = {}, + std::vector strides = {}, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC, + std::function deleter = nullptr); + +/** + * Creates a TensorImplPtr that manages a newly created TensorImpl with the + * specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector of a specific scalar type, rather than a raw pointer. + * The deleter ensures that the data vector is properly managed and its + * lifetime is tied to the TensorImpl. + * + * @tparam T The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A vector containing the tensor's data. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of each dimension. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorImplPtr managing the newly created TensorImpl. + */ +template +TensorImplPtr make_tensor_impl_ptr( + std::vector sizes, + std::vector::type> data, + std::vector dim_order = {}, + std::vector strides = {}, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + const auto data_ptr = data.data(); + return make_tensor_impl_ptr( + T, + std::move(sizes), + data_ptr, + std::move(dim_order), + std::move(strides), + dynamism, + [data = std::move(data)](void*) {}); +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/test/TARGETS b/extension/tensor/test/TARGETS new file mode 100644 index 00000000000..2341af9282f --- /dev/null +++ b/extension/tensor/test/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl new file mode 100644 index 00000000000..04e231e0cdf --- /dev/null +++ b/extension/tensor/test/targets.bzl @@ -0,0 +1,22 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + for aten_mode in (True, False): + aten_suffix = ("_aten" if aten_mode else "") + + runtime.cxx_test( + name = "test" + aten_suffix, + srcs = [ + "tensor_impl_ptr_test.cpp", + ], + deps = [ + "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix, + ], + ) diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp new file mode 100644 index 00000000000..a95f807a736 --- /dev/null +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +using namespace executorch::extension; +using namespace executorch::runtime; + +class TensorImplPtrTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + runtime_init(); + } +}; + +TEST_F(TensorImplPtrTest, TensorImplCreation) { + float data[20] = {2}; + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {4, 5}, data, {0, 1}, {5, 1}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->size(1), 5); + EXPECT_EQ(tensor_impl->strides()[0], 5); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(tensor_impl->data(), data); + EXPECT_EQ(tensor_impl->mutable_data(), data); + EXPECT_EQ(((float*)tensor_impl->mutable_data())[0], 2); +} + +TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) { + float data[20] = {2}; + auto tensor_impl1 = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data); + auto tensor_impl2 = tensor_impl1; + + EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get()); + EXPECT_EQ(tensor_impl1.use_count(), tensor_impl2.use_count()); + + tensor_impl1.reset(); + EXPECT_EQ(tensor_impl2.use_count(), 1); + EXPECT_NE(tensor_impl2.get(), nullptr); +} + +TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderAndStrides) { + float data[12] = {0}; + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {3, 4}, data, {}, {4, 1}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 3); + EXPECT_EQ(tensor_impl->size(1), 4); + EXPECT_EQ(tensor_impl->strides()[0], 4); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(tensor_impl->data(), data); +} + +TEST_F(TensorImplPtrTest, TensorImplInferredDimOrderCustomStrides) { + float data[12] = {0}; + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {3, 4}, data, {}, {1, 3}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 3); + EXPECT_EQ(tensor_impl->size(1), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(tensor_impl->strides()[1], 3); +} + +TEST_F(TensorImplPtrTest, TensorImplDefaultDimOrderAndStrides) { + float data[24] = {0}; + auto tensor_impl = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {2, 3, 4}, data); + + EXPECT_EQ(tensor_impl->dim(), 3); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 3); + EXPECT_EQ(tensor_impl->size(2), 4); + EXPECT_EQ(tensor_impl->strides()[0], 12); + EXPECT_EQ(tensor_impl->strides()[1], 4); + EXPECT_EQ(tensor_impl->strides()[2], 1); +} + +TEST_F(TensorImplPtrTest, TensorImplMismatchStridesAndDimOrder) { + float data[12] = {0}; + ET_EXPECT_DEATH( + { + auto _ = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {3, 4}, data, {1, 0}, {1, 4}); + }, + ""); +} + +TEST_F(TensorImplPtrTest, TensorImplCustomDimOrderAndStrides) { + float data[12] = {0}; + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {3, 4}, data, {1, 0}, {1, 3}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 3); + EXPECT_EQ(tensor_impl->size(1), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(tensor_impl->strides()[1], 3); +} + +TEST_F(TensorImplPtrTest, TensorImplInvalidDimOrder) { + ET_EXPECT_DEATH( + { + float data[20] = {2}; + auto _ = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {4, 5}, data, {2, 1}); + }, + ""); +} + +TEST_F(TensorImplPtrTest, TensorImplCustomDeleter) { + float data[20] = {4}; + auto tensor_impl = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data); + + TensorImplPtr copied_tensor_impl = tensor_impl; + EXPECT_EQ(tensor_impl.use_count(), copied_tensor_impl.use_count()); + + tensor_impl.reset(); + EXPECT_EQ(copied_tensor_impl.use_count(), 1); +} + +TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) { + auto deleter_called = false; + std::shared_ptr data_ptr( + new float[10], [](float* ptr) { delete[] ptr; }); + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, + {4, 5}, + data_ptr.get(), + {}, + {}, + exec_aten::TensorShapeDynamism::STATIC, + [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); + + EXPECT_EQ(data_ptr.use_count(), 2); + + tensor_impl.reset(); + EXPECT_TRUE(deleter_called); + EXPECT_EQ(data_ptr.use_count(), 1); +} + +TEST_F(TensorImplPtrTest, TensorImplOwningData) { + auto tensor_impl = make_tensor_impl_ptr( + {2, 5}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, + {1, 0}, + {1, 2}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 5); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(tensor_impl->strides()[1], 2); + EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl->data())[9], 10.0f); +} + +TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { + auto tensor_impl = make_tensor_impl_ptr({0, 5}, {}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 0); + EXPECT_EQ(tensor_impl->size(1), 5); + EXPECT_EQ(tensor_impl->strides()[0], 5); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(tensor_impl->data(), nullptr); +} From d96bcca3f570b497f9ab7aa75ad71d8707be1686 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 17:59:10 -0700 Subject: [PATCH 160/531] Tensor managed by a smart pointer. Differential Revision: D61959529 Pull Request resolved: https://github.com/pytorch/executorch/pull/5055 --- extension/tensor/targets.bzl | 3 + extension/tensor/tensor_ptr.cpp | 25 +++ extension/tensor/tensor_ptr.h | 185 ++++++++++++++++++++++ extension/tensor/test/targets.bzl | 1 + extension/tensor/test/tensor_ptr_test.cpp | 178 +++++++++++++++++++++ 5 files changed, 392 insertions(+) create mode 100644 extension/tensor/tensor_ptr.cpp create mode 100644 extension/tensor/tensor_ptr.h create mode 100644 extension/tensor/test/tensor_ptr_test.cpp diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index 133e29fd68b..e0d7d95627f 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -14,9 +14,11 @@ def define_common_targets(): name = "tensor" + aten_suffix, srcs = [ "tensor_impl_ptr.cpp", + "tensor_ptr.cpp", ], exported_headers = [ "tensor_impl_ptr.h", + "tensor_ptr.h", ], visibility = [ "@EXECUTORCH_CLIENTS", @@ -24,6 +26,7 @@ def define_common_targets(): deps = [ "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], exported_deps = [ "//executorch/runtime/core/exec_aten:lib" + aten_suffix, diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp new file mode 100644 index 00000000000..7a0aa997f02 --- /dev/null +++ b/extension/tensor/tensor_ptr.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch { +namespace extension { + +runtime::Error resize_tensor_ptr( + TensorPtr& tensor, + const std::vector& sizes) { + return runtime::resize_tensor( + *tensor, + exec_aten::ArrayRef(sizes.data(), sizes.size())); +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h new file mode 100644 index 00000000000..4d20f46be71 --- /dev/null +++ b/extension/tensor/tensor_ptr.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace executorch { +namespace extension { + +#ifndef USE_ATEN_LIB +namespace internal { +/** + * Custom deleter for TensorPtr that ensures the associated TensorImplPtr + * is properly managed. + * + * Since Tensor does not own its TensorImpl, this deleter is responsible for + * managing the lifecycle of the TensorImplPtr, ensuring that the dynamic + * metadata (sizes, dim_order, strides) is properly released when the Tensor is + * destroyed. + */ +struct TensorPtrDeleter final { + TensorImplPtr tensor_impl; + + void operator()(exec_aten::Tensor* pointer) { + // Release all resources immediately since the data held by the + // TensorPtrDeleter is tied to the managed object, not the smart pointer + // itself. We need to free this memory when the object is destroyed, not + // when the smart pointer (and deleter) are eventually destroyed or reset. + tensor_impl.reset(); + delete pointer; + } +}; +} // namespace internal + +/** + * A smart pointer type for managing the lifecycle of a Tensor. + * + * TensorPtr uses a unique pointer to enforce that each Tensor object has + * distinct ownership. This abstraction serves as a more convenient and safer + * replacement for the standard Tensor, which does not manage its + * metadata by design. Using TensorPtr simplifies memory management and ensures + * that the underlying TensorImpl is safely shared among tensors when needed. + */ +using TensorPtr = + std::unique_ptr; +#else +/** + * A smart pointer type for managing the lifecycle of a Tensor. + * + * When using ATen, this is a standard unique_ptr for exec_aten::Tensor. + * In ATen, the Tensor class owns its TensorImpl and associated metadata, + * so no custom deleter is required. + */ +using TensorPtr = std::unique_ptr; +#endif // USE_ATEN_LIB + +/** + * Creates a new TensorPtr that manages a newly created Tensor with the given + * TensorImplPtr. + * + * This function wraps the provided TensorImplPtr in a TensorPtr, ensuring that + * the Tensor object's lifecycle is properly managed. The TensorPtr will + * uniquely own the Tensor object, while the underlying TensorImplPtr may be + * shared with other Tensors. + * + * @param tensor_impl A TensorImplPtr to the TensorImpl to be managed. + * @return A TensorPtr that manages the newly created Tensor. + */ +inline TensorPtr make_tensor_ptr(TensorImplPtr tensor_impl) { +#ifndef USE_ATEN_LIB + auto tensor = std::make_unique(tensor_impl.get()); + return TensorPtr( + tensor.release(), internal::TensorPtrDeleter{std::move(tensor_impl)}); +#else + return std::make_unique(std::move(tensor_impl)); +#endif // USE_ATEN_LIB +} + +/** + * Creates a new TensorPtr that shares the same TensorImplPtr as an existing + * TensorPtr. + * + * This function creates a new TensorPtr that shares the + * underlying TensorImpl with the provided TensorPtr, ensuring that the + * underlying data and metadata are not duplicated but safely shared between the + * tensor objects. + * + * @param tensor A TensorPtr to the existing Tensor from which to create a copy. + * @return A new TensorPtr that shares the underlying TensorImplPtr with the + * original. + */ +inline TensorPtr make_tensor_ptr(const TensorPtr& tensor) { +#ifndef USE_ATEN_LIB + return make_tensor_ptr(tensor.get_deleter().tensor_impl); +#else + return make_tensor_ptr(tensor->getIntrusivePtr()); +#endif // USE_ATEN_LIB +} + +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * @param type The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A pointer to the data buffer. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of the tensor. + * @param dynamism Specifies the mutability of the tensor's shape. + * @param deleter A custom deleter function for managing the lifetime of the + * data buffer. If provided, this deleter will be called when the managed Tensor + * object is destroyed. + * @return A TensorPtr that manages the newly created Tensor. + */ +inline TensorPtr make_tensor_ptr( + const exec_aten::ScalarType type, + std::vector sizes, + void* data, + std::vector dim_order = {}, + std::vector strides = {}, + const exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC, + std::function deleter = nullptr) { + return make_tensor_ptr(make_tensor_impl_ptr( + type, + std::move(sizes), + data, + std::move(dim_order), + std::move(strides), + dynamism, + std::move(deleter))); +} + +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector of a specific scalar type, rather than a raw pointer. + * The deleter ensures that the data vector is properly managed and its + * lifetime is tied to the TensorImpl. + * + * @tparam T The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A vector containing the tensor's data. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of each dimension. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorImplPtr managing the newly created TensorImpl. + */ +template +TensorImplPtr make_tensor_ptr( + std::vector sizes, + std::vector::type> data, + std::vector dim_order = {}, + std::vector strides = {}, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return make_tensor_impl_ptr( + std::move(sizes), + std::move(data), + std::move(dim_order), + std::move(strides), + dynamism); +} + +/** + * Resizes the Tensor managed by the given TensorPtr to the new sizes provided. + * + * @param tensor A TensorPtr managing the Tensor to resize. + * @param sizes A vector representing the new sizes for each dimension. + * @return Error::Ok on success, or an appropriate error code otherwise. + */ +ET_NODISCARD +runtime::Error resize_tensor_ptr( + TensorPtr& tensor, + const std::vector& sizes); + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl index 04e231e0cdf..95ef27fcb62 100644 --- a/extension/tensor/test/targets.bzl +++ b/extension/tensor/test/targets.bzl @@ -14,6 +14,7 @@ def define_common_targets(): name = "test" + aten_suffix, srcs = [ "tensor_impl_ptr_test.cpp", + "tensor_ptr_test.cpp", ], deps = [ "//executorch/extension/tensor:tensor" + aten_suffix, diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp new file mode 100644 index 00000000000..0b58f10eeba --- /dev/null +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; + +class TensorPtrTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + runtime_init(); + } +}; + +TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) { + float data[20] = {2}; + auto tensor = make_tensor_ptr( + exec_aten::ScalarType::Float, {4, 5}, data, {0, 1}, {5, 1}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->const_data_ptr(), data); + EXPECT_EQ(tensor->const_data_ptr()[0], 2); +} + +TEST_F(TensorPtrTest, TensorSharingImpl) { + float data[20] = {2}; + auto tensor1 = make_tensor_ptr(exec_aten::ScalarType::Float, {4, 5}, data); + auto tensor2 = make_tensor_ptr(tensor1); + EXPECT_EQ(tensor1->unsafeGetTensorImpl(), tensor2->unsafeGetTensorImpl()); +} + +TEST_F(TensorPtrTest, TensorImplLifetime) { + TensorPtr tensor; + EXPECT_EQ(tensor, nullptr); + { + float data[20] = {2}; + auto tensor_impl = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {4, 5}, data); + tensor = make_tensor_ptr(tensor_impl); + } + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); +} + +TEST_F(TensorPtrTest, TensorWithZeroDimensionAndElements) { + float data[20] = {2}; + auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, data); + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {0, 5}, data); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->numel(), 0); +} + +TEST_F(TensorPtrTest, TensorResize) { + float data[20] = {2}; + auto tensor = make_tensor_ptr( + exec_aten::ScalarType::Float, + {4, 5}, + data, + {}, + {}, + exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND); + EXPECT_EQ(resize_tensor_ptr(tensor, {5, 4}), Error::Ok); + EXPECT_EQ(tensor->size(0), 5); + EXPECT_EQ(tensor->size(1), 4); +} + +TEST_F(TensorPtrTest, TensorDataAccess) { + float data[6] = {1, 2, 3, 4, 5, 6}; + auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {2, 3}, data); + EXPECT_EQ(tensor->const_data_ptr()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[5], 6); + tensor->mutable_data_ptr()[0] = 10; + EXPECT_EQ(tensor->const_data_ptr()[0], 10); +} + +TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) { + auto deleter_called = false; + float* data = new float[20](); + auto tensor = make_tensor_ptr( + exec_aten::ScalarType::Float, + {4, 5}, + data, + {}, + {}, + exec_aten::TensorShapeDynamism::STATIC, + [&deleter_called](void* ptr) { + deleter_called = true; + delete[] static_cast(ptr); + }); + + tensor.reset(); + EXPECT_TRUE(deleter_called); +} + +TEST_F(TensorPtrTest, TensorManagesMovedVector) { + auto deleter_called = false; + std::vector data(20, 3.0f); + auto* data_ptr = data.data(); + auto tensor = make_tensor_ptr( + exec_aten::ScalarType::Float, + {4, 5}, + data_ptr, + {}, + {}, + exec_aten::TensorShapeDynamism::STATIC, + [moved_data = std::move(data), &deleter_called](void*) mutable { + deleter_called = true; + }); + + EXPECT_TRUE(data.empty()); // NOLINT(bugprone-use-after-move) + EXPECT_EQ(tensor->data_ptr(), data_ptr); + + tensor.reset(); + EXPECT_TRUE(deleter_called); +} + +TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) { + auto deleter_called = false; + std::shared_ptr data_ptr( + new float[10], [](float* ptr) { delete[] ptr; }); + auto tensor = make_tensor_ptr( + exec_aten::ScalarType::Float, + {4, 5}, + data_ptr.get(), + {}, + {}, + exec_aten::TensorShapeDynamism::STATIC, + [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); + + EXPECT_EQ(data_ptr.use_count(), 2); + + tensor.reset(); + EXPECT_TRUE(deleter_called); + EXPECT_EQ(data_ptr.use_count(), 1); +} + +TEST_F(TensorPtrTest, TensorOwningData) { + auto tensor_impl = make_tensor_ptr( + {2, 5}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, + {1, 0}, + {1, 2}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 5); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(tensor_impl->strides()[1], 2); + EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl->data())[9], 10.0f); +} + +TEST_F(TensorPtrTest, TensorOwningEmptyData) { + auto tensor_impl = make_tensor_ptr({0, 5}, {}); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 0); + EXPECT_EQ(tensor_impl->size(1), 5); + EXPECT_EQ(tensor_impl->strides()[0], 5); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(tensor_impl->data(), nullptr); +} From d0708c0f8e99f6a8d908ce83d6c43b9c4f8656e7 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:20:25 -0700 Subject: [PATCH 161/531] Fix linter for minibench Differential Revision: D62153327 Pull Request resolved: https://github.com/pytorch/executorch/pull/5049 --- extension/android/benchmark/app/.gitignore | 2 +- .../android/benchmark/app/build.gradle.kts | 65 ++++++++++--------- .../minibench/ExampleInstrumentedTest.java | 28 ++++---- .../pytorch/minibench/BenchmarkActivity.java | 49 +++++++------- .../pytorch/minibench/ExampleUnitTest.java | 20 ++++-- extension/android/benchmark/build.gradle.kts | 12 +++- .../android/benchmark/settings.gradle.kts | 30 ++++++--- 7 files changed, 118 insertions(+), 88 deletions(-) diff --git a/extension/android/benchmark/app/.gitignore b/extension/android/benchmark/app/.gitignore index 42afabfd2ab..796b96d1c40 100644 --- a/extension/android/benchmark/app/.gitignore +++ b/extension/android/benchmark/app/.gitignore @@ -1 +1 @@ -/build \ No newline at end of file +/build diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts index b48404f8ff7..b716f2e8bd0 100644 --- a/extension/android/benchmark/app/build.gradle.kts +++ b/extension/android/benchmark/app/build.gradle.kts @@ -1,41 +1,44 @@ -plugins { - id("com.android.application") -} +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +plugins { id("com.android.application") } android { - namespace = "org.pytorch.minibench" - compileSdk = 34 + namespace = "org.pytorch.minibench" + compileSdk = 34 - defaultConfig { - applicationId = "org.pytorch.minibench" - minSdk = 28 - targetSdk = 33 - versionCode = 1 - versionName = "1.0" + defaultConfig { + applicationId = "org.pytorch.minibench" + minSdk = 28 + targetSdk = 33 + versionCode = 1 + versionName = "1.0" - testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" - } + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + } - buildTypes { - release { - isMinifyEnabled = false - proguardFiles( - getDefaultProguardFile("proguard-android-optimize.txt"), - "proguard-rules.pro" - ) - } - } - compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 + buildTypes { + release { + isMinifyEnabled = false + proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro") } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } } dependencies { - implementation(files("libs/executorch.aar")) - implementation("com.facebook.soloader:soloader:0.10.5") - implementation("com.facebook.fbjni:fbjni:0.5.1") - testImplementation("junit:junit:4.13.2") - androidTestImplementation("androidx.test.ext:junit:1.2.1") - androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") + implementation(files("libs/executorch.aar")) + implementation("com.facebook.soloader:soloader:0.10.5") + implementation("com.facebook.fbjni:fbjni:0.5.1") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.2.1") + androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") } diff --git a/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java index c5887aebccf..9de66835885 100644 --- a/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java +++ b/extension/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/ExampleInstrumentedTest.java @@ -1,15 +1,21 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + package org.pytorch.minibench; -import android.content.Context; +import static org.junit.Assert.*; -import androidx.test.platform.app.InstrumentationRegistry; +import android.content.Context; import androidx.test.ext.junit.runners.AndroidJUnit4; - +import androidx.test.platform.app.InstrumentationRegistry; import org.junit.Test; import org.junit.runner.RunWith; -import static org.junit.Assert.*; - /** * Instrumented test, which will execute on an Android device. * @@ -17,10 +23,10 @@ */ @RunWith(AndroidJUnit4.class) public class ExampleInstrumentedTest { - @Test - public void useAppContext() { - // Context of the app under test. - Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); - assertEquals("org.pytorch.minibench", appContext.getPackageName()); - } + @Test + public void useAppContext() { + // Context of the app under test. + Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); + assertEquals("org.pytorch.minibench", appContext.getPackageName()); + } } diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java index 17897d0d36e..e9599dd3518 100644 --- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java @@ -11,36 +11,33 @@ import android.app.Activity; import android.content.Intent; import android.os.Bundle; - -import org.pytorch.executorch.Module; - import java.io.FileWriter; import java.io.IOException; +import org.pytorch.executorch.Module; public class BenchmarkActivity extends Activity { - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); - int numIter = intent.getIntExtra("num_iter", 10); - - // TODO: Format the string with a parsable format - StringBuilder resultText = new StringBuilder(); - - Module module = Module.load(modelPath); - for (int i = 0; i < numIter; i++) { - long start = System.currentTimeMillis(); - module.forward(); - long forwardMs = System.currentTimeMillis() - start; - resultText.append(forwardMs).append(";"); - } - - try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { - writer.write(resultText.toString()); - } catch (IOException e) { - e.printStackTrace(); - } + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + Intent intent = getIntent(); + String modelPath = intent.getStringExtra("model_path"); + int numIter = intent.getIntExtra("num_iter", 10); + + // TODO: Format the string with a parsable format + StringBuilder resultText = new StringBuilder(); + + Module module = Module.load(modelPath); + for (int i = 0; i < numIter; i++) { + long start = System.currentTimeMillis(); + module.forward(); + long forwardMs = System.currentTimeMillis() - start; + resultText.append(forwardMs).append(";"); + } + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { + writer.write(resultText.toString()); + } catch (IOException e) { + e.printStackTrace(); } + } } diff --git a/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java index 134410482b8..c6a6a76a4d8 100644 --- a/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java +++ b/extension/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java @@ -1,17 +1,25 @@ -package org.pytorch.minibench; +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ -import org.junit.Test; +package org.pytorch.minibench; import static org.junit.Assert.*; +import org.junit.Test; + /** * Example local unit test, which will execute on the development machine (host). * * @see Testing documentation */ public class ExampleUnitTest { - @Test - public void addition_isCorrect() { - assertEquals(4, 2 + 2); - } + @Test + public void addition_isCorrect() { + assertEquals(4, 2 + 2); + } } diff --git a/extension/android/benchmark/build.gradle.kts b/extension/android/benchmark/build.gradle.kts index cc9db8a5cc0..ac625be8e02 100644 --- a/extension/android/benchmark/build.gradle.kts +++ b/extension/android/benchmark/build.gradle.kts @@ -1,4 +1,10 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + // Top-level build file where you can add configuration options common to all sub-projects/modules. -plugins { - id("com.android.application") version "8.1.0" apply false -} +plugins { id("com.android.application") version "8.1.0" apply false } diff --git a/extension/android/benchmark/settings.gradle.kts b/extension/android/benchmark/settings.gradle.kts index f2f5ac42a2c..4afd7e2d388 100644 --- a/extension/android/benchmark/settings.gradle.kts +++ b/extension/android/benchmark/settings.gradle.kts @@ -1,17 +1,27 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + pluginManagement { - repositories { - google() - mavenCentral() - gradlePluginPortal() - } + repositories { + google() + mavenCentral() + gradlePluginPortal() + } } + dependencyResolutionManagement { - repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) - repositories { - google() - mavenCentral() - } + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + } } rootProject.name = "MiniBench" + include(":app") From d76134642e65b7956177273f53bee8bb3a3ed2fe Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 18:20:28 -0700 Subject: [PATCH 162/531] Convenience API for creating Tensor with a data blob. Differential Revision: D61959527 Pull Request resolved: https://github.com/pytorch/executorch/pull/5056 --- extension/tensor/targets.bzl | 2 + extension/tensor/tensor.h | 14 + extension/tensor/tensor_ptr_maker.h | 282 ++++++++++++++++++ extension/tensor/test/targets.bzl | 1 + .../tensor/test/tensor_ptr_maker_test.cpp | 180 +++++++++++ 5 files changed, 479 insertions(+) create mode 100644 extension/tensor/tensor.h create mode 100644 extension/tensor/tensor_ptr_maker.h create mode 100644 extension/tensor/test/tensor_ptr_maker_test.cpp diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index e0d7d95627f..d00136f8d5b 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -17,8 +17,10 @@ def define_common_targets(): "tensor_ptr.cpp", ], exported_headers = [ + "tensor.h", "tensor_impl_ptr.h", "tensor_ptr.h", + "tensor_ptr_maker.h", ], visibility = [ "@EXECUTORCH_CLIENTS", diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h new file mode 100644 index 00000000000..0de8c39b75d --- /dev/null +++ b/extension/tensor/tensor.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Umbrella header for the Tensor extension. +#include +#include +#include diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h new file mode 100644 index 00000000000..a08f04c2101 --- /dev/null +++ b/extension/tensor/tensor_ptr_maker.h @@ -0,0 +1,282 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace extension { + +/** + * A helper class for creating TensorPtr instances from raw data and tensor + * properties. Note the the TensorPtr created by this class will not own the + * data, so it must outlive the TensorPtr. + * + * TensorPtrMaker provides a fluent interface for specifying various properties + * of a tensor, such as its type, sizes, data pointer, dimension order, strides, + * and shape dynamism. The final tensor is created by invoking make_tensor_ptr() + * or converting TensorPtrMaker to TensorPtr. + */ +class TensorPtrMaker final { + public: + // This class may have non-copyable members in the future. + TensorPtrMaker(const TensorPtrMaker&) = delete; + TensorPtrMaker& operator=(const TensorPtrMaker&) = delete; + // But it is movable. + TensorPtrMaker(TensorPtrMaker&&) = default; + TensorPtrMaker& operator=(TensorPtrMaker&&) = default; + /** + * Sets the scalar type of the tensor elements. + * + * @param type The scalar type (e.g., float, int, bool). + * @return Rvalue to this TensorPtrMaker for method chaining. + */ + TensorPtrMaker&& type(exec_aten::ScalarType type) { + type_ = type; + return std::move(*this); + } + + /** + * Sets the order of dimensions in memory. + * + * @param dim_order A vector specifying the dimension order. + * @return Rvalue to this TensorPtrMaker for method chaining. + */ + TensorPtrMaker&& dim_order(std::vector dim_order) { + dim_order_ = std::move(dim_order); + return std::move(*this); + } + + /** + * Sets the strides for each dimension of the tensor. + * + * @param strides A vector specifying the stride for each dimension. + * @return Rvalue to this TensorPtrMaker for method chaining. + */ + TensorPtrMaker&& strides(std::vector strides) { + strides_ = std::move(strides); + return std::move(*this); + } + + /** + * Sets the shape dynamism of the tensor. + * + * @param dynamism Specifies whether the tensor's shape is static, dynamic, or + * bounded. + * @return Rvalue to this TensorPtrMaker for method chaining. + */ + TensorPtrMaker&& dynamism(exec_aten::TensorShapeDynamism dynamism) { + dynamism_ = dynamism; + return std::move(*this); + } + + /** + * Sets a custom deleter function to manage the lifetime of the data buffer. + * + * @param deleter A function that will be called to delete the data buffer + * when the Tensor object managed by the TensorPtr is destroyed. Explicitly + * consuming an rvalue to avoid unnecessary copies when the deleter is a + * lambda that has captured some state. + * @return Rvalue to this TensorPtrMaker for method chaining. + */ + TensorPtrMaker&& deleter(std::function&& deleter) { + deleter_ = std::move(deleter); + return std::move(*this); + } + + /** + * Creates and returns a TensorPtr instance using the properties set in this + * TensorPtrMaker. + * + * @return A TensorPtr instance that manages the newly created Tensor. + */ + TensorPtr make_tensor_ptr() && { + return ::executorch::extension::make_tensor_ptr( + type_, + std::move(sizes_), + data_, + std::move(dim_order_), + std::move(strides_), + dynamism_, + std::move(deleter_)); + } + + /** + * Implicit conversion operator to create a TensorPtr. + * + * @return A TensorPtr instance that manages the newly created Tensor. + */ + operator TensorPtr() && { + return std::move(*this).make_tensor_ptr(); + } + + private: + TensorPtrMaker( + void* data, + std::vector sizes, + exec_aten::ScalarType type) + : sizes_(std::move(sizes)), data_(data), type_(type) {} + + private: + // The following properties are required to create a Tensor. + friend TensorPtrMaker for_blob( + void* data, + std::vector sizes, + exec_aten::ScalarType type); + + private: + std::vector sizes_; + std::vector strides_; + std::vector dim_order_; + std::function deleter_ = nullptr; + void* data_ = nullptr; + exec_aten::ScalarType type_ = exec_aten::ScalarType::Float; + exec_aten::TensorShapeDynamism dynamism_ = + exec_aten::TensorShapeDynamism::STATIC; +}; + +/** + * Creates a TensorPtrMaker instance for building a TensorPtr from a raw data + * pointer and tensor sizes. + * + * The TensorPtrMaker returned by this function allows for further customization + * of the tensor's properties, such as data type, dimension order, strides, and + * shape dynamism, before finalizing the TensorPtr creation. + * + * @param data A pointer to the raw data to be used by the tensor. It must + * outlive the TensorPtr created by this function. + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @return A TensorPtrMaker instance for creating a TensorPtr. + */ +inline TensorPtrMaker for_blob( + void* data, + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float) { + return TensorPtrMaker(data, std::move(sizes), type); +} + +/** + * Creates a TensorPtr from a raw data pointer and tensor sizes, with an + * optional dynamism setting. + * + * This function is a convenient way to create a tensor from existing data, with + * the option to specify whether the tensor's shape is static, dynamic, or + * bounded. + * + * @param data A pointer to the raw data to be used by the tensor. It must + * outlive the TensorPtr created by this function. + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance that manages the newly created Tensor. + */ +inline TensorPtr from_blob( + void* data, + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return for_blob(data, std::move(sizes), type) + .dynamism(dynamism) + .make_tensor_ptr(); +} + +/** + * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with + * an optional dynamism setting. + * + * This function allows for the creation of a tensor from existing data, with + * the option to specify custom strides for each dimension and whether the + * tensor's shape is static, dynamic, or bounded. + * + * @param data A pointer to the raw data to be used by the tensor. It must + * outlive the TensorPtr created by this function. + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance that manages the newly created Tensor. + */ +inline TensorPtr from_blob( + void* data, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return for_blob(data, std::move(sizes), type) + .strides(std::move(strides)) + .dynamism(dynamism) + .make_tensor_ptr(); +} + +/** + * Creates a TensorPtr from a raw data pointer and tensor sizes, with an + * optional dynamism setting. + * + * This function is a convenient way to create a tensor from existing data, with + * the option to specify whether the tensor's shape is static, dynamic, or + * bounded. + * + * @param data A pointer to the raw data to be used by the tensor. It must + * outlive the TensorPtr created by this function. + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param deleter A function to delete the data when it's no longer needed. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance that manages the newly created Tensor. + */ +inline TensorPtr from_blob( + void* data, + std::vector sizes, + exec_aten::ScalarType type, + std::function&& deleter, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return for_blob(data, std::move(sizes), type) + .deleter(std::move(deleter)) + .dynamism(dynamism) + .make_tensor_ptr(); +} + +/** + * Creates a TensorPtr from a raw data pointer, tensor sizes, and strides, with + * an optional dynamism setting. + * + * This function allows for the creation of a tensor from existing data, with + * the option to specify custom strides for each dimension and whether the + * tensor's shape is static, dynamic, or bounded. + * + * @param data A pointer to the raw data to be used by the tensor. It must + * outlive the TensorPtr created by this function. + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param deleter A function to delete the data when it's no longer needed. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance that manages the newly created Tensor. + */ +inline TensorPtr from_blob( + void* data, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + std::function&& deleter, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return for_blob(data, std::move(sizes), type) + .strides(std::move(strides)) + .deleter(std::move(deleter)) + .dynamism(dynamism) + .make_tensor_ptr(); +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl index 95ef27fcb62..ad62031ec08 100644 --- a/extension/tensor/test/targets.bzl +++ b/extension/tensor/test/targets.bzl @@ -14,6 +14,7 @@ def define_common_targets(): name = "test" + aten_suffix, srcs = [ "tensor_impl_ptr_test.cpp", + "tensor_ptr_maker_test.cpp", "tensor_ptr_test.cpp", ], deps = [ diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp new file mode 100644 index 00000000000..d1b4179a260 --- /dev/null +++ b/extension/tensor/test/tensor_ptr_maker_test.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; + +class TensorPtrMakerTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + runtime_init(); + } +}; + +TEST_F(TensorPtrMakerTest, CreateTensorUsingTensorMaker) { + float data[20] = {2}; + auto tensor = for_blob(data, {4, 5}) + .dim_order({0, 1}) + .strides({5, 1}) + .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) + .make_tensor_ptr(); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->const_data_ptr(), data); + EXPECT_EQ(tensor->const_data_ptr()[0], 2); +} + +TEST_F(TensorPtrMakerTest, PerfectForwardingLValue) { + float data[20] = {2}; + std::vector sizes = {4, 5}; + std::vector dim_order = {0, 1}; + std::vector strides = {5, 1}; + + auto tensor = for_blob(data, sizes) + .dim_order(dim_order) + .strides(strides) + .make_tensor_ptr(); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + + EXPECT_EQ(sizes.size(), 2); + EXPECT_EQ(dim_order.size(), 2); + EXPECT_EQ(strides.size(), 2); +} + +TEST_F(TensorPtrMakerTest, PerfectForwardingRValue) { + float data[20] = {2}; + std::vector sizes = {4, 5}; + std::vector dim_order = {0, 1}; + std::vector strides = {5, 1}; + + auto tensor = for_blob(data, std::move(sizes)) + .dim_order(std::move(dim_order)) + .strides(std::move(strides)) + .make_tensor_ptr(); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + // for_blob() moved the contents of the vectors, leaving these empty. + EXPECT_EQ(sizes.size(), 0); // NOLINT(bugprone-use-after-move) + EXPECT_EQ(dim_order.size(), 0); // NOLINT(bugprone-use-after-move) + EXPECT_EQ(strides.size(), 0); // NOLINT(bugprone-use-after-move) +} + +TEST_F(TensorPtrMakerTest, CreateTensorFromBlob) { + float data[20] = {2}; + auto tensor = from_blob(data, {4, 5}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->const_data_ptr(), data); + EXPECT_EQ(tensor->const_data_ptr()[0], 2); + EXPECT_EQ(tensor->const_data_ptr()[19], 0); +} + +TEST_F(TensorPtrMakerTest, CreateTensorUsingFromBlobWithStrides) { + float data[20] = {3}; + auto tensor = from_blob(data, {2, 2, 2}, {4, 2, 1}); + + EXPECT_EQ(tensor->dim(), 3); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 2); + EXPECT_EQ(tensor->size(2), 2); + EXPECT_EQ(tensor->strides()[0], 4); + EXPECT_EQ(tensor->strides()[1], 2); + EXPECT_EQ(tensor->strides()[2], 1); + EXPECT_EQ(tensor->const_data_ptr(), data); + EXPECT_EQ(tensor->const_data_ptr()[0], 3); +} + +TEST_F(TensorPtrMakerTest, TensorMakerConversionOperator) { + float data[20] = {2}; + TensorPtr tensor = + for_blob(data, {4, 5}) + .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); +} + +TEST_F(TensorPtrMakerTest, CreateTensorWithZeroDimensions) { + float data[1] = {2}; + auto tensor = from_blob(data, {}); + + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 2); +} + +TEST_F(TensorPtrMakerTest, TensorWithCustomDataDeleter) { + auto deleter_called = false; + float* data = new float[20](); + auto tensor = for_blob(data, {4, 5}) + .deleter([&deleter_called](void* ptr) { + deleter_called = true; + delete[] static_cast(ptr); + }) + .make_tensor_ptr(); + + tensor.reset(); + EXPECT_TRUE(deleter_called); +} + +TEST_F(TensorPtrMakerTest, TensorManagesMovedVector) { + auto deleter_called = false; + std::vector data(20, 3.0f); + auto* data_ptr = data.data(); + auto tensor = for_blob(data_ptr, {4, 5}) + .deleter([moved_data = std::move(data), &deleter_called]( + void*) mutable { deleter_called = true; }) + .make_tensor_ptr(); + + EXPECT_TRUE(data.empty()); // NOLINT(bugprone-use-after-move) + EXPECT_EQ(tensor->data_ptr(), data_ptr); + + tensor.reset(); + EXPECT_TRUE(deleter_called); +} + +TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) { + auto deleter_called = false; + std::shared_ptr data_ptr( + new float[10], [](float* ptr) { delete[] ptr; }); + auto tensor = from_blob( + data_ptr.get(), + {4, 5}, + exec_aten::ScalarType::Float, + [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); + + EXPECT_EQ(data_ptr.use_count(), 2); + + tensor.reset(); + EXPECT_TRUE(deleter_called); + EXPECT_EQ(data_ptr.use_count(), 1); +} From f271159ca382bcbff77d089685e6f17bd5fc25a9 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 18:32:48 -0700 Subject: [PATCH 163/531] Adopt the new tensor API for aten_util. Differential Revision: D61959566 Pull Request resolved: https://github.com/pytorch/executorch/pull/5057 --- .../make_aten_functor_from_et_functor.h | 58 +++++-------------- extension/aten_util/targets.bzl | 1 + 2 files changed, 15 insertions(+), 44 deletions(-) diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h index 3b54254e8ed..d7f2906944c 100644 --- a/extension/aten_util/make_aten_functor_from_et_functor.h +++ b/extension/aten_util/make_aten_functor_from_et_functor.h @@ -20,8 +20,8 @@ #endif #include #include +#include #include -#include #include namespace executorch { @@ -105,37 +105,12 @@ struct type_convert< typename remove_const_ref::type, torch::executor::Tensor>>> final { - explicit type_convert(ATensor value) : value_(value) { - auto sizes = - std::make_shared>( - value_.sizes().begin(), value_.sizes().end()); - const ssize_t dim = sizes->size(); - auto dim_order = - std::make_shared>( - dim); - auto strides = - std::make_shared>( - dim); - - std::iota(dim_order->begin(), dim_order->end(), 0); - ::executorch::runtime::dim_order_to_stride_nocheck( - sizes->data(), dim_order->data(), dim, strides->data()); - - auto tensor_impl = std::make_shared( - static_cast(value_.scalar_type()), - sizes->size(), - sizes->data(), - value_.mutable_data_ptr(), - dim_order->data(), - strides->data()); - - converted_ = std::unique_ptr< - torch::executor::Tensor, - std::function>( - new torch::executor::Tensor(tensor_impl.get()), - [sizes, dim_order, strides, tensor_impl]( - torch::executor::Tensor* pointer) { delete pointer; }); - } + explicit type_convert(ATensor value) + : value_(value), + converted_(from_blob( + value_.mutable_data_ptr(), + {value_.sizes().begin(), value_.sizes().end()}, + ::torch::executor::ScalarType(value_.scalar_type()))) {} ETensor call() { return *converted_; @@ -143,10 +118,7 @@ struct type_convert< private: ATensor value_; - std::unique_ptr< - torch::executor::Tensor, - std::function> - converted_; + TensorPtr converted_; }; // Tensors: ETen to ATen. @@ -158,15 +130,14 @@ struct type_convert< std::is_same_v::type, at::Tensor> && std::is_same_v< typename remove_const_ref::type, - torch::executor::Tensor>>> + ::torch::executor::Tensor>>> final { explicit type_convert(ETensor value) - : value_(value), sizes_(value_.sizes().begin(), value_.sizes().end()) { - converted_ = at::from_blob( - value_.mutable_data_ptr(), - sizes_, - static_cast(value_.scalar_type())); - } + : value_(value), + converted_(at::from_blob( + value_.mutable_data_ptr(), + std::vector{value_.sizes().begin(), value_.sizes().end()}, + c10::ScalarType(value_.scalar_type()))) {} ATensor call() { return converted_; @@ -175,7 +146,6 @@ struct type_convert< private: ETensor value_; at::Tensor converted_; - std::vector sizes_; }; // Optionals: ATen to ETen. diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl index b396cb78325..f219d6253f2 100644 --- a/extension/aten_util/targets.bzl +++ b/extension/aten_util/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/tensor:tensor", "//executorch/runtime/core:core", "//executorch/runtime/core:evalue", "//executorch/runtime/core/exec_aten:lib", From 3f5773ac8e4d7365a9cb2c37b74aebfe18771481 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 3 Sep 2024 18:42:59 -0700 Subject: [PATCH 164/531] [runner] Print prompt before call into text prefiller Differential Revision: D62053820 Pull Request resolved: https://github.com/pytorch/executorch/pull/5019 --- examples/models/llama2/runner/runner.cpp | 8 +++-- examples/models/llava/runner/llava_runner.cpp | 7 +++-- extension/llm/runner/text_prefiller.cpp | 29 ++----------------- extension/llm/runner/text_prefiller.h | 7 +---- 4 files changed, 13 insertions(+), 38 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 8b9e6865516..bceaaa3ed55 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -124,7 +124,6 @@ Error Runner::load() { metadata_.at(kVocabSize), temperature_); text_prefiller_ = std::make_unique( - tokenizer_.get(), text_decoder_runner_.get(), metadata_.at(kUseKVCache), metadata_.at(kEnableDynamicShape)); @@ -201,8 +200,11 @@ Error Runner::generate( // Prefill first // Here feed all tokens to the model and get the next predicted token // after the prompt. After that we will enter generate loop. - auto prefill_res = - text_prefiller_->prefill(prompt_tokens, 0, wrapped_callback); + + // print prompts + wrapped_callback(prompt); + + auto prefill_res = text_prefiller_->prefill(prompt_tokens, 0); stats_.first_token_ms = util::time_in_ms(); stats_.prompt_eval_end_ms = util::time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index cb968ca88d8..0fc06da0c56 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -51,7 +51,6 @@ Error LlavaRunner::load() { // Load the text prefiller text_prefiller_ = std::make_unique( - tokenizer_.get(), text_decoder_runner_.get(), /*use_kv_cache=*/true, /*enable_parallel_prefill=*/true); @@ -111,12 +110,14 @@ Error LlavaRunner::generate( } // prefill user prompt. No BOS because preset prompt already has it. + wrapped_callback(prompt); + std::vector user_prompt_tokens = ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0)); size_t num_user_tokens = user_prompt_tokens.size(); - uint64_t prefill_next_token = ET_UNWRAP( - text_prefiller_->prefill(user_prompt_tokens, pos, wrapped_callback)); + uint64_t prefill_next_token = + ET_UNWRAP(text_prefiller_->prefill(user_prompt_tokens, pos)); pos += num_user_tokens; // Generate tokens diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index 4b9afb8326d..53a737e6afc 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -16,19 +16,16 @@ namespace extension { namespace llm { TextPrefiller::TextPrefiller( - Tokenizer* tokenizer, TextDecoderRunner* text_decoder_runner, bool use_kv_cache, bool enable_parallel_prefill) - : tokenizer_(tokenizer), - text_decoder_runner_(text_decoder_runner), + : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache), enable_parallel_prefill_(enable_parallel_prefill) {} ::executorch::runtime::Result TextPrefiller::prefill( std::vector& prompt_tokens, - int64_t start_pos, - std::function token_callback) { + int64_t start_pos) { ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null"); if (!text_decoder_runner_->is_method_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load()); @@ -55,21 +52,10 @@ ::executorch::runtime::Result TextPrefiller::prefill( ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_LOG( Info, "Prefill token result numel(): %zu", outputs_res.get().numel()); - // insert new token into prompt_tokens - // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) - uint64_t prev = prompt_tokens[0]; - uint64_t cur; - for (int i = 0; i < prompt_tokens.size(); i++) { - cur = prompt_tokens[i]; - if (token_callback && cur != tokenizer_->bos_tok()) { - token_callback(ET_UNWRAP(tokenizer_->decode(prev, cur))); - } - prev = cur; - } + cur_token = text_decoder_runner_->logits_to_token(outputs_res.get()); } else { // sequential prefill int64_t pos = 0; // position in the sequence - int64_t prev_token; // token & pos int64_t pos_data = 0; // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) @@ -87,27 +73,18 @@ ::executorch::runtime::Result TextPrefiller::prefill( exec_aten::Tensor logits_tensor = ET_UNWRAP( text_decoder_runner_->step(managed_tokens, managed_start_pos)); - // if first token is not bos, we need to callback - if (cur_token != tokenizer_->bos_tok()) { - token_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); - } pos = 1; // start from index 1 while (pos < num_prompt_tokens) { // Run the model pos_data = start_pos + pos; - prev_token = cur_token; - // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[pos]; logits_tensor = ET_UNWRAP( text_decoder_runner_->step(managed_tokens, managed_start_pos)); - // print the token as string, decode it with the Tokenizer object - token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token))); - pos++; } diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h index bcec2b895fe..a8ba77b860a 100644 --- a/extension/llm/runner/text_prefiller.h +++ b/extension/llm/runner/text_prefiller.h @@ -23,7 +23,6 @@ namespace llm { class TextPrefiller { public: TextPrefiller( - Tokenizer* tokenizer, TextDecoderRunner* text_decoder_runner, bool use_kv_cache_, bool enable_parallel_prefill); @@ -33,17 +32,13 @@ class TextPrefiller { * tokenizer. * @param start_pos The starting position in KV cache of the input in the LLM * Module. - * @param token_callback A callback function that will be called for each - * token in the prompt. * @return The next token of the LLM Module after prefill. */ ::executorch::runtime::Result prefill( std::vector& prompt_tokens, - int64_t start_pos = 0, - std::function token_callback = {}); + int64_t start_pos = 0); private: - Tokenizer* tokenizer_; TextDecoderRunner* text_decoder_runner_; bool use_kv_cache_; bool enable_parallel_prefill_; From 5b6cac65030eae536a1d8c43c9739158944b48d8 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 19:24:56 -0700 Subject: [PATCH 165/531] FIx typo in templated constructor. Differential Revision: D62164880 Pull Request resolved: https://github.com/pytorch/executorch/pull/5059 --- extension/tensor/tensor_ptr.h | 6 ++--- extension/tensor/test/tensor_ptr_test.cpp | 32 +++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 4d20f46be71..76b5dc833ed 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -154,19 +154,19 @@ inline TensorPtr make_tensor_ptr( * @return A TensorImplPtr managing the newly created TensorImpl. */ template -TensorImplPtr make_tensor_ptr( +TensorPtr make_tensor_ptr( std::vector sizes, std::vector::type> data, std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = exec_aten::TensorShapeDynamism::STATIC) { - return make_tensor_impl_ptr( + return make_tensor_ptr(make_tensor_impl_ptr( std::move(sizes), std::move(data), std::move(dim_order), std::move(strides), - dynamism); + dynamism)); } /** diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 0b58f10eeba..0d76600a666 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -151,28 +151,28 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) { } TEST_F(TensorPtrTest, TensorOwningData) { - auto tensor_impl = make_tensor_ptr( + auto tensor = make_tensor_ptr( {2, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, {1, 0}, {1, 2}); - EXPECT_EQ(tensor_impl->dim(), 2); - EXPECT_EQ(tensor_impl->size(0), 2); - EXPECT_EQ(tensor_impl->size(1), 5); - EXPECT_EQ(tensor_impl->strides()[0], 1); - EXPECT_EQ(tensor_impl->strides()[1], 2); - EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); - EXPECT_EQ(((float*)tensor_impl->data())[9], 10.0f); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->strides()[1], 2); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0f); + EXPECT_EQ(tensor->const_data_ptr()[9], 10.0f); } TEST_F(TensorPtrTest, TensorOwningEmptyData) { - auto tensor_impl = make_tensor_ptr({0, 5}, {}); - - EXPECT_EQ(tensor_impl->dim(), 2); - EXPECT_EQ(tensor_impl->size(0), 0); - EXPECT_EQ(tensor_impl->size(1), 5); - EXPECT_EQ(tensor_impl->strides()[0], 5); - EXPECT_EQ(tensor_impl->strides()[1], 1); - EXPECT_EQ(tensor_impl->data(), nullptr); + auto tensor = make_tensor_ptr({0, 5}, {}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 0); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->strides()[0], 5); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->data_ptr(), nullptr); } From 46a1e6ca72e72a7d32234acf2b8fb8c70975a30f Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 20:27:58 -0700 Subject: [PATCH 166/531] Build for OSS. Differential Revision: D61959546 Pull Request resolved: https://github.com/pytorch/executorch/pull/5058 --- CMakeLists.txt | 6 ++++++ build/Utils.cmake | 3 +++ build/cmake_deps.toml | 11 ++++++++++ build/executorch-config.cmake | 1 + extension/tensor/CMakeLists.txt | 31 +++++++++++++++++++++++++++ extension/tensor/test/CMakeLists.txt | 32 ++++++++++++++++++++++++++++ test/utils/OSSTestConfig.json | 13 ++++++++++- 7 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 extension/tensor/CMakeLists.txt create mode 100644 extension/tensor/test/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 721e29f426d..3618bff7677 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,8 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension" OFF ) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF) + option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF) option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF) @@ -637,6 +639,10 @@ if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) endif() +if(EXECUTORCH_BUILD_EXTENSION_TENSOR) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor) +endif() + if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 diff --git a/build/Utils.cmake b/build/Utils.cmake index 55f5892a55e..bf04fa1b15c 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -65,6 +65,9 @@ function(executorch_print_configuration_summary) message(STATUS " EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : " "${EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL}" ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_TENSOR : " + "${EXECUTORCH_BUILD_EXTENSION_TENSOR}" + ) message( STATUS " EXECUTORCH_BUILD_FLATC : ${EXECUTORCH_BUILD_FLATC}" diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index a051dad027d..476a3e69fad 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -171,6 +171,17 @@ deps = [ "extension_module", "extension_runner_util", ] + +[targets.extension_tensor] +buck_targets = [ + "//extension/tensor:tensor", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", +] # ---------------------------------- extension end ---------------------------------- # ---------------------------------- binary start ---------------------------------- diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 695c8e455ba..4376c9e5e77 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -46,6 +46,7 @@ set(lib_list extension_module extension_module_static extension_runner_util + extension_tensor extension_threadpool xnnpack_backend XNNPACK diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt new file mode 100644 index 00000000000..4a02965c647 --- /dev/null +++ b/extension/tensor/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +cmake_minimum_required(VERSION 3.19) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +list(TRANSFORM _extension_tensor__srcs PREPEND "${EXECUTORCH_ROOT}/") +add_library(extension_tensor ${_extension_tensor__srcs}) +target_link_libraries(extension_tensor executorch) +target_include_directories(extension_tensor PUBLIC ${EXECUTORCH_ROOT}/..) +target_compile_options(extension_tensor PUBLIC ${_common_compile_options}) + +# Install libraries +install( + TARGETS extension_tensor + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/extension/tensor/test/CMakeLists.txt b/extension/tensor/test/CMakeLists.txt new file mode 100644 index 00000000000..132a40c31ba --- /dev/null +++ b/extension/tensor/test/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# @generated by test/utils/generate_gtest_cmakelists.py +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# + +cmake_minimum_required(VERSION 3.19) +project(extension_tensor_test) + +# Use C++17 for test. +set(CMAKE_CXX_STANDARD 17) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/build/Test.cmake) + +set(_test_srcs tensor_impl_ptr_test.cpp tensor_ptr_maker_test.cpp + tensor_ptr_test.cpp +) + +et_cxx_test( + extension_tensor_test SOURCES ${_test_srcs} EXTRA_LIBS extension_tensor +) diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index b7d7f1700b4..93ae82acc33 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -37,7 +37,7 @@ ], "additional_libs": [ "extension_data_loader", - "extension_module_static", + "extension_module", "portable_kernels", "portable_ops_lib" ] @@ -62,6 +62,17 @@ "portable_ops_lib" ] }, + { + "directory": "extension/tensor/test", + "sources": [ + "tensor_impl_ptr_test.cpp", + "tensor_ptr_maker_test.cpp", + "tensor_ptr_test.cpp" + ], + "additional_libs": [ + "extension_tensor" + ] + }, { "directory": "kernels/portable/cpu/util/test", "sources": [ From aca758d43b468ff36b3ca41b05235bb3541b07c9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 3 Sep 2024 20:54:35 -0700 Subject: [PATCH 167/531] Add LLaMA perf benchmark workflow for Apple iOS Differential Revision: D61998563 Pull Request resolved: https://github.com/pytorch/executorch/pull/4953 --- .github/workflows/apple-perf.yml | 305 ++++++++++++ build/build_apple_llm_demo.sh | 75 +++ .../LLaMA/LLaMA.xcodeproj/project.pbxproj | 456 +++++++++++++++++- .../LLaMAPerfBenchmark.entitlements | 8 + .../LLaMAPerfBenchmarkApp.swift | 16 + .../LLaMAPerfBenchmarkTests.swift | 50 ++ 6 files changed, 909 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/apple-perf.yml create mode 100755 build/build_apple_llm_demo.sh create mode 100644 examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements create mode 100644 examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift create mode 100644 examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml new file mode 100644 index 00000000000..41e2868bfbb --- /dev/null +++ b/.github/workflows/apple-perf.yml @@ -0,0 +1,305 @@ +name: apple-perf + +on: + schedule: + - cron: 0 1 * * * + # Note: GitHub has an upper limit of 10 inputs + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + workflow_call: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: linux.2xlarge + outputs: + models: ${{ steps.set-parameters.outputs.models }} + devices: ${{ steps.set-parameters.outputs.devices }} + delegates: ${{ steps.set-parameters.outputs.delegates }} + steps: + - name: Set parameters + id: set-parameters + shell: bash + env: + # Separate default values from the workflow dispatch. To ensure defaults are accessible + # during scheduled runs and to provide flexibility for different defaults between + # on-demand and periodic benchmarking. + CRON_DEFAULT_MODELS: "stories110M" + CRON_DEFAULT_DEVICES: "apple_iphone_15" + CRON_DEFAULT_DELEGATES: "xnnpack" + run: | + set -ex + MODELS="${{ inputs.models }}" + if [ -z "$MODELS" ]; then + MODELS="$CRON_DEFAULT_MODELS" + fi + DEVICES="${{ inputs.devices }}" + if [ -z "$DEVICES" ]; then + DEVICES="$CRON_DEFAULT_DEVICES" + fi + DELEGATES="${{ inputs.delegates }}" + if [ -z "$DELEGATES" ]; then + DELEGATES="$CRON_DEFAULT_DELEGATES" + fi + + # Mapping devices to their corresponding device-pool-arn + declare -A DEVICE_POOL_ARNS + DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d" + + # Resolve device names with their corresponding ARNs + if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then + DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') + fi + declare -a MAPPED_ARNS=() + for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do + if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then + echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 + exit 1 + fi + MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") + done + + echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) + echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT + echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + + export-models: + name: export-models + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: set-parameters + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + fail-fast: false + with: + runner: macos-latest-xlarge + python-version: '3.11' + submodules: 'true' + timeout: 60 + upload-artifact: ios-models + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + if [[ ${{ matrix.delegate }} == "coreml" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + fi + + if [[ ${{ matrix.delegate }} == "mps" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + fi + + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + echo "::endgroup::" + + echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + BUILD_MODE="cmake" + DTYPE="fp32" + + if [[ ${{ matrix.model }} =~ ^stories* ]]; then + # Install requirements for export_llama + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash examples/models/llama2/install_requirements.sh + + # Test llama2 + if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + DELEGATE_CONFIG="xnnpack+custom+qe" + fi + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" + else + # TODO (huydhn): Extend the export script here to support other backends such as coreml, mps + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + fi + echo "::endgroup::" + + upload-models: + needs: export-models + runs-on: linux.2xlarge + steps: + - name: Download the models from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-models + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the models + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the models to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 1 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + build-llm-demo: + name: build-llm-demo + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: + - set-parameters + secrets: inherit + strategy: + matrix: + tokenizer: [bpe] + with: + runner: macos-latest-xlarge + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + upload-artifact: ios-apps + secrets-env: BUILD_CERTIFICATE_BASE64 BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + timeout: 90 + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + + # Install CoreML Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + + # Install MPS Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + echo "::endgroup::" + + ${CONDA_RUN} --no-capture-output \ + build/build_apple_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} + + upload-ios-apps: + needs: build-llm-demo + runs-on: linux.2xlarge + steps: + - name: Download the apps from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the apps + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the apps to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + benchmark-on-device: + needs: + - set-parameters + - upload-ios-apps + - upload-models + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + with: + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + device-pool-arn: ${{ matrix.device }} + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/LLaMAPerfBenchmark.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/LLaMAPerfBenchmark.xctestrun.zip + test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }} + extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/build/build_apple_llm_demo.sh b/build/build_apple_llm_demo.sh new file mode 100755 index 00000000000..08652f04718 --- /dev/null +++ b/build/build_apple_llm_demo.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +TOKENIZER="${1:-bpe}" +ARTIFACTS_DIR_NAME="$2" + +APP_PATH="examples/demo-apps/apple_ios/LLaMA/LLaMA" + +if [[ "${TOKENIZER}" = "bpe" ]]; then + xcodebuild build-for-testing \ + -project "${APP_PATH}.xcodeproj" \ + -scheme LLaMAPerfBenchmark \ + -destination platform="iOS" \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER=iLLaMA \ + CODE_SIGN_IDENTITY="iPhone Distribution" \ + CODE_SIGNING_REQUIRED=No \ + CODE_SIGNING_ALLOWED=No \ + GCC_PREPROCESSOR_DEFINITIONS="DEBUG=1 ET_USE_TIKTOKEN=0" +else + xcodebuild build-for-testing \ + -project "${APP_PATH}.xcodeproj" \ + -scheme LLaMAPerfBenchmark \ + -destination platform="iOS" \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER=iLLaMA \ + CODE_SIGN_IDENTITY="iPhone Distribution" \ + CODE_SIGNING_REQUIRED=No \ + CODE_SIGNING_ALLOWED=No +fi + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app, debug mode here is the default from xcodebuild and match +# with what we have in the test spec +# TODO (huydhn): See if we can switch to release mode here +MODE="Debug" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +APP_NAME=LLaMAPerfBenchmark + +ls -lah +cp -r "${APP_NAME}.app" Payload && zip -vr "${APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index 54ae7d33198..15228bbe0db 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -53,6 +53,24 @@ 03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */; }; 03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */; }; 03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; }; + 84DD94742C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */; }; + 84DD94812C81060E00C765A6 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94802C81060E00C765A6 /* backend_coreml */; }; + 84DD94832C81060E00C765A6 /* backend_coreml_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94822C81060E00C765A6 /* backend_coreml_debug */; }; + 84DD94852C81060E00C765A6 /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94842C81060E00C765A6 /* backend_mps */; }; + 84DD94872C81060E00C765A6 /* backend_mps_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94862C81060E00C765A6 /* backend_mps_debug */; }; + 84DD94892C81060E00C765A6 /* backend_xnnpack in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94882C81060E00C765A6 /* backend_xnnpack */; }; + 84DD948B2C81060E00C765A6 /* backend_xnnpack_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */; }; + 84DD94912C81060E00C765A6 /* kernels_custom in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94902C81060E00C765A6 /* kernels_custom */; }; + 84DD94932C81060E00C765A6 /* kernels_custom_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94922C81060E00C765A6 /* kernels_custom_debug */; }; + 84DD94952C81060E00C765A6 /* kernels_optimized in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94942C81060E00C765A6 /* kernels_optimized */; }; + 84DD94972C81060E00C765A6 /* kernels_optimized_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94962C81060E00C765A6 /* kernels_optimized_debug */; }; + 84DD94992C81060E00C765A6 /* kernels_portable in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD94982C81060E00C765A6 /* kernels_portable */; }; + 84DD949B2C81060E00C765A6 /* kernels_portable_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949A2C81060E00C765A6 /* kernels_portable_debug */; }; + 84DD949D2C81060E00C765A6 /* kernels_quantized in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949C2C81060E00C765A6 /* kernels_quantized */; }; + 84DD949F2C81060E00C765A6 /* kernels_quantized_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 84DD949E2C81060E00C765A6 /* kernels_quantized_debug */; }; + 84DD94A02C81061100C765A6 /* LLaMARunner.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; }; + 84DD94A12C81061100C765A6 /* LLaMARunner.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; + 84DD94AF2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -63,6 +81,20 @@ remoteGlobalIDString = 03729ED42BB1F8DE00152F2E; remoteInfo = LLaMARunner; }; + 84DD94A22C81061100C765A6 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 032C01672AC228E5002955E1 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 03729ED42BB1F8DE00152F2E; + remoteInfo = LLaMARunner; + }; + 84DD94B02C811E3E00C765A6 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 032C01672AC228E5002955E1 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 84DD94702C8105EB00C765A6; + remoteInfo = LLaMAPerfBenchmark; + }; /* End PBXContainerItemProxy section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -77,6 +109,17 @@ name = "Embed Frameworks"; runOnlyForDeploymentPostprocessing = 0; }; + 84DD94A42C81061100C765A6 /* Embed Frameworks */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = ""; + dstSubfolderSpec = 10; + files = ( + 84DD94A12C81061100C765A6 /* LLaMARunner.framework in Embed Frameworks */, + ); + name = "Embed Frameworks"; + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ @@ -113,6 +156,11 @@ 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = ""; }; 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = ""; }; 03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = base64.h; path = ../../../../extension/llm/tokenizer/base64.h; sourceTree = ""; }; + 84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMAPerfBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LLaMAPerfBenchmarkApp.swift; sourceTree = ""; }; + 84DD94A72C8107AB00C765A6 /* LLaMAPerfBenchmark.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = LLaMAPerfBenchmark.entitlements; sourceTree = ""; }; + 84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = LLaMAPerfBenchmarkTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + 84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LLaMAPerfBenchmarkTests.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -146,6 +194,35 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + 84DD946E2C8105EB00C765A6 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 84DD94932C81060E00C765A6 /* kernels_custom_debug in Frameworks */, + 84DD94972C81060E00C765A6 /* kernels_optimized_debug in Frameworks */, + 84DD94892C81060E00C765A6 /* backend_xnnpack in Frameworks */, + 84DD949B2C81060E00C765A6 /* kernels_portable_debug in Frameworks */, + 84DD949F2C81060E00C765A6 /* kernels_quantized_debug in Frameworks */, + 84DD94812C81060E00C765A6 /* backend_coreml in Frameworks */, + 84DD94912C81060E00C765A6 /* kernels_custom in Frameworks */, + 84DD94852C81060E00C765A6 /* backend_mps in Frameworks */, + 84DD94992C81060E00C765A6 /* kernels_portable in Frameworks */, + 84DD94A02C81061100C765A6 /* LLaMARunner.framework in Frameworks */, + 84DD94952C81060E00C765A6 /* kernels_optimized in Frameworks */, + 84DD948B2C81060E00C765A6 /* backend_xnnpack_debug in Frameworks */, + 84DD94872C81060E00C765A6 /* backend_mps_debug in Frameworks */, + 84DD949D2C81060E00C765A6 /* kernels_quantized in Frameworks */, + 84DD94832C81060E00C765A6 /* backend_coreml_debug in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 84DD94A92C811E3E00C765A6 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ @@ -227,6 +304,11 @@ 0324D69F2BAACB7C00DEF36F /* LLaMARunner */, 036CAF9D2BB1444500D6C2D5 /* LLaMA.app */, 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */, + 84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */, + 84DD94722C8105EB00C765A6 /* LLaMAPerfBenchmark */, + 84DD94AD2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */, + 84DD947F2C81060E00C765A6 /* Frameworks */, + 84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */, ); sourceTree = ""; }; @@ -279,6 +361,30 @@ path = ../../../../../../extension/llm/sampler; sourceTree = ""; }; + 84DD94722C8105EB00C765A6 /* LLaMAPerfBenchmark */ = { + isa = PBXGroup; + children = ( + 84DD94A72C8107AB00C765A6 /* LLaMAPerfBenchmark.entitlements */, + 84DD94732C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift */, + ); + path = LLaMAPerfBenchmark; + sourceTree = ""; + }; + 84DD947F2C81060E00C765A6 /* Frameworks */ = { + isa = PBXGroup; + children = ( + ); + name = Frameworks; + sourceTree = ""; + }; + 84DD94AD2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */ = { + isa = PBXGroup; + children = ( + 84DD94AE2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift */, + ); + path = LLaMAPerfBenchmarkTests; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXHeadersBuildPhase section */ @@ -360,6 +466,59 @@ productReference = 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */; productType = "com.apple.product-type.framework"; }; + 84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */ = { + isa = PBXNativeTarget; + buildConfigurationList = 84DD947C2C8105EC00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmark" */; + buildPhases = ( + 84DD946D2C8105EB00C765A6 /* Sources */, + 84DD946E2C8105EB00C765A6 /* Frameworks */, + 84DD946F2C8105EB00C765A6 /* Resources */, + 84DD94A42C81061100C765A6 /* Embed Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + 84DD94A32C81061100C765A6 /* PBXTargetDependency */, + ); + name = LLaMAPerfBenchmark; + packageProductDependencies = ( + 84DD94802C81060E00C765A6 /* backend_coreml */, + 84DD94822C81060E00C765A6 /* backend_coreml_debug */, + 84DD94842C81060E00C765A6 /* backend_mps */, + 84DD94862C81060E00C765A6 /* backend_mps_debug */, + 84DD94882C81060E00C765A6 /* backend_xnnpack */, + 84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */, + 84DD94902C81060E00C765A6 /* kernels_custom */, + 84DD94922C81060E00C765A6 /* kernels_custom_debug */, + 84DD94942C81060E00C765A6 /* kernels_optimized */, + 84DD94962C81060E00C765A6 /* kernels_optimized_debug */, + 84DD94982C81060E00C765A6 /* kernels_portable */, + 84DD949A2C81060E00C765A6 /* kernels_portable_debug */, + 84DD949C2C81060E00C765A6 /* kernels_quantized */, + 84DD949E2C81060E00C765A6 /* kernels_quantized_debug */, + ); + productName = LLaMAPerfBenchmark; + productReference = 84DD94712C8105EB00C765A6 /* LLaMAPerfBenchmark.app */; + productType = "com.apple.product-type.application"; + }; + 84DD94AB2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */ = { + isa = PBXNativeTarget; + buildConfigurationList = 84DD94B22C811E3E00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmarkTests" */; + buildPhases = ( + 84DD94A82C811E3E00C765A6 /* Sources */, + 84DD94A92C811E3E00C765A6 /* Frameworks */, + 84DD94AA2C811E3E00C765A6 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + 84DD94B12C811E3E00C765A6 /* PBXTargetDependency */, + ); + name = LLaMAPerfBenchmarkTests; + productName = LLaMAPerfBenchmarkTests; + productReference = 84DD94AC2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ @@ -367,7 +526,7 @@ isa = PBXProject; attributes = { BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; + LastSwiftUpdateCheck = 1540; LastUpgradeCheck = 1530; TargetAttributes = { 032C016E2AC228E6002955E1 = { @@ -376,6 +535,13 @@ 03729ED42BB1F8DE00152F2E = { CreatedOnToolsVersion = 15.3; }; + 84DD94702C8105EB00C765A6 = { + CreatedOnToolsVersion = 15.4; + }; + 84DD94AB2C811E3E00C765A6 = { + CreatedOnToolsVersion = 15.4; + TestTargetID = 84DD94702C8105EB00C765A6; + }; }; }; buildConfigurationList = 032C016A2AC228E5002955E1 /* Build configuration list for PBXProject "LLaMA" */; @@ -396,6 +562,8 @@ targets = ( 032C016E2AC228E6002955E1 /* LLaMA */, 03729ED42BB1F8DE00152F2E /* LLaMARunner */, + 84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */, + 84DD94AB2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests */, ); }; /* End PBXProject section */ @@ -416,6 +584,20 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + 84DD946F2C8105EB00C765A6 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 84DD94AA2C811E3E00C765A6 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXResourcesBuildPhase section */ /* Begin PBXShellScriptBuildPhase section */ @@ -472,6 +654,22 @@ ); runOnlyForDeploymentPostprocessing = 0; }; + 84DD946D2C8105EB00C765A6 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 84DD94742C8105EB00C765A6 /* LLaMAPerfBenchmarkApp.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 84DD94A82C811E3E00C765A6 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 84DD94AF2C811E3E00C765A6 /* LLaMAPerfBenchmarkTests.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXSourcesBuildPhase section */ /* Begin PBXTargetDependency section */ @@ -480,6 +678,16 @@ target = 03729ED42BB1F8DE00152F2E /* LLaMARunner */; targetProxy = 03729ED92BB1F8DE00152F2E /* PBXContainerItemProxy */; }; + 84DD94A32C81061100C765A6 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 03729ED42BB1F8DE00152F2E /* LLaMARunner */; + targetProxy = 84DD94A22C81061100C765A6 /* PBXContainerItemProxy */; + }; + 84DD94B12C811E3E00C765A6 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 84DD94702C8105EB00C765A6 /* LLaMAPerfBenchmark */; + targetProxy = 84DD94B02C811E3E00C765A6 /* PBXContainerItemProxy */; + }; /* End PBXTargetDependency section */ /* Begin XCBuildConfiguration section */ @@ -847,6 +1055,164 @@ }; name = Release; }; + 84DD947D2C8105EC00C765A6 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CODE_SIGN_ENTITLEMENTS = LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = ""; + ENABLE_PREVIEWS = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + "OTHER_LDFLAGS[sdk=iphoneos*]" = ( + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-debug.a", + ); + "OTHER_LDFLAGS[sdk=iphonesimulator*]" = ( + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-debug.a", + ); + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 84DD947E2C8105EC00C765A6 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CODE_SIGN_ENTITLEMENTS = LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = ""; + ENABLE_PREVIEWS = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + "OTHER_LDFLAGS[sdk=iphoneos*]" = ( + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-debug.a", + ); + "OTHER_LDFLAGS[sdk=iphonesimulator*]" = ( + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-debug.a", + "-force_load", + "$(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-debug.a", + ); + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + 84DD94B32C811E3E00C765A6 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GCC_C_LANGUAGE_STANDARD = gnu17; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/LLaMAPerfBenchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/LLaMAPerfBenchmark"; + }; + name = Debug; + }; + 84DD94B42C811E3E00C765A6 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + GCC_C_LANGUAGE_STANDARD = gnu17; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.illama; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/LLaMAPerfBenchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/LLaMAPerfBenchmark"; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ @@ -877,6 +1243,24 @@ defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; }; + 84DD947C2C8105EC00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmark" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 84DD947D2C8105EC00C765A6 /* Debug */, + 84DD947E2C8105EC00C765A6 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 84DD94B22C811E3E00C765A6 /* Build configuration list for PBXNativeTarget "LLaMAPerfBenchmarkTests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 84DD94B32C811E3E00C765A6 /* Debug */, + 84DD94B42C811E3E00C765A6 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ /* Begin XCRemoteSwiftPackageReference section */ @@ -966,6 +1350,76 @@ package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; productName = executorch_debug; }; + 84DD94802C81060E00C765A6 /* backend_coreml */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_coreml; + }; + 84DD94822C81060E00C765A6 /* backend_coreml_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_coreml_debug; + }; + 84DD94842C81060E00C765A6 /* backend_mps */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_mps; + }; + 84DD94862C81060E00C765A6 /* backend_mps_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_mps_debug; + }; + 84DD94882C81060E00C765A6 /* backend_xnnpack */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_xnnpack; + }; + 84DD948A2C81060E00C765A6 /* backend_xnnpack_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = backend_xnnpack_debug; + }; + 84DD94902C81060E00C765A6 /* kernels_custom */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_custom; + }; + 84DD94922C81060E00C765A6 /* kernels_custom_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_custom_debug; + }; + 84DD94942C81060E00C765A6 /* kernels_optimized */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_optimized; + }; + 84DD94962C81060E00C765A6 /* kernels_optimized_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_optimized_debug; + }; + 84DD94982C81060E00C765A6 /* kernels_portable */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_portable; + }; + 84DD949A2C81060E00C765A6 /* kernels_portable_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_portable_debug; + }; + 84DD949C2C81060E00C765A6 /* kernels_quantized */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_quantized; + }; + 84DD949E2C81060E00C765A6 /* kernels_quantized_debug */ = { + isa = XCSwiftPackageProductDependency; + package = 03312C172BBFC940002106EF /* XCRemoteSwiftPackageReference "executorch" */; + productName = kernels_quantized_debug; + }; /* End XCSwiftPackageProductDependency section */ }; rootObject = 032C01672AC228E5002955E1 /* Project object */; diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements new file mode 100644 index 00000000000..99f471672d6 --- /dev/null +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmark.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.developer.kernel.increased-memory-limit + + + diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift new file mode 100644 index 00000000000..b09321f1555 --- /dev/null +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmark/LLaMAPerfBenchmarkApp.swift @@ -0,0 +1,16 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +@main +struct LLaMAPerfBenchmarkApp: SwiftUI.App { + var body: some Scene { + WindowGroup { } + } +} diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift new file mode 100644 index 00000000000..fa798f019de --- /dev/null +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMAPerfBenchmarkTests/LLaMAPerfBenchmarkTests.swift @@ -0,0 +1,50 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import LLaMARunner +import XCTest + +final class LLaMAPerfBenchmarkTests: XCTestCase { + func testLlama2() throws { + guard + let modelPath = Bundle.main.path( + forResource: "llama2", + ofType: "pte", + inDirectory: "aatp/data" + ) + else { + XCTFail("Failed to get model path") + return + } + + guard + let tokenizerPath = Bundle.main.path( + forResource: "tokenizer", + ofType: "bin", + inDirectory: "aatp/data" + ) + else { + XCTFail("Failed to get tokenizer path") + return + } + + let runner = Runner(modelPath: modelPath, tokenizerPath: tokenizerPath) + do { + try runner.load() + } catch let loadError { + XCTFail("Failed to load the model: \(loadError)") + } + XCTAssertTrue(runner.isloaded()) + + let seq_len = 128 + var tokens: [String] = [] + try runner.generate("How do you do! I'm testing llama2 on mobile device", sequenceLength: seq_len) { token in + tokens.append(token) + } + } +} From 5a4188fc0ad9bcfa1988f7d04083e2d16c9c6bdc Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 21:36:51 -0700 Subject: [PATCH 168/531] Revert D61959566 Differential Revision: D62166474 Pull Request resolved: https://github.com/pytorch/executorch/pull/5060 --- .../make_aten_functor_from_et_functor.h | 58 ++++++++++++++----- extension/aten_util/targets.bzl | 1 - 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h index d7f2906944c..3b54254e8ed 100644 --- a/extension/aten_util/make_aten_functor_from_et_functor.h +++ b/extension/aten_util/make_aten_functor_from_et_functor.h @@ -20,8 +20,8 @@ #endif #include #include -#include #include +#include #include namespace executorch { @@ -105,12 +105,37 @@ struct type_convert< typename remove_const_ref::type, torch::executor::Tensor>>> final { - explicit type_convert(ATensor value) - : value_(value), - converted_(from_blob( - value_.mutable_data_ptr(), - {value_.sizes().begin(), value_.sizes().end()}, - ::torch::executor::ScalarType(value_.scalar_type()))) {} + explicit type_convert(ATensor value) : value_(value) { + auto sizes = + std::make_shared>( + value_.sizes().begin(), value_.sizes().end()); + const ssize_t dim = sizes->size(); + auto dim_order = + std::make_shared>( + dim); + auto strides = + std::make_shared>( + dim); + + std::iota(dim_order->begin(), dim_order->end(), 0); + ::executorch::runtime::dim_order_to_stride_nocheck( + sizes->data(), dim_order->data(), dim, strides->data()); + + auto tensor_impl = std::make_shared( + static_cast(value_.scalar_type()), + sizes->size(), + sizes->data(), + value_.mutable_data_ptr(), + dim_order->data(), + strides->data()); + + converted_ = std::unique_ptr< + torch::executor::Tensor, + std::function>( + new torch::executor::Tensor(tensor_impl.get()), + [sizes, dim_order, strides, tensor_impl]( + torch::executor::Tensor* pointer) { delete pointer; }); + } ETensor call() { return *converted_; @@ -118,7 +143,10 @@ struct type_convert< private: ATensor value_; - TensorPtr converted_; + std::unique_ptr< + torch::executor::Tensor, + std::function> + converted_; }; // Tensors: ETen to ATen. @@ -130,14 +158,15 @@ struct type_convert< std::is_same_v::type, at::Tensor> && std::is_same_v< typename remove_const_ref::type, - ::torch::executor::Tensor>>> + torch::executor::Tensor>>> final { explicit type_convert(ETensor value) - : value_(value), - converted_(at::from_blob( - value_.mutable_data_ptr(), - std::vector{value_.sizes().begin(), value_.sizes().end()}, - c10::ScalarType(value_.scalar_type()))) {} + : value_(value), sizes_(value_.sizes().begin(), value_.sizes().end()) { + converted_ = at::from_blob( + value_.mutable_data_ptr(), + sizes_, + static_cast(value_.scalar_type())); + } ATensor call() { return converted_; @@ -146,6 +175,7 @@ struct type_convert< private: ETensor value_; at::Tensor converted_; + std::vector sizes_; }; // Optionals: ATen to ETen. diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl index f219d6253f2..b396cb78325 100644 --- a/extension/aten_util/targets.bzl +++ b/extension/aten_util/targets.bzl @@ -27,7 +27,6 @@ def define_common_targets(): ], exported_deps = [ "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/tensor:tensor", "//executorch/runtime/core:core", "//executorch/runtime/core:evalue", "//executorch/runtime/core/exec_aten:lib", From 761688cdf20b7ac77150a195bb6c7f781246c897 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 22:45:06 -0700 Subject: [PATCH 169/531] Fixx OSS tests. Differential Revision: D62168928 Pull Request resolved: https://github.com/pytorch/executorch/pull/5065 --- test/run_oss_cpp_tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 3f17a9ead69..3693700e831 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -35,6 +35,7 @@ build_executorch() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_VULKAN=$BUILD_VULKAN \ -DEXECUTORCH_BUILD_XNNPACK=ON \ From 59d83ca615c940476486840278cc5d5e9ea0d048 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 23:33:42 -0700 Subject: [PATCH 170/531] Build for Apple frameworks. Differential Revision: D61959554 Pull Request resolved: https://github.com/pytorch/executorch/pull/5063 --- build/build_apple_frameworks.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index a22fd4ecb9d..8bd9e0539ff 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -33,6 +33,7 @@ libexecutorch_no_prim_ops.a,\ libextension_apple.a,\ libextension_data_loader.a,\ libextension_module.a,\ +libextension_tensor.a,\ :$HEADERS_PATH" FRAMEWORK_BACKEND_COREML="backend_coreml:\ @@ -165,6 +166,7 @@ cmake_build() { -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=$CUSTOM \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=$OPTIMIZED \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=$QUANTIZED \ @@ -188,6 +190,7 @@ mkdir -p "$HEADERS_PATH" "$SOURCE_ROOT_DIR"/build/print_exported_headers.py --buck2="$BUCK2" --targets \ //extension/module: \ + //extension/tensor: \ | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch" cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch" From 8c0f63e4871c1db6f5891c422889a15e8eeb044d Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 3 Sep 2024 23:43:27 -0700 Subject: [PATCH 171/531] Adopt the new tensor API. Differential Revision: D61959569 Pull Request resolved: https://github.com/pytorch/executorch/pull/5066 --- .../ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm index 733dcdc8b35..56a9cd66522 100644 --- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm +++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm @@ -9,6 +9,7 @@ #import "MobileNetClassifier.h" #import +#import using namespace ::torch::executor; @@ -33,9 +34,8 @@ - (BOOL)classifyWithInput:(float*)input output:(float*)output outputSize:(NSInteger)outputSize error:(NSError**)error { - int32_t sizes[] = {1, kChannels, kSize, kSize}; - TensorImpl inputTensor(ScalarType::Float, std::size(sizes), sizes, input); - const auto result = _module->forward(Tensor(&inputTensor)); + const auto result = + _module->forward(from_blob(input, {1, kChannels, kSize, kSize})); if (!result.ok()) { if (error) { From 3cf9237af1b6743cfe5293ac333efc39951adf49 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Wed, 4 Sep 2024 02:02:40 -0700 Subject: [PATCH 172/531] a-start ops | add dim order regulation Differential Revision: D59824508 Pull Request resolved: https://github.com/pytorch/executorch/pull/4330 --- kernels/portable/cpu/op_add.cpp | 4 + kernels/portable/cpu/op_addmm.cpp | 8 ++ kernels/portable/cpu/op_alias_copy.cpp | 2 + kernels/portable/cpu/op_allclose.cpp | 3 + kernels/portable/cpu/op_amax.cpp | 3 + kernels/portable/cpu/op_amin.cpp | 3 + kernels/portable/cpu/op_any.cpp | 9 +++ kernels/portable/cpu/op_arange.cpp | 4 + kernels/portable/cpu/op_argmax.cpp | 3 + kernels/portable/cpu/op_argmin.cpp | 3 + kernels/portable/cpu/op_as_strided_copy.cpp | 5 ++ kernels/portable/cpu/op_atan2.cpp | 3 + kernels/portable/cpu/op_avg_pool2d.cpp | 5 ++ ...ary_ufunc_realb_realb_to_realb_logical.cpp | 3 + .../cpu/pattern/unary_ufunc_realh.cpp | 3 + .../pattern/unary_ufunc_realhb_to_bool.cpp | 3 + .../pattern/unary_ufunc_realhb_to_floath.cpp | 3 + runtime/core/exec_aten/util/tensor_util.h | 70 ++++++++++++++--- .../core/exec_aten/util/tensor_util_aten.cpp | 78 +++++++------------ .../exec_aten/util/tensor_util_portable.cpp | 76 ++++++++++++------ 20 files changed, 207 insertions(+), 84 deletions(-) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 33662ecc55a..a435e4ee658 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -79,6 +79,8 @@ Tensor& add_out( out); ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); @@ -131,6 +133,8 @@ Tensor& add_scalar_out( "Failed to resize output tensor."); ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp index 2f4745bb519..75ca1a3ad00 100644 --- a/kernels/portable/cpu/op_addmm.cpp +++ b/kernels/portable/cpu/op_addmm.cpp @@ -45,6 +45,14 @@ Tensor& addmm_out( ET_KERNEL_CHECK( ctx, tensor_is_broadcastable_to(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, mat1, mat2, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ScalarType alpha_dtype = utils::get_scalar_dtype(alpha); ScalarType beta_dtype = utils::get_scalar_dtype(beta); ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_alias_copy.cpp b/kernels/portable/cpu/op_alias_copy.cpp index 72fd945e984..a3ac11a725e 100644 --- a/kernels/portable/cpu/op_alias_copy.cpp +++ b/kernels/portable/cpu/op_alias_copy.cpp @@ -28,6 +28,8 @@ Tensor& alias_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { "Failed to resize output tensor."); ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); if (in.nbytes() > 0) { // Note that this check is important. It's valid for a tensor with numel 0 diff --git a/kernels/portable/cpu/op_allclose.cpp b/kernels/portable/cpu/op_allclose.cpp index dc9fea082b6..0a2549bcaef 100644 --- a/kernels/portable/cpu/op_allclose.cpp +++ b/kernels/portable/cpu/op_allclose.cpp @@ -104,6 +104,9 @@ Tensor& allclose_out( out.scalar_type() == ScalarType::Bool, "Out tensor must be type Bool; saw type %" PRId8, static_cast(out.scalar_type())); + ET_CHECK_MSG( + tensors_have_same_dim_order(self, other, out), + "self, other and out tensors should have same dim order"); ET_CHECK_MSG( out.numel() == 1, "Out tensor must be a single element; saw %zu elements", diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp index d3994f1efd5..79e915ea422 100644 --- a/kernels/portable/cpu/op_amax.cpp +++ b/kernels/portable/cpu/op_amax.cpp @@ -39,6 +39,9 @@ Tensor& amax_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in.scalar_type(), ctx, "amax.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp index f150e0eb7d6..37a63cff20f 100644 --- a/kernels/portable/cpu/op_amin.cpp +++ b/kernels/portable/cpu/op_amin.cpp @@ -39,6 +39,9 @@ Tensor& amin_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in.scalar_type(), ctx, "amin.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp index ca942475514..6f4c3047b78 100644 --- a/kernels/portable/cpu/op_any.cpp +++ b/kernels/portable/cpu/op_any.cpp @@ -22,6 +22,9 @@ Tensor& any_all_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK( ctx, resize_tensor(out, {}) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); constexpr auto name = "any.all_out"; @@ -68,6 +71,9 @@ Tensor& any_dims_out( out); } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); constexpr auto name = "any.dims_out"; @@ -122,6 +128,9 @@ Tensor& any_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); constexpr auto name = "any.out"; diff --git a/kernels/portable/cpu/op_arange.cpp b/kernels/portable/cpu/op_arange.cpp index a8fc3ce96ab..50acd6cfb97 100644 --- a/kernels/portable/cpu/op_arange.cpp +++ b/kernels/portable/cpu/op_arange.cpp @@ -27,6 +27,8 @@ Tensor& arange_out(RuntimeContext& ctx, const Scalar& end, Tensor& out) { ET_KERNEL_CHECK( ctx, check_arange_args(0.0, end_val, 1.0, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); + size_t size = static_cast(std::ceil(end_val)); Tensor::SizesType out_length = static_cast(size); @@ -73,6 +75,8 @@ Tensor& arange_start_out( InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); + double size_d = (d_end - d_start) / d_step; size_t size = static_cast(std::ceil(size_d)); diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp index c59be11bc08..1a50699d64d 100644 --- a/kernels/portable/cpu/op_argmax.cpp +++ b/kernels/portable/cpu/op_argmax.cpp @@ -40,6 +40,9 @@ Tensor& argmax_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index e0609c9e5ce..4d124650cf4 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -40,6 +40,9 @@ Tensor& argmin_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); diff --git a/kernels/portable/cpu/op_as_strided_copy.cpp b/kernels/portable/cpu/op_as_strided_copy.cpp index 17ebc410878..fc4508746f9 100644 --- a/kernels/portable/cpu/op_as_strided_copy.cpp +++ b/kernels/portable/cpu/op_as_strided_copy.cpp @@ -37,6 +37,11 @@ Tensor& as_strided_copy_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + if (in.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 36ba53830b5..32831b4ca83 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -26,6 +26,9 @@ atan2_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/op_avg_pool2d.cpp b/kernels/portable/cpu/op_avg_pool2d.cpp index 42e3cd7fca8..0a9c8407ee5 100644 --- a/kernels/portable/cpu/op_avg_pool2d.cpp +++ b/kernels/portable/cpu/op_avg_pool2d.cpp @@ -44,6 +44,11 @@ Tensor& avg_pool2d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + size_t output_ndim = 0; exec_aten::SizesType output_sizes[kTensorDimensionLimit]; get_avg_pool2d_out_target_size( diff --git a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp index 22d30b39a47..ee4b5f4374b 100644 --- a/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp +++ b/kernels/portable/cpu/pattern/binary_ufunc_realb_realb_to_realb_logical.cpp @@ -27,6 +27,9 @@ Tensor& binary_ufunc_realb_realb_to_realb_logical( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp index cb29ac42594..c85a6af65af 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp @@ -33,6 +33,9 @@ Tensor& unary_ufunc_realh( ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { apply_unary_map_fn( [fn](const CTYPE val_in) { return static_cast(fn(val_in)); }, diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp index bf5e84710ef..85e48b1e4d9 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp @@ -38,6 +38,9 @@ Tensor& unary_ufunc_realhb_to_bool( "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.", static_cast(out.scalar_type())); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + const auto in_type = in.scalar_type(); ET_SWITCH_REALHB_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp index 47d442d8bd4..0637c70ee31 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp @@ -32,6 +32,9 @@ Tensor& unary_ufunc_realhb_to_floath( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + const auto in_type = in.scalar_type(); const auto out_type = out.scalar_type(); diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 4dcb0ef9f69..cadb5ecd9ab 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -1153,33 +1153,80 @@ bool tensor_has_valid_dim_order(exec_aten::Tensor t); bool tensor_is_default_or_channels_last_dim_order(exec_aten::Tensor t); /** - * Asserts that two tensors have the same dim_order + * Checks whether a tensor has the default dimension order. + * Logs an error message if the tensor does not meet the expected criteria. * - * Note that this macro only tests dim order, but not others like actual data, - * sizes, etc. Also this macro does not support ATen mode since we do not - * support dim order in ATen mode. + * @param t The tensor to check the dimension order of. + * @return True if the tensor has the default dimension order, false otherwise. + */ +bool tensor_is_default_dim_order(exec_aten::Tensor t); + +/** + * Checks whether a tensor has the channels last dimension order. + * Logs an error message if the tensor does not meet the expected criteria. * - * TODO(T183094318): Add dim order and related function support for ATen mode. + * @param t The tensor to check the dimension order of. + * @return True if the tensor has the channels last dimension order, false + * otherwise. */ +bool tensor_is_channels_last_dim_order(exec_aten::Tensor t); +/** + * Asserts that four tensors have the same dim_order + * + * Note that this macro only tests dim order, but not others like actual data, + * sizes, etc. + * + */ bool tensors_have_same_dim_order( + const exec_aten::ArrayRef tensor_list); + +/** + * Asserts that two tensors have the same dim_order + * + * Note that this macro only tests dim order, but not others like actual data, + * sizes, etc. + */ + +inline bool tensors_have_same_dim_order( const exec_aten::Tensor& a, - const exec_aten::Tensor& b); + const exec_aten::Tensor& b) { + exec_aten::Tensor tensor_list[2] = {a, b}; + return tensors_have_same_dim_order(tensor_list); +} /** * Asserts that three tensors have the same dim_order * * Note that this macro only tests dim order, but not others like actual data, - * sizes, etc. Also this macro does not support ATen mode since we do not - * support dim order in ATen mode. + * sizes, etc. * - * TODO(T183094318): Add dim order and related function support for ATen mode. */ -bool tensors_have_same_dim_order( +inline bool tensors_have_same_dim_order( const exec_aten::Tensor& a, const exec_aten::Tensor& b, - const exec_aten::Tensor& c); + const exec_aten::Tensor& c) { + exec_aten::Tensor tensor_list[3] = {a, b, c}; + return tensors_have_same_dim_order(tensor_list); +} + +/** + * Asserts that four tensors have the same dim_order + * + * Note that this macro only tests dim order, but not others like actual data, + * sizes, etc. + * + */ + +inline bool tensors_have_same_dim_order( + const exec_aten::Tensor& a, + const exec_aten::Tensor& b, + const exec_aten::Tensor& c, + const exec_aten::Tensor& d) { + exec_aten::Tensor tensor_list[4] = {a, b, c, d}; + return tensors_have_same_dim_order(tensor_list); +} /** * Given an n-dimensional coordinate array and an array of tensor strides, @@ -1232,6 +1279,7 @@ using ::executorch::runtime::tensor_is_bits_type; using ::executorch::runtime::tensor_is_bool_type; using ::executorch::runtime::tensor_is_complex_type; using ::executorch::runtime::tensor_is_contiguous; +using ::executorch::runtime::tensor_is_default_dim_order; using ::executorch::runtime::tensor_is_default_or_channels_last_dim_order; using ::executorch::runtime::tensor_is_floating_type; using ::executorch::runtime::tensor_is_integral_type; diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp index 91b75c06483..84af8fcee42 100644 --- a/runtime/core/exec_aten/util/tensor_util_aten.cpp +++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp @@ -78,61 +78,43 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) { } bool tensors_have_same_dim_order( - const exec_aten::Tensor& a, - const exec_aten::Tensor& b) { - exec_aten::DimOrderType a_dim_order[kTensorDimensionLimit]; - exec_aten::DimOrderType b_dim_order[kTensorDimensionLimit]; - - ET_LOG_MSG_AND_RETURN_IF_FALSE( - get_dim_order(a, a_dim_order, a.dim()) == Error::Ok, - "Failed to retrieve dim order from first input tensor!"); - ET_LOG_MSG_AND_RETURN_IF_FALSE( - get_dim_order(b, b_dim_order, b.dim()) == Error::Ok, - "Failed to retrieve dim order from second input tensor!"); - - bool all_contiguous = is_contiguous_dim_order(a_dim_order, a.dim()) && - is_contiguous_dim_order(b_dim_order, b.dim()); - - bool all_channels_last = is_channels_last_dim_order(a_dim_order, a.dim()) && - is_channels_last_dim_order(b_dim_order, b.dim()); - - ET_LOG_MSG_AND_RETURN_IF_FALSE( - all_contiguous || all_channels_last, - "Two input tensors have different dim orders"); + const exec_aten::ArrayRef tensor_list) { + if (tensor_list.size() < 2) { + return true; + } - return true; -} + exec_aten::DimOrderType first_dim_order[kTensorDimensionLimit]; + exec_aten::DimOrderType other_dim_order[kTensorDimensionLimit]; -bool tensors_have_same_dim_order( - const exec_aten::Tensor& a, - const exec_aten::Tensor& b, - const exec_aten::Tensor& c) { - exec_aten::DimOrderType a_dim_order[kTensorDimensionLimit]; - exec_aten::DimOrderType b_dim_order[kTensorDimensionLimit]; - exec_aten::DimOrderType c_dim_order[kTensorDimensionLimit]; - ET_LOG_MSG_AND_RETURN_IF_FALSE( - get_dim_order(a, a_dim_order, a.dim()) == Error::Ok, - "Failed to retrieve dim order from first input tensor!"); - ET_LOG_MSG_AND_RETURN_IF_FALSE( - get_dim_order(b, b_dim_order, b.dim()) == Error::Ok, - "Failed to retrieve dim order from second input tensor!"); ET_LOG_MSG_AND_RETURN_IF_FALSE( - get_dim_order(c, c_dim_order, c.dim()) == Error::Ok, - "Failed to retrieve dim order from third input tensor!"); - - bool all_contiguous = is_contiguous_dim_order(a_dim_order, a.dim()) && - is_contiguous_dim_order(b_dim_order, b.dim()) && - is_contiguous_dim_order(c_dim_order, c.dim()); - - bool all_channels_last = is_channels_last_dim_order(a_dim_order, a.dim()) && - is_channels_last_dim_order(b_dim_order, b.dim()) && - is_channels_last_dim_order(c_dim_order, c.dim()); + get_dim_order(tensor_list[0], first_dim_order, tensor_list[0].dim()) == + Error::Ok, + "Failed to retrieve dim order from 1st input tensor!"); + + bool all_contiguous = + is_contiguous_dim_order(first_dim_order, tensor_list[0].dim()); + bool all_channels_last = + is_channels_last_dim_order(first_dim_order, tensor_list[0].dim()); + + for (size_t i = 1; i < tensor_list.size(); ++i) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + get_dim_order(tensor_list[i], other_dim_order, tensor_list[i].dim()) == + Error::Ok, + "Failed to retrieve dim order from %zd-th input tensor!", + i); + + all_contiguous = all_contiguous && + is_contiguous_dim_order(other_dim_order, tensor_list[i].dim()); + all_channels_last = all_channels_last && + is_channels_last_dim_order(other_dim_order, tensor_list[i].dim()); + } ET_LOG_MSG_AND_RETURN_IF_FALSE( all_contiguous || all_channels_last, - "Three input tensors have different dim orders"); + "%zd input tensors have different dim orders", + tensor_list.size()); - return true; + return all_contiguous || all_channels_last; } namespace internal { diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp index 7e9a15f09a9..b7ed92f3c97 100644 --- a/runtime/core/exec_aten/util/tensor_util_portable.cpp +++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp @@ -73,40 +73,66 @@ bool tensor_is_default_or_channels_last_dim_order(torch::executor::Tensor t) { return ret_val; } -bool tensors_have_same_dim_order( - const exec_aten::Tensor& a, - const exec_aten::Tensor& b) { - bool all_contiguous = - is_contiguous_dim_order(a.dim_order().data(), a.dim_order().size()) && - is_contiguous_dim_order(b.dim_order().data(), b.dim_order().size()); - bool all_channels_last = - is_channels_last_dim_order(a.dim_order().data(), a.dim_order().size()) && - is_channels_last_dim_order(b.dim_order().data(), b.dim_order().size()); +bool tensor_is_default_dim_order(torch::executor::Tensor t) { + bool ret_val = + is_contiguous_dim_order(t.dim_order().data(), t.dim_order().size()); - ET_LOG_MSG_AND_RETURN_IF_FALSE( - all_contiguous || all_channels_last, - "Two input tensors have different dim orders"); + if (!ret_val) { + ET_LOG(Error, "Expected tensor to have default dim order, but got"); + for (size_t d = 0; d < t.dim(); ++d) { + ET_LOG( + Error, + " dim_order(%zu): %zu", + static_cast(d), + static_cast(t.dim_order()[d])); + } + } + return ret_val; +} - return true; +bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) { + bool ret_val = + is_channels_last_dim_order(t.dim_order().data(), t.dim_order().size()); + + if (!ret_val) { + ET_LOG(Error, "Expected tensor to have channels last dim order, but got"); + for (size_t d = 0; d < t.dim(); ++d) { + ET_LOG( + Error, + " dim_order(%zu): %zu", + static_cast(d), + static_cast(t.dim_order()[d])); + } + } + return ret_val; } bool tensors_have_same_dim_order( - const exec_aten::Tensor& a, - const exec_aten::Tensor& b, - const exec_aten::Tensor& c) { - bool all_contiguous = - is_contiguous_dim_order(a.dim_order().data(), a.dim_order().size()) && - is_contiguous_dim_order(b.dim_order().data(), b.dim_order().size()) && - is_contiguous_dim_order(c.dim_order().data(), c.dim_order().size()); - bool all_channels_last = - is_channels_last_dim_order(a.dim_order().data(), a.dim_order().size()) && - is_channels_last_dim_order(b.dim_order().data(), b.dim_order().size()) && - is_channels_last_dim_order(c.dim_order().data(), c.dim_order().size()); + const exec_aten::ArrayRef tensor_list) { + if (tensor_list.size() < 2) { + return true; + } + bool all_contiguous = true; + bool all_channels_last = true; + for (size_t i = 0; i < tensor_list.size(); ++i) { + all_contiguous = all_contiguous && + is_contiguous_dim_order( + tensor_list[i].dim_order().data(), + tensor_list[i].dim_order().size()); + all_channels_last = all_channels_last && + is_channels_last_dim_order( + tensor_list[i].dim_order().data(), + tensor_list[i].dim_order().size()); + } + ET_LOG_MSG_AND_RETURN_IF_FALSE( all_contiguous || all_channels_last, - "Three input tensors have different dim orders"); + "%zd input tensors have different dim orders", + tensor_list.size()); + return true; } + namespace internal { Error share_tensor_data( From 8781604cd99b305ccf2db461467fab4a77907926 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 10:02:05 -0700 Subject: [PATCH 173/531] Fix CI by correcting namespaces. Differential Revision: D62189964 Pull Request resolved: https://github.com/pytorch/executorch/pull/5073 --- .../ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm index 56a9cd66522..59b66e510bd 100644 --- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm +++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo/Sources/MobileNet/MobileNetClassifier.mm @@ -11,7 +11,7 @@ #import #import -using namespace ::torch::executor; +using namespace ::executorch::extension; NSErrorDomain const ETMobileNetClassifierErrorDomain = @"MobileNetClassifierErrorDomain"; From 0c78a9db8a1194482639a793b4c774764db4502a Mon Sep 17 00:00:00 2001 From: Riley Dulin Date: Wed, 4 Sep 2024 11:27:33 -0700 Subject: [PATCH 174/531] Remove expensive print from lowered_backend_module.py Differential Revision: D62155526 Pull Request resolved: https://github.com/pytorch/executorch/pull/5046 --- exir/lowered_backend_module.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 6ba3b6bcb05..e50d3038dac 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -696,9 +696,6 @@ def create_exported_program_from_submodule( in_spec = pytree.tree_flatten((tuple(subgraph_signature.user_inputs), {}))[1] out_spec = pytree.tree_flatten(subgraph_signature.user_outputs)[1] - print(submodule.graph) - print(subgraph_signature) - return ( ExportedProgram( root=submodule, From d32bf995526a40c6c07f5c9b364ec36648134632 Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:38:49 -0400 Subject: [PATCH 175/531] Add op: narrow_copy.out Differential Revision: D62156680 Pull Request resolved: https://github.com/pytorch/executorch/pull/5047 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_narrow_copy.cpp | 57 +++++ kernels/portable/cpu/op_slice_copy.cpp | 31 +-- kernels/portable/cpu/op_slice_scatter.cpp | 2 +- kernels/portable/cpu/util/copy_ops_util.cpp | 27 --- kernels/portable/cpu/util/copy_ops_util.h | 13 -- kernels/portable/cpu/util/index_util.cpp | 77 ------- kernels/portable/cpu/util/index_util.h | 14 -- kernels/portable/cpu/util/slice_util.cpp | 182 ++++++++++++++++ kernels/portable/cpu/util/slice_util.h | 66 ++++++ kernels/portable/cpu/util/targets.bzl | 11 + kernels/portable/functions.yaml | 5 + kernels/test/op_narrow_copy_test.cpp | 197 ++++++++++++++++++ kernels/test/op_slice_copy_test.cpp | 19 ++ kernels/test/targets.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 11 +- 16 files changed, 554 insertions(+), 161 deletions(-) create mode 100644 kernels/portable/cpu/op_narrow_copy.cpp create mode 100644 kernels/portable/cpu/util/slice_util.cpp create mode 100644 kernels/portable/cpu/util/slice_util.h create mode 100644 kernels/test/op_narrow_copy_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index b71585ef9dd..1350fc090b0 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -259,6 +259,8 @@ - op: mul.Scalar_out +- op: narrow_copy.out + - op: native_batch_norm.out - op: native_group_norm.out diff --git a/kernels/portable/cpu/op_narrow_copy.cpp b/kernels/portable/cpu/op_narrow_copy.cpp new file mode 100644 index 00000000000..0c21ec5b901 --- /dev/null +++ b/kernels/portable/cpu/op_narrow_copy.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& narrow_copy_out( + RuntimeContext& ctx, + const Tensor& in, + int64_t dim, + int64_t start, + int64_t length, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_narrow_copy_args(in, dim, start, length, out), + InvalidArgument, + out); + + if (dim < 0) { + dim += in.dim(); + } + + // @lint-ignore CLANGTIDY facebook-hte-CArray + Tensor::SizesType target_sizes[kTensorDimensionLimit]; + size_t target_ndim = 0; + get_narrow_copy_out_target_size(in, dim, length, target_sizes, &target_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok, + InvalidArgument, + out); + + if (length != 0) { + compute_slice(in, dim, start, length, 1, out); + } + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp index d56bdcd864f..41a76567906 100644 --- a/kernels/portable/cpu/op_slice_copy.cpp +++ b/kernels/portable/cpu/op_slice_copy.cpp @@ -6,8 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#include #include #include @@ -41,40 +40,20 @@ Tensor& slice_copy_Tensor_out( // available) int64_t start = start_val.has_value() ? start_val.value() : 0; - int64_t num_values = adjust_slice_indices(in.size(dim), &start, &end, step); + int64_t length = adjust_slice_indices(in.size(dim), &start, &end, step); + // @lint-ignore CLANGTIDY facebook-hte-CArray Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; - get_slice_copy_out_target_size( - in, dim, num_values, target_sizes, &target_ndim); + get_slice_copy_out_target_size(in, dim, length, target_sizes, &target_ndim); ET_KERNEL_CHECK( ctx, resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok, InvalidArgument, out); - size_t dim_length = in.size(dim); + compute_slice(in, dim, start, length, step, out); - size_t leading_dims = getLeadingDims(in, dim); - size_t trailing_dims = getTrailingDims(in, dim); - - if (trailing_dims == 0) { - return out; - } - - size_t length_per_step = trailing_dims * in.element_size(); - - const char* input_data = in.const_data_ptr(); - char* dest = out.mutable_data_ptr(); - - for (int i = 0; i < leading_dims; i++) { - const char* src = input_data + (i * dim_length + start) * length_per_step; - for (int j = 0; j < num_values; j++) { - memcpy(dest, src, length_per_step); - src += step * length_per_step; - dest += length_per_step; - } - } return out; } diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp index 367b626696f..a1f9ce4d921 100644 --- a/kernels/portable/cpu/op_slice_scatter.cpp +++ b/kernels/portable/cpu/op_slice_scatter.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include namespace torch { diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index bcd72d96a3b..61c07d71a4b 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -411,33 +411,6 @@ void get_select_copy_out_target_size( } } -bool check_slice_copy_args( - const Tensor& in, - int64_t dim, - int64_t step, - Tensor& out) { - ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0); - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); - ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim)); - ET_LOG_MSG_AND_RETURN_IF_FALSE( - step > 0, "slice step must be greater than zero"); - return true; -} - -void get_slice_copy_out_target_size( - const Tensor& in, - int64_t dim, - int64_t num_values, - exec_aten::SizesType* out_sizes, - size_t* out_ndim) { - *out_ndim = in.dim(); - - for (size_t d = 0; d < in.dim(); ++d) { - out_sizes[d] = in.size(d); - } - out_sizes[dim] = num_values; -} - bool check_split_with_sizes_copy_args( const Tensor& in, exec_aten::ArrayRef split_sizes, diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index ef0fc9579bd..91c62e707e9 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -136,19 +136,6 @@ void get_select_copy_out_target_size( exec_aten::SizesType* out_sizes, size_t* out_ndim); -bool check_slice_copy_args( - const Tensor& in, - int64_t dim, - int64_t step, - Tensor& out); - -void get_slice_copy_out_target_size( - const Tensor& in, - int64_t dim, - int64_t num_values, - exec_aten::SizesType* out_sizes, - size_t* out_ndim); - bool check_split_with_sizes_copy_args( const Tensor& in, exec_aten::ArrayRef split_sizes, diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp index b1c9696fd62..39c556fa01c 100644 --- a/kernels/portable/cpu/util/index_util.cpp +++ b/kernels/portable/cpu/util/index_util.cpp @@ -261,82 +261,5 @@ bool check_select_scatter_args( return true; } -bool check_slice_scatter_args( - const Tensor& input, - const Tensor& src, - int64_t dim, - int64_t num_values, - int64_t step, - Tensor output) { - ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0); - - // Check dim. The dim planed to be selected on shall exist in input - ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim())); - - // Input and output tensors should be the same shape and dtype - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output)); - - // The input.dim() shall equal to src.dim() - ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src)); - - // Check step. Step must be greater than zero - ET_LOG_MSG_AND_RETURN_IF_FALSE( - step > 0, "slice step must be greater than zero"); - - // The size of src tensor should follow these rules: - // - src.size(i) shall equal to input.size(i) if i != dim, - // - src.size(dim) shall equal to num_values - for (size_t d = 0; d < input.dim() - 1; d++) { - if (d != dim) { - ET_LOG_AND_RETURN_IF_FALSE( - tensors_have_same_size_at_dims(input, d, src, d)); - } else { - ET_LOG_MSG_AND_RETURN_IF_FALSE( - src.size(d) == num_values, - "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")", - d, - input.size(d), - num_values, - dim); - } - } - - return true; -} - -int64_t adjust_slice_indices( - int64_t dim_length, - int64_t* start, - int64_t* end, - int64_t step) { - int64_t num_values = 0; - - // Update start and end index - // First convert it to c++ style from python style if needed. - // The start index is using python style E.g., for the shape {2, 3, 4}, - // dim = -1 would refer to dim[2], dim = -2 would refer to dim[1], and so on. - *start = *start < 0 ? *start + dim_length : *start; - *end = *end < 0 ? *end + dim_length : *end; - // Second, if start or end still negative, which means user want to start or - // end slicing from very beginning, so set it to zero - *start = *start < 0 ? 0 : *start; - *end = *end < 0 ? 0 : *end; - // Last, if start or end larger than maximum value (dim_length - 1), indicates - // user want to start slicing after end or slicing until the end, so update it - // to dim_length - *start = *start > dim_length ? dim_length : *start; - *end = *end > dim_length ? dim_length : *end; - - if (*start >= dim_length || *end <= 0 || *start >= *end) { - // Set num_values to 0 if interval [start, end) is non-exist or do not - // overlap with [0, dim_length) - num_values = 0; - } else { - // Update num_values to min(max_num_values, num_values) - num_values = (*end - 1 - *start) / step + 1; - } - return num_values; -} - } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h index 73d264a748c..0ee430c9726 100644 --- a/kernels/portable/cpu/util/index_util.h +++ b/kernels/portable/cpu/util/index_util.h @@ -64,19 +64,5 @@ bool check_select_scatter_args( int64_t index, Tensor& output); -bool check_slice_scatter_args( - const Tensor& input, - const Tensor& src, - int64_t dim, - int64_t num_values, - int64_t step, - Tensor output); - -int64_t adjust_slice_indices( - int64_t dim_length, - int64_t* start, - int64_t* end, - int64_t step); - } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/slice_util.cpp b/kernels/portable/cpu/util/slice_util.cpp new file mode 100644 index 00000000000..b9f5260e626 --- /dev/null +++ b/kernels/portable/cpu/util/slice_util.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { + +using Tensor = exec_aten::Tensor; + +bool check_narrow_copy_args( + const Tensor& in, + int64_t dim, + int64_t start, + int64_t lenth, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim)); + ET_LOG_MSG_AND_RETURN_IF_FALSE(lenth >= 0, "lenth must be non-negative"); + ET_LOG_AND_RETURN_IF_FALSE(start >= -in.size(dim)); + ET_LOG_AND_RETURN_IF_FALSE(start <= in.size(dim)); + if (start < 0) { + start += in.size(dim); + } + ET_LOG_AND_RETURN_IF_FALSE(start + lenth <= in.size(dim)); + return true; +} + +void get_narrow_copy_out_target_size( + const Tensor& in, + int64_t dim, + int64_t length, + exec_aten::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = in.dim(); + + for (size_t d = 0; d < in.dim(); ++d) { + out_sizes[d] = in.size(d); + } + out_sizes[dim] = length; +} + +bool check_slice_copy_args( + const Tensor& in, + int64_t dim, + int64_t step, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim)); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + step > 0, "slice step must be greater than zero"); + return true; +} + +void get_slice_copy_out_target_size( + const Tensor& in, + int64_t dim, + int64_t length, + exec_aten::SizesType* out_sizes, + size_t* out_ndim) { + get_narrow_copy_out_target_size(in, dim, length, out_sizes, out_ndim); +} + +bool check_slice_scatter_args( + const Tensor& input, + const Tensor& src, + int64_t dim, + int64_t num_values, + int64_t step, + Tensor output) { + ET_LOG_AND_RETURN_IF_FALSE(input.dim() > 0); + + // Check dim. The dim planed to be selected on shall exist in input + ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, input.dim())); + + // Input and output tensors should be the same shape and dtype + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_shape_and_dtype(input, output)); + + // The input.dim() shall equal to src.dim() + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src)); + + // Check step. Step must be greater than zero + ET_LOG_MSG_AND_RETURN_IF_FALSE( + step > 0, "slice step must be greater than zero"); + + // The size of src tensor should follow these rules: + // - src.size(i) shall equal to input.size(i) if i != dim, + // - src.size(dim) shall equal to num_values + for (size_t d = 0; d < input.dim() - 1; d++) { + if (d != dim) { + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(input, d, src, d)); + } else { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + src.size(d) == num_values, + "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")", + d, + input.size(d), + num_values, + dim); + } + } + + return true; +} + +int64_t adjust_slice_indices( + int64_t dim_length, + int64_t* start, + int64_t* end, + int64_t step) { + int64_t num_values = 0; + + // Update start and end index + // First convert it to c++ style from python style if needed. + // The start index is using python style E.g., for the shape {2, 3, 4}, + // dim = -1 would refer to dim[2], dim = -2 would refer to dim[1], and so on. + *start = *start < 0 ? *start + dim_length : *start; + *end = *end < 0 ? *end + dim_length : *end; + // Second, if start or end still negative, which means user want to start or + // end slicing from very beginning, so set it to zero + *start = *start < 0 ? 0 : *start; + *end = *end < 0 ? 0 : *end; + // Last, if start or end larger than maximum value (dim_length - 1), indicates + // user want to start slicing after end or slicing until the end, so update it + // to dim_length + *start = *start > dim_length ? dim_length : *start; + *end = *end > dim_length ? dim_length : *end; + + if (*start >= dim_length || *end <= 0 || *start >= *end) { + // Set num_values to 0 if interval [start, end) is non-exist or do not + // overlap with [0, dim_length) + num_values = 0; + } else { + // Update num_values to min(max_num_values, num_values) + num_values = (*end - 1 - *start) / step + 1; + } + return num_values; +} + +void compute_slice( + const Tensor& in, + int64_t dim, + int64_t start, + int64_t length, + int64_t step, + Tensor& out) { + size_t dim_length = in.size(dim); + + size_t leading_dims = getLeadingDims(in, dim); + size_t trailing_dims = getTrailingDims(in, dim); + + if (trailing_dims == 0) { + return; + } + + size_t length_per_step = trailing_dims * in.element_size(); + + const char* input_data = in.const_data_ptr(); + char* dest = out.mutable_data_ptr(); + + for (int i = 0; i < leading_dims; i++) { + const char* src = input_data + (i * dim_length + start) * length_per_step; + for (int j = 0; j < length; j++) { + memcpy(dest, src, length_per_step); + src += step * length_per_step; + dest += length_per_step; + } + } +} + +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/slice_util.h b/kernels/portable/cpu/util/slice_util.h new file mode 100644 index 00000000000..734f0dd3c6d --- /dev/null +++ b/kernels/portable/cpu/util/slice_util.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch { +namespace executor { + +bool check_narrow_copy_args( + const Tensor& in, + int64_t dim, + int64_t start, + int64_t length, + Tensor& out); + +void get_narrow_copy_out_target_size( + const Tensor& in, + int64_t dim, + int64_t length, + exec_aten::SizesType* out_sizes, + size_t* out_ndim); + +bool check_slice_copy_args( + const Tensor& in, + int64_t dim, + int64_t step, + Tensor& out); + +void get_slice_copy_out_target_size( + const Tensor& in, + int64_t dim, + int64_t num_values, + exec_aten::SizesType* out_sizes, + size_t* out_ndim); + +bool check_slice_scatter_args( + const Tensor& input, + const Tensor& src, + int64_t dim, + int64_t num_values, + int64_t step, + Tensor output); + +int64_t adjust_slice_indices( + int64_t dim_length, + int64_t* start, + int64_t* end, + int64_t step); + +void compute_slice( + const Tensor& in, + int64_t dim, + int64_t start, + int64_t length, + int64_t step, + Tensor& out); + +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index bd55b4da304..3961add0fd7 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -29,6 +29,7 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:distance_util", "//executorch/kernels/portable/cpu/util:select_copy_util", "//executorch/kernels/portable/cpu/util:advanced_index_util", + "//executorch/kernels/portable/cpu/util:slice_util", ], visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], ) @@ -226,6 +227,16 @@ def define_common_targets(): visibility = ["//executorch/kernels/portable/cpu/..."], ) + runtime.cxx_library( + name = "slice_util", + srcs = ["slice_util.cpp"], + exported_headers = ["slice_util.h"], + deps = [ + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["//executorch/kernels/portable/cpu/..."], + ) + # Utility functions that can be used by operators that perform reduction for aten_mode in [True, False]: suffix = "_aten" if aten_mode else "" diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 69e0334051c..5136ea0a12f 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -587,6 +587,11 @@ - arg_meta: null kernel_name: torch::executor::mul_scalar_out +- op: narrow_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::narrow_copy_out + - op: native_group_norm.out kernels: - arg_meta: null diff --git a/kernels/test/op_narrow_copy_test.cpp b/kernels/test/op_narrow_copy_test.cpp new file mode 100644 index 00000000000..e453e46500a --- /dev/null +++ b/kernels/test/op_narrow_copy_test.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::optional; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpNarrowCopyOutTest : public OperatorTest { + protected: + Tensor& op_narrow_copy_out( + const Tensor& in, + int64_t dim, + int64_t start, + int64_t length, + Tensor& out) { + return torch::executor::aten::narrow_copy_outf( + context_, in, dim, start, length, out); + } + + template + void test_dtype() { + TensorFactory tf; + + // clang-format off + Tensor input = tf.make( + /*sizes=*/{3, 4}, + /*data=*/{ + 1, 2, 3, 4, // [0, :] + 5, 6, 7, 8, // [1, :] + 9, 10, 11, 12, // [2, :] + }); + + Tensor expected = tf.make( + /*sizes=*/{2, 4}, + /*data=*/{ + 1, 2, 3, 4, // [0, :] + 5, 6, 7, 8, // [1, :] + }); + // clang-format on + + Tensor out = tf.zeros({2, 4}); + Tensor ret = + op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/2, out); + + EXPECT_TENSOR_EQ(out, ret); + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpNarrowCopyOutTest, AllDtypesSupported) { +#define TEST_ENTRY(ctype, dtype) test_dtype(); + ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpNarrowCopyOutTest, EmptyInputSupported) { + TensorFactory tf; + + Tensor input = tf.ones({1, 0, 1}); + Tensor out = tf.zeros({1, 0, 1}); + + Tensor expect = tf.ones({1, 0, 1}); + + Tensor ret = + op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/1, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); + + ret = op_narrow_copy_out(input, /*dim=*/1, /*start=*/0, /*length=*/0, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); + + ret = op_narrow_copy_out(input, /*dim=*/2, /*start=*/0, /*length=*/1, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); +} + +TEST_F(OpNarrowCopyOutTest, ZeroLengthSupported) { + TensorFactory tf; + + Tensor input = tf.ones({2, 3}); + Tensor out = tf.ones({2, 0}); + + Tensor expect = tf.ones({2, 0}); + + Tensor ret = + op_narrow_copy_out(input, /*dim=*/1, /*start=*/1, /*length=*/0, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); + + ret = op_narrow_copy_out(input, /*dim=*/1, /*start=*/-1, /*length=*/0, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); +} + +TEST_F(OpNarrowCopyOutTest, ZeroDimInputDies) { + TensorFactory tf; + + Tensor input = tf.ones({}); + Tensor out = tf.ones({}); + + // The operation shall die whatever the end is. + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/0, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/0, /*start=*/1, /*length=*/1, out)); +} + +TEST_F(OpNarrowCopyOutTest, InvalidStart) { + TensorFactory tf; + + Tensor input = tf.ones({2, 3}); + Tensor out = tf.ones({2, 3}); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/0, /*start=*/-3, /*length=*/0, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/1, /*start=*/4, /*length=*/0, out)); +} + +TEST_F(OpNarrowCopyOutTest, InvalidStartLengthCombination) { + TensorFactory tf; + + Tensor input = tf.ones({2, 3}); + Tensor out = tf.ones({2, 3}); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/3, out)); + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/1, /*start=*/-1, /*length=*/2, out)); +} + +TEST_F(OpNarrowCopyOutTest, NegativeLengthDies) { + TensorFactory tf; + + Tensor input = tf.ones({1, 1, 1}); + Tensor out = tf.zeros({1, 1, 1}); + + // Some invalid length values. + const std::vector invalid_lengths = {-3, -2, -1}; + for (int64_t length : invalid_lengths) { + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out( + input, /*dim=*/0, /*start=*/0, /*length=*/length, out)); + } +} + +TEST_F(OpNarrowCopyOutTest, DimOutOfBoundDies) { + TensorFactory tf; + + Tensor input = tf.ones({1, 1, 1}); + Tensor out = tf.zeros({1, 1, 1}); + + // Some invalid dim values. + const std::vector invalid_dims = {3, 4, 5, -4, -5, -6}; + for (int64_t dim : invalid_dims) { + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, dim, /*start=*/0, /*length=*/1, out)); + } +} + +TEST_F(OpNarrowCopyOutTest, MismatchedDtypesDies) { + TensorFactory tf_int; + TensorFactory tf_float; + Tensor input = tf_int.zeros({1, 2, 2}); + + // Size is compatible to the output, but a mismatched dtype. + Tensor out = tf_float.ones({1, 2, 2}); + + ET_EXPECT_KERNEL_FAILURE( + context_, + op_narrow_copy_out(input, /*dim=*/0, /*start=*/0, /*length=*/1, out)); +} diff --git a/kernels/test/op_slice_copy_test.cpp b/kernels/test/op_slice_copy_test.cpp index 4c04e4bf51c..9aaf6f18dbc 100644 --- a/kernels/test/op_slice_copy_test.cpp +++ b/kernels/test/op_slice_copy_test.cpp @@ -475,6 +475,25 @@ TEST_F(OpSliceCopyTensorOutTest, EmptySizeInputDies) { input, /*dim=*/0, /*start=*/0, /*end=*/1, /*step=*/1, out)); } +TEST_F(OpSliceCopyTensorOutTest, ZeroLengthSupported) { + TensorFactory tf; + + Tensor input = tf.ones({2, 3}); + Tensor out = tf.ones({2, 0}); + + Tensor expect = tf.ones({2, 0}); + + Tensor ret = op_slice_copy_tensor_out( + input, /*dim=*/1, /*start=*/1, /*end=*/1, /*step=*/1, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); + + ret = op_slice_copy_tensor_out( + input, /*dim=*/1, /*start=*/-1, /*end=*/-1, /*step=*/1, out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expect); +} + TEST_F(OpSliceCopyTensorOutTest, NonPostiveStepsDies) { TensorFactory tf; diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 749a221f9c0..7ae17c5237a 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -246,6 +246,7 @@ def define_common_targets(): _common_op_test("op_minimum_test", ["aten", "portable"]) _common_op_test("op_mm_test", ["aten", "portable"]) _common_op_test("op_mul_test", ["aten", "portable", "optimized"]) + _common_op_test("op_narrow_copy_test", ["aten", "portable"]) _common_op_test("op_native_batch_norm_test", ["aten", "portable"]) _common_op_test("op_native_group_norm_test", ["aten", "portable"]) _common_op_test("op_native_layer_norm_test", ["aten", "portable", "optimized"]) diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl index b56f40c0215..ef8f936571c 100644 --- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -808,6 +808,12 @@ ATEN_OPS = ( ":scalar_utils", ], ), + op_target( + name = "op_narrow_copy", + deps = [ + "//executorch/kernels/portable/cpu/util:slice_util", + ], + ), op_target( name = "op_native_batch_norm", deps = [ @@ -1042,14 +1048,13 @@ ATEN_OPS = ( op_target( name = "op_slice_copy", deps = [ - "//executorch/kernels/portable/cpu/util:copy_ops_util", - "//executorch/kernels/portable/cpu/util:index_util", + "//executorch/kernels/portable/cpu/util:slice_util", ], ), op_target( name = "op_slice_scatter", deps = [ - "//executorch/kernels/portable/cpu/util:index_util", + "//executorch/kernels/portable/cpu/util:slice_util", ], ), op_target( From 550cd99545e0c724f7b63edd96dadf787d5a0185 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 12:14:09 -0700 Subject: [PATCH 176/531] Rephrase namespaces for brevity. Differential Revision: D62189433 Pull Request resolved: https://github.com/pytorch/executorch/pull/5080 --- extension/module/module.cpp | 84 ++++++++++------------- extension/module/module.h | 128 +++++++++++++++--------------------- 2 files changed, 90 insertions(+), 122 deletions(-) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index f9f1d6be0f7..725fe707bb7 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -33,42 +33,26 @@ std::move(*et_result__)); \ }) -using ::exec_aten::Tensor; -using ::executorch::extension::FileDataLoader; -using ::executorch::extension::MallocMemoryAllocator; -using ::executorch::extension::MmapDataLoader; -using ::executorch::runtime::DataLoader; -using ::executorch::runtime::Error; -using ::executorch::runtime::EValue; -using ::executorch::runtime::EventTracer; -using ::executorch::runtime::HierarchicalAllocator; -using ::executorch::runtime::MemoryAllocator; -using ::executorch::runtime::MemoryManager; -using ::executorch::runtime::MethodMeta; -using ::executorch::runtime::Program; -using ::executorch::runtime::Result; -using ::executorch::runtime::Span; - namespace executorch { namespace extension { Module::Module( const std::string& file_path, - const Module::LoadMode load_mode, - std::unique_ptr event_tracer) + const LoadMode load_mode, + std::unique_ptr event_tracer) : file_path_(file_path), load_mode_(load_mode), memory_allocator_(std::make_unique()), temp_allocator_(std::make_unique()), event_tracer_(std::move(event_tracer)) { - ::executorch::runtime::runtime_init(); + runtime::runtime_init(); } Module::Module( - std::unique_ptr data_loader, - std::unique_ptr memory_allocator, - std::unique_ptr temp_allocator, - std::unique_ptr event_tracer) + std::unique_ptr data_loader, + std::unique_ptr memory_allocator, + std::unique_ptr temp_allocator, + std::unique_ptr event_tracer) : data_loader_(std::move(data_loader)), memory_allocator_( memory_allocator ? std::move(memory_allocator) @@ -77,14 +61,14 @@ Module::Module( temp_allocator ? std::move(temp_allocator) : std::make_unique()), event_tracer_(std::move(event_tracer)) { - ::executorch::runtime::runtime_init(); + runtime::runtime_init(); } Module::Module( - std::shared_ptr program, - std::unique_ptr memory_allocator, - std::unique_ptr temp_allocator, - std::unique_ptr event_tracer) + std::shared_ptr program, + std::unique_ptr memory_allocator, + std::unique_ptr temp_allocator, + std::unique_ptr event_tracer) : program_(std::move(program)), memory_allocator_( memory_allocator ? std::move(memory_allocator) @@ -93,10 +77,10 @@ Module::Module( temp_allocator ? std::move(temp_allocator) : std::make_unique()), event_tracer_(std::move(event_tracer)) { - ::executorch::runtime::runtime_init(); + runtime::runtime_init(); } -Error Module::load(const Program::Verification verification) { +runtime::Error Module::load(const runtime::Program::Verification verification) { if (!is_loaded()) { if (!data_loader_) { switch (load_mode_) { @@ -119,15 +103,15 @@ Error Module::load(const Program::Verification verification) { break; } }; - auto program = - ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification)); - program_ = std::shared_ptr( - program.release(), [](Program* pointer) { delete pointer; }); + auto program = ET_UNWRAP_UNIQUE( + runtime::Program::load(data_loader_.get(), verification)); + program_ = std::shared_ptr( + program.release(), [](runtime::Program* pointer) { delete pointer; }); } - return Error::Ok; + return runtime::Error::Ok; } -Result> Module::method_names() { +runtime::Result> Module::method_names() { ET_CHECK_OK_OR_RETURN_ERROR(load()); const auto method_count = program_->num_methods(); std::unordered_set result; @@ -139,7 +123,7 @@ Result> Module::method_names() { return result; } -Error Module::load_method(const std::string& method_name) { +runtime::Error Module::load_method(const std::string& method_name) { if (!is_method_loaded(method_name)) { ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -158,10 +142,11 @@ Error Module::load_method(const std::string& method_name) { method_holder.planned_spans.emplace_back( method_holder.planned_buffers.back().data(), buffer_size); } - method_holder.planned_memory = std::make_unique(Span( - method_holder.planned_spans.data(), - method_holder.planned_spans.size())); - method_holder.memory_manager = std::make_unique( + method_holder.planned_memory = + std::make_unique(runtime::Span( + method_holder.planned_spans.data(), + method_holder.planned_spans.size())); + method_holder.memory_manager = std::make_unique( memory_allocator_.get(), method_holder.planned_memory.get(), temp_allocator_.get()); @@ -171,33 +156,36 @@ Error Module::load_method(const std::string& method_name) { event_tracer_.get())); methods_.emplace(method_name, std::move(method_holder)); } - return Error::Ok; + return runtime::Error::Ok; } -Result Module::method_meta(const std::string& method_name) { +runtime::Result Module::method_meta( + const std::string& method_name) { ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name)); return methods_.at(method_name).method->method_meta(); } -Result> Module::execute( +runtime::Result> Module::execute( const std::string& method_name, - const std::vector& input) { + const std::vector& input) { ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name)); auto& method = methods_.at(method_name).method; ET_CHECK_OK_OR_RETURN_ERROR(method->set_inputs( - exec_aten::ArrayRef(input.data(), input.size()))); + exec_aten::ArrayRef(input.data(), input.size()))); ET_CHECK_OK_OR_RETURN_ERROR(method->execute()); const auto outputs_size = method->outputs_size(); - std::vector outputs(outputs_size); + std::vector outputs(outputs_size); ET_CHECK_OK_OR_RETURN_ERROR( method->get_outputs(outputs.data(), outputs_size)); return outputs; } -Error Module::set_output_data_ptr(EValue output_value, size_t output_index) { +runtime::Error Module::set_output_data_ptr( + runtime::EValue output_value, + size_t output_index) { ET_CHECK_OK_OR_RETURN_ERROR(load_method("forward")); auto& output_tensor = output_value.toTensor(); auto& method = methods_.at("forward").method; diff --git a/extension/module/module.h b/extension/module/module.h index 8ae7e436556..052489fb331 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -48,8 +48,7 @@ class Module final { explicit Module( const std::string& file_path, const LoadMode load_mode = LoadMode::MmapUseMlock, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = - nullptr); + std::unique_ptr event_tracer = nullptr); /** * Constructs an instance with the provided data loader and memory allocator. @@ -61,13 +60,10 @@ class Module final { * @param[in] event_tracer A EventTracer used for tracking and logging events. */ explicit Module( - std::unique_ptr<::executorch::runtime::DataLoader> data_loader, - std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator = - nullptr, - std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator = - nullptr, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = - nullptr); + std::unique_ptr data_loader, + std::unique_ptr memory_allocator = nullptr, + std::unique_ptr temp_allocator = nullptr, + std::unique_ptr event_tracer = nullptr); /** * Constructs an instance using an existing shared program. @@ -80,13 +76,10 @@ class Module final { * @param[in] event_tracer A EventTracer used for tracking and logging events. */ explicit Module( - std::shared_ptr<::executorch::runtime::Program> program, - std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator = - nullptr, - std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator = - nullptr, - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = - nullptr); + std::shared_ptr program, + std::unique_ptr memory_allocator = nullptr, + std::unique_ptr temp_allocator = nullptr, + std::unique_ptr event_tracer = nullptr); Module(const Module&) = delete; Module& operator=(const Module&) = delete; @@ -102,9 +95,9 @@ class Module final { * @returns An Error to indicate success or failure of the loading process. */ ET_NODISCARD - ::executorch::runtime::Error load( - const ::executorch::runtime::Program::Verification verification = - ::executorch::runtime::Program::Verification::Minimal); + runtime::Error load( + const runtime::Program::Verification verification = + runtime::Program::Verification::Minimal); /** * Checks if the program is loaded. @@ -121,7 +114,7 @@ class Module final { * * @returns Shared pointer to the program or nullptr if it's not yet loaded. */ - inline std::shared_ptr<::executorch::runtime::Program> program() const { + inline std::shared_ptr program() const { return program_; } @@ -132,7 +125,7 @@ class Module final { * @returns A set of strings containing the names of the methods, or an error * if the program or method failed to load. */ - ::executorch::runtime::Result> method_names(); + runtime::Result> method_names(); /** * Load a specific method from the program and set up memory management if @@ -143,7 +136,7 @@ class Module final { * @returns An Error to indicate success or failure. */ ET_NODISCARD - ::executorch::runtime::Error load_method(const std::string& method_name); + runtime::Error load_method(const std::string& method_name); /** * Checks if a specific method is loaded. @@ -166,7 +159,7 @@ class Module final { * @returns A method metadata, or an error if the program or method failed to * load. */ - ::executorch::runtime::Result<::executorch::runtime::MethodMeta> method_meta( + runtime::Result method_meta( const std::string& method_name); /** @@ -180,10 +173,9 @@ class Module final { * from the method or an error to indicate failure. */ ET_NODISCARD - ::executorch::runtime::Result> - execute( + runtime::Result> execute( const std::string& method_name, - const std::vector<::executorch::runtime::EValue>& input); + const std::vector& input); /** * Execute a specific method with a single input value. @@ -195,13 +187,10 @@ class Module final { * @returns A Result object containing either a vector of output values * from the method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - std::vector<::executorch::runtime::EValue>> - execute( + ET_NODISCARD inline runtime::Result> execute( const std::string& method_name, - const ::executorch::runtime::EValue& input) { - return execute( - method_name, std::vector<::executorch::runtime::EValue>{input}); + const runtime::EValue& input) { + return execute(method_name, std::vector{input}); } /** @@ -213,10 +202,9 @@ class Module final { * @returns A Result object containing either a vector of output values * from the method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - std::vector<::executorch::runtime::EValue>> - execute(const std::string& method_name) { - return execute(method_name, std::vector<::executorch::runtime::EValue>{}); + ET_NODISCARD inline runtime::Result> execute( + const std::string& method_name) { + return execute(method_name, std::vector{}); } /** @@ -229,13 +217,12 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - ::executorch::runtime::EValue> - get(const std::string& method_name, - const std::vector<::executorch::runtime::EValue>& input) { + ET_NODISCARD inline runtime::Result get( + const std::string& method_name, + const std::vector& input) { auto result = ET_UNWRAP(execute(method_name, input)); if (result.empty()) { - return ::executorch::runtime::Error::InvalidArgument; + return runtime::Error::InvalidArgument; } return result[0]; } @@ -250,11 +237,10 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - ::executorch::runtime::EValue> - get(const std::string& method_name, - const ::executorch::runtime::EValue& input) { - return get(method_name, std::vector<::executorch::runtime::EValue>{input}); + ET_NODISCARD inline runtime::Result get( + const std::string& method_name, + const runtime::EValue& input) { + return get(method_name, std::vector{input}); } /** @@ -266,10 +252,9 @@ class Module final { * @returns A Result object containing either the first output value from the * method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - ::executorch::runtime::EValue> - get(const std::string& method_name) { - return get(method_name, std::vector<::executorch::runtime::EValue>{}); + ET_NODISCARD inline runtime::Result get( + const std::string& method_name) { + return get(method_name, std::vector{}); } /** @@ -281,9 +266,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - std::vector<::executorch::runtime::EValue>> - forward(const std::vector<::executorch::runtime::EValue>& input) { + ET_NODISCARD inline runtime::Result> forward( + const std::vector& input) { return execute("forward", input); } @@ -296,10 +280,9 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - std::vector<::executorch::runtime::EValue>> - forward(const ::executorch::runtime::EValue& input) { - return forward(std::vector<::executorch::runtime::EValue>{input}); + ET_NODISCARD inline runtime::Result> forward( + const runtime::EValue& input) { + return forward(std::vector{input}); } /** @@ -309,10 +292,8 @@ class Module final { * @returns A Result object containing either a vector of output values * from the 'forward' method or an error to indicate failure. */ - ET_NODISCARD inline ::executorch::runtime::Result< - std::vector<::executorch::runtime::EValue>> - forward() { - return forward(std::vector<::executorch::runtime::EValue>{}); + ET_NODISCARD inline runtime::Result> forward() { + return forward(std::vector{}); } /** @@ -323,7 +304,7 @@ class Module final { * @returns A pointer to the EventTracer instance. Returns nullptr if no * EventTracer is set. */ - inline ::executorch::runtime::EventTracer* event_tracer() const { + inline runtime::EventTracer* event_tracer() const { return event_tracer_.get(); } @@ -335,28 +316,27 @@ class Module final { * * @returns An Error to indicate success or failure of the loading process. */ - ::executorch::runtime::Error set_output_data_ptr( - ::executorch::runtime::EValue output_value, + runtime::Error set_output_data_ptr( + runtime::EValue output_value, size_t output_index); private: struct MethodHolder { std::vector> planned_buffers; - std::vector<::executorch::runtime::Span> planned_spans; - std::unique_ptr<::executorch::runtime::HierarchicalAllocator> - planned_memory; - std::unique_ptr<::executorch::runtime::MemoryManager> memory_manager; - std::unique_ptr<::executorch::runtime::Method> method; + std::vector> planned_spans; + std::unique_ptr planned_memory; + std::unique_ptr memory_manager; + std::unique_ptr method; }; private: std::string file_path_; LoadMode load_mode_{LoadMode::MmapUseMlock}; - std::shared_ptr<::executorch::runtime::Program> program_; - std::unique_ptr<::executorch::runtime::DataLoader> data_loader_; - std::unique_ptr<::executorch::runtime::MemoryAllocator> memory_allocator_; - std::unique_ptr<::executorch::runtime::MemoryAllocator> temp_allocator_; - std::unique_ptr<::executorch::runtime::EventTracer> event_tracer_; + std::shared_ptr program_; + std::unique_ptr data_loader_; + std::unique_ptr memory_allocator_; + std::unique_ptr temp_allocator_; + std::unique_ptr event_tracer_; std::unordered_map methods_; friend class ExecuTorchJni; From e1dfb14fe453f96d8a6168f70b526ea27317022f Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 12:21:27 -0700 Subject: [PATCH 177/531] Exclude some extra duplicated runtime symbols. Differential Revision: D62198181 Pull Request resolved: https://github.com/pytorch/executorch/pull/5079 --- build/cmake_deps.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 476a3e69fad..60d3ac71451 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -181,6 +181,7 @@ filters = [ ] deps = [ "executorch", + "executorch_no_prim_ops", ] # ---------------------------------- extension end ---------------------------------- # ---------------------------------- binary start ---------------------------------- From a0b8bf059de7bccee1de4409f85e9f6f5089c957 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 12:37:49 -0700 Subject: [PATCH 178/531] Format cmake files. Differential Revision: D62168513 Pull Request resolved: https://github.com/pytorch/executorch/pull/5081 --- examples/arm/executor_runner/CMakeLists.txt | 4 +++- extension/parallel/test/CMakeLists.txt | 12 ++++++++---- extension/threadpool/CMakeLists.txt | 9 +++++---- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index b32b2d8d2bf..68c5435dffe 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -133,7 +133,9 @@ endif() # The arm_executor_runner executable add_executable(arm_executor_runner) -target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp) +target_sources( + arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp +) # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) diff --git a/extension/parallel/test/CMakeLists.txt b/extension/parallel/test/CMakeLists.txt index 9f1ff1871a2..ab37f66c17d 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/parallel/test/CMakeLists.txt @@ -21,12 +21,16 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs - thread_parallel_test.cpp ../thread_parallel.cpp -) +set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp) et_cxx_test( - extension_parallel_test SOURCES ${_test_srcs} EXTRA_LIBS pthreadpool cpuinfo extension_threadpool + extension_parallel_test + SOURCES + ${_test_srcs} + EXTRA_LIBS + pthreadpool + cpuinfo + extension_threadpool ) target_include_directories( extension_parallel_test diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index a82afc045a3..281b63b8592 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -20,16 +20,17 @@ if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() -add_library(extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp) +add_library( + extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp +) target_link_libraries( extension_threadpool PUBLIC executorch_no_prim_ops cpuinfo pthreadpool ) target_include_directories(extension_threadpool PUBLIC ${EXECUTORCH_ROOT}/..) target_include_directories( extension_threadpool - PUBLIC - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include + PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) From 03a8f34f29574e7a7d782e0ec1050875f8d8ea92 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Wed, 4 Sep 2024 13:17:27 -0700 Subject: [PATCH 179/531] b&c start ops | add dim order sanity check Differential Revision: D59824515 Pull Request resolved: https://github.com/pytorch/executorch/pull/4334 --- kernels/portable/cpu/op_bitwise_and.cpp | 6 ++++++ kernels/portable/cpu/op_bitwise_not.cpp | 2 ++ kernels/portable/cpu/op_bitwise_or.cpp | 6 ++++++ kernels/portable/cpu/op_bitwise_xor.cpp | 6 ++++++ kernels/portable/cpu/op_bmm.cpp | 5 +++++ kernels/portable/cpu/op_cdist_forward.cpp | 5 +++++ kernels/portable/cpu/op_clamp.cpp | 9 +++++++++ kernels/portable/cpu/op_clone.cpp | 3 +++ kernels/portable/cpu/op_constant_pad_nd.cpp | 3 +++ kernels/portable/cpu/op_convolution.cpp | 3 +++ kernels/portable/cpu/op_copy.cpp | 6 ++++++ kernels/portable/cpu/op_cumsum.cpp | 3 +++ kernels/portable/cpu/util/copy_ops_util.cpp | 2 ++ 13 files changed, 59 insertions(+) diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp index de137afbec2..79c0e939f78 100644 --- a/kernels/portable/cpu/op_bitwise_and.cpp +++ b/kernels/portable/cpu/op_bitwise_and.cpp @@ -32,6 +32,9 @@ Tensor& bitwise_and_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -82,6 +85,9 @@ Tensor& bitwise_and_Scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_bitwise_not.cpp b/kernels/portable/cpu/op_bitwise_not.cpp index 157d2fd173c..a6993a76563 100644 --- a/kernels/portable/cpu/op_bitwise_not.cpp +++ b/kernels/portable/cpu/op_bitwise_not.cpp @@ -33,6 +33,8 @@ Tensor& bitwise_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { "Failed to resize output tensor."); ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); if (in.scalar_type() == exec_aten::ScalarType::Bool) { apply_unary_map_fn( diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp index 39707de07ce..7c53b687222 100644 --- a/kernels/portable/cpu/op_bitwise_or.cpp +++ b/kernels/portable/cpu/op_bitwise_or.cpp @@ -32,6 +32,9 @@ Tensor& bitwise_or_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -74,6 +77,9 @@ Tensor& bitwise_or_Scalar_out( Tensor& out) { (void)ctx; + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp index 1855485ee52..2d1bb410345 100644 --- a/kernels/portable/cpu/op_bitwise_xor.cpp +++ b/kernels/portable/cpu/op_bitwise_xor.cpp @@ -32,6 +32,9 @@ Tensor& bitwise_xor_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -82,6 +85,9 @@ Tensor& bitwise_xor_Scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp index e36a4c2e413..6d9bad64236 100644 --- a/kernels/portable/cpu/op_bmm.cpp +++ b/kernels/portable/cpu/op_bmm.cpp @@ -23,6 +23,11 @@ Tensor& bmm_out( Tensor& out) { ET_KERNEL_CHECK(ctx, check_bmm_args(in, mat2, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + size_t output_ndim = 0; exec_aten::SizesType output_sizes[kTensorDimensionLimit]; get_bmm_out_target_size(in, mat2, output_sizes, &output_ndim); diff --git a/kernels/portable/cpu/op_cdist_forward.cpp b/kernels/portable/cpu/op_cdist_forward.cpp index 5f435806926..2f217562847 100644 --- a/kernels/portable/cpu/op_cdist_forward.cpp +++ b/kernels/portable/cpu/op_cdist_forward.cpp @@ -124,6 +124,11 @@ Tensor& _cdist_forward_out( Tensor& out) { (void)ctx; + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(x1, x2, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(x1), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, check_cdist_args(x1, x2, p, compute_mode, out), diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 33a9fe899db..5d9a0166e8d 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -83,6 +83,9 @@ Tensor& clamp_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType min_type = in_type; ScalarType max_type = in_type; @@ -182,6 +185,12 @@ Tensor& clamp_tensor_out( const Tensor& min = has_min ? min_opt.value() : in; const Tensor& max = has_max ? max_opt.value() : in; + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, min, max, out), + InvalidArgument, + out); + ET_KERNEL_CHECK( ctx, resize_to_broadcast_target_size(in, min, max, out) == Error::Ok, diff --git a/kernels/portable/cpu/op_clone.cpp b/kernels/portable/cpu/op_clone.cpp index a49f4169dbf..df37cfec82a 100644 --- a/kernels/portable/cpu/op_clone.cpp +++ b/kernels/portable/cpu/op_clone.cpp @@ -38,6 +38,9 @@ Tensor& clone_out( InvalidArgument, out); + ET_KERNEL_CHECK( + context, tensors_have_same_dim_order(self, out), InvalidArgument, out); + // Right now we only focus on contiguous memory, memory_format shall always // either a nullopt or exec::aten::MemoryFormat::Contiguous ET_KERNEL_CHECK( diff --git a/kernels/portable/cpu/op_constant_pad_nd.cpp b/kernels/portable/cpu/op_constant_pad_nd.cpp index 32a3985b29e..09470cdfc76 100644 --- a/kernels/portable/cpu/op_constant_pad_nd.cpp +++ b/kernels/portable/cpu/op_constant_pad_nd.cpp @@ -170,6 +170,9 @@ Tensor& constant_pad_nd_out( ET_KERNEL_CHECK( ctx, check_constant_pad_args(in, pad, value, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // resize out tensor for dynamic shapes ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp index 81a0747e454..33ad177f5da 100644 --- a/kernels/portable/cpu/op_convolution.cpp +++ b/kernels/portable/cpu/op_convolution.cpp @@ -365,6 +365,9 @@ Tensor& convolution_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + size_t output_ndim = 0; exec_aten::SizesType output_sizes[kTensorDimensionLimit]; get_convolution_out_target_size( diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp index 900b6e39d34..764a50a5d20 100644 --- a/kernels/portable/cpu/op_copy.cpp +++ b/kernels/portable/cpu/op_copy.cpp @@ -39,6 +39,9 @@ Tensor& copy_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType src_type = src.scalar_type(); @@ -66,6 +69,9 @@ copy_(RuntimeContext& ctx, Tensor& in, const Tensor& src, bool non_blocking) { ET_KERNEL_CHECK( ctx, tensor_is_broadcastable_to(src, in), InvalidArgument, in); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, src), InvalidArgument, in); + ScalarType in_type = in.scalar_type(); ScalarType src_type = src.scalar_type(); diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp index fffc2d46392..6dadd27ad86 100644 --- a/kernels/portable/cpu/op_cumsum.cpp +++ b/kernels/portable/cpu/op_cumsum.cpp @@ -93,6 +93,9 @@ Tensor& cumsum_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, resize_tensor(out, self.sizes()) == Error::Ok, InvalidArgument, out); diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index 61c07d71a4b..a46ccbf241a 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -95,6 +95,8 @@ bool check_cat_args( ET_LOG_AND_RETURN_IF_FALSE( canCast(tensors[i].scalar_type(), out.scalar_type())); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dim_order(tensors[i], out)); + // Empty tensors have no shape constraints. if (tensors[i].numel() == 0) { continue; From ae05ed80403fa37cda487a5d3d30c20f09fa2640 Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 4 Sep 2024 13:38:34 -0700 Subject: [PATCH 180/531] Remove usages of extract_constant_segment=False Differential Revision: D61945429 Pull Request resolved: https://github.com/pytorch/executorch/pull/5074 --- backends/apple/mps/test/test_mps_utils.py | 8 ++------ examples/apple/coreml/scripts/export.py | 10 ++-------- examples/apple/coreml/scripts/inspector_utils.py | 1 - examples/apple/mps/scripts/mps_example.py | 10 ++-------- examples/arm/aot_arm_compiler.py | 4 +--- examples/models/llava/export_llava.py | 1 - examples/portable/scripts/export_and_delegate.py | 10 ++-------- examples/qualcomm/oss_scripts/llama2/llama.py | 1 - examples/qualcomm/scripts/export_example.py | 4 +--- examples/qualcomm/utils.py | 1 - examples/sdk/scripts/gen_sample_etrecord.py | 7 +++---- examples/xnnpack/aot_compiler.py | 6 +++--- examples/xnnpack/quantization/example.py | 6 +++--- 13 files changed, 19 insertions(+), 50 deletions(-) diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py index 199a7fe1782..6f7d00d7b09 100644 --- a/backends/apple/mps/test/test_mps_utils.py +++ b/backends/apple/mps/test/test_mps_utils.py @@ -239,9 +239,7 @@ def lower_module_and_test_output( ) executorch_program = delegated_program.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: delegated_program = to_backend( @@ -258,9 +256,7 @@ def lower_module_and_test_output( _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. ), ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) if bundled_program: diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py index 5a8c9b227f6..e906c0704cb 100644 --- a/examples/apple/coreml/scripts/export.py +++ b/examples/apple/coreml/scripts/export.py @@ -104,11 +104,7 @@ def export_lowered_module_to_executorch_program(lowered_module, example_inputs): lowered_module(*example_inputs) exec_prog = to_edge( export(lowered_module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG - ).to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) - ) + ).to_executorch(config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)) return exec_prog @@ -178,9 +174,7 @@ def generate_compile_specs_from_args(args): ) delegated_program_manager = edge_program_manager.to_backend(partitioner) exec_program = delegated_program_manager.to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) + config=exir.ExecutorchBackendConfig(extract_delegate_segments=True) ) else: lowered_module, edge_copy = lower_module_to_coreml( diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py index c5674ec520b..9d7420a920a 100644 --- a/examples/apple/coreml/scripts/inspector_utils.py +++ b/examples/apple/coreml/scripts/inspector_utils.py @@ -79,7 +79,6 @@ def build_sdk_runner_including_coreml( ) _EDGE_BACKEND_CONFIG = exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True, ) diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py index 636444e2b78..d6416e0ffc8 100644 --- a/examples/apple/mps/scripts/mps_example.py +++ b/examples/apple/mps/scripts/mps_example.py @@ -183,9 +183,7 @@ def get_model_config(args): logging.info(f"Lowered graph:\n{edge.exported_program().graph}") executorch_program = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: lowered_module = to_backend( @@ -195,11 +193,7 @@ def get_model_config(args): lowered_module, example_inputs, edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False), - ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) - ) + ).to_executorch(config=ExecutorchBackendConfig(extract_delegate_segments=False)) model_name = f"{args.model_name}_mps" diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index f854a081fa6..4d77e819089 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -226,9 +226,7 @@ def forward(self, x): try: exec_prog = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) except RuntimeError as e: if "Missing out variants" in str(e.args[0]): diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 5cd8628c603..4f8a403bb34 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -222,7 +222,6 @@ def export_all(llava_model: LlavaModel): executorch_program = lowered_and_edge.to_executorch( ExecutorchBackendConfig( - extract_constant_segment=True, extract_delegate_segments=True, passes=[ QuantFusionPass(), diff --git a/examples/portable/scripts/export_and_delegate.py b/examples/portable/scripts/export_and_delegate.py index 8d394537188..8df476f3dfc 100644 --- a/examples/portable/scripts/export_and_delegate.py +++ b/examples/portable/scripts/export_and_delegate.py @@ -15,8 +15,6 @@ BackendWithCompilerDemo, ) from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo -from executorch.exir.capture._config import ExecutorchBackendConfig - from executorch.extension.export_util import export_to_edge from ...models import MODEL_NAME_TO_MODEL @@ -94,9 +92,7 @@ def forward(self, *args): logging.info(f"Lowered graph:\n{composited_edge.exported_program().graph}") - exec_prog = composited_edge.to_executorch( - config=ExecutorchBackendConfig(extract_constant_segment=False) - ) + exec_prog = composited_edge.to_executorch() buffer = exec_prog.buffer model_name = "composite_model" @@ -147,9 +143,7 @@ def get_example_inputs(self): edge = edge.to_backend(AddMulPartitionerDemo()) logging.info(f"Lowered graph:\n{edge.exported_program().graph}") - exec_prog = edge.to_executorch( - config=ExecutorchBackendConfig(extract_constant_segment=False) - ) + exec_prog = edge.to_executorch() buffer = exec_prog.buffer model_name = "partition_lowered_model" diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py index 087296b15bc..f7fda3b9849 100644 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ b/examples/qualcomm/oss_scripts/llama2/llama.py @@ -315,7 +315,6 @@ def lowering_modules( passes=[ BuildQuantIo(), ], - extract_constant_segment=False, # For shared buffer, user must pass the memory address # which is allocated by RPC memory to executor runner. # Therefore, won't want to pre-allocate diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py index 8339b9f5b58..08f18d6ac6a 100644 --- a/examples/qualcomm/scripts/export_example.py +++ b/examples/qualcomm/scripts/export_example.py @@ -96,9 +96,7 @@ ) executorch_program = delegated_program.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) if args.generate_etrecord: diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index ef21892f96c..1a748bb45e1 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -254,7 +254,6 @@ def build_executorch_binary( ) executorch_config = ExecutorchBackendConfig( - extract_constant_segment=False, # For shared buffer, user must pass the memory address # which is allocated by RPC memory to executor runner. # Therefore, won't want to pre-allocate diff --git a/examples/sdk/scripts/gen_sample_etrecord.py b/examples/sdk/scripts/gen_sample_etrecord.py index d2c4913b035..9194b7caa23 100644 --- a/examples/sdk/scripts/gen_sample_etrecord.py +++ b/examples/sdk/scripts/gen_sample_etrecord.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # Generate fixture files import argparse import copy @@ -18,7 +20,6 @@ ExportedProgram, to_edge, ) -from executorch.exir.capture._config import ExecutorchBackendConfig from torch.export import export from ...models import MODEL_NAME_TO_MODEL @@ -38,9 +39,7 @@ def gen_etrecord(model: torch.nn.Module, inputs: Any, output_path=None): aten_dialect, compile_config=EdgeCompileConfig(_check_ir_validity=True) ) edge_program_copy = copy.deepcopy(edge_program) - et_program: ExecutorchProgramManager = edge_program_copy.to_executorch( - config=ExecutorchBackendConfig(extract_constant_segment=False) - ) + et_program: ExecutorchProgramManager = edge_program_copy.to_executorch() generate_etrecord( (DEFAULT_OUTPUT_PATH if not output_path else output_path), edge_dialect_program=edge_program, diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py index 32d67e0cd4a..0ae84c0197a 100644 --- a/examples/xnnpack/aot_compiler.py +++ b/examples/xnnpack/aot_compiler.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # Example script for exporting simple models to flatbuffer import argparse @@ -103,9 +105,7 @@ logging.info(f"Lowered graph:\n{edge.exported_program().graph}") exec_prog = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) if args.etrecord is not None: diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py index e64c171f6d4..bd23f7f383e 100644 --- a/examples/xnnpack/quantization/example.py +++ b/examples/xnnpack/quantization/example.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import argparse import copy import logging @@ -191,9 +193,7 @@ def main() -> None: start = time.perf_counter() prog = edge_m.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) save_pte_program(prog, f"{args.model_name}_quantized") end = time.perf_counter() From f326ee1d51b229273734df9b444b79ca9442e1b8 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 14:35:38 -0700 Subject: [PATCH 181/531] Adopt the new tensor API for aten_util. Differential Revision: D62168422 Pull Request resolved: https://github.com/pytorch/executorch/pull/5062 --- CMakeLists.txt | 1 + .../make_aten_functor_from_et_functor.h | 58 +++++-------------- extension/aten_util/targets.bzl | 1 + extension/llm/custom_ops/CMakeLists.txt | 3 +- extension/tensor/CMakeLists.txt | 2 +- extension/tensor/tensor_impl_ptr.cpp | 2 +- 6 files changed, 20 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3618bff7677..a19f405e80c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -228,6 +228,7 @@ cmake_dependent_option( ) if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) + set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON) set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON) endif() diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h index 3b54254e8ed..d7f2906944c 100644 --- a/extension/aten_util/make_aten_functor_from_et_functor.h +++ b/extension/aten_util/make_aten_functor_from_et_functor.h @@ -20,8 +20,8 @@ #endif #include #include +#include #include -#include #include namespace executorch { @@ -105,37 +105,12 @@ struct type_convert< typename remove_const_ref::type, torch::executor::Tensor>>> final { - explicit type_convert(ATensor value) : value_(value) { - auto sizes = - std::make_shared>( - value_.sizes().begin(), value_.sizes().end()); - const ssize_t dim = sizes->size(); - auto dim_order = - std::make_shared>( - dim); - auto strides = - std::make_shared>( - dim); - - std::iota(dim_order->begin(), dim_order->end(), 0); - ::executorch::runtime::dim_order_to_stride_nocheck( - sizes->data(), dim_order->data(), dim, strides->data()); - - auto tensor_impl = std::make_shared( - static_cast(value_.scalar_type()), - sizes->size(), - sizes->data(), - value_.mutable_data_ptr(), - dim_order->data(), - strides->data()); - - converted_ = std::unique_ptr< - torch::executor::Tensor, - std::function>( - new torch::executor::Tensor(tensor_impl.get()), - [sizes, dim_order, strides, tensor_impl]( - torch::executor::Tensor* pointer) { delete pointer; }); - } + explicit type_convert(ATensor value) + : value_(value), + converted_(from_blob( + value_.mutable_data_ptr(), + {value_.sizes().begin(), value_.sizes().end()}, + ::torch::executor::ScalarType(value_.scalar_type()))) {} ETensor call() { return *converted_; @@ -143,10 +118,7 @@ struct type_convert< private: ATensor value_; - std::unique_ptr< - torch::executor::Tensor, - std::function> - converted_; + TensorPtr converted_; }; // Tensors: ETen to ATen. @@ -158,15 +130,14 @@ struct type_convert< std::is_same_v::type, at::Tensor> && std::is_same_v< typename remove_const_ref::type, - torch::executor::Tensor>>> + ::torch::executor::Tensor>>> final { explicit type_convert(ETensor value) - : value_(value), sizes_(value_.sizes().begin(), value_.sizes().end()) { - converted_ = at::from_blob( - value_.mutable_data_ptr(), - sizes_, - static_cast(value_.scalar_type())); - } + : value_(value), + converted_(at::from_blob( + value_.mutable_data_ptr(), + std::vector{value_.sizes().begin(), value_.sizes().end()}, + c10::ScalarType(value_.scalar_type()))) {} ATensor call() { return converted_; @@ -175,7 +146,6 @@ struct type_convert< private: ETensor value_; at::Tensor converted_; - std::vector sizes_; }; // Optionals: ATen to ETen. diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl index b396cb78325..f219d6253f2 100644 --- a/extension/aten_util/targets.bzl +++ b/extension/aten_util/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/tensor:tensor", "//executorch/runtime/core:core", "//executorch/runtime/core:evalue", "//executorch/runtime/core/exec_aten:lib", diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 41c8c0ee160..723444498a4 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -94,7 +94,8 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) endif() target_link_libraries( - custom_ops_aot_lib PUBLIC cpublas torch extension_threadpool + custom_ops_aot_lib PUBLIC cpublas torch extension_tensor + extension_threadpool ) if(WIN32) # There is no direct replacement for libpthread.so on Windows. For the diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt index 4a02965c647..2cf1bf2956f 100644 --- a/extension/tensor/CMakeLists.txt +++ b/extension/tensor/CMakeLists.txt @@ -18,7 +18,7 @@ endif() list(TRANSFORM _extension_tensor__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(extension_tensor ${_extension_tensor__srcs}) -target_link_libraries(extension_tensor executorch) +target_link_libraries(extension_tensor executorch_no_prim_ops) target_include_directories(extension_tensor PUBLIC ${EXECUTORCH_ROOT}/..) target_compile_options(extension_tensor PUBLIC ${_common_compile_options}) diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp index aa5f78e7f8d..ea4d83f5afd 100644 --- a/extension/tensor/tensor_impl_ptr.cpp +++ b/extension/tensor/tensor_impl_ptr.cpp @@ -91,7 +91,7 @@ TensorImplPtr make_tensor_impl_ptr( tensor_impl.release(), TensorImplPtrDeleter{ std::unique_ptr>( - data, std::move(deleter) ?: noop_deleter), + data, deleter ? std::move(deleter) : noop_deleter), std::move(sizes), std::move(dim_order), std::move(strides)}); From 3716680ecb6886ddb59a7c72e61c67a958f536d3 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:19:19 -0700 Subject: [PATCH 182/531] [ET-VK] Persistently map staging buffers Differential Revision: D59706627 Pull Request resolved: https://github.com/pytorch/executorch/pull/5021 --- .../runtime/api/containers/StagingBuffer.h | 30 +++++++ .../vulkan/runtime/graph/ComputeGraph.cpp | 4 +- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 5 +- .../runtime/graph/ops/utils/StagingUtils.cpp | 80 ------------------- .../runtime/graph/ops/utils/StagingUtils.h | 19 ----- .../runtime/vk_api/memory/Allocation.cpp | 7 ++ .../vulkan/runtime/vk_api/memory/Allocation.h | 2 + .../runtime/vk_api/memory/Allocator.cpp | 3 +- .../vulkan/runtime/vk_api/memory/Buffer.cpp | 2 +- .../vulkan/runtime/vk_api/memory/Buffer.h | 4 + backends/vulkan/test/utils/test_utils.cpp | 34 ++++---- backends/vulkan/test/utils/test_utils.h | 4 +- .../vulkan/test/vulkan_compute_api_test.cpp | 18 ++--- 13 files changed, 76 insertions(+), 136 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h index a24728470b0..66c607e178c 100644 --- a/backends/vulkan/runtime/api/containers/StagingBuffer.h +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -14,6 +14,8 @@ #include +#include + namespace vkcompute { namespace api { @@ -55,6 +57,10 @@ class StagingBuffer final { return vulkan_buffer_; } + inline void* data() { + return vulkan_buffer_.allocation_info().pMappedData; + } + inline size_t numel() { return numel_; } @@ -62,6 +68,30 @@ class StagingBuffer final { inline size_t nbytes() { return nbytes_; } + + inline void copy_from(const void* src, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + memcpy(data(), src, nbytes); + vmaFlushAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + } + + inline void copy_to(void* dst, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + vmaInvalidateAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + memcpy(dst, data(), nbytes); + } + + inline void set_staging_zeros() { + memset(data(), 0, nbytes_); + } }; } // namespace api diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index a8f57f57d2a..c22241940f8 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -401,7 +401,7 @@ void ComputeGraph::copy_into_staging( const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_ptr_to_staging(data, *staging, nbytes); + staging->copy_from(data, nbytes); } void ComputeGraph::copy_from_staging( @@ -410,7 +410,7 @@ void ComputeGraph::copy_from_staging( const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_staging_to_ptr(*staging, data, nbytes); + staging->copy_to(data, nbytes); } void ComputeGraph::prepare() { diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index a9c2f6c9b6a..61b24cd409b 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -53,8 +53,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); api::StagingBuffer staging(graph->context(), packed->dtype(), numel); - size_t nbytes = numel * vkapi::element_size(packed->dtype()); - set_staging_zeros(staging, nbytes); + staging.set_staging_zeros(); return staging; } @@ -62,7 +61,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { size_t numel = utils::multiply_integers(tref->sizes); api::StagingBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); - copy_ptr_to_staging(tref->data, staging, nbytes); + staging.copy_from(tref->data, nbytes); return staging; } diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 9cb715e202a..8804bcf2ef6 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -13,88 +13,8 @@ #include -#include - namespace vkcompute { -template -void memcpy_to_mapping_impl( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes) { - T* data_ptr = dst_mapping.template data(); - memcpy(data_ptr, reinterpret_cast(src), nbytes); -} - -template -void memcpy_from_mapping_impl( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes) { - T* data_ptr = src_mapping.template data(); - memcpy(reinterpret_cast(dst), data_ptr, nbytes); -} - -void memcpy_to_mapping( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_to_mapping_impl(src, dst_mapping, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void memcpy_from_mapping( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_from_mapping_impl(src_mapping, dst, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void copy_ptr_to_staging( - const void* src, - api::StagingBuffer& staging, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - mapping.invalidate(); - memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); -} - -void copy_staging_to_ptr( - api::StagingBuffer& staging, - void* dst, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); - mapping.invalidate(); - memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); -} - -void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - uint8_t* data_ptr = mapping.template data(); - memset(data_ptr, 0, staging.nbytes()); -} - vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, const bool int8_buffer_enabled) { diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index f16c52ecf33..8d63958a738 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -12,25 +12,6 @@ namespace vkcompute { -// -// Functions to copy data into and out of a staging buffer -// - -void copy_ptr_to_staging( - const void* src, - api::StagingBuffer& staging, - const size_t nbytes); -void copy_staging_to_ptr( - api::StagingBuffer& staging, - void* dst, - const size_t nbytes); - -void set_staging_zeros(api::StagingBuffer& staging, const size_t nbytes); - -// -// Functions to get shaders -// - vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, bool int8_buffer_enabled = true); diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp index b07bb2862d3..d4e0fc9702e 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp @@ -30,6 +30,7 @@ Allocation::Allocation() create_info{}, allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), + allocation_info({}), is_copy_(false) {} Allocation::Allocation( @@ -40,6 +41,7 @@ Allocation::Allocation( create_info(create_info), allocator(vma_allocator), allocation(VK_NULL_HANDLE), + allocation_info({}), is_copy_(false) { VK_CHECK(vmaAllocateMemory( allocator, &memory_requirements, &create_info, &allocation, nullptr)); @@ -50,6 +52,7 @@ Allocation::Allocation(const Allocation& other) noexcept create_info(other.create_info), allocator(other.allocator), allocation(other.allocation), + allocation_info(other.allocation_info), is_copy_(true) {} Allocation::Allocation(Allocation&& other) noexcept @@ -57,8 +60,10 @@ Allocation::Allocation(Allocation&& other) noexcept create_info(other.create_info), allocator(other.allocator), allocation(other.allocation), + allocation_info(other.allocation_info), is_copy_(other.is_copy_) { other.allocation = VK_NULL_HANDLE; + other.allocation_info = {}; } Allocation& Allocation::operator=(Allocation&& other) noexcept { @@ -68,9 +73,11 @@ Allocation& Allocation::operator=(Allocation&& other) noexcept { create_info = other.create_info; allocator = other.allocator; allocation = other.allocation; + allocation_info = other.allocation_info; is_copy_ = other.is_copy_; other.allocation = tmp_allocation; + other.allocation_info = {}; return *this; } diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h index cec6f61e766..44e8277a35c 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h @@ -62,6 +62,8 @@ struct Allocation final { VmaAllocator allocator; // Handles to the allocated memory VmaAllocation allocation; + // Information about the allocated memory + VmaAllocationInfo allocation_info; private: // Indicates whether this class instance is a copy of another class instance, diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index b990cf6a119..e814063fa90 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -142,7 +142,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { // Staging buffers are accessed by both the CPU and GPU, so set the // appropriate flags to indicate that the host device will be accessing // the data from this buffer. - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | + VMA_ALLOCATION_CREATE_MAPPED_BIT; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; alloc_create_info.preferredFlags = diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp index 366b45a5e41..5a78dab764d 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp @@ -67,7 +67,7 @@ VulkanBuffer::VulkanBuffer( &allocation_create_info, &handle_, &(memory_.allocation), - nullptr)); + &(memory_.allocation_info))); } else { VmaAllocatorInfo allocator_info{}; vmaGetAllocatorInfo(allocator_, &allocator_info); diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 9302048f861..af32ffffa84 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -114,6 +114,10 @@ class VulkanBuffer final { return memory_.allocation; } + inline VmaAllocationInfo allocation_info() const { + return memory_.allocation_info; + } + inline VmaAllocationCreateInfo allocation_create_info() const { return VmaAllocationCreateInfo(memory_.create_info); } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 4c2972419d0..a469a44dc1a 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -326,15 +326,15 @@ void record_reference_matmul( void fill_vtensor(api::vTensor& vten, std::vector& data) { api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted; \ - data_converted.resize(data.size()); \ - for (int i = 0; i < data.size(); ++i) { \ - data_converted[i] = ctype(data[i]); \ - } \ - copy_ptr_to_staging( \ - data_converted.data(), staging_buffer, vten.staging_buffer_nbytes()); \ +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted; \ + data_converted.resize(data.size()); \ + for (int i = 0; i < data.size(); ++i) { \ + data_converted[i] = ctype(data[i]); \ + } \ + staging_buffer.copy_from( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ } break; switch (vten.dtype()) { @@ -424,14 +424,14 @@ void extract_vtensor(api::vTensor& vten, std::vector& data) { api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted(data.size()); \ - copy_staging_to_ptr( \ - staging_buffer, data_converted.data(), vten.staging_buffer_nbytes()); \ - for (int i = 0; i < data.size(); ++i) { \ - data[i] = float(data_converted[i]); \ - } \ +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted(data.size()); \ + staging_buffer.copy_to( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ + for (int i = 0; i < data.size(); ++i) { \ + data[i] = float(data_converted[i]); \ + } \ } break; switch (vten.dtype()) { diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index 3bc12c472db..25163e664bf 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -132,7 +132,7 @@ fill_staging(api::StagingBuffer& staging, float val, int numel = -1) { } std::vector data(numel); std::fill(data.begin(), data.end(), val); - copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel); + staging.copy_from(data.data(), sizeof(float) * numel); } void fill_vtensor(api::vTensor& vten, std::vector& data); @@ -169,7 +169,7 @@ check_staging_buffer(api::StagingBuffer& staging, float val, int numel = -1) { numel = staging.numel(); } std::vector data(numel); - copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel); + staging.copy_to(data.data(), sizeof(float) * numel); for (size_t i = 0; i < data.size(); ++i) { CHECK_VALUE(data, i, val); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index a0bfefafa02..c035d5f8b85 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -360,7 +360,7 @@ TEST_F(VulkanComputeAPITest, spec_var_shader_test) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, scale * i + offset); @@ -470,7 +470,7 @@ void test_storage_buffer_type(const size_t len) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, T(i)); @@ -2132,7 +2132,7 @@ void run_from_gpu_test( submit_to_gpu(); std::vector data_out(staging_buffer.numel()); - copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes()); + staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes()); for (int i = 0; i < vten.numel(); i++) { CHECK_VALUE(data_out, i, i + offset); @@ -2160,8 +2160,7 @@ void round_trip_test( for (int i = 0; i < staging_buffer_in.numel(); i++) { data_in[i] = T(i * -1); } - copy_ptr_to_staging( - data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes()); + staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes()); // Output staging buffer StagingBuffer staging_buffer_out( @@ -2182,8 +2181,7 @@ void round_trip_test( // Extract data from output staging buffer std::vector data_out(staging_buffer_out.numel()); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), staging_buffer_out.nbytes()); + staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes()); // All indices should be equal to the input data for (int i = 0; i < vten.numel(); i++) { @@ -2624,8 +2622,7 @@ void test_conv2d( for (int i = 0; i < in_numel; i++) { data_in[i] = i + 1; } - copy_ptr_to_staging( - data_in.data(), staging_buffer_in, sizeof(float) * in_numel); + staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel); // Output staging buffer const int64_t out_numel = @@ -2642,8 +2639,7 @@ void test_conv2d( // Extract data from output staging buffer std::vector data_out(out_numel); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), sizeof(float) * out_numel); + staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel); // Check data matches results copied from ATen-VK for (int i = 0; i < vten.numel(); i++) { From 79b97e45009c58f09010977db6ee5d5d853a50ae Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Wed, 4 Sep 2024 15:23:10 -0700 Subject: [PATCH 183/531] [tokenizer] Consolidate how runner decide which tokenizer to use Differential Revision: D62160344 Pull Request resolved: https://github.com/pytorch/executorch/pull/5052 --- examples/models/llama2/runner/runner.cpp | 10 ++- extension/llm/tokenizer/base64.h | 76 ++++++++++++------- .../test_tiktoken_invalid_base64.model | 1 + .../test_tiktoken_invalid_rank.model | 1 + .../resources/test_tiktoken_no_space.model | 1 + .../llm/tokenizer/test/test_tiktoken.cpp | 46 +++++++++++ extension/llm/tokenizer/tiktoken.cpp | 46 +++++++---- 7 files changed, 134 insertions(+), 47 deletions(-) create mode 100644 extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model create mode 100644 extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model create mode 100644 extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index bceaaa3ed55..0a5d773092e 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -69,17 +69,19 @@ Error Runner::load() { return Error::Ok; } ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - // load tokenizer + // load tokenizer. Assuming tiktoken is the default tokenizer tokenizer_ = nullptr; - tokenizer_ = std::make_unique(); + tokenizer_ = get_tiktoken_for_llama(); Error err = tokenizer_->load(tokenizer_path_); + // Rely on tiktoken to throw error if the artifact is incompatible. Then we + // fallback to BPE tokenizer. if (err == Error::InvalidArgument) { ET_LOG( Info, - "Failed to load %s as a BPETokenizer artifact, trying Tiktoken", + "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer", tokenizer_path_.c_str()); tokenizer_.reset(); - tokenizer_ = get_tiktoken_for_llama(); + tokenizer_ = std::make_unique(); tokenizer_->load(tokenizer_path_); } diff --git a/extension/llm/tokenizer/base64.h b/extension/llm/tokenizer/base64.h index 7337ecead4e..83ef9e0696b 100644 --- a/extension/llm/tokenizer/base64.h +++ b/extension/llm/tokenizer/base64.h @@ -24,6 +24,8 @@ #pragma once +#include +#include #include #include #include @@ -32,10 +34,13 @@ namespace executorch { namespace extension { namespace llm { +using Error = executorch::runtime::Error; +template +using Result = executorch::runtime::Result; namespace base64 { -std::string decode(const std::string_view& input); +Result decode(const std::string_view& input); namespace detail { @@ -59,96 +64,111 @@ constexpr uint32_t DECODE_TABLE[] = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}; -inline void validate(uint32_t v) { - ET_CHECK_MSG(v != 255, "invalid char"); +inline Error validate(uint32_t v) { + ET_CHECK_OR_RETURN_ERROR(v != 255, InvalidArgument, "invalid char"); + return Error::Ok; } -inline void decode(const std::string_view& input, std::string& output) { - ET_CHECK_MSG( - input.size() == 4, "input length must be 4, got %zu", input.size()); +inline Error decode(const std::string_view& input, std::string& output) { + ET_CHECK_OR_RETURN_ERROR( + input.size() == 4, + InvalidArgument, + "input length must be 4, got %zu", + input.size()); uint32_t val = 0; uint8_t c = input[0]; auto v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = v; c = input[1]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; c = input[2]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; c = input[3]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; output.push_back(static_cast((val >> 16) & 0xFF)); output.push_back(static_cast((val >> 8) & 0xFF)); output.push_back(static_cast(val & 0xFF)); + return Error::Ok; } -inline void decode_1_padding( +inline Error decode_1_padding( const std::string_view& input, std::string& output) { - ET_CHECK_MSG( - input.size() == 3, "input length must be 3, got %zu", input.size()); + ET_CHECK_OR_RETURN_ERROR( + input.size() == 3, + InvalidArgument, + "input length must be 3, got %zu", + input.size()); uint32_t val = 0; uint8_t c = input[0]; auto v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = v; c = input[1]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; c = input[2]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; output.push_back(static_cast((val >> 10) & 0xFF)); output.push_back(static_cast((val >> 2) & 0xFF)); + return Error::Ok; } -inline void decode_2_padding( +inline Error decode_2_padding( const std::string_view& input, std::string& output) { - assert(input.size() == 2); + ET_CHECK_OR_RETURN_ERROR( + input.size() == 2, + InvalidArgument, + "input length must be 2, got %zu", + input.size()); uint32_t val = 0; uint8_t c = input[0]; auto v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = v; c = input[1]; v = DECODE_TABLE[c]; - validate(v); + ET_CHECK_OK_OR_RETURN_ERROR(validate(v)); val = (val << 6) | v; output.push_back(static_cast((val >> 4) & 0xFF)); + return Error::Ok; } } // namespace detail -inline std::string decode(const std::string_view& input) { - ET_CHECK_MSG(!input.empty(), "empty input"); +inline Result decode(const std::string_view& input) { + ET_CHECK_OR_RETURN_ERROR(!input.empty(), InvalidArgument, "empty input"); // Faster than `input.size() % 4`. - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( (input.size() & 3) == 0 && input.size() >= 4, + InvalidArgument, "input length must be larger than 4 and is multiple of 4, got %zu", input.size()); @@ -156,21 +176,23 @@ inline std::string decode(const std::string_view& input) { output.reserve(input.size() / 4 * 3); auto idx = 0U; for (; idx < input.size() - 4; idx += 4) { - detail::decode(input.substr(idx, 4), output); + ET_CHECK_OK_OR_RETURN_ERROR(detail::decode(input.substr(idx, 4), output)); } // Last 4 bytes. Might contain paddings. if (input[idx + 3] == '=') { if (input[idx + 2] == '=') { // Tow paddings. - detail::decode_2_padding(input.substr(idx, 2), output); + ET_CHECK_OK_OR_RETURN_ERROR( + detail::decode_2_padding(input.substr(idx, 2), output)); } else { // One padding. - detail::decode_1_padding(input.substr(idx, 3), output); + ET_CHECK_OK_OR_RETURN_ERROR( + detail::decode_1_padding(input.substr(idx, 3), output)); } } else { // No padding. - detail::decode(input.substr(idx, 4), output); + ET_CHECK_OK_OR_RETURN_ERROR(detail::decode(input.substr(idx, 4), output)); } return output; diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model new file mode 100644 index 00000000000..2d9c39f19d6 --- /dev/null +++ b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model @@ -0,0 +1 @@ +tet 0 diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model new file mode 100644 index 00000000000..07d43b1e439 --- /dev/null +++ b/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model @@ -0,0 +1 @@ +ICAgICAgIA== 18446744073709551616 diff --git a/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model b/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model new file mode 100644 index 00000000000..c025dddd3ba --- /dev/null +++ b/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model @@ -0,0 +1 @@ +ICAgICAgIA==10 diff --git a/extension/llm/tokenizer/test/test_tiktoken.cpp b/extension/llm/tokenizer/test/test_tiktoken.cpp index a81b20bcf88..ce2a781aa1c 100644 --- a/extension/llm/tokenizer/test/test_tiktoken.cpp +++ b/extension/llm/tokenizer/test/test_tiktoken.cpp @@ -8,7 +8,9 @@ #include #include +#include #include +#include #include using namespace ::testing; @@ -140,3 +142,47 @@ TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) { ""); #endif } + +TEST_F(TiktokenExtensionTest, LoadWithInvalidPath) { + auto invalidModelPath = + std::getenv("RESOURCES_PATH") + std::string("/nonexistent.model"); + + Error res = tokenizer_->load(invalidModelPath.c_str()); + EXPECT_EQ(res, Error::InvalidArgument); +} + +TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidRank) { + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_invalid_rank.model"); + + Error res = tokenizer_->load(invalidModelPath.c_str()); + + EXPECT_EQ(res, Error::InvalidArgument); +} + +TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidBase64) { + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_invalid_base64.model"); + + Error res = tokenizer_->load(invalidModelPath.c_str()); + + EXPECT_EQ(res, Error::InvalidArgument); +} + +TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithNoSpace) { + auto invalidModelPath = std::getenv("RESOURCES_PATH") + + std::string("/test_tiktoken_no_space.model"); + + Error res = tokenizer_->load(invalidModelPath.c_str()); + + EXPECT_EQ(res, Error::InvalidArgument); +} + +TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithBPEFile) { + auto invalidModelPath = + std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin"); + + Error res = tokenizer_->load(invalidModelPath.c_str()); + + EXPECT_EQ(res, Error::InvalidArgument); +} diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp index 7b15d25f0da..f8ccf74fd6b 100644 --- a/extension/llm/tokenizer/tiktoken.cpp +++ b/extension/llm/tokenizer/tiktoken.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -65,33 +66,43 @@ static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) { return _create_regex(special_pattern); } -static std::pair _parse(const std::string& line) { +static Result> _parse( + const std::string& line) { + // Tiktoken format + // https://github.com/openai/tiktoken/blob/main/tiktoken/load.py#L140 auto pos = line.find(" "); - ET_CHECK_MSG( - pos != std::string::npos, "invalid encoder line: %s", line.c_str()); + ET_CHECK_OR_RETURN_ERROR( + pos != std::string::npos, + InvalidArgument, + "invalid tiktoken line: %s", + line.c_str()); - auto token = base64::decode({line.data(), pos}); + auto token = ET_UNWRAP(base64::decode({line.data(), pos})); uint64_t rank = 0; try { rank = std::stoul(line.substr(pos + 1)); } catch (const std::exception&) { - ET_CHECK_MSG(false, "invalid encoder rank: %s", line.c_str()); + ET_CHECK_OR_RETURN_ERROR( + false, InvalidArgument, "invalid encoder rank: %s", line.c_str()); } - return {std::move(token), rank}; + return std::pair{std::move(token), rank}; } -static Encoder _load_encoder(const std::string& path) { +static Result _load_encoder(const std::string& path) { std::ifstream file(path); - ET_CHECK_MSG(file, "failed to open encoder file: %s", path.c_str()); + ET_CHECK_OR_RETURN_ERROR( + file, InvalidArgument, "failed to open encoder file: %s", path.c_str()); Encoder encoder; std::string line; while (std::getline(file, line)) { - auto [token, rank] = _parse(line); + auto [token, rank] = ET_UNWRAP(_parse(line)); - ET_CHECK_MSG( + ET_CHECK_OR_RETURN_ERROR( encoder.emplace(std::move(token), rank).second, + InvalidArgument, "duplicate item: %s", line.c_str()); } @@ -99,13 +110,16 @@ static Encoder _load_encoder(const std::string& path) { return encoder; } -static Decoder _build_decoder(const Encoder& encoder) { +static Result _build_decoder(const Encoder& encoder) { Decoder decoder; for (const auto& [k, v] : encoder) { decoder.emplace(v, k); } - ET_CHECK_MSG(encoder.size() == decoder.size(), "duplicate items in encoder"); + ET_CHECK_OR_RETURN_ERROR( + encoder.size() == decoder.size(), + InvalidArgument, + "duplicate items in encoder"); return decoder; } @@ -356,11 +370,11 @@ Tiktoken::Tiktoken( } Error Tiktoken::load(const std::string& path) { - _encoder = _load_encoder(path); + _encoder = ET_UNWRAP(_load_encoder(path)); _special_token_encoder = _build_special_token_encoder(_encoder.size()); - _decoder = _build_decoder(_encoder); - _special_token_decoder = _build_decoder(_special_token_encoder); + _decoder = ET_UNWRAP(_build_decoder(_encoder)); + _special_token_decoder = ET_UNWRAP(_build_decoder(_special_token_encoder)); _regex = _create_regex(_pattern); // Warmup re2 as it is slow on the first run, void the return value as it's @@ -393,7 +407,7 @@ Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) const { for (auto i = 0; i < eos; ++i) { res.push_back(eos_tok_); } - return Result(res); + return Result>(std::move(res)); } Result Tiktoken::decode(uint64_t prev, uint64_t cur) const { From e73dce29359499007343604d6d8b6c01e612c3d7 Mon Sep 17 00:00:00 2001 From: Esteb37 <35089867+Esteb37@users.noreply.github.com> Date: Wed, 4 Sep 2024 18:52:20 -0400 Subject: [PATCH 184/531] [INT4-MM| Add Texture3D storage type Differential Revision: D62148863 Pull Request resolved: https://github.com/pytorch/executorch/pull/5044 --- .../runtime/graph/ops/glsl/q_4w_linear.glsl | 127 +++++++++++++----- .../runtime/graph/ops/glsl/q_4w_linear.yaml | 3 + .../graph/ops/impl/QuantizedMatMul.cpp | 42 ++++-- .../vulkan/test/vulkan_compute_api_test.cpp | 46 ++++--- 4 files changed, 154 insertions(+), 64 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl index 71ecf162362..751d513d59d 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl @@ -12,6 +12,9 @@ #define PRECISION ${PRECISION} +#define FOUR 4 + +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} #define FLOAT_T ${buffer_scalar_type(DTYPE)} ${define_active_storage_type(STORAGE)} @@ -26,12 +29,17 @@ ${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} ${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)} ${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)} -${layout_declare_ubo(4, "ivec4", "out_sizes")} -${layout_declare_ubo(5, "ivec4", "out_strides")} -${layout_declare_ubo(6, "ivec4", "mat1_strides")} -${layout_declare_ubo(7, "ivec4", "mat2_sizes")} -${layout_declare_ubo(8, "ivec4", "mat2_strides")} -${layout_declare_ubo(9, "ivec4", "scales_strides")} +$if STORAGE == "texture3d": + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(6, "ivec4", "scales_strides")} +$else: + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "out_strides")} + ${layout_declare_ubo(6, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(7, "ivec4", "mat1_strides")} + ${layout_declare_ubo(8, "ivec4", "mat2_strides")} + ${layout_declare_ubo(9, "ivec4", "scales_strides")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -49,45 +57,90 @@ void main() { return; } - const uint K = mat2_sizes.x * 2; - const uint N = mat2_sizes.y; + const uint K = mat1_sizes.x; const uint n = out_pos.x; const uint m = out_pos.y; - const uint k_block = (K + group_size - 1) / group_size; const uint mask = uint(0x0f); - ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); - ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); - ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); - ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); float rc = 0.0; int k = 0; - for (int kb = 0; kb < k_block; kb++) { - scale_pos.x = kb; - const int scale_id = to_buffer_id(scale_pos, scales_strides); - const float scale = float(t_scales_and_zeros[scale_id]); - - zero_pos.x = kb; - const int zero_id = to_buffer_id(zero_pos, scales_strides); - const float zero = float(t_scales_and_zeros[zero_id]) - scale * 8.0; - - for(uint idx = 0; idx < group_size && k < K; idx++, k++) { - mat1_pos.x = k; - const int mat1_id = to_buffer_id(mat1_pos, mat1_strides); - const float mat1_val = float(t_mat1[mat1_id]); - - mat2_pos.x = k / 2; - const int mat2_id = to_buffer_id(mat2_pos, mat2_strides); - // Bitwise op treats sign bit from int8 as a value bit instead, - // since there is no uint8_t datatype - uint mat2_val = (t_mat2[mat2_id] & 0xFF); - mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + #ifdef USING_BUFFER + const uint k_block = (K + group_size - 1) / group_size; + ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); + ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); + ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); + ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); + + for (int kb = 0; kb < k_block; kb++) { + scale_pos.x = kb; + const int scale_id = to_buffer_id(scale_pos, scales_strides); + const float scale = float(t_scales_and_zeros[scale_id]); + + zero_pos.x = kb; + const int zero_id = to_buffer_id(zero_pos, scales_strides); + const float zero = float(t_scales_and_zeros[zero_id]) - scale * 8.0; + + for(uint idx = 0; idx < group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const int mat1_id = to_buffer_id(mat1_pos, mat1_strides); + const float mat1_val = float(t_mat1[mat1_id]); + + mat2_pos.x = k / 2; + const int mat2_id = to_buffer_id(mat2_pos, mat2_strides); + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (t_mat2[mat2_id] & 0xFF); + mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + + rc += mat1_val * (scale * float(mat2_val) + zero); + } + } - rc += mat1_val * (scale * float(mat2_val) + zero); + const int out_id = to_buffer_id(out_pos, out_strides); + t_out[out_id] = FLOAT_T(rc); + + #else // Using texture + const uint texel_group_size = group_size / FOUR; + const uint k_block = (K + texel_group_size - 1) / texel_group_size; + ivec3 mat1_pos = ivec3(0, m, out_pos.z); + ivec3 mat2_pos = ivec3(0, n, out_pos.z); + ivec3 scale_pos = ivec3(0, n, 0); + ivec3 zero_pos = ivec3(0, n, 1); + + for (int kb = 0; kb < k_block; kb++) { + const int texel_kb = kb / FOUR; + const int kb_offset = kb % FOUR; + + scale_pos.x = texel_kb; + const VEC4_T scale_texel = load_texel(t_scales_and_zeros, scale_pos); + const float scale = float(scale_texel[kb_offset]); + + zero_pos.x = texel_kb; + const VEC4_T zero_texel = load_texel(t_scales_and_zeros, zero_pos); + const float zero = float(zero_texel[kb_offset]) - scale * 8.0; + + for(uint idx = 0; idx < texel_group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos); + + mat2_pos.x = k / 2; + const i8vec4 mat2_tex = i8vec4(load_texel(t_mat2, mat2_pos)); + + // Every two texels of mat1 correspond to one texel of mat2 + // Even mat1 indeces correspond to first half of mat2 texel and + // odd indeces correspond to second half + const int mat2_offset = (k & 1) == 0 ? 0 : 2; + for (int texel_idx = 0; texel_idx < FOUR; texel_idx++){ + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (mat2_tex[mat2_offset + texel_idx / 2] & 0xFF); + mat2_val = (texel_idx & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + rc += mat1_tex[texel_idx] * (scale * float(mat2_val) + zero); + } + } } - } + write_texel(t_out, out_pos.xyz, vec4(rc, 0, 0, 0)); - const int out_id = to_buffer_id(out_pos, out_strides); - t_out[out_id] = FLOAT_T(rc); + #endif } diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml index a3585c998e8..fd65068080a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml @@ -12,5 +12,8 @@ q_4w_linear: DTYPE: - VALUE: float - VALUE: half + STORAGE: + - VALUE: buffer + - VALUE: texture3d shader_variants: - NAME: q_4w_linear diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp index b2796d26dd1..d478b7c253e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp @@ -30,7 +30,16 @@ void check_q_matmul_args( VK_CHECK_COND(mat1_sizes.size() == 2); VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out)); + VK_CHECK_COND(graph.memory_layout_of(mat1) == utils::kWidthPacked); + VK_CHECK_COND(graph.memory_layout_of(mat2_data) == utils::kWidthPacked); + VK_CHECK_COND( + graph.memory_layout_of(scales_and_zeros) == utils::kWidthPacked); + + if (graph.storage_type_of(out) == utils::kBuffer) { + VK_CHECK_COND(graph.memory_layout_of(out) == utils::kWidthPacked); + } else { + VK_CHECK_COND(graph.memory_layout_of(out) == utils::kChannelsPacked); + } const int mat1_K = utils::val_at(-1, mat1_sizes); const int mat2_K = utils::val_at(-1, mat2_sizes) * 2; @@ -95,24 +104,39 @@ void add_q_matmul_node( const ValueRef group_size, const ValueRef scales_and_zeros_data, const ValueRef out) { - ValueRef mat2 = - prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + auto storage_type = graph.storage_type_of(out); + + ValueRef mat2; + + if (storage_type == utils::kBuffer) { + mat2 = prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + } else { + mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + } + ValueRef scales_and_zeros = prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); std::string kernel_name = "q_4w_linear"; add_dtype_suffix(kernel_name, graph.dtype_of(out)); + add_storage_type_suffix(kernel_name, storage_type); const uint32_t group_size_val = graph.extract_scalar(group_size); vkapi::ParamsBindList ubos({}); - ubos.append(graph.sizes_ubo(out)); - ubos.append(graph.strides_ubo(out)); - ubos.append(graph.strides_ubo(mat1)); - ubos.append(graph.sizes_ubo(mat2)); - ubos.append(graph.strides_ubo(mat2)); - ubos.append(graph.strides_ubo(scales_and_zeros)); + if (storage_type == utils::kBuffer) { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.strides_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(mat1)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } else { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } auto out_sizes = graph.sizes_of(out); uint32_t N = utils::val_at(-1, out_sizes); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c035d5f8b85..fc5d1f8214b 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -2738,7 +2738,10 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) { /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); } -void test_int4pack_mm(std::vector MKN, uint32_t group_size) { +void test_int4pack_mm( + std::vector MKN, + uint32_t group_size, + utils::StorageType storage_type) { GraphConfig config; ComputeGraph graph(config); @@ -2752,8 +2755,7 @@ void test_int4pack_mm(std::vector MKN, uint32_t group_size) { const std::vector out_size = {M, N}; std::vector A_data = create_random_float_buffer(M * K); - IOValueRef A = - graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kBuffer); + IOValueRef A = graph.add_input_tensor(mat1_size, vkapi::kFloat, storage_type); graph.copy_into_staging(A.staging, A_data.data(), A_data.size()); // Quantized but un-packed weights @@ -2764,7 +2766,7 @@ void test_int4pack_mm(std::vector MKN, uint32_t group_size) { int4mm_pack_weights(mat2_size, B_quant_data.data()); IOValueRef B_int4 = - graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer); + graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, storage_type); graph.copy_into_staging( B_int4.staging, B_int4_data.data(), B_int4_data.size()); @@ -2773,7 +2775,7 @@ void test_int4pack_mm(std::vector MKN, uint32_t group_size) { // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in // int4 range IOValueRef scales_and_zeros = - graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, utils::kBuffer); + graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, storage_type); std::vector s_data(graph.numel_of(scales_and_zeros.value)); const int zeros_stride = s_data.size() / 2; for (size_t i = 0; i < zeros_stride; i++) { @@ -2785,7 +2787,13 @@ void test_int4pack_mm(std::vector MKN, uint32_t group_size) { scales_and_zeros.staging, s_data.data(), s_data.size()); IOValueRef out_int4; - out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + + if (storage_type == utils::kBuffer) { + out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + } else { + out_int4.value = + graph.add_tensor(out_size, vkapi::kFloat, utils::kChannelsPacked); + } VK_GET_OP_FN("aten._weight_int4pack_mm.default") (graph, @@ -2799,13 +2807,13 @@ void test_int4pack_mm(std::vector MKN, uint32_t group_size) { // Dequantized matmul for comparison IOValueRef B_deq = - graph.add_input_tensor(mat2_size, vkapi::kFloat, utils::kBuffer); + graph.add_input_tensor(mat2_size, vkapi::kFloat, storage_type); std::vector B_deq_data = int4mm_dequantize_weights( mat2_size, B_quant_data.data(), group_size, s_data.data()); graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size()); IOValueRef out_deq; - out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, storage_type); VK_GET_OP_FN("aten.mm.default") (graph, {A.value, B_deq.value, out_deq.value}); @@ -2838,18 +2846,20 @@ TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) { GTEST_SKIP(); } - // Vector multiplication, single group per row - test_int4pack_mm({1, 32, 1}, 32); + for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { + // Vector multiplication, single group per row + test_int4pack_mm({1, 32, 1}, 32, storage_type); - // Vector multiplication, multiple groups per row - test_int4pack_mm({1, 256, 1}, 64); + // Vector multiplication, multiple groups per row + test_int4pack_mm({1, 256, 1}, 64, storage_type); - // Square matrices, single group per row - test_int4pack_mm({32, 32, 32}, 32); + // Square matrices, single group per row + test_int4pack_mm({32, 32, 32}, 32, storage_type); - // Irregular matrices, single group per row - test_int4pack_mm({37, 32, 19}, 32); + // Irregular matrices, single group per row + test_int4pack_mm({37, 32, 19}, 32, storage_type); - // Irregular matrices, multiple groups per row - test_int4pack_mm({37, 256, 19}, 64); + // Irregular matrices, multiple groups per row + test_int4pack_mm({37, 256, 19}, 64, storage_type); + } } From 2806554ac133b8fc6e3dc3ae995631ec459540c5 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Wed, 4 Sep 2024 16:14:57 -0700 Subject: [PATCH 185/531] d to g start ops | add dim order sanity check Differential Revision: D59846689 Pull Request resolved: https://github.com/pytorch/executorch/pull/4329 --- kernels/portable/cpu/op_detach_copy.cpp | 3 +++ kernels/portable/cpu/op_diagonal_copy.cpp | 5 +++++ kernels/portable/cpu/op_div.cpp | 9 +++++++++ kernels/portable/cpu/op_embedding.cpp | 9 +++++++++ kernels/portable/cpu/op_eq.cpp | 6 ++++++ kernels/portable/cpu/op_expand_copy.cpp | 4 ++++ kernels/portable/cpu/op_fill.cpp | 6 ++++++ kernels/portable/cpu/op_flip.cpp | 3 +++ kernels/portable/cpu/op_floor_divide.cpp | 3 +++ kernels/portable/cpu/op_fmod.cpp | 6 ++++++ kernels/portable/cpu/op_full_like.cpp | 5 +++++ kernels/portable/cpu/op_ge.cpp | 6 ++++++ kernels/portable/cpu/op_gelu.cpp | 3 +++ kernels/portable/cpu/op_glu.cpp | 3 +++ kernels/portable/cpu/op_gt.cpp | 6 ++++++ 15 files changed, 77 insertions(+) diff --git a/kernels/portable/cpu/op_detach_copy.cpp b/kernels/portable/cpu/op_detach_copy.cpp index 844f259f6de..a8db4b0804b 100644 --- a/kernels/portable/cpu/op_detach_copy.cpp +++ b/kernels/portable/cpu/op_detach_copy.cpp @@ -33,6 +33,9 @@ Tensor& detach_copy_out(RuntimeContext& ctx, const Tensor& self, Tensor& out) { out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(self, out), InvalidArgument, out); diff --git a/kernels/portable/cpu/op_diagonal_copy.cpp b/kernels/portable/cpu/op_diagonal_copy.cpp index 67b14c3f792..0de86ea0a64 100644 --- a/kernels/portable/cpu/op_diagonal_copy.cpp +++ b/kernels/portable/cpu/op_diagonal_copy.cpp @@ -73,6 +73,11 @@ Tensor& diagonal_copy_out( ET_KERNEL_CHECK( ctx, check_diagonal_copy_args(in, dim1, dim2, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + if (dim1 < 0) { dim1 += nonzero_dim(in); } diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp index 84591cb0ebd..db2079c2ff2 100644 --- a/kernels/portable/cpu/op_div.cpp +++ b/kernels/portable/cpu/op_div.cpp @@ -41,6 +41,9 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); @@ -97,6 +100,9 @@ Tensor& div_out_mode( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = get_compute_type(a_type, b_type); @@ -159,6 +165,9 @@ Tensor& div_scalar_out( ScalarType common_type = isFloatingType(a_type) ? a_type : ScalarType::Float; ScalarType out_type = out.scalar_type(); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.Scalar_out", CTYPE_A, [&]() { diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp index ffa43da7395..1b493435af5 100644 --- a/kernels/portable/cpu/op_embedding.cpp +++ b/kernels/portable/cpu/op_embedding.cpp @@ -102,6 +102,15 @@ Tensor& embedding_out( out.size(1), weight.size(1)); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(weight, indices, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, tensor_is_default_dim_order(weight), InvalidArgument, out); + ScalarType ix_type = indices.scalar_type(); ET_CHECK_MSG( ix_type == ScalarType::Long || ix_type == ScalarType::Int, diff --git a/kernels/portable/cpu/op_eq.cpp b/kernels/portable/cpu/op_eq.cpp index 8a4e4656f08..8fef70e564f 100644 --- a/kernels/portable/cpu/op_eq.cpp +++ b/kernels/portable/cpu/op_eq.cpp @@ -34,6 +34,9 @@ Tensor& eq_tensor_out( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { @@ -80,6 +83,9 @@ Tensor& eq_scalar_out( ScalarType b_type = utils::get_scalar_dtype(b); ScalarType out_type = out.scalar_type(); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { using CTYPE_IN = diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp index 5f0d19adc59..67e9149ff9f 100644 --- a/kernels/portable/cpu/op_expand_copy.cpp +++ b/kernels/portable/cpu/op_expand_copy.cpp @@ -85,6 +85,10 @@ Tensor& expand_copy_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + // Holds the result of expand_sizes converted to repeat sizes int64_t repeats[kTensorDimensionLimit]; const auto repeats_size{map_expand_to_repeats( diff --git a/kernels/portable/cpu/op_fill.cpp b/kernels/portable/cpu/op_fill.cpp index 60ebd5de5ab..d908c53e0c7 100644 --- a/kernels/portable/cpu/op_fill.cpp +++ b/kernels/portable/cpu/op_fill.cpp @@ -31,6 +31,9 @@ Tensor& fill_scalar_out( ET_KERNEL_CHECK(ctx, a_type == out_type, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, @@ -67,6 +70,9 @@ Tensor& fill_tensor_out( // Assert `b` must be a scalar tensor. ET_KERNEL_CHECK(ctx, tensor_is_scalar(b), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/op_flip.cpp b/kernels/portable/cpu/op_flip.cpp index 10c52439d11..c88585f88a5 100644 --- a/kernels/portable/cpu/op_flip.cpp +++ b/kernels/portable/cpu/op_flip.cpp @@ -45,6 +45,9 @@ flip_out(RuntimeContext& ctx, const Tensor& in, IntArrayRef dims, Tensor& out) { ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, check_flip_args(in, dims, out), InvalidArgument, out); bool flip_dim_data[kTensorDimensionLimit]; diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp index 0514df0ca25..88c6d5e7e79 100644 --- a/kernels/portable/cpu/op_floor_divide.cpp +++ b/kernels/portable/cpu/op_floor_divide.cpp @@ -87,6 +87,9 @@ Tensor& floor_divide_out( ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 42f83731199..6743eb8cf8a 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -85,6 +85,9 @@ Tensor& fmod_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -139,6 +142,9 @@ Tensor& fmod_Scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_full_like.cpp b/kernels/portable/cpu/op_full_like.cpp index 880e02efe66..0ce8923ccdf 100644 --- a/kernels/portable/cpu/op_full_like.cpp +++ b/kernels/portable/cpu/op_full_like.cpp @@ -34,6 +34,11 @@ Tensor& full_like_out( "memory_format must be contiguous"); } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_ge.cpp b/kernels/portable/cpu/op_ge.cpp index 88e056e7362..d89c45cca45 100644 --- a/kernels/portable/cpu/op_ge.cpp +++ b/kernels/portable/cpu/op_ge.cpp @@ -31,6 +31,9 @@ Tensor& ge_tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); @@ -77,6 +80,9 @@ Tensor& ge_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_gelu.cpp b/kernels/portable/cpu/op_gelu.cpp index 0432c028141..4fadd2aff58 100644 --- a/kernels/portable/cpu/op_gelu.cpp +++ b/kernels/portable/cpu/op_gelu.cpp @@ -34,6 +34,9 @@ Tensor& gelu_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, "gelu.out", CTYPE, [&]() { if (approximate == "tanh") { apply_unary_map_fn( diff --git a/kernels/portable/cpu/op_glu.cpp b/kernels/portable/cpu/op_glu.cpp index 5a075ff35ca..4ecdbbc1caf 100644 --- a/kernels/portable/cpu/op_glu.cpp +++ b/kernels/portable/cpu/op_glu.cpp @@ -144,6 +144,9 @@ glu_out(RuntimeContext& ctx, const Tensor& self, int64_t dim, Tensor& out) { ET_KERNEL_CHECK( ctx, resize_glu_out(self, dim, out) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, check_glu_args(self, dim, out), InvalidArgument, out); const size_t non_negative_dim = dim < 0 ? dim + self.dim() : dim; diff --git a/kernels/portable/cpu/op_gt.cpp b/kernels/portable/cpu/op_gt.cpp index 56d8657c9b5..4c5df64cb69 100644 --- a/kernels/portable/cpu/op_gt.cpp +++ b/kernels/portable/cpu/op_gt.cpp @@ -31,6 +31,9 @@ Tensor& gt_tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); @@ -77,6 +80,9 @@ Tensor& gt_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); From 1a4cf514f84a5b480548c51edcd86cd264fac59a Mon Sep 17 00:00:00 2001 From: lucylq Date: Wed, 4 Sep 2024 16:19:05 -0700 Subject: [PATCH 186/531] Remove usages of extract_constant_segment=True Differential Revision: D61995885 Pull Request resolved: https://github.com/pytorch/executorch/pull/5082 --- examples/mediatek/model_export_scripts/llama.py | 1 - examples/portable/scripts/export.py | 2 +- exir/_serialize/_program.py | 2 +- exir/_serialize/test/test_program.py | 15 ++++----------- extension/llm/export/builder.py | 1 - 5 files changed, 6 insertions(+), 15 deletions(-) diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py index 9c371f46ca6..980a502c5ae 100644 --- a/examples/mediatek/model_export_scripts/llama.py +++ b/examples/mediatek/model_export_scripts/llama.py @@ -369,7 +369,6 @@ def export_to_et_ir( alloc_graph_input=False, alloc_graph_output=False, ), - extract_constant_segment=True, extract_delegate_segments=True, ) ) diff --git a/examples/portable/scripts/export.py b/examples/portable/scripts/export.py index 7849fa06ccd..6055ecef0f3 100644 --- a/examples/portable/scripts/export.py +++ b/examples/portable/scripts/export.py @@ -62,7 +62,7 @@ def main() -> None: *MODEL_NAME_TO_MODEL[args.model_name] ) - backend_config = ExecutorchBackendConfig(extract_constant_segment=True) + backend_config = ExecutorchBackendConfig() if args.segment_alignment is not None: backend_config.segment_alignment = int(args.segment_alignment, 16) if ( diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index d22de71d1c2..24ee6bd21a3 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -347,7 +347,7 @@ def serialize_pte_binary( *, mutable_data: Optional[List[Buffer]] = None, extract_delegate_segments: bool = False, - extract_constant_segment: bool = False, + extract_constant_segment: bool = True, segment_alignment: int = 128, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index 09927ad9648..c4f4df0d0b2 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -169,7 +169,6 @@ def constant_segment_with_tensor_alignment( pte_data = bytes( serialize_pte_binary( program, - extract_constant_segment=True, segment_alignment=SEGMENT_ALIGNMENT, constant_tensor_alignment=constant_tensor_alignment, ) @@ -427,16 +426,12 @@ def test_round_trip_large_buffer_sizes(self) -> None: def test_round_trip_no_segments_and_no_header(self) -> None: """Tests that a Program serialized with extract_delegate_segments=True - or extract_constant_segment=True, when there are no segments, does not - contain an extended header, constant segment, or delegate segments. Confirm - that a Program remains the same after serializing and deserializing. + when there are no segments does not contain an extended header, + constant segment, or delegate segments. Confirm that a Program remains + the same after serializing and deserializing. """ program = get_test_program() - pte_data = bytes( - serialize_pte_binary( - program, extract_delegate_segments=True, extract_constant_segment=True - ) - ) + pte_data = bytes(serialize_pte_binary(program, extract_delegate_segments=True)) self.assertGreater(len(pte_data), 16) # File magic should be present at the expected offset. @@ -637,7 +632,6 @@ def test_constant_segment_tensor_alignment_non_power_of_2_fails(self) -> None: with self.assertRaises(ValueError): serialize_pte_binary( program, - extract_constant_segment=True, segment_alignment=SEGMENT_ALIGNMENT, constant_tensor_alignment=constant_tensor_alignment, ) @@ -662,7 +656,6 @@ def test_constant_segment_and_delegate_segment(self) -> None: serialize_pte_binary( program, extract_delegate_segments=True, - extract_constant_segment=True, segment_alignment=SEGMENT_ALIGNMENT, constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT, ) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index eccb3317e7f..4f5bab7bc02 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -261,7 +261,6 @@ def to_executorch(self) -> "LLMEdgeManager": assert self.edge_manager, "Need to run export_to_edge() first" self.export_program = self.edge_manager.to_executorch( ExecutorchBackendConfig( - extract_constant_segment=True, extract_delegate_segments=True, passes=[ QuantFusionPass(), From 6ec534230bf943b4ed8ad26a9c71cf39ea633ecc Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 4 Sep 2024 16:22:44 -0700 Subject: [PATCH 187/531] TrainingModule Differential Revision: D62140852 Pull Request resolved: https://github.com/pytorch/executorch/pull/5077 --- extension/module/module.h | 4 +- extension/training/module/TARGETS | 8 ++ extension/training/module/targets.bzl | 28 ++++ extension/training/module/test/TARGETS | 8 ++ extension/training/module/test/targets.bzl | 34 +++++ .../module/test/training_module_test.cpp | 107 ++++++++++++++ extension/training/module/training_module.cpp | 135 ++++++++++++++++++ extension/training/module/training_module.h | 102 +++++++++++++ 8 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 extension/training/module/TARGETS create mode 100644 extension/training/module/targets.bzl create mode 100644 extension/training/module/test/TARGETS create mode 100644 extension/training/module/test/targets.bzl create mode 100644 extension/training/module/test/training_module_test.cpp create mode 100644 extension/training/module/training_module.cpp create mode 100644 extension/training/module/training_module.h diff --git a/extension/module/module.h b/extension/module/module.h index 052489fb331..c1fe11147f7 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -22,7 +22,7 @@ namespace extension { /** * A facade class for loading programs and executing methods within them. */ -class Module final { +class Module { public: /** * Enum to define loading behavior. @@ -337,6 +337,8 @@ class Module final { std::unique_ptr memory_allocator_; std::unique_ptr temp_allocator_; std::unique_ptr event_tracer_; + + protected: std::unordered_map methods_; friend class ExecuTorchJni; diff --git a/extension/training/module/TARGETS b/extension/training/module/TARGETS new file mode 100644 index 00000000000..2341af9282f --- /dev/null +++ b/extension/training/module/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/training/module/targets.bzl b/extension/training/module/targets.bzl new file mode 100644 index 00000000000..88da84ed131 --- /dev/null +++ b/extension/training/module/targets.bzl @@ -0,0 +1,28 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + for aten_mode in (True, False): + aten_suffix = ("_aten" if aten_mode else "") + + runtime.cxx_library( + name = "training_module" + aten_suffix, + srcs = [ + "training_module.cpp", + ], + exported_headers = [ + "training_module.h", + ], + visibility = [ + "@EXECUTORCH_CLIENTS", + ], + exported_deps = [ + "//executorch/extension/module:module" + aten_suffix, + "//executorch/runtime/core:evalue" + aten_suffix, + ], + ) diff --git a/extension/training/module/test/TARGETS b/extension/training/module/test/TARGETS new file mode 100644 index 00000000000..a6c52d105f6 --- /dev/null +++ b/extension/training/module/test/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets(is_fbcode = True) diff --git a/extension/training/module/test/targets.bzl b/extension/training/module/test/targets.bzl new file mode 100644 index 00000000000..8b260e2a7e8 --- /dev/null +++ b/extension/training/module/test/targets.bzl @@ -0,0 +1,34 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(is_fbcode = False): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + # TODO(dbort): Find a way to make these run for ANDROID/APPLE in xplat. The + # android and ios test determinators don't like the reference to the model + # file in fbcode. See https://fburl.com/9esapdmd + if not runtime.is_oss and is_fbcode: + modules_env = { + # The tests use this var to find the program file to load. This uses + # an fbcode target path because the authoring/export tools + # intentionally don't work in xplat (since they're host-only tools). + "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", + "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])", + } + + runtime.cxx_test( + name = "training_module_test", + srcs = [ + "training_module_test.cpp", + ], + deps = [ + "//executorch/extension/training/module:training_module", + "//executorch/extension/data_loader:file_data_loader", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + "//executorch/kernels/portable:generated_lib", + ], + env = modules_env, + ) diff --git a/extension/training/module/test/training_module_test.cpp b/extension/training/module/test/training_module_test.cpp new file mode 100644 index 00000000000..58631c4cf44 --- /dev/null +++ b/extension/training/module/test/training_module_test.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include + +// @lint-ignore-every CLANGTIDY facebook-hte-CArray + +using namespace ::testing; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::Error; +using torch::executor::Span; +using torch::executor::testing::TensorFactory; + +class TrainingModuleTest : public ::testing::Test { + protected: + void SetUp() override { + torch::executor::runtime_init(); + } +}; + +TEST_F(TrainingModuleTest, JointGraphTest) { + // Create a loader for the serialized ModuleAdd program. + const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH"); + executorch::runtime::Result + loader_res = torch::executor::util::FileDataLoader::from(path); + ASSERT_EQ(loader_res.error(), Error::Ok); + auto loader = std::make_unique( + std::move(loader_res.get())); + + auto mod = executorch::extension::training::TrainingModule(std::move(loader)); + + TensorFactory tf; + Tensor input = tf.make({3}, {1.0, 1.0, 1.0}); + Tensor label = tf.make({3}, {1.0, 0.0, 0.0}); + + std::vector inputs; + inputs.push_back(input); + inputs.push_back(label); + + auto res = mod.execute_forward_backward("forward", inputs); + ASSERT_EQ(res.error(), Error::Ok); + ASSERT_EQ(res.get().size(), 1); + + // Test Gradients + auto grad_res = mod.named_gradients("forward"); + ASSERT_EQ(grad_res.error(), Error::Ok); + auto& grad = grad_res.get(); + ASSERT_EQ(grad.size(), 2); + ASSERT_NE(grad.find("linear.weight"), grad.end()); + ASSERT_NE(grad.find("linear.bias"), grad.end()); + + ASSERT_EQ(grad.find("linear.weight")->second.sizes()[0], 3); + ASSERT_EQ(grad.find("linear.weight")->second.sizes()[1], 3); + ASSERT_EQ(grad.find("linear.weight")->second.dim(), 2); + ASSERT_EQ(grad.find("linear.bias")->second.sizes()[0], 3); + ASSERT_EQ(grad.find("linear.bias")->second.dim(), 1); + + // Test Parameters + auto param_res = mod.named_parameters("forward"); + ASSERT_EQ(param_res.error(), Error::Ok); + auto& param = grad_res.get(); + ASSERT_EQ(param.size(), 2); + ASSERT_NE(param.find("linear.weight"), grad.end()); + ASSERT_NE(param.find("linear.bias"), grad.end()); + + ASSERT_EQ(param.find("linear.weight")->second.sizes()[0], 3); + ASSERT_EQ(param.find("linear.weight")->second.sizes()[1], 3); + ASSERT_EQ(param.find("linear.weight")->second.dim(), 2); + ASSERT_EQ(param.find("linear.bias")->second.sizes()[0], 3); + ASSERT_EQ(param.find("linear.bias")->second.dim(), 1); +} + +TEST_F(TrainingModuleTest, NonTrainingModuleTest) { + // Create a loader for the serialized ModuleAdd program. + const char* path = std::getenv("ET_MODULE_ADD_PATH"); + executorch::runtime::Result + loader_res = torch::executor::util::FileDataLoader::from(path); + ASSERT_EQ(loader_res.error(), Error::Ok); + auto loader = std::make_unique( + std::move(loader_res.get())); + + auto mod = executorch::extension::training::TrainingModule(std::move(loader)); + + TensorFactory tf; + Tensor input = tf.make({2, 2}, {1.0, 1.0, 1.0, 1.0}); + Tensor input2 = tf.make({2, 2}, {1.0, 0.0, 0.0, 0.0}); + + std::vector inputs; + inputs.push_back(input); + inputs.push_back(input2); + + // Non-training module should fail to execute forward/backward as it cant find + // the gradients or params. + auto res = mod.execute_forward_backward("forward", inputs); + ASSERT_EQ(res.error(), Error::InvalidArgument); +} diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp new file mode 100644 index 00000000000..7b38292fd1f --- /dev/null +++ b/extension/training/module/training_module.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace extension { +namespace training { + +namespace { +std::string gradients_method_prefix = "__et_training_gradients_index_"; +std::string parameters_method_prefix = "__et_training_parameters_index_"; +std::string fqn_method_prefix = "__et_training_fqn_"; +} // namespace + +runtime::Result> +TrainingModule::execute_forward_backward( + const std::string& method_name, + const std::vector& input) { + // Find where the user outputs end. + const std::string gradients_method_name = + gradients_method_prefix + method_name; + auto res = executorch::extension::Module::execute(gradients_method_name); + if (!res.ok()) { + return res.error(); + } + uint64_t grad_start = res.get()[0].toInt(); + + const std::string parameters_method_name = + parameters_method_prefix + method_name; + // get params start. + auto param_res = + executorch::extension::Module::execute(parameters_method_name); + if (!param_res.ok()) { + return param_res.error(); + } + + uint64_t param_start = param_res.get()[0].toInt(); + + // Execute the forward and backward pass. + + auto outputs = torch::executor::Module::execute(method_name, input); + if (!outputs.ok()) { + return outputs.error(); + } + + // Extract the user outputs. + std::vector user_outputs; + user_outputs.reserve(grad_start); + for (size_t i = 0; i < grad_start; ++i) { + user_outputs.push_back(outputs.get().at(i)); + } + + // Extract and store the gradients. + if (method_named_gradients_.find(method_name) == + method_named_gradients_.end()) { + method_named_gradients_.insert({method_name, {}}); + + auto& gradients_map = method_named_gradients_.at(method_name); + // Get names. + const std::string fqn_method_name = fqn_method_prefix + method_name; + auto fqn_res = executorch::extension::Module::execute(fqn_method_name); + if (!fqn_res.ok()) { + return fqn_res.error(); + } + const auto& fqn_list = fqn_res.get(); + + // Only have to initialize the dict once because the tensors in the dict and + // the tensors in the method alias the same TensorImpl, so updating one will + // update the other. + size_t name_index = 0; + for (size_t grad_index = grad_start; grad_index < param_start; + ++grad_index, ++name_index) { + exec_aten::string_view fqn = fqn_list.at(name_index).toString(); + gradients_map.insert({fqn, outputs.get().at(grad_index).toTensor()}); + } + } + + return user_outputs; +} + +runtime::Result> +TrainingModule::named_parameters(const std::string& method_name) { + std::map named_parameters; + const std::string fqn_method_name = fqn_method_prefix + method_name; + const std::string parameters_method_name = + parameters_method_prefix + method_name; + + // get names. + auto fqn_res = executorch::extension::Module::execute(fqn_method_name); + if (!fqn_res.ok()) { + return fqn_res.error(); + } + const auto& fqn_list = fqn_res.get(); + + // get params start. + auto param_res = + executorch::extension::Module::execute(parameters_method_name); + if (!param_res.ok()) { + return param_res.error(); + } + + uint64_t param_start = param_res.get()[0].toInt(); + + auto& method = methods_.at(method_name).method; + + // create dict + size_t name_index = 0; + for (size_t param_index = param_start; param_index < method->outputs_size(); + ++param_index, ++name_index) { + exec_aten::string_view fqn = fqn_list.at(name_index).toString(); + exec_aten::Tensor param = method->get_output(param_index).toTensor(); + named_parameters.insert({fqn, param}); + } + return named_parameters; +} + +runtime::Result> +TrainingModule::named_gradients(const std::string& method_name) { + if (method_named_gradients_.find(method_name) == + method_named_gradients_.end()) { + ET_LOG(Error, "No gradients found for method %s", method_name.c_str()); + return executorch::runtime::Error::InvalidArgument; + } + return method_named_gradients_.at(method_name); +} + +} // namespace training +} // namespace extension +} // namespace executorch diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h new file mode 100644 index 00000000000..7571aacecf6 --- /dev/null +++ b/extension/training/module/training_module.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace executorch { +namespace extension { +namespace training { + +/** + * A facade class for loading programs for on-device training and executing + * methods within them. + */ +class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module { + public: + explicit TrainingModule( + std::unique_ptr data_loader, + std::unique_ptr memory_allocator = nullptr, + std::unique_ptr temp_allocator = nullptr, + std::unique_ptr event_tracer = nullptr) + : executorch::extension::Module( + std::move(data_loader), + std::move(memory_allocator), + std::move(temp_allocator), + std::move(event_tracer)), + method_named_gradients_({}) {} + + explicit TrainingModule(const Module&) = delete; + TrainingModule& operator=(const Module&) = delete; + explicit TrainingModule(Module&&) = delete; + TrainingModule& operator=(Module&&) = delete; + + /** + * Execute a specific method with the given input and retrieve output. Only + * valid if the specified method is a joint graph. Loads the program and + * method before executing if needed. + * + * @param[in] method_name The name of the joint graph method to execute. + * @param[in] input A vector of input values to be passed to the method. + * + * @returns A Result object containing the output values from the method or an + * error to indicate failure. + */ + ET_EXPERIMENTAL runtime::Result> + execute_forward_backward( + const std::string& method_name, + const std::vector& input); + + /** + * Retrieve the trainable parameters for a joint graph method. + * + * @param[in] method_name The name of the joint graph method to get the + * parameters for. + * + * @returns A Result object containing a map of the fully qualified name to + * parameter tensor, or an error if the method is not a joint graph or has not + * been executed yet. + */ + ET_EXPERIMENTAL + runtime::Result> + named_parameters(const std::string& method_name); + + /** + * Retrieve the latest gradients for a joint graph method. + * + * @param[in] method_name The name of the joint graph method to get the + * gradients for. + * + * @returns A Result object containing a map of the fully qualified name to + * gradient tensor associated with that parameter from the latest + * forward_backward execution, or an error if the method is not a joint graph + * or has not been executed yet. + */ + ET_EXPERIMENTAL + runtime::Result> + named_gradients(const std::string& method_name); + + private: + std::unordered_map< + std::string, + std::map> + method_named_gradients_; +}; + +} // namespace training +} // namespace extension +} // namespace executorch From e793795d88f6f3ee15494227dbb7d1560899d9da Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Wed, 4 Sep 2024 19:22:48 -0400 Subject: [PATCH 188/531] [ET-VK] Add `TmpTensorVRef` struct to recycle temporary tensor memory Differential Revision: D62144398 Pull Request resolved: https://github.com/pytorch/executorch/pull/5041 --- .../vulkan/runtime/graph/ComputeGraph.cpp | 66 +++++++++++++ backends/vulkan/runtime/graph/ComputeGraph.h | 81 +++++++++++++++ .../vulkan/runtime/graph/containers/Value.h | 5 + .../vulkan/test/vulkan_compute_api_test.cpp | 99 +++++++++++++++++++ 4 files changed, 251 insertions(+) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index c22241940f8..8a9ec370f6d 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -47,6 +47,72 @@ VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt) #undef VALUE_PTR_CLASS_IMPL +// +// TmpTensor +// + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor( + sizes, + dtype, + storage_type, + memory_layout, + sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {} + +TmpTensor::~TmpTensor() { + // Lifetime of this temporary tensor is expired; return the shared object to + // the pool, as long as the sobj index is valid + if (sobj_idx >= 0) { + graph_p->tmp_shared_object_idxs_.emplace(sobj_idx); + } +} + +int64_t TmpTensor::get_sobj_idx() { + int64_t sobj_idx; + // If no available temporary shared objects, request a new one to be created + if (graph_p->tmp_shared_object_idxs_.empty()) { + sobj_idx = graph_p->shared_objects_.size(); + } else { + // Get the first available shared object idx + sobj_idx = graph_p->tmp_shared_object_idxs_.top(); + graph_p->tmp_shared_object_idxs_.pop(); + } + return sobj_idx; +} + // // ComputeGraph // diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index ac5e0d6c9d1..210b03c4cad 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -11,6 +11,7 @@ // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName #include +#include #include @@ -67,6 +68,79 @@ DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); #undef DECL_VALUE_PTR_CLASS +// +// TmpTensor +// + +/* + * This struct is used to recycle the memory of temporary tensors that are + * created during the execution of a node. Upon construction, this struct will + * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance + * if any shared objects are available; if not, then a new one is created. A + * tensor value is then added to the `ComputeGraph` instance with the requested + * specifications. Upon destruction, the shared object index of the temporary + * tensor is returned to `tmp_shared_object_idxs_`. + * + * Note that instances of this struct can be used as if they were `ValueRef` due + * to implementation of a custom casting operator. + * + * This class should only be used to create tensors whose lifetimes exist only + * in a well defined scope (i.e. within a function). + */ +struct TmpTensor { + ComputeGraph* graph_p; + int64_t sobj_idx; + ValueRef vref; + + // + // Match all available overloads of `add_tensor` + // + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype); + + // No copy construction or assignment + TmpTensor(TmpTensor& other) = delete; + TmpTensor& operator=(TmpTensor& other) = delete; + + // No move construction or assignment + TmpTensor(TmpTensor&& other) = delete; + TmpTensor& operator=(TmpTensor&& other) = delete; + + // Custom cast to ValueRef + operator ValueRef() const { + return vref; + }; + + ~TmpTensor(); + + private: + // Helper function to get first available shared object index or request a new + // one to be created. + int64_t get_sobj_idx(); +}; + // // ComputeGraph // @@ -94,7 +168,12 @@ class ComputeGraph final { vkapi::DescriptorPoolConfig execute_descriptor_counts_; std::unique_ptr context_; + std::vector shared_objects_; + // This stack is used by `TmpTensor` instances to recycle shared objects + // for temporary tensors. See the comments of `TmpTensor` for more details + std::stack tmp_shared_object_idxs_; + std::vector values_; std::vector param_ubos_; @@ -593,6 +672,8 @@ class ComputeGraph final { friend class BoolListPtr; friend class ValueListPtr; friend class SymIntPtr; + + friend struct TmpTensor; }; template diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index 50a2b5e548c..8773f0c0b04 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -29,6 +29,11 @@ inline bool is_valid(ValueRef value_ref) { struct IOValueRef { ValueRef value; ValueRef staging; + + // Custom cast to ValueRef + operator ValueRef() const { + return value; + }; }; /* diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index fc5d1f8214b..1112548b855 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1518,6 +1518,105 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) { + GraphConfig config; + ComputeGraph graph(config); + + std::vector size_big = {8, 64, 124}; + std::vector size_small = {8, 1, 124}; + + // Build graph + + IOValueRef a = graph.add_input_tensor( + size_big, vkapi::kFloat, /*shared_object_idx = */ 0); + IOValueRef b = graph.add_input_tensor( + size_small, vkapi::kFloat, /*shared_object_idx = */ 1); + + IOValueRef out = {}; + + out.value = + graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2); + + // Perform the following compute + // + // a, b, out; + // { + // inter; + // { + // tmp = a + b + // tmp2 = tmp + a + // inter = tmp2 + b + // } + // { + // tmp = inter + b; + // tmp2 = tmp + a + // out = tmp2 + b; + // } + // } + { + TmpTensor inter(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(inter.sobj_idx == 3); + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {a, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, inter}); + } + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {inter, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, out}); + } + } + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + float val_a = i + 2.0f; + float val_b = i + 1.5f; + float val_tmp = val_a + val_b; + float val_tmp2 = val_tmp + val_a; + float val_inter = val_tmp2 + val_b; + float val_tmp_2 = val_inter + val_b; + float val_tmp2_2 = val_tmp_2 + val_a; + float val_out = val_tmp2_2 + val_b; + + fill_vtensor(graph, a, val_a); + fill_vtensor(graph, b, val_b); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + TEST(VulkanComputeGraphTest, test_large_graph) { auto build_start_time = std::chrono::system_clock::now(); GraphConfig config; From 084659e0d4a4a47c53277f34df0bdb606e6bf7e1 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:00:02 -0700 Subject: [PATCH 189/531] Update script to build and upload MiniBench artifacts Build this app during CI. We can start trying it with ``` adb shell am start -n org.pytorch.minibench/org.pytorch.minibench.BenchmarkActivity --es model_path /data/local/tmp/model.pte adb shell run-as org.pytorch.minibench cat files/benchmark_results.txt ``` Pull Request resolved: https://github.com/pytorch/executorch/pull/5017 --- build/build_android_llm_demo.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 7b7150de210..38efa05b745 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -91,12 +91,18 @@ build_aar() { popd } -build_android_llm_demo_app() { +build_android_demo_apps() { mkdir -p examples/demo-apps/android/LlamaDemo/app/libs cp ${BUILD_AAR_DIR}/executorch-llama.aar examples/demo-apps/android/LlamaDemo/app/libs pushd examples/demo-apps/android/LlamaDemo ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest popd + + mkdir -p extension/android/benchmark/app/libs + cp ${BUILD_AAR_DIR}/executorch.aar extension/android/benchmark/app/libs + pushd extension/android/benchmark + ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build + popd } collect_artifacts_to_be_uploaded() { @@ -116,6 +122,10 @@ collect_artifacts_to_be_uploaded() { # Collect JAR and AAR cp extension/android/build/libs/executorch.jar "${DEMO_APP_DIR}" find "${BUILD_AAR_DIR}/" -name 'executorch*.aar' -exec cp {} "${DEMO_APP_DIR}" \; + # Collect MiniBench APK + MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench" + mkdir -p "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}" } BUILD_AAR_DIR="$(mktemp -d)" @@ -130,5 +140,5 @@ for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do build_android_native_library ${ANDROID_ABI} done build_aar -build_android_llm_demo_app +build_android_demo_apps collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME} From cd1c833b079adb324871dcbbe75b43d42ffc0ade Mon Sep 17 00:00:00 2001 From: Riandy Date: Wed, 4 Sep 2024 17:03:06 -0700 Subject: [PATCH 190/531] Unified Android aar support for llava and llama models Differential Revision: D61406255 Pull Request resolved: https://github.com/pytorch/executorch/pull/5086 --- .../example/executorchllamademo/ETImage.java | 16 ++++++-- .../executorchllamademo/MainActivity.java | 38 +++++++++++++++++-- .../executorchllamademo/ModelUtils.java | 28 ++++++++++++++ .../executorchllamademo/PromptFormat.java | 2 + 4 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java index cf3c3e5f0a5..e68c8472626 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java @@ -46,6 +46,16 @@ public byte[] getBytes() { return bytes; } + public int[] getInts() { + // We need to convert the byte array to an int array because + // the runner expects an int array as input. + int[] intArray = new int[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + intArray[i] = (bytes[i++] & 0xFF); + } + return intArray; + } + private byte[] getBytesFromImageURI(Uri uri) { try { int RESIZED_IMAGE_WIDTH = 336; @@ -72,9 +82,9 @@ private byte[] getBytesFromImageURI(Uri uri) { int blue = Color.blue(color); // Store the RGB values in the byte array - rgbValues[(y * width + x) * 3] = (byte) red; - rgbValues[(y * width + x) * 3 + 1] = (byte) green; - rgbValues[(y * width + x) * 3 + 2] = (byte) blue; + rgbValues[y * width + x] = (byte) red; + rgbValues[(y * width + x) + height * width] = (byte) green; + rgbValues[(y * width + x) + 2 * height * width] = (byte) blue; } } return rgbValues; diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 70936e17d84..f24254efb31 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -102,7 +102,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera mMessageAdapter.notifyDataSetChanged(); }); long runStartTime = System.currentTimeMillis(); - mModule = new LlamaModule(modelPath, tokenizerPath, temperature); + mModule = + new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); int loadResult = mModule.load(); long loadDuration = System.currentTimeMillis() - runStartTime; String modelLoadError = ""; @@ -552,8 +557,6 @@ private void onModelRunStopped() { mSendButton.setOnClickListener( view -> { addSelectedImagesToChatThread(mSelectedImageUri); - // TODO: When ET supports multimodal, this is where we will add the images as part of the - // prompt. List processedImageList = getProcessedImagesForModel(mSelectedImageUri); processedImageList.forEach( image -> { @@ -599,7 +602,34 @@ public void run() { }); ETLogging.getInstance().log("Running inference.. prompt=" + prompt); long generateStartTime = System.currentTimeMillis(); - mModule.generate(prompt, MainActivity.this); + if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + == ModelUtils.VISION_MODEL) { + if (!processedImageList.isEmpty()) { + // For now, Llava only support 1 image. + ETImage img = processedImageList.get(0); + mModule.generate( + processedImageList.get(0).getInts(), + img.getWidth(), + img.getHeight(), + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + prompt, + ModelUtils.VISION_MODEL_SEQ_LEN, + MainActivity.this); + } else { + // no image selected, we pass in empty int array + mModule.generate( + new int[0], + 0, + 0, + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + prompt, + ModelUtils.VISION_MODEL_SEQ_LEN, + MainActivity.this); + } + } else { + mModule.generate(prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, MainActivity.this); + } + long generateDuration = System.currentTimeMillis() - generateStartTime; mResultMessage.setTotalGenerationTime(generateDuration); runOnUiThread( diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java new file mode 100644 index 00000000000..ab1f1bc92fc --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class ModelUtils { + static final int TEXT_MODEL = 1; + static final int VISION_MODEL = 2; + static final int VISION_MODEL_IMAGE_CHANNELS = 3; + static final int VISION_MODEL_SEQ_LEN = 768; + static final int TEXT_MODEL_SEQ_LEN = 256; + + public static int getModelCategory(ModelType modelType) { + switch (modelType) { + case LLAVA_1_5: + return VISION_MODEL; + case LLAMA_3: + case LLAMA_3_1: + default: + return TEXT_MODEL; + } + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 72990f4ea8b..a077f4d677f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -21,6 +21,7 @@ public static String getSystemPromptTemplate(ModelType modelType) { + SYSTEM_PLACEHOLDER + "<|eot_id|>"; case LLAVA_1_5: + return "USER: "; default: return SYSTEM_PLACEHOLDER; } @@ -35,6 +36,7 @@ public static String getUserPromptTemplate(ModelType modelType) { + "<|eot_id|>\n" + "<|start_header_id|>assistant<|end_header_id|>"; case LLAVA_1_5: + return USER_PLACEHOLDER + " ASSISTANT:"; default: return USER_PLACEHOLDER; } From a8c592e98a363ea3e52a3d3424c9f16abd722bf9 Mon Sep 17 00:00:00 2001 From: hsharma35 Date: Wed, 4 Sep 2024 17:59:24 -0700 Subject: [PATCH 191/531] Buckify backends/arm for meta internal use. Differential Revision: D62062674 Pull Request resolved: https://github.com/pytorch/executorch/pull/5023 --- backends/arm/TARGETS | 83 +++++++++++++++++++ backends/arm/arm_backend.py | 2 +- backends/arm/arm_vela.py | 22 ++--- backends/arm/operators/TARGETS | 34 ++++++++ backends/arm/operators/op_bmm.py | 1 + backends/arm/operators/op_conv2d.py | 7 +- backends/arm/operators/op_mm.py | 1 + backends/arm/operators/op_mul.py | 10 ++- backends/arm/operators/op_output.py | 4 +- backends/arm/operators/op_view.py | 2 +- backends/arm/passes/TARGETS | 12 +++ .../annotate_channels_last_dim_order_pass.py | 4 +- backends/arm/passes/arm_pass_manager.py | 4 +- .../passes/convert_expand_copy_to_repeat.py | 4 +- .../arm/passes/size_adjust_conv2d_pass.py | 6 +- backends/arm/quantizer/TARGETS | 31 +++++++ backends/arm/quantizer/arm_quantizer_utils.py | 10 ++- .../quantizer/quantization_annotation/TARGETS | 12 +++ .../quantization_annotation/cat_annotator.py | 4 +- backends/arm/tosa_quant_utils.py | 10 ++- backends/arm/tosa_utils.py | 11 ++- 21 files changed, 232 insertions(+), 42 deletions(-) create mode 100644 backends/arm/TARGETS create mode 100644 backends/arm/operators/TARGETS create mode 100644 backends/arm/passes/TARGETS create mode 100644 backends/arm/quantizer/TARGETS create mode 100644 backends/arm/quantizer/quantization_annotation/TARGETS diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS new file mode 100644 index 00000000000..220db373710 --- /dev/null +++ b/backends/arm/TARGETS @@ -0,0 +1,83 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_partitioner", + srcs = [ + "arm_partitioner.py", + ], + typing = True, + deps = [ + ":arm_backend", + "//executorch/backends/arm/passes:passes", + "//executorch/exir:lib", + ], +) + +python_library( + name = "arm_backend", + srcs = [ + "arm_backend.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/flatbuffers:flatbuffers", + "fbsource//third-party/pypi/ml-dtypes:ml-dtypes", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":arm_vela", + "//executorch/backends/arm/operators:lib", + "//executorch/backends/arm/operators:node_visitor", + "//executorch/backends/arm/passes:passes", + ], +) + +python_library( + name = "arm_vela", + srcs = [ + "arm_vela.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela", + ], +) + +python_library( + name = "tosa_mapping", + srcs = [ + "tosa_mapping.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "//caffe2:torch", + ], +) + +python_library( + name = "tosa_quant_utils", + srcs = [ + "tosa_quant_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/numpy:numpy", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":tosa_mapping", + "//executorch/exir/dialects:lib", + ], +) + +python_library( + name = "tosa_utils", + srcs = [ + "tosa_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + ":tosa_quant_utils", + "//executorch/backends/arm/operators:node_visitor", + ], +) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index f187191fee0..27fd36ca0e1 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -159,7 +159,7 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool: return False -def get_intermediate_path(compile_spec: List[CompileSpec]) -> str: +def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]: for spec in compile_spec: if spec.key == "debug_artifact_path": return spec.value.decode() diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index f387672b7b4..53533947c49 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -5,12 +5,12 @@ import os import struct -import subprocess import tempfile from typing import List import numpy as np +from ethosu.vela import vela # Pack either input or output tensor block, compose the related arrays into @@ -38,21 +38,17 @@ def vela_compile(tosa_graph, args: List[str]): with tempfile.TemporaryDirectory() as tmpdir: tosaname = "out.tosa" flatbuffer = tosa_graph.serialize() - with open(os.path.join(tmpdir, tosaname), "wb") as f: + tosa_path = os.path.join(tmpdir, tosaname) + with open(tosa_path, "wb") as f: f.write(flatbuffer) # invoke vela - vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}" - try: - subprocess.run([vela_command], shell=True, check=True, capture_output=True) - except subprocess.CalledProcessError as process_error: - raise RuntimeError( - f"Vela compiler ('{vela_command}') failed with error:\n \ - {process_error.stderr.decode()}\n \ - Stdout:\n{process_error.stdout.decode()}" - ) - - np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + output_dir = os.path.join(tmpdir, "output") + args.append(f"--output-dir={output_dir}") + args.append(tosa_path) + vela.main(" ".join(args).split(" ")) + + np_path = os.path.join(output_dir, "out_sg0_vela.npz") blocks = b"" with np.load(np_path, allow_pickle=False) as data: diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS new file mode 100644 index 00000000000..fd04d5fb847 --- /dev/null +++ b/backends/arm/operators/TARGETS @@ -0,0 +1,34 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "node_visitor", + srcs = ["node_visitor.py"], + typing = True, + deps = [ + "//executorch/backends/arm:tosa_mapping", + ], +) + +python_library( + name = "ops", + srcs = glob(["op_*.py"]), + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":node_visitor", + "//executorch/backends/arm:tosa_mapping", + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) + +python_library( + name = "lib", + srcs = ["__init__.py"], + typing = True, + deps = [ + ":node_visitor", + ":ops", + ], +) diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py index 8d0235ebe73..59f28d3bad8 100644 --- a/backends/arm/operators/op_bmm.py +++ b/backends/arm/operators/op_bmm.py @@ -72,6 +72,7 @@ def define_node( build_rescale( tosa_fb=tosa_graph, scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined. input_node=bmm_result, output_name=output.name, output_type=ts.DType.INT8, diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 9437e96f5e9..935c923ba42 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -2,7 +2,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List +from typing import cast, List import serializer.tosa_serializer as ts import torch @@ -156,11 +156,12 @@ def define_node( # integer value domain of the next op. Otherwise return float32 output. if is_quant_node: # Get scale_factor from input, weight, and output. - _, input_scale, _, _, _, _ = getNodeArgs(node.args[0]) - _, weight_scale, _, _, _, _ = getNodeArgs(node.args[1]) + _, input_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[0])) + _, weight_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[1])) _, output_scale, output_zp, _, _, _ = getNodeArgs(list(node.users)[0]) build_rescale_conv_output( tosa_graph, + # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined. conv2d_res, output.name, actual_out_type, diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py index f7097022f12..98152215035 100644 --- a/backends/arm/operators/op_mm.py +++ b/backends/arm/operators/op_mm.py @@ -96,6 +96,7 @@ def define_node( build_rescale( tosa_fb=tosa_graph, scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `reshape_intermediate` is undefined, or not always defined. input_node=reshape_intermediate, output_name=output.name, output_type=ts.DType.INT8, diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py index e9cbfcbd7cc..f7c593e9fe3 100644 --- a/backends/arm/operators/op_mul.py +++ b/backends/arm/operators/op_mul.py @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List +from typing import cast, List import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils @@ -35,8 +35,12 @@ def define_node( if is_quant_node: input_A = inputs[0] input_B = inputs[1] - input_A_qargs = tqutils.get_quant_node_args(node.args[0]) - input_B_qargs = tqutils.get_quant_node_args(node.args[1]) + input_A_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[0]) + ) + input_B_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[1]) + ) input_A.shape = tutils.tosa_shape(input_A.shape, input_A.dim_order) input_B.shape = tutils.tosa_shape(input_B.shape, input_B.dim_order) diff --git a/backends/arm/operators/op_output.py b/backends/arm/operators/op_output.py index 7d163114aa8..89654ed2d48 100644 --- a/backends/arm/operators/op_output.py +++ b/backends/arm/operators/op_output.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import cast + import serializer.tosa_serializer as ts import torch @@ -11,7 +13,7 @@ def process_output( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, ): - for output in node.args[0]: + for output in cast(tuple[torch.fx.Node, ...], node.args[0]): tosa_graph.addOutputTensor( tosa_graph.currRegion.currBasicBlock.tensors[output.name] ) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 682eacd5e38..5baedfc9627 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -6,6 +6,7 @@ import serializer.tosa_serializer as ts import torch +import tosa.Op as TosaOp from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, @@ -13,7 +14,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape -from serializer.tosa_serializer import TosaOp @register_node_visitor diff --git a/backends/arm/passes/TARGETS b/backends/arm/passes/TARGETS new file mode 100644 index 00000000000..ca20b03fccd --- /dev/null +++ b/backends/arm/passes/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "passes", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py index ea3c171c580..8ba02c2f7e3 100644 --- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import cast + import torch from executorch.backends.arm.tosa_quant_utils import dq_op from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d @@ -28,7 +30,7 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node): if node.target != dq_op: return False prev_node = node.args[0] - if prev_node.op != "placeholder": + if cast(torch.fx.Node, prev_node).op != "placeholder": return False return is_consumer_node_depthwise_conv2d(node) elif node.op == "placeholder": diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index 8cac53b1347..914bf57aabc 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -23,11 +23,11 @@ class ArmPassManager(PassManager): - def _transform(self, graph_module: torch.fx.Graph): + def _transform(self, graph_module: torch.fx.GraphModule): return self(graph_module).graph_module def transform_to_backend_pipeline( - self, graph_module: torch.fx.Graph, compile_spec: CompileSpec + self, graph_module: torch.fx.GraphModule, compile_spec: list[CompileSpec] ): """Apply passes before transforming program to backend""" self.add_pass(SizeAdjustConv2DPass()) diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py index 53138682d56..5f409e1ae5f 100644 --- a/backends/arm/passes/convert_expand_copy_to_repeat.py +++ b/backends/arm/passes/convert_expand_copy_to_repeat.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from typing import cast + import torch.fx from executorch.backends.arm.tosa_mapping import extract_tensor_meta from executorch.exir.dialects._ops import ops as exir_ops @@ -31,7 +33,7 @@ def call(self, graph_module: torch.fx.GraphModule): expand_node = src_partition.nodes[0] _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta) - multiples = expand_node.args[1] + multiples = cast(tuple[int], expand_node.args[1]) expanded_rank = len(multiples) # Expanded shape is 'shape' front-padded with ones. diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py index 25d27e7f40f..ea161b74928 100644 --- a/backends/arm/passes/size_adjust_conv2d_pass.py +++ b/backends/arm/passes/size_adjust_conv2d_pass.py @@ -85,8 +85,8 @@ def call(self, graph_module: torch.fx.GraphModule): input_node, weight, _, stride_hw, pad_hw, dilation_hw, _, _, _ = ( conv_node.args ) - weight_shape = weight.meta["val"].shape - input_shape = input_node.meta["val"].shape + weight_shape = cast(torch.fx.Node, weight).meta["val"].shape + input_shape = cast(torch.fx.Node, input_node).meta["val"].shape slice_args = [] for stride, pad, dilation, dim in zip( @@ -119,7 +119,7 @@ def call(self, graph_module: torch.fx.GraphModule): last_node = dq_node else: last_node = slice_node - conv_node.replace_input_with(input_node, last_node) + conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node) modified_graph = True if modified_graph: diff --git a/backends/arm/quantizer/TARGETS b/backends/arm/quantizer/TARGETS new file mode 100644 index 00000000000..840586488bf --- /dev/null +++ b/backends/arm/quantizer/TARGETS @@ -0,0 +1,31 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_quantizer", + srcs = ["arm_quantizer.py"], + typing = True, + deps = [ + ":arm_quantizer_utils", + "//caffe2:torch", + "//executorch/backends/arm/quantizer/quantization_annotation:quantization_annotation", + "//executorch/exir:lib", + ], +) + +python_library( + name = "quantization_config", + srcs = ["quantization_config.py"], + typing = True, + deps = [ + "//caffe2:torch", + ], +) + +python_library( + name = "arm_quantizer_utils", + srcs = ["arm_quantizer_utils.py"], + typing = True, + deps = [ + ":quantization_config", + ], +) diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 417aa454a8e..1cac297bc92 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -10,7 +10,7 @@ # import operator -from typing import Callable, cast, List +from typing import Callable, cast, List, Union import torch from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig @@ -72,7 +72,7 @@ def get_shared_qspec( Both outputs are None if one of the inputs is a node that can't be quantized. """ - input_act0 = node.args[0] + input_act0 = cast(Node, node.args[0]) input_act1 = node.args[1] input_act_qspec = quantization_config.get_input_act_qspec() @@ -169,7 +169,9 @@ def propagate_annotation(model: GraphModule) -> None: n = cast(Node, n) if is_annotated(n): continue - if n.op != "call_function" or not is_share_obs_or_fq_op(n.target): + if n.op != "call_function" or not is_share_obs_or_fq_op( + cast(Callable, n.target) + ): continue prev_node = n.args[0] @@ -217,7 +219,7 @@ def convert_scalars_to_attrs(model: GraphModule) -> GraphModule: prefix = "_tensor_constant_" get_new_attr_name = get_new_attr_name_with_prefix(prefix) tensor_constant_name = get_new_attr_name(model) - float_tensor = torch.tensor(float(args[i])) + float_tensor = torch.tensor(float(cast(Union[int, float], args[i]))) model.register_buffer(tensor_constant_name, float_tensor) fake_mode = n.meta["val"].fake_mode with model.graph.inserting_before(n): diff --git a/backends/arm/quantizer/quantization_annotation/TARGETS b/backends/arm/quantizer/quantization_annotation/TARGETS new file mode 100644 index 00000000000..4ce8b5cad2c --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "quantization_annotation", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//caffe2:torch", + "//executorch/backends/arm/quantizer:arm_quantizer_utils", + "//executorch/backends/arm/quantizer:quantization_config", + ], +) diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py index 40dd19526b3..992070ac172 100644 --- a/backends/arm/quantizer/quantization_annotation/cat_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import itertools -from typing import Callable, List, Optional +from typing import Callable, cast, List, Optional import torch.fx from executorch.backends.arm.quantizer import arm_quantizer_utils @@ -34,7 +34,7 @@ def _annotate_cat( if arm_quantizer_utils.is_annotated(cat_node): continue - input_acts = cat_node.args[0] + input_acts = cast(list[torch.fx.Node], cat_node.args[0]) input_act0 = input_acts[0] input_act_qspec = quantization_config.get_input_act_qspec() diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index c0d16d51b25..d93f2544070 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -6,15 +6,16 @@ # Utiliy functions for TOSA quantized lowerings import math -from typing import NamedTuple +from typing import NamedTuple, Sequence import numpy as np import serializer.tosa_serializer as ts import torch.fx +import tosa.Op as TosaOp from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg from executorch.exir.dialects._ops import ops as exir_ops -from serializer.tosa_serializer import TosaOp, TosaSerializerTensor +from serializer.tosa_serializer import TosaSerializerTensor from torch.fx import Node q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default @@ -65,6 +66,7 @@ def is_quant_node(node: torch.fx.Node): def get_quant_node_dtype(node: torch.fx.Node): + # pyre-ignore[16]: Undefined attribute. if "tosa" in node.target.__name__: return node.meta["val"].dtype @@ -231,7 +233,7 @@ def build_rescale_from_int32( rescale_scale, is_scale32=True, is_double_round=False, -) -> TosaSerializerTensor: +) -> None: multiplier, shift = compute_multiplier_and_shift(rescale_scale) attr_rescale_output = ts.TosaSerializerAttribute() attr_rescale_output.RescaleAttribute( @@ -254,7 +256,7 @@ def build_rescale_from_int32( def rescale_nodes_to_int32( - nodes: list[Node], tosa_graph: ts.TosaSerializer + nodes: Sequence[Node], tosa_graph: ts.TosaSerializer ) -> tuple[list[TosaSerializerTensor], float]: """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. The scales are adjusted using the smallest scale of all 'nodes'. diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index f84e371279b..5353dd49fae 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -5,7 +5,7 @@ import logging import os -from typing import Any, Dict +from typing import Any, cast, Dict import numpy as np import serializer.tosa_serializer as ts @@ -235,7 +235,7 @@ def build_avg_pool_2d_common( output_zp = 0 if is_quant_node: - input_zp = get_quant_node_args(node.args[0]).zp + input_zp = get_quant_node_args(cast(torch.fx.Node, node.args[0])).zp output_zp = get_quant_node_args(list(node.users)[0]).zp attr = ts.TosaSerializerAttribute() @@ -306,7 +306,9 @@ def process_call_function( ) # Visiting each Node + # pyre-ignore[16]: Undefined attribute. if node.target.__name__ in node_visitors: + # pyre-ignore[16]: Undefined attribute. node_visitors[node.target.__name__].define_node( node, tosa_graph, @@ -319,7 +321,10 @@ def process_call_function( def expand_dims( - tosa_graph: ts.TosaSerializer, input_node: TosaArg, dtype: ts.DType, dim: int + tosa_graph: ts.TosaSerializer, + input_node: TosaArg, + dtype: int, + dim: int, ) -> Any: """Inserts TOSA operators into the tosa_graph, that perform the equivalent of the expand_dims (a.k.a unsqueeze) operation. A new axis is created at the From 5156615fabee31a3e9c932170d53ae013a8d3b6d Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 4 Sep 2024 18:09:57 -0700 Subject: [PATCH 192/531] Resync Android related BUCK file Pull Request resolved: https://github.com/pytorch/executorch/pull/5099 --- .../android/ExecuTorchDemo/app/src/main/BUCK | 67 ++++++++++++++ .../android/LlamaDemo/app/src/main/BUCK | 64 ++++++++++++++ extension/android/BUCK | 34 ++++++++ extension/android/jni/BUCK | 87 +++++++++++++++++++ 4 files changed, 252 insertions(+) create mode 100644 examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/BUCK create mode 100644 extension/android/BUCK create mode 100644 extension/android/jni/BUCK diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK new file mode 100644 index 00000000000..2b33cef732a --- /dev/null +++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK @@ -0,0 +1,67 @@ +load("@fbsource//tools/build_defs:manifold.bzl", "manifold_get") +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +manifold_get( + name = "dl3_xnnpack_fp32", + out = "dl3_xnnpack_fp32.pte", + api_key = "executorch-key", + artifact_path = "tree/models/benchmarking/executorch/dl3_xnnpack_fp32.pte", + bucket_name = "executorch", + sha1 = "3e7af1d8f5ec4acb6de156d361715e16e5f53783", + timeout_msec = 120000, +) + +fb_android_resource( + name = "app_res", + assets = "assets", + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_resource( + name = "model_res", + assets = {"dl3_xnnpack_fp32.pte": ":dl3_xnnpack_fp32"}, + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchdemo/ClassificationActivity.java", + "java/com/example/executorchdemo/ImageNetClasses.java", + "java/com/example/executorchdemo/MainActivity.java", + "java/com/example/executorchdemo/TensorImageUtils.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//xplat/executorch/extension/android:executorch", + ], +) + +fb_android_binary( + name = "ExecuTorchDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 19, # Android supports 19 for minimum + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + ":model_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//xplat/executorch/extension/android:executorch", + "//xplat/executorch/extension/android/jni:executorch_jni_full", + ], +) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK new file mode 100644 index 00000000000..1fd656317ea --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK @@ -0,0 +1,64 @@ +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +oncall("executorch") + +fb_android_resource( + name = "app_res", + package = "com.example.executorchllamademo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchllamademo/AppLog.java", + "java/com/example/executorchllamademo/DemoSharedPreferences.java", + "java/com/example/executorchllamademo/ETImage.java", + "java/com/example/executorchllamademo/ETLogging.java", + "java/com/example/executorchllamademo/LlmBenchmarkRunner.java", + "java/com/example/executorchllamademo/LogsActivity.java", + "java/com/example/executorchllamademo/LogsAdapter.java", + "java/com/example/executorchllamademo/MainActivity.java", + "java/com/example/executorchllamademo/Message.java", + "java/com/example/executorchllamademo/MessageAdapter.java", + "java/com/example/executorchllamademo/MessageType.java", + "java/com/example/executorchllamademo/ModelRunner.java", + "java/com/example/executorchllamademo/ModelRunnerCallback.java", + "java/com/example/executorchllamademo/ModelType.java", + "java/com/example/executorchllamademo/PromptFormat.java", + "java/com/example/executorchllamademo/SettingsActivity.java", + "java/com/example/executorchllamademo/SettingsFields.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + ], +) + +fb_android_binary( + name = "ExecuTorchLlamaDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 21, + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + "//xplat/executorch/extension/android/jni:executorch_llama_jni", + ], +) diff --git a/extension/android/BUCK b/extension/android/BUCK new file mode 100644 index 00000000000..dfc5db18137 --- /dev/null +++ b/extension/android/BUCK @@ -0,0 +1,34 @@ +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") + +oncall("executorch") + +fb_android_library( + name = "executorch", + srcs = [ + "src/main/java/org/pytorch/executorch/DType.java", + "src/main/java/org/pytorch/executorch/EValue.java", + "src/main/java/org/pytorch/executorch/Module.java", + "src/main/java/org/pytorch/executorch/NativePeer.java", + "src/main/java/org/pytorch/executorch/Tensor.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + "//fbandroid/java/com/facebook/jni:jni", + "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader", + ], +) + +fb_android_library( + name = "executorch_llama", + srcs = [ + "src/main/java/org/pytorch/executorch/LlamaCallback.java", + "src/main/java/org/pytorch/executorch/LlamaModule.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + "//fbandroid/java/com/facebook/jni:jni", + "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader", + ], +) diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK new file mode 100644 index 00000000000..c32f4ab0a95 --- /dev/null +++ b/extension/android/jni/BUCK @@ -0,0 +1,87 @@ +load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library") +load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib") + +oncall("executorch") + +executorch_generated_lib( + name = "generated_op_lib_optimized", + custom_ops_aten_kernel_deps = [ + "//executorch/kernels/portable:operators_aten", + ], + custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml", + define_static_targets = True, + fallback_yaml_target = "//executorch/kernels/portable:functions.yaml", + functions_yaml_target = "//executorch/kernels/optimized:optimized.yaml", + visibility = ["PUBLIC"], + deps = [ + "//executorch/kernels/optimized:optimized_operators", + "//executorch/kernels/optimized:optimized_oplist", + "//executorch/kernels/portable:executorch_aten_ops", + "//executorch/kernels/portable:executorch_custom_ops", + "//executorch/kernels/portable:operators", + ], +) + +fb_android_cxx_library( + name = "executorch_jni", + srcs = ["jni_layer.cpp"], + headers = ["jni_layer_constants.h"], + allow_jni_merging = False, + compiler_flags = [ + "-frtti", + "-fexceptions", + ], + soname = "libexecutorch.$(ext)", + visibility = ["PUBLIC"], + deps = [ + "//fbandroid/libraries/fbjni:fbjni", + "//fbandroid/native/fb:fb", + "//third-party/glog:glog", + "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:managed_tensor_static", + ], +) + +fb_android_cxx_library( + name = "executorch_jni_full", + srcs = ["jni_layer.cpp"], + headers = ["jni_layer_constants.h"], + allow_jni_merging = False, + compiler_flags = [ + "-frtti", + "-fexceptions", + ], + soname = "libexecutorch.$(ext)", + visibility = ["PUBLIC"], + deps = [ + ":generated_op_lib_optimized_static", + "//fbandroid/libraries/fbjni:fbjni", + "//fbandroid/native/fb:fb", + "//third-party/glog:glog", + "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", + "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:managed_tensor_static", + ], +) + +fb_android_cxx_library( + name = "executorch_llama_jni", + srcs = ["jni_layer_llama.cpp"], + allow_jni_merging = False, + compiler_flags = [ + "-frtti", + "-fexceptions", + "-Wno-format", + ], + soname = "libexecutorch_llama_jni.$(ext)", + visibility = ["PUBLIC"], + deps = [ + "//fbandroid/libraries/fbjni:fbjni", + "//fbandroid/native/fb:fb", + "//third-party/glog:glog", + "//xplat/executorch/examples/models/llama2/runner:runner_static", + "//xplat/executorch/examples/models/llava/runner:runner_static", + "//xplat/executorch/extension/threadpool:cpuinfo_utils_static", + "//xplat/executorch/extension/threadpool:threadpool_static", + ], +) From caf48b706fa57ccfbf03ec3f584b9d66a934c4be Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Wed, 4 Sep 2024 18:35:40 -0700 Subject: [PATCH 193/531] Migrate //executorch/... from PyTorchBackendInterface to BackendInterface Differential Revision: D61927046 Pull Request resolved: https://github.com/pytorch/executorch/pull/5054 --- .../apple/coreml/runtime/include/coreml_backend/delegate.h | 2 +- backends/apple/mps/runtime/MPSBackend.mm | 2 +- backends/arm/README.md | 2 +- backends/arm/runtime/ArmBackendEthosU.cpp | 2 +- backends/mediatek/runtime/include/NeuronBackend.h | 2 +- backends/qualcomm/runtime/QnnExecuTorchBackend.h | 3 ++- backends/vulkan/runtime/VulkanBackend.cpp | 2 +- backends/xnnpack/runtime/XNNPACKBackend.cpp | 2 +- exir/backend/test/demos/rpc/ExecutorBackend.cpp | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h index a11d41bf7f4..1943e0f05b0 100644 --- a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h +++ b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h @@ -20,7 +20,7 @@ class BackendDelegate; namespace torch { namespace executor { -class CoreMLBackendDelegate final : public PyTorchBackendInterface { +class CoreMLBackendDelegate final : public ::executorch::runtime::BackendInterface { public: CoreMLBackendDelegate() noexcept; ~CoreMLBackendDelegate() = default; diff --git a/backends/apple/mps/runtime/MPSBackend.mm b/backends/apple/mps/runtime/MPSBackend.mm index b94bdc9319b..cb96edbeb2e 100644 --- a/backends/apple/mps/runtime/MPSBackend.mm +++ b/backends/apple/mps/runtime/MPSBackend.mm @@ -19,7 +19,7 @@ namespace torch { namespace executor { -class MPSBackend final : public PyTorchBackendInterface { +class MPSBackend final : public ::executorch::runtime::BackendInterface { public: ~MPSBackend() = default; diff --git a/backends/arm/README.md b/backends/arm/README.md index 7167aa853b6..375259c62ab 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -33,7 +33,7 @@ Quantization: - `arm_quantizer_utils.py` - Utilities for quantization Runtime: -- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (PyTorchBackendInterface) for Ethos-U +- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U Other: - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 9f9ea8ec9fa..26ffb0b9700 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -46,7 +46,7 @@ class ArmBackendExecuteCallbacks { } }; -class ArmBackend final : public PyTorchBackendInterface { +class ArmBackend final : public ::executorch::runtime::BackendInterface { public: ArmBackend() {} diff --git a/backends/mediatek/runtime/include/NeuronBackend.h b/backends/mediatek/runtime/include/NeuronBackend.h index 2cfcb311b93..7a22956de63 100644 --- a/backends/mediatek/runtime/include/NeuronBackend.h +++ b/backends/mediatek/runtime/include/NeuronBackend.h @@ -26,7 +26,7 @@ namespace torch { namespace executor { -class NeuronBackend final : public PyTorchBackendInterface { +class NeuronBackend final : public ::executorch::runtime::BackendInterface { public: Result init( BackendInitContext& context, diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h index ed4d35068dc..fbcc7058894 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h @@ -14,7 +14,8 @@ namespace torch { namespace executor { -class QnnExecuTorchBackend final : public PyTorchBackendInterface { +class QnnExecuTorchBackend final + : public ::executorch::runtime::BackendInterface { public: ~QnnExecuTorchBackend(){}; diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index fd06841beca..7ed9469f77f 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -412,7 +412,7 @@ void maybe_resize_output( // VulkanBackend class // -class VulkanBackend final : public PyTorchBackendInterface { +class VulkanBackend final : public ::executorch::runtime::BackendInterface { public: ~VulkanBackend() override = default; diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index 264dc838720..c817c010e29 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -20,7 +20,7 @@ namespace torch { namespace executor { -class XnnpackBackend final : public PyTorchBackendInterface { +class XnnpackBackend final : public ::executorch::runtime::BackendInterface { public: ~XnnpackBackend() = default; diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp index 0bc85a685e9..aeef621a271 100644 --- a/exir/backend/test/demos/rpc/ExecutorBackend.cpp +++ b/exir/backend/test/demos/rpc/ExecutorBackend.cpp @@ -35,7 +35,7 @@ namespace executor { * front-end before having the actual backend ready. */ -class ExecutorBackend final : public PyTorchBackendInterface { +class ExecutorBackend final : public ::executorch::runtime::BackendInterface { public: ~ExecutorBackend() = default; From e4a23229287fc25a8c191c028c15a20aa80e1780 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Wed, 4 Sep 2024 18:52:14 -0700 Subject: [PATCH 194/531] Make QNN chipset with the device pool (#5098) Co-authored-by: Guang Yang --- .ci/scripts/test.sh | 5 ++++- .github/workflows/android-perf.yml | 8 ++++---- extension/llm/export/partitioner_lib.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh index 1dbf4a8ce9e..04398c5a483 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test.sh @@ -175,7 +175,10 @@ test_model_with_qnn() { EXPORTED_MODEL_NAME=vit_qnn.pte fi - "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only + # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24 + QNN_CHIPSET=SM8450 + + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME} } diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index a23f9487157..473dad08c14 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -15,7 +15,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s2x + default: samsung_galaxy_s22 delegates: description: Backend delegates required: false @@ -45,7 +45,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s2x + default: samsung_galaxy_s22 delegates: description: Backend delegates required: false @@ -85,7 +85,7 @@ jobs: # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit" - CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x" + CRON_DEFAULT_DEVICES: "samsung_galaxy_s22" CRON_DEFAULT_DELEGATES: "xnnpack,qnn" run: | set -ex @@ -104,7 +104,7 @@ jobs: # Mapping devices to their corresponding device-pool-arn declare -A DEVICE_POOL_ARNS - DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" + DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" # Resolve device names with their corresponding ARNs if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index e75d5bef3fb..0d9f7c6cfd9 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -140,7 +140,7 @@ def get_qnn_partitioner( return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] + soc_model=QcomChipset.SM8450, # default to SM8450 # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, From e119d51aff7dff19ac536200d7f262742aa4eec4 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 4 Sep 2024 18:53:27 -0700 Subject: [PATCH 195/531] Add more docs and give API warning in ET Java Differential Revision: D62216121 Pull Request resolved: https://github.com/pytorch/executorch/pull/5097 --- .../android/src/main/java/org/pytorch/executorch/DType.java | 6 +++++- .../src/main/java/org/pytorch/executorch/EValue.java | 2 ++ .../src/main/java/org/pytorch/executorch/LlamaCallback.java | 6 ++++++ .../src/main/java/org/pytorch/executorch/LlamaModule.java | 6 ++++++ .../src/main/java/org/pytorch/executorch/Module.java | 6 +++++- .../src/main/java/org/pytorch/executorch/NativePeer.java | 6 +++++- .../src/main/java/org/pytorch/executorch/Tensor.java | 2 ++ 7 files changed, 31 insertions(+), 3 deletions(-) diff --git a/extension/android/src/main/java/org/pytorch/executorch/DType.java b/extension/android/src/main/java/org/pytorch/executorch/DType.java index 8b3fb42a6ad..97da05a0af1 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/DType.java +++ b/extension/android/src/main/java/org/pytorch/executorch/DType.java @@ -8,7 +8,11 @@ package org.pytorch.executorch; -/** Codes representing tensor data types. */ +/** + * Codes representing tensor data types. + * + *

Warning: These APIs are experimental and subject to change without notice + */ public enum DType { // NOTE: "jniCode" must be kept in sync with scalar_type.h. // NOTE: Never serialize "jniCode", because it can change between releases. diff --git a/extension/android/src/main/java/org/pytorch/executorch/EValue.java b/extension/android/src/main/java/org/pytorch/executorch/EValue.java index d9fa2e8b833..971545d62fb 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/EValue.java +++ b/extension/android/src/main/java/org/pytorch/executorch/EValue.java @@ -27,6 +27,8 @@ * *

{@code EValue} objects may retain references to objects passed into their constructors, and * may return references to their internal state from {@code toX()}. + * + *

Warning: These APIs are experimental and subject to change without notice */ @DoNotStrip public class EValue { diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java index 33ab928bae0..2d327925d17 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java @@ -10,6 +10,12 @@ import com.facebook.jni.annotations.DoNotStrip; +/** + * Callback interface for Llama model. Users can implement this interface to receive the generated + * tokens and statistics. + * + *

Warning: These APIs are experimental and subject to change without notice + */ public interface LlamaCallback { /** * Called when a new result is available from JNI. Users will keep getting onResult() invocations diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index 3e4b091cfc8..bdc8506aa9c 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -13,6 +13,12 @@ import com.facebook.soloader.nativeloader.NativeLoader; import com.facebook.soloader.nativeloader.SystemDelegate; +/** + * LlamaModule is a wrapper around the Executorch Llama model. It provides a simple interface to + * generate text from the model. + * + *

Warning: These APIs are experimental and subject to change without notice + */ public class LlamaModule { public static final int MODEL_TYPE_TEXT = 1; diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java index 981cfcd8c62..dc4bf710d9b 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/Module.java +++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java @@ -12,7 +12,11 @@ import com.facebook.soloader.nativeloader.SystemDelegate; import java.util.Map; -/** Java wrapper for ExecuTorch Module. */ +/** + * Java wrapper for ExecuTorch Module. + * + *

Warning: These APIs are experimental and subject to change without notice + */ public class Module { /** Load mode for the module. Load the whole file as a buffer. */ diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java index 6eadbf05097..0e6c0a231cb 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java +++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java @@ -13,7 +13,11 @@ import com.facebook.soloader.nativeloader.NativeLoader; import java.util.Map; -/** Interface for the native peer object for entry points to the Module */ +/** + * Interface for the native peer object for entry points to the Module + * + *

Warning: These APIs are experimental and subject to change without notice + */ class NativePeer { static { // Loads libexecutorch.so from jniLibs diff --git a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java index 0c478b89b38..8a1639703d3 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java +++ b/extension/android/src/main/java/org/pytorch/executorch/Tensor.java @@ -36,6 +36,8 @@ * between {@link Module} calls to avoid reallocation. Data retrieved from {@code Tensor} objects * may be copied or may be a reference to the {@code Tensor}'s internal data buffer. {@code shape} * is always copied. + * + *

Warning: These APIs are experimental and subject to change without notice */ public abstract class Tensor { private static final String ERROR_MSG_DATA_BUFFER_NOT_NULL = "Data buffer must be not null"; From d23548be72ec727c0792a07a96e3d30b0fdf012e Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Wed, 4 Sep 2024 21:53:33 -0400 Subject: [PATCH 196/531] [ET-VK][BE][ez] Enable automatic layout slot index incrementing Differential Revision: D62210119 Pull Request resolved: https://github.com/pytorch/executorch/pull/5091 --- backends/vulkan/runtime/gen_vulkan_spv.py | 45 ++++++++++++++++++----- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index f4ba98b31fd..6ee29d45f18 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -38,6 +38,10 @@ # Basic configuration settings for shaders DEFAULT_ENV: Dict[str, Any] = { "PRECISION": "highp", + # B is shorthand for "binding". This is used to automatically increment the + # layout binding index when declaring layout bindings. Note that a container + # type is used because integers are immutable in Python. + "B": [0], } # Establishes relationships between different tensor types and different GLSL types @@ -179,8 +183,14 @@ def get_access_qualifier(access_type: Optional[str]) -> str: raise AssertionError(f"Invalid access type: {access_type}") +def get_slot_val(slot: Union[int, List[int]]) -> int: + if isinstance(slot, list): + return slot[0] + return slot + + def layout_declare_buffer( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -192,15 +202,18 @@ def layout_declare_buffer( array_type = buffer_scalar_type(dtype) out_str = f""" -layout(set = 0, binding = {slot}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ +layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ {array_type} {var_name}[]; }}; """ + + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str def layout_declare_image( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -209,11 +222,16 @@ def layout_declare_image( ) -> str: image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_sampler( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -222,11 +240,16 @@ def layout_declare_sampler( image_ndim: int = 3, ) -> str: sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}) uniform {precision} {sampler_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_tensor( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -262,7 +285,9 @@ def layout_declare_tensor( ) -def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: +def layout_declare_ubo( + slot: Union[int, List[int]], *args, precision: str = "PRECISION" +) -> str: assert len(args) % 2 == 0 var_list = list(zip(args[::2], args[1::2])) @@ -272,12 +297,14 @@ def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: ubo_name += var_name + "_" out_str = f""" -layout(set = 0, binding = {slot}) uniform {precision} restrict readonly {ubo_name}UBO {{ +layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{ """ for type_name, var_name in var_list: out_str += f"{type_name} {var_name};\n" out_str += "};" + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str From 9ae7c0da21b160c8082d72dc4e52724f249cd3b4 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Wed, 4 Sep 2024 19:14:24 -0700 Subject: [PATCH 197/531] h to l start ops | add dim order sanity check Differential Revision: D59877605 Pull Request resolved: https://github.com/pytorch/executorch/pull/4333 --- kernels/portable/cpu/op_hardtanh.cpp | 3 +++ kernels/portable/cpu/op_index.cpp | 5 +++++ kernels/portable/cpu/op_index_put.cpp | 5 +++++ kernels/portable/cpu/op_index_select.cpp | 5 +++++ kernels/portable/cpu/op_le.cpp | 6 ++++++ kernels/portable/cpu/op_leaky_relu.cpp | 3 +++ kernels/portable/cpu/op_lift_fresh_copy.cpp | 3 +++ kernels/portable/cpu/op_log_softmax.cpp | 3 +++ kernels/portable/cpu/op_logical_not.cpp | 3 +++ kernels/portable/cpu/op_logit.cpp | 3 +++ kernels/portable/cpu/op_lt.cpp | 6 ++++++ 11 files changed, 45 insertions(+) diff --git a/kernels/portable/cpu/op_hardtanh.cpp b/kernels/portable/cpu/op_hardtanh.cpp index d61b932e06f..735fe9dc29b 100644 --- a/kernels/portable/cpu/op_hardtanh.cpp +++ b/kernels/portable/cpu/op_hardtanh.cpp @@ -36,6 +36,9 @@ Tensor& hardtanh_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType min_type = utils::get_scalar_dtype(min); ScalarType max_type = utils::get_scalar_dtype(max); diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp index d70ceaa859b..3feb75919b0 100644 --- a/kernels/portable/cpu/op_index.cpp +++ b/kernels/portable/cpu/op_index.cpp @@ -32,6 +32,11 @@ Tensor& index_Tensor_out( ET_KERNEL_CHECK( ctx, check_index_args(in, indices, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); size_t block_count = count_index_blocks(indices); diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp index 88cffe1bce7..71f941e6187 100644 --- a/kernels/portable/cpu/op_index_put.cpp +++ b/kernels/portable/cpu/op_index_put.cpp @@ -33,6 +33,11 @@ Tensor& index_put_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dtype(in, values), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); size_t block_count = count_index_blocks(indices); diff --git a/kernels/portable/cpu/op_index_select.cpp b/kernels/portable/cpu/op_index_select.cpp index 59cbb40b83f..2b1cfa9141d 100644 --- a/kernels/portable/cpu/op_index_select.cpp +++ b/kernels/portable/cpu/op_index_select.cpp @@ -28,6 +28,11 @@ Tensor& index_select_out( ET_KERNEL_CHECK( ctx, check_index_select_args(in, dim, index, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + if (dim < 0) { dim += nonzero_dim(in); } diff --git a/kernels/portable/cpu/op_le.cpp b/kernels/portable/cpu/op_le.cpp index aa2c85d17d4..29ee1ee1261 100644 --- a/kernels/portable/cpu/op_le.cpp +++ b/kernels/portable/cpu/op_le.cpp @@ -35,6 +35,9 @@ Tensor& le_tensor_out( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() { ET_SWITCH_REAL_TYPES_AND( Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() { @@ -77,6 +80,9 @@ Tensor& le_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_leaky_relu.cpp b/kernels/portable/cpu/op_leaky_relu.cpp index 1cb0a6a9401..07b7328b3b4 100644 --- a/kernels/portable/cpu/op_leaky_relu.cpp +++ b/kernels/portable/cpu/op_leaky_relu.cpp @@ -35,6 +35,9 @@ Tensor& leaky_relu_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); ScalarType sc_type = utils::get_scalar_dtype(negative_slope); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/op_lift_fresh_copy.cpp b/kernels/portable/cpu/op_lift_fresh_copy.cpp index 2341d437d6c..c646eef03fb 100644 --- a/kernels/portable/cpu/op_lift_fresh_copy.cpp +++ b/kernels/portable/cpu/op_lift_fresh_copy.cpp @@ -25,6 +25,9 @@ lift_fresh_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + if (in.nbytes() > 0) { // Note that this check is important. It's valid for a tensor with numel 0 // to have a null data pointer, but in some environments it's invalid to diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp index 34f43c48065..97e14a0cad8 100644 --- a/kernels/portable/cpu/op_log_softmax.cpp +++ b/kernels/portable/cpu/op_log_softmax.cpp @@ -36,6 +36,9 @@ Tensor& log_softmax_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // Adjust for negative dim dim = dim < 0 ? dim + nonzero_dim(in) : dim; diff --git a/kernels/portable/cpu/op_logical_not.cpp b/kernels/portable/cpu/op_logical_not.cpp index c607fce8e21..e3d967e1a3f 100644 --- a/kernels/portable/cpu/op_logical_not.cpp +++ b/kernels/portable/cpu/op_logical_not.cpp @@ -27,6 +27,9 @@ Tensor& logical_not_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensors_have_same_shape(in, out), InvalidArgument, out); ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_logit.cpp b/kernels/portable/cpu/op_logit.cpp index 7a54d91d0e3..a343dc91561 100644 --- a/kernels/portable/cpu/op_logit.cpp +++ b/kernels/portable/cpu/op_logit.cpp @@ -28,6 +28,9 @@ Tensor& logit_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); ScalarType in_type = in.scalar_type(); diff --git a/kernels/portable/cpu/op_lt.cpp b/kernels/portable/cpu/op_lt.cpp index 7fd9dd9bf6b..aecdf0af756 100644 --- a/kernels/portable/cpu/op_lt.cpp +++ b/kernels/portable/cpu/op_lt.cpp @@ -31,6 +31,9 @@ Tensor& lt_tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); @@ -77,6 +80,9 @@ Tensor& lt_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); From b8a2cbd5a322aab7f58f34484db935d377692800 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 19:26:51 -0700 Subject: [PATCH 198/531] Add LLaVa runner. Differential Revision: D62142005 Pull Request resolved: https://github.com/pytorch/executorch/pull/5053 --- .../LLaMA/LLaMA.xcodeproj/project.pbxproj | 14 ++- .../LLaMA/LLaMA/Application/ContentView.swift | 1 + .../LLaMARunner/Exported/LLaMARunner.h | 27 ++++- .../LLaMARunner/Exported/LLaMARunner.mm | 112 ++++++++++++++++++ 4 files changed, 152 insertions(+), 2 deletions(-) diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index 15228bbe0db..e8cb47091a5 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -43,6 +43,9 @@ 03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; }; 03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */; }; 03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; }; + 0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0372C3102C893FE900CD942A /* CoreGraphics.framework */; }; + 0372C3142C89418E00CD942A /* llava_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 0372C3122C89418E00CD942A /* llava_runner.h */; }; + 0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0372C3132C89418E00CD942A /* llava_runner.cpp */; }; 038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */; }; 038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */; }; 03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */; }; @@ -141,11 +144,14 @@ 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; }; 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = ""; }; 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = ""; }; - 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = util.h; path = ../../../../extension/llm/runner/util.h; sourceTree = ""; }; + 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = ""; }; 03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = ""; }; 03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = ""; }; 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer.cpp; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.cpp; sourceTree = ""; }; 03729F152BB2043600152F2E /* tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tokenizer.h; path = ../../../../extension/llm/tokenizer/tokenizer.h; sourceTree = ""; }; + 0372C3102C893FE900CD942A /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; }; + 0372C3122C89418E00CD942A /* llava_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llava_runner.h; path = ../../../examples/models/llava/runner/llava_runner.h; sourceTree = ""; }; + 0372C3132C89418E00CD942A /* llava_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llava_runner.cpp; path = ../../../examples/models/llava/runner/llava_runner.cpp; sourceTree = ""; }; 038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = llama_tiktoken.cpp; sourceTree = ""; }; 038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = llama_tiktoken.h; sourceTree = ""; }; 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = bpe_tokenizer.h; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.h; sourceTree = ""; }; @@ -190,6 +196,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + 0372C3112C893FE900CD942A /* CoreGraphics.framework in Frameworks */, 03312C3E2BBFD076002106EF /* executorch_debug in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; @@ -323,6 +330,8 @@ 03729F062BB2035900152F2E /* runner */ = { isa = PBXGroup; children = ( + 0372C3132C89418E00CD942A /* llava_runner.cpp */, + 0372C3122C89418E00CD942A /* llava_runner.h */, 03729F072BB203B300152F2E /* runner.cpp */, 03729F082BB203B300152F2E /* runner.h */, 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */, @@ -373,6 +382,7 @@ 84DD947F2C81060E00C765A6 /* Frameworks */ = { isa = PBXGroup; children = ( + 0372C3102C893FE900CD942A /* CoreGraphics.framework */, ); name = Frameworks; sourceTree = ""; @@ -403,6 +413,7 @@ 038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */, 03729F0C2BB203B300152F2E /* util.h in Headers */, 03729F0B2BB203B300152F2E /* runner.h in Headers */, + 0372C3142C89418E00CD942A /* llava_runner.h in Headers */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -646,6 +657,7 @@ 03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */, 03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */, 038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */, + 0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */, 03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */, 03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */, 03729F0A2BB203B300152F2E /* runner.cpp in Sources */, diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift index 02f28b88b81..d64314e5349 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift @@ -13,6 +13,7 @@ import LLaMARunner class RunnerHolder: ObservableObject { var runner: Runner? + var llavaRunner: LLaVARunner? } struct ContentView: View { diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h index b34fa0fe605..158824b67ee 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ -#import +#import NS_ASSUME_NONNULL_BEGIN FOUNDATION_EXPORT NSErrorDomain const LLaMARunnerErrorDomain; +FOUNDATION_EXPORT NSErrorDomain const LLaVARunnerErrorDomain; NS_SWIFT_NAME(Runner) @interface LLaMARunner : NSObject @@ -23,6 +24,30 @@ NS_SWIFT_NAME(Runner) sequenceLength:(NSInteger)seq_len withTokenCallback:(nullable void (^)(NSString*))callback error:(NSError**)error; +- (BOOL)generate:(NSArray*)images + prompt:(NSString*)prompt + sequenceLength:(NSInteger)seq_len + withTokenCallback:(nullable void (^)(NSString*))callback + error:(NSError**)error; +- (void)stop; + ++ (instancetype)new NS_UNAVAILABLE; +- (instancetype)init NS_UNAVAILABLE; + +@end + +NS_SWIFT_NAME(LLaVARunner) +@interface LLaVARunner : NSObject + +- (instancetype)initWithModelPath:(NSString*)filePath + tokenizerPath:(NSString*)tokenizerPath; +- (BOOL)isloaded; +- (BOOL)loadWithError:(NSError**)error; +- (BOOL)generate:(NSArray*)images + prompt:(NSString*)prompt + sequenceLength:(NSInteger)seq_len + withTokenCallback:(nullable void (^)(NSString*))callback + error:(NSError**)error; - (void)stop; + (instancetype)new NS_UNAVAILABLE; diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm index 07b6dce0aa8..9b169c33890 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm @@ -10,10 +10,12 @@ #import #import +#import using namespace ::torch::executor; NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain"; +NSErrorDomain const LLaVARunnerErrorDomain = @"LLaVARunnerErrorDomain"; @interface LLaMARunner () @end @@ -102,3 +104,113 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level } @end + +@interface LLaVARunner () +@end + +@implementation LLaVARunner { + std::unique_ptr _runner; +} + +- (instancetype)initWithModelPath:(NSString*)modelPath + tokenizerPath:(NSString*)tokenizerPath { + self = [super init]; + if (self) { + [ExecuTorchLog.sharedLog addSink:self]; + _runner = std::make_unique( + modelPath.UTF8String, tokenizerPath.UTF8String); + } + return self; +} + +- (void)dealloc { + [ExecuTorchLog.sharedLog removeSink:self]; +} + +- (BOOL)isloaded { + return _runner->is_loaded(); +} + +- (BOOL)loadWithError:(NSError**)error { + const auto status = _runner->load(); + if (status != Error::Ok) { + if (error) { + *error = [NSError errorWithDomain:LLaVARunnerErrorDomain + code:(NSInteger)status + userInfo:nil]; + } + return NO; + } + return YES; +} + +- (BOOL)generate:(NSArray*)images + prompt:(NSString*)prompt + sequenceLength:(NSInteger)seq_len + withTokenCallback:(nullable void (^)(NSString*))callback + error:(NSError**)error { + std::vector rawImages; + rawImages.reserve(images.count); + + for (UIImage* image in images) { + CGImageRef cgImage = image.CGImage; + const int32_t width = CGImageGetWidth(cgImage); + const int32_t height = CGImageGetHeight(cgImage); + std::vector buffer(height * width * 4); + CGContextRef context = CGBitmapContextCreate( + buffer.data(), + width, + height, + 8, + width * 4, + CGColorSpaceCreateDeviceRGB(), + kCGImageAlphaPremultipliedLast); + CGContextDrawImage(context, CGRectMake(0, 0, width, height), cgImage); + CGContextRelease(context); + rawImages.push_back({std::move(buffer), width, height, 4}); + } + const auto status = _runner->generate( + std::move(rawImages), + prompt.UTF8String, + seq_len, + [callback](const std::string& token) { callback(@(token.c_str())); }); + if (status != Error::Ok) { + if (error) { + *error = [NSError errorWithDomain:LLaVARunnerErrorDomain + code:(NSInteger)status + userInfo:nil]; + return NO; + } + } + return YES; +} + +- (void)stop { + _runner->stop(); +} + +#pragma mark - ExecuTorchLogSink + +- (void)logWithLevel:(ExecuTorchLogLevel)level + timestamp:(NSTimeInterval)timestamp + filename:(NSString*)filename + line:(NSUInteger)line + message:(NSString*)message { + NSUInteger totalSeconds = (NSUInteger)timestamp; + NSUInteger hours = (totalSeconds / 3600) % 24; + NSUInteger minutes = (totalSeconds / 60) % 60; + NSUInteger seconds = totalSeconds % 60; + NSUInteger microseconds = (timestamp - totalSeconds) * 1000000; + NSLog( + @"%c %02lu:%02lu:%02lu.%06lu executorch:%s:%zu] %s", + (char)level, + hours, + minutes, + seconds, + microseconds, + filename.UTF8String, + line, + message.UTF8String); +} + +@end From 3d4904bb4303ed78ca4b8cddcdd1c1615a69a030 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 4 Sep 2024 19:51:16 -0700 Subject: [PATCH 199/531] Resync Android BUCK part 2 --- examples/demo-apps/android/LlamaDemo/app/src/main/BUCK | 1 + extension/android/jni/BUCK | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK index 1fd656317ea..80315c4104b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK @@ -27,6 +27,7 @@ fb_android_library( "java/com/example/executorchllamademo/ModelRunner.java", "java/com/example/executorchllamademo/ModelRunnerCallback.java", "java/com/example/executorchllamademo/ModelType.java", + "java/com/example/executorchllamademo/ModelUtils.java", "java/com/example/executorchllamademo/PromptFormat.java", "java/com/example/executorchllamademo/SettingsActivity.java", "java/com/example/executorchllamademo/SettingsFields.java", diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index c32f4ab0a95..7afd9f8a941 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -30,6 +30,7 @@ fb_android_cxx_library( compiler_flags = [ "-frtti", "-fexceptions", + "-Wno-unused-variable", ], soname = "libexecutorch.$(ext)", visibility = ["PUBLIC"], @@ -38,6 +39,7 @@ fb_android_cxx_library( "//fbandroid/native/fb:fb", "//third-party/glog:glog", "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:inputs_static", "//xplat/executorch/extension/runner_util:managed_tensor_static", ], ) @@ -50,6 +52,7 @@ fb_android_cxx_library( compiler_flags = [ "-frtti", "-fexceptions", + "-Wno-unused-variable", ], soname = "libexecutorch.$(ext)", visibility = ["PUBLIC"], @@ -60,6 +63,7 @@ fb_android_cxx_library( "//third-party/glog:glog", "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:inputs_static", "//xplat/executorch/extension/runner_util:managed_tensor_static", ], ) From 83d92ff1b84d5b2093c386f24d3d88d8eea949f6 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 4 Sep 2024 19:56:50 -0700 Subject: [PATCH 200/531] Fix typo in runner interface. Differential Revision: D62222658 Pull Request resolved: https://github.com/pytorch/executorch/pull/5104 --- .../LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h index 158824b67ee..5f8b3c8449a 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h @@ -24,11 +24,6 @@ NS_SWIFT_NAME(Runner) sequenceLength:(NSInteger)seq_len withTokenCallback:(nullable void (^)(NSString*))callback error:(NSError**)error; -- (BOOL)generate:(NSArray*)images - prompt:(NSString*)prompt - sequenceLength:(NSInteger)seq_len - withTokenCallback:(nullable void (^)(NSString*))callback - error:(NSError**)error; - (void)stop; + (instancetype)new NS_UNAVAILABLE; From ee752f0be294951fe0650306a37f981ccf72a57c Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 4 Sep 2024 20:54:30 -0700 Subject: [PATCH 201/531] Update Android lint path Pull Request resolved: https://github.com/pytorch/executorch/pull/5102 --- .github/workflows/lint.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ea068f65e1a..c2f5ed31c16 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -65,7 +65,8 @@ jobs: script: | FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \ examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \ - examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java) + examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \ + extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java) if [ -n "$FILES_NEEDS_FORMAT" ]; then echo "Warning: The following files need formatting. Please use google-java-format." echo "$FILES_NEEDS_FORMAT" From e8549670ba06e198128790b4980abc653cb33c53 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:28:31 -0700 Subject: [PATCH 202/531] Build QNN in android_llm_demo.sh and perf Pull Request resolved: https://github.com/pytorch/executorch/pull/5105 Co-authored-by: Huy Do --- .github/workflows/android-perf.yml | 9 +++++++++ build/build_android_llm_demo.sh | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 473dad08c14..c44de955335 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -206,6 +206,10 @@ jobs: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main needs: set-parameters + strategy: + matrix: + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + fail-fast: false with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -222,6 +226,11 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + if [[ ${{ matrix.delegate }} == "qnn" ]]; then + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + fi + # TODO: This needs to be replaced with a generic loader .apk # Build LLM Demo for Android bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 38efa05b745..5a17c8745dc 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -19,6 +19,13 @@ build_android_native_library() { ANDROID_ABI="$1" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" + QNN_SDK_ROOT="${QNN_SDK_ROOT:-}" + if [ -n "$QNN_SDK_ROOT" ]; then + EXECUTORCH_BUILD_QNN=ON + else + EXECUTORCH_BUILD_QNN=OFF + fi + cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ @@ -34,6 +41,8 @@ build_android_native_library() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ + -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" From 866e9b84d46bc83ae184ee627893f3bb55839260 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:35:39 -0700 Subject: [PATCH 203/531] Switch XNNPack tests to use export_for_training Differential Revision: D61684468 Pull Request resolved: https://github.com/pytorch/executorch/pull/4867 --- backends/xnnpack/test/test_xnnpack_utils.py | 6 ++++-- backends/xnnpack/test/tester/tester.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py index 3f5359a3f45..ea9217e04ab 100644 --- a/backends/xnnpack/test/test_xnnpack_utils.py +++ b/backends/xnnpack/test/test_xnnpack_utils.py @@ -72,6 +72,7 @@ get_symmetric_quantization_config, XNNPACKQuantizer, ) +from torch.export import export_for_training from torch.testing import FileCheck @@ -315,10 +316,11 @@ def quantize_and_test_model_with_quantizer( ): module.eval() # program capture - m = torch._export.capture_pre_autograd_graph( + + m = export_for_training( module, example_inputs, - ) + ).module() quantizer = XNNPACKQuantizer() quantization_config = get_symmetric_quantization_config() diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py index eb25a14cfea..7586c4f2313 100644 --- a/backends/xnnpack/test/tester/tester.py +++ b/backends/xnnpack/test/tester/tester.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import torch -import torch.export._trace as export_trace from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.passes import XNNPACKPassManager from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config @@ -31,6 +30,7 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.print_program import pretty_print, print_program +from torch.export import export_for_training logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -157,10 +157,10 @@ def __init__( def run( self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]] ) -> None: - captured_graph = export_trace._export( - artifact, inputs, pre_dispatch=True - ).module() + assert inputs is not None + captured_graph = export_for_training(artifact, inputs).module() + assert isinstance(captured_graph, torch.fx.GraphModule) prepared = prepare_pt2e(captured_graph, self.quantizer) if self.calibrate: From 91089db7739354514d26c15d8c9dc85964c28387 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 5 Sep 2024 18:52:44 +0200 Subject: [PATCH 204/531] Implement dumping operator distribution for TOSA graph Differential Revision: D62035062 Pull Request resolved: https://github.com/pytorch/executorch/pull/4970 --- backends/arm/test/misc/test_debug_feats.py | 85 ++++++++++++++++------ backends/arm/test/tester/arm_tester.py | 47 ++++++++++-- 2 files changed, 102 insertions(+), 30 deletions(-) diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index aa9703f9eba..dd59fddbd47 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -126,26 +126,67 @@ def test_numerical_diff_prints(self): self.fail() -class TestDumpOperatorsAndDtypes(unittest.TestCase): - def test_dump_ops_and_dtypes(self): - model = Linear(20, 30) - ( - ArmTester( - model, - example_inputs=model.get_inputs(), - compile_spec=common.get_tosa_compile_spec(), - ) - .quantize() - .dump_dtype_distribution() - .dump_operator_distribution() - .export() - .dump_dtype_distribution() - .dump_operator_distribution() - .to_edge() - .dump_dtype_distribution() - .dump_operator_distribution() - .partition() - .dump_dtype_distribution() - .dump_operator_distribution() +def test_dump_ops_and_dtypes(): + model = Linear(20, 30) + ( + ArmTester( + model, + example_inputs=model.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .dump_dtype_distribution() + .dump_operator_distribution() + .export() + .dump_dtype_distribution() + .dump_operator_distribution() + .to_edge() + .dump_dtype_distribution() + .dump_operator_distribution() + .partition() + .dump_dtype_distribution() + .dump_operator_distribution() + ) + # Just test that there are no execptions. + + +def test_dump_tosa_ops(capsys): + model = Linear(20, 30) + ( + ArmTester( + model, + example_inputs=model.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .partition() + .dump_operator_distribution() + ) + captured = capsys.readouterr() + assert "Partition operators:" in captured.out + assert "TOSA operators:" in captured.out + + +def test_fail_dump_tosa_ops(capsys): + class Add(torch.nn.Module): + def forward(self, x): + return x + x + + model = Add() + compile_spec = common.get_tosa_compile_spec_unbuilt() + compile_spec.output_format = "vela" + ( + ArmTester( + model, example_inputs=(torch.ones(5),), compile_spec=compile_spec.build() ) - # Just test that there are no execeptions. + .quantize() + .export() + .to_edge() + .partition() + .dump_operator_distribution() + ) + captured = capsys.readouterr() + assert "Partition operators:" in captured.out + assert "Can not get operator distribution for vela command stream." in captured.out diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 98fac29144c..ec44a02739e 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -13,7 +13,7 @@ import numpy as np -import torch +import torch.fx from executorch.backends.arm.arm_backend import get_intermediate_path, is_permute_memory from executorch.backends.arm.arm_partitioner import ArmPartitioner @@ -297,9 +297,7 @@ def get_graph(self, stage: str | None = None) -> Graph: return graph - def dump_operator_distribution( - self, path_to_dump: Optional[str] = None - ) -> ArmQuantizer: + def dump_operator_distribution(self, path_to_dump: Optional[str] = None): """Dump a dictionary with {operator: operator count} for the operators in the graph of the current stage. @@ -307,13 +305,16 @@ def dump_operator_distribution( """ graph = self.get_graph(self.cur) op_dist = _get_operator_distribution(graph) - to_print = self.cur + " operators: " + _format_dict(op_dist) + "\n" + to_print = self.cur + " operators: " + _format_dict(dict(op_dist)) + "\n" + + if self.cur == self.stage_name(tester.Partition): + to_print += _get_tosa_operator_distribution( + self.get_artifact(self.cur).exported_program().graph_module + ) _dump_str(to_print, path_to_dump) return self - def dump_dtype_distribution( - self, path_to_dump: Optional[str] = None - ) -> ArmQuantizer: + def dump_dtype_distribution(self, path_to_dump: Optional[str] = None): """Dump a dictionary with {dtype: dtype count} for the dtypes of the nodes in the graph of the current stage. @@ -421,6 +422,36 @@ def _get_operator_distribution(graph: Graph) -> dict[str, int]: ) +def _get_tosa_operator_distribution(graph_module: torch.fx.GraphModule) -> str: + """Counts the occurences of operator names of all lowered modules containing + a TOSA flatbuffer. + The result is a string with the operator distribution or an error message. + """ + op_list = [] + id = 0 + while lowered_module := getattr(graph_module, f"lowered_module_{id}", None): + for spec in lowered_module.compile_specs: + if spec.key != "output_format": + continue + if spec.value == b"tosa": + tosa_fb = lowered_module.processed_bytes + tosa_json = dbg_tosa_fb_to_json(tosa_fb) + for region in tosa_json["regions"]: + for block in region["blocks"]: + op_list.extend( + [operator["op"] for operator in block["operators"]] + ) + break + elif spec.value == b"vela": + return "Can not get operator distribution for vela command stream." + else: + return f"Unknown output format '{spec.value}'." + id += 1 + if id == 0: + return "No delegate with name 'lowered_module_0 found in graph module." + return "TOSA operators: " + _format_dict(dict(Counter(op_list))) + + def _dump_str(to_print: str, path_to_dump: Optional[str] = None): if path_to_dump: with open(path_to_dump, "a") as fp: From 2ba2f478739ee726dcb4138e3795c92bee045826 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:17:06 -0700 Subject: [PATCH 205/531] Allow user to define ANDROID_ABIS in build_android_llm_demo.sh Allow build arm64-v8a only Pull Request resolved: https://github.com/pytorch/executorch/pull/5109 --------- Co-authored-by: Huy Do --- .github/workflows/android-perf.yml | 3 ++- build/build_android_llm_demo.sh | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c44de955335..53d934a0a62 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -230,9 +230,10 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh fi - + # TODO: This needs to be replaced with a generic loader .apk # Build LLM Demo for Android + export ANDROID_ABIS="arm64-v8a" bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 5a17c8745dc..9e051daa1c2 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -139,7 +139,9 @@ collect_artifacts_to_be_uploaded() { BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR -ANDROID_ABIS=("arm64-v8a" "x86_64") +if [ -z "$ANDROID_ABIS" ]; then + ANDROID_ABIS=("arm64-v8a" "x86_64") +fi export ANDROID_ABIS ARTIFACTS_DIR_NAME="$1" From 4e2cd6ce8914973c7076587f7ef7f738a8101548 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Thu, 5 Sep 2024 11:03:48 -0700 Subject: [PATCH 206/531] [executorch] Update Llava install_requirements.sh (#4886) --- examples/models/llava/install_requirements.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh index 3bf803b356c..931d63b3919 100644 --- a/examples/models/llava/install_requirements.sh +++ b/examples/models/llava/install_requirements.sh @@ -7,6 +7,6 @@ set -x -pip install transformers accelerate +pip install transformers accelerate sentencepiece pip list From cdb54389d0230868b2129e966888a5dbab9ae621 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Thu, 5 Sep 2024 11:19:10 -0700 Subject: [PATCH 207/531] Migrate executorch examples to use export_for_training Differential Revision: D62142608 Pull Request resolved: https://github.com/pytorch/executorch/pull/5036 --- examples/models/test/test_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/test/test_export.py b/examples/models/test/test_export.py index f258cc21391..b3030c24fea 100644 --- a/examples/models/test/test_export.py +++ b/examples/models/test/test_export.py @@ -29,7 +29,7 @@ def collect_executorch_and_eager_outputs( Returns a tuple containing the outputs of the eager mode model and the executorch mode model. """ eager_model = eager_model.eval() - model = torch._export.capture_pre_autograd_graph(eager_model, example_inputs) + model = torch.export.export_for_training(eager_model, example_inputs).module() edge_model = export_to_edge(model, example_inputs) executorch_prog = edge_model.to_executorch() From cea5abbcddedb72ce8cbbb4e456b8ee9bfe99e1c Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Thu, 5 Sep 2024 11:45:55 -0700 Subject: [PATCH 208/531] m to p start ops | add dim order sanity check Differential Revision: D59984020 Pull Request resolved: https://github.com/pytorch/executorch/pull/4331 --- kernels/portable/cpu/op_masked_fill.cpp | 3 +++ kernels/portable/cpu/op_max.cpp | 18 +++++++++++++ kernels/portable/cpu/op_maximum.cpp | 3 +++ kernels/portable/cpu/op_mean.cpp | 5 ++++ kernels/portable/cpu/op_min.cpp | 18 +++++++++++++ kernels/portable/cpu/op_minimum.cpp | 3 +++ kernels/portable/cpu/op_mm.cpp | 5 ++++ kernels/portable/cpu/op_mul.cpp | 6 +++++ kernels/portable/cpu/op_native_batch_norm.cpp | 22 +++++++++++++++ kernels/portable/cpu/op_native_group_norm.cpp | 25 +++++++++++++++++ kernels/portable/cpu/op_native_layer_norm.cpp | 27 +++++++++++++++++++ kernels/portable/cpu/op_ne.cpp | 6 +++++ kernels/portable/cpu/op_neg.cpp | 3 +++ kernels/portable/cpu/op_pdist_forward.cpp | 5 ++++ kernels/portable/cpu/op_permute_copy.cpp | 3 +++ kernels/portable/cpu/op_pixel_shuffle.cpp | 4 +++ 16 files changed, 156 insertions(+) diff --git a/kernels/portable/cpu/op_masked_fill.cpp b/kernels/portable/cpu/op_masked_fill.cpp index 7a72994b07a..e6c0bb4442d 100644 --- a/kernels/portable/cpu/op_masked_fill.cpp +++ b/kernels/portable/cpu/op_masked_fill.cpp @@ -39,6 +39,9 @@ Tensor& masked_fill_scalar_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, mask, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in_type, ctx, "masked_fill.Scalar_out", CTYPE, [&]() { ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp index 8f363ced4e2..b36cde42e45 100644 --- a/kernels/portable/cpu/op_max.cpp +++ b/kernels/portable/cpu/op_max.cpp @@ -49,6 +49,24 @@ std::tuple max_out( InvalidArgument, (std::tuple({max, max_indices}))); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, max), + InvalidArgument, + (std::tuple({max, max_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(max_indices), + InvalidArgument, + (std::tuple({max, max_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(in), + InvalidArgument, + (std::tuple({max, max_indices}))); + dim = dim < 0 ? dim + in.dim() : dim; ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 1353479b294..e52a6fd072b 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -75,6 +75,9 @@ Tensor& maximum_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index 79e66c62b5e..e930eb6c838 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -33,6 +33,11 @@ Tensor& mean_dim_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp index 8e3b5a00b36..e4f5e5714f3 100644 --- a/kernels/portable/cpu/op_min.cpp +++ b/kernels/portable/cpu/op_min.cpp @@ -49,6 +49,24 @@ std::tuple min_out( InvalidArgument, (std::tuple({min, min_indices}))); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, min), + InvalidArgument, + (std::tuple({min, min_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(min_indices), + InvalidArgument, + (std::tuple({min, min_indices}))); + + ET_KERNEL_CHECK( + ctx, + tensor_is_default_dim_order(in), + InvalidArgument, + (std::tuple({min, min_indices}))); + dim = dim < 0 ? dim + in.dim() : dim; ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index f18d1a6d368..84024beffaa 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -75,6 +75,9 @@ Tensor& minimum_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp index 6903bf3cad5..4a6a8f3cfdc 100644 --- a/kernels/portable/cpu/op_mm.cpp +++ b/kernels/portable/cpu/op_mm.cpp @@ -29,6 +29,11 @@ mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) { InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { size_t m = in.size(0); size_t n = in.size(1); diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index c933d10d274..1d29b8bfe8a 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -72,6 +72,9 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); @@ -113,6 +116,9 @@ Tensor& mul_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp index 2e613c0a637..fceb8b24d9a 100644 --- a/kernels/portable/cpu/op_native_batch_norm.cpp +++ b/kernels/portable/cpu/op_native_batch_norm.cpp @@ -73,6 +73,28 @@ std::tuple _native_batch_norm_legit_no_training_out( InvalidArgument, ret_val); + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, out, mean_out, invstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, bias.value()), + InvalidArgument, + ret_val); + } + size_t C_dim = in.dim() >= 1 ? 1 : 0; size_t C = in.size(C_dim); size_t outer = getLeadingDims(in, C_dim); diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp index f9213fdeb1b..b61f5be6764 100644 --- a/kernels/portable/cpu/op_native_group_norm.cpp +++ b/kernels/portable/cpu/op_native_group_norm.cpp @@ -158,6 +158,31 @@ std::tuple native_group_norm_out( InvalidArgument, ret_val); + ET_KERNEL_CHECK( + ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val); + + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, bias.value()), + InvalidArgument, + ret_val); + } + constexpr auto name = "native_group_norm.out"; ET_SWITCH_FLOAT_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp index f10acda10ee..711c747ca2b 100644 --- a/kernels/portable/cpu/op_native_layer_norm.cpp +++ b/kernels/portable/cpu/op_native_layer_norm.cpp @@ -117,6 +117,33 @@ std::tuple native_layer_norm_out( InvalidArgument, ret_val); + // Only support default dim order for now. + // TODO: Support other dim orders. + ET_KERNEL_CHECK( + ctx, tensor_is_default_dim_order(input), InvalidArgument, ret_val); + + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(input, bias.value()), + InvalidArgument, + ret_val); + } + Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit]; size_t mean_rstd_ndim = 0; get_layer_norm_out_target_size( diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp index 5601fdafbd1..2c25dc7029e 100644 --- a/kernels/portable/cpu/op_ne.cpp +++ b/kernels/portable/cpu/op_ne.cpp @@ -30,6 +30,9 @@ Tensor& ne_tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); @@ -75,6 +78,9 @@ Tensor& ne_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType out_type = out.scalar_type(); diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp index 026d1009c49..b88cdb03a23 100644 --- a/kernels/portable/cpu/op_neg.cpp +++ b/kernels/portable/cpu/op_neg.cpp @@ -30,6 +30,9 @@ Tensor& neg_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] { apply_unary_map_fn( [](const CTYPE val_in) { return static_cast(-val_in); }, diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp index 88b5e881943..9b06b880b64 100644 --- a/kernels/portable/cpu/op_pdist_forward.cpp +++ b/kernels/portable/cpu/op_pdist_forward.cpp @@ -24,6 +24,11 @@ Tensor& _pdist_forward_out( ET_KERNEL_CHECK(ctx, check_pdist_args(in, p, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_pdist_out_target_size(in, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp index e7df5c9657d..1362b57c005 100644 --- a/kernels/portable/cpu/op_permute_copy.cpp +++ b/kernels/portable/cpu/op_permute_copy.cpp @@ -46,6 +46,9 @@ Tensor& permute_copy_out( ET_KERNEL_CHECK( ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_permute_copy_out_target_size( diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp index 104348f3fed..e1e459b1b28 100644 --- a/kernels/portable/cpu/op_pixel_shuffle.cpp +++ b/kernels/portable/cpu/op_pixel_shuffle.cpp @@ -72,6 +72,10 @@ Tensor& pixel_shuffle_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_pixel_shuffle_out_target_size( From 6ccb290c0185658cdd63ba933df50a4ff1e91d1d Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 5 Sep 2024 13:29:13 -0700 Subject: [PATCH 209/531] Switch to the new tensor API internally. Differential Revision: D61959575 Pull Request resolved: https://github.com/pytorch/executorch/pull/5111 --- .ci/scripts/build-qnn-sdk.sh | 1 + .ci/scripts/build_llama_android.sh | 3 +- .ci/scripts/test_llama.sh | 3 +- .ci/scripts/test_llava.sh | 5 +- backends/qualcomm/scripts/build.sh | 2 + backends/vulkan/docs/android_demo.md | 3 +- backends/xnnpack/README.md | 3 +- build/build_android_llm_demo.sh | 1 + ...d-run-qualcomm-ai-engine-direct-backend.md | 2 + docs/source/llm/getting-started.md | 14 ++-- .../tutorial-xnnpack-delegate-lowering.md | 3 +- .../android/ExecuTorchDemo/README.md | 2 + .../demo-apps/android/ExecuTorchDemo/setup.sh | 1 + .../android/LlamaDemo/setup-with-qnn.sh | 1 + examples/demo-apps/android/LlamaDemo/setup.sh | 1 + examples/llm_manual/CMakeLists.txt | 2 + examples/llm_manual/main.cpp | 7 +- examples/llm_manual/managed_tensor.h | 44 ----------- .../cross_attention/cross_attention_mask.cpp | 12 +-- .../cross_attention/cross_attention_mask.h | 8 +- .../cross_attention_mask_test.cpp | 30 ++++---- .../flamingo/cross_attention/targets.bzl | 2 +- examples/models/llama2/README.md | 6 +- examples/models/llama2/runner/CMakeLists.txt | 4 +- examples/models/llama2/runner/runner.cpp | 1 - examples/models/llama2/runner/targets.bzl | 2 +- examples/models/llava/README.md | 3 +- examples/models/llava/runner/CMakeLists.txt | 4 +- .../llava/runner/llava_image_prefiller.h | 14 ++-- .../llava/runner/llava_text_decoder_runner.h | 11 +-- examples/models/llava/runner/targets.bzl | 2 +- examples/models/phi-3-mini/CMakeLists.txt | 3 +- examples/models/phi-3-mini/README.md | 3 +- examples/models/phi-3-mini/runner.cpp | 14 +--- .../oss_scripts/llama2/CMakeLists.txt | 1 + .../oss_scripts/llama2/qnn_llama_runner.cpp | 1 - .../oss_scripts/llama2/runner/runner.cpp | 77 ++++++++----------- .../oss_scripts/llama2/runner/runner.h | 14 ++-- .../qaihub_scripts/llama/CMakeLists.txt | 2 + .../llama/llama2/qaihub_llama2_7b_runner.cpp | 1 - .../llama/llama3/qaihub_llama3_8b_runner.cpp | 1 - .../qaihub_scripts/llama/runner/runner.cpp | 1 - .../qaihub_scripts/llama/runner/runner.h | 1 - .../stable_diffusion/CMakeLists.txt | 1 + .../stable_diffusion/runner/runner.cpp | 42 ++++------ examples/xnnpack/README.md | 6 +- extension/android/CMakeLists.txt | 1 + extension/android/jni/BUCK | 4 +- extension/android/jni/jni_layer.cpp | 23 +++--- extension/aten_util/test/targets.bzl | 1 - extension/llm/runner/CMakeLists.txt | 4 +- extension/llm/runner/multimodal_runner.h | 1 - extension/llm/runner/targets.bzl | 6 +- extension/llm/runner/text_decoder_runner.cpp | 14 ++-- extension/llm/runner/text_decoder_runner.h | 6 +- extension/llm/runner/text_prefiller.cpp | 26 +++---- extension/llm/runner/text_token_generator.h | 15 ++-- kernels/README.md | 2 +- test/utils/OSSTestConfig.json | 3 +- 59 files changed, 206 insertions(+), 265 deletions(-) delete mode 100644 examples/llm_manual/managed_tensor.h diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index ec3a8a39e37..c48ac2056aa 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -29,6 +29,7 @@ set_up_aot() { -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 644fc4c2bb7..7d3370ee561 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -22,8 +22,9 @@ install_executorch_and_backend_lib() { -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 4fa8c94905f..290ece7b8e6 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -107,8 +107,9 @@ cmake_install_executorch_libraries() { retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 3543ea3fa57..90a2afa11f8 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -20,8 +20,9 @@ cmake_install_executorch_libraries() { cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -61,7 +62,7 @@ export_llava() { # Download a new image with different size, to test if the model can handle different image sizes prepare_image_tensor() { echo "Downloading image" - curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg + curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt } diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index 61b363f1a77..5f77a747404 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -81,6 +81,7 @@ if [ "$BUILD_AARCH64" = true ]; then -DEXECUTORCH_BUILD_QNN=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ @@ -124,6 +125,7 @@ if [ "$BUILD_X86_64" = true ]; then -DEXECUTORCH_BUILD_QNN=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -S $PRJ_ROOT \ diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index aaff7a7a727..8570859ed34 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -94,8 +94,9 @@ binary using the Android NDK toolchain. cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_VULKAN=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE=python \ diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 33a0bfaf309..0c3d7e14428 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -105,9 +105,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 9e051daa1c2..4d1a0ac1235 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -38,6 +38,7 @@ build_android_native_library() { -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index 5abaaeb7cef..230f007d3fc 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -136,6 +136,7 @@ cmake .. \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF @@ -167,6 +168,7 @@ cmake .. \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index a0865811462..9c03399444e 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -201,9 +201,9 @@ Create a file called main.cpp with the following contents: #include "basic_sampler.h" #include "basic_tokenizer.h" -#include "managed_tensor.h" #include +#include #include #include #include @@ -244,14 +244,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. @@ -339,7 +338,6 @@ Finally, download the following files into the same directory as main.h: ``` curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h -curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h ``` To learn more, see the [Runtime APIs Tutorial](../extension-module.md). @@ -364,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) # Include the executorch subdirectory. @@ -377,6 +376,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels ``` @@ -386,7 +386,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -518,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -534,6 +534,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels xnnpack_backend) # Provides the XNNPACK CPU acceleration backend ``` @@ -548,7 +549,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index 4491a6e8c80..8afa6d6fe77 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -149,9 +149,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 807561f44b5..9af1f5266eb 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -78,6 +78,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install @@ -119,6 +120,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh index 05dc3e4492e..00d9201b092 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh +++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh @@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 5e3ac6fc011..87d0f47c956 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_QNN=ON \ diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index ccb2a788d6e..91a68d4b88b 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ diff --git a/examples/llm_manual/CMakeLists.txt b/examples/llm_manual/CMakeLists.txt index 185665180f9..e5054a683a6 100644 --- a/examples/llm_manual/CMakeLists.txt +++ b/examples/llm_manual/CMakeLists.txt @@ -13,6 +13,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -29,6 +30,7 @@ target_link_libraries( nanogpt_runner PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform # kernels xnnpack_backend diff --git a/examples/llm_manual/main.cpp b/examples/llm_manual/main.cpp index c0fc482542e..3c4ecd71af0 100644 --- a/examples/llm_manual/main.cpp +++ b/examples/llm_manual/main.cpp @@ -10,9 +10,9 @@ #include "basic_sampler.h" #include "basic_tokenizer.h" -#include "managed_tensor.h" #include +#include #include #include #include @@ -42,14 +42,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. diff --git a/examples/llm_manual/managed_tensor.h b/examples/llm_manual/managed_tensor.h deleted file mode 100644 index 204b38aa4e9..00000000000 --- a/examples/llm_manual/managed_tensor.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -/** - * Creates and owns the necessary metadata for a Tensor instance. Does not own - * the data pointer. - */ -class ManagedTensor { - public: - ManagedTensor( - void* data, - const std::vector& sizes, - exec_aten::ScalarType dtype) - : sizes_(sizes), - tensor_impl_( - /*type=*/dtype, - /*dim=*/sizes_.size(), - /*sizes=*/sizes_.data(), - /*data=*/data, - /*dim_order=*/nullptr, - /*strides=*/nullptr, - /*dynamism=*/ - executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND) {} - - /** - * Get the Tensor object managed by this class. - */ - exec_aten::Tensor get_tensor() { - return exec_aten::Tensor(&tensor_impl_); - } - - private: - std::vector sizes_; - exec_aten::TensorImpl tensor_impl_; -}; diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp index b2a2a6a8063..06887ec4735 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask.cpp +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.cpp @@ -6,12 +6,11 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include -#include -#include - namespace torch::executor { // Fowrward declaration needed for ARM compilers. @@ -97,7 +96,7 @@ std::vector> _get_image_attention_intervals( return vision_masks; } -std::vector cross_attention_mask( +std::vector cross_attention_mask( const std::vector& tokens, const std::vector& images, size_t tile_size, @@ -121,7 +120,7 @@ std::vector cross_attention_mask( // Create mask for each individual image based on its number of tokens, // which can vary based on number of tiles since they are not yet tile padded. // The masks are padded and concatenated together in the batch collator. - std::vector cross_attention_masks; + std::vector cross_attention_masks; size_t text_seq_len = tokens.size(); for (size_t image_idx = 0; image_idx < image_intervals.size(); ++image_idx) { size_t n_tiles = images[image_idx].size(0); @@ -140,7 +139,8 @@ std::vector cross_attention_mask( size_t stride = image_seq_len; std::vector mask_data(num_elements); - ManagedTensor mask(mask_data.data(), sizes, ScalarType::Int); + auto mask = executorch::extension::from_blob( + mask_data.data(), sizes, ScalarType::Int); cross_attention_masks.emplace_back(std::move(mask)); // Add the allocated data to the output vector. diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask.h b/examples/models/flamingo/cross_attention/cross_attention_mask.h index 6998d91ad4a..ccbc9eb1710 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask.h +++ b/examples/models/flamingo/cross_attention/cross_attention_mask.h @@ -8,11 +8,11 @@ #pragma once -#include -#include - #include +#include +#include + namespace torch { namespace executor { @@ -59,7 +59,7 @@ namespace executor { * * @returns A vector of cross attention masks, as Tensors, one for each image. */ -std::vector cross_attention_mask( +std::vector<::executorch::extension::TensorPtr> cross_attention_mask( const std::vector& tokens, const std::vector& images, size_t tile_size, diff --git a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp index 5b9e58c216f..b232212fa31 100644 --- a/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp +++ b/examples/models/flamingo/cross_attention/cross_attention_mask_test.cpp @@ -7,10 +7,10 @@ */ #include + #include using namespace ::testing; -using torch::executor::ManagedTensor; using torch::executor::ScalarType; using torch::executor::Tensor; using torch::executor::TensorImpl; @@ -41,29 +41,27 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) { std::vector images = {a, b, c}; std::vector> mask_data; - std::vector output_masks = - torch::executor::cross_attention_mask( - tokens, - images, - /*tile_size=*/1, - /*patch_size=*/1, - /*image_token_id=*/1, - /*out=*/mask_data); + auto output_masks = torch::executor::cross_attention_mask( + tokens, + images, + /*tile_size=*/1, + /*patch_size=*/1, + /*image_token_id=*/1, + /*out=*/mask_data); // Check contents of the mask. std::vector> expected_intervals = { {0, 7}, {1, 7}, {7, 12}}; for (size_t mask_idx = 0; mask_idx < output_masks.size(); ++mask_idx) { - ManagedTensor& output_mask = output_masks[mask_idx]; - Tensor output_tensor = output_mask.get_aliasing_tensor(); - for (size_t i = 0; i < output_tensor.size(0); ++i) { - for (size_t j = 0; j < output_tensor.strides()[0]; ++j) { - size_t unrolled_index = i * output_tensor.strides()[0] + j; + auto& output_tensor = output_masks[mask_idx]; + for (size_t i = 0; i < output_tensor->size(0); ++i) { + for (size_t j = 0; j < output_tensor->strides()[0]; ++j) { + size_t unrolled_index = i * output_tensor->strides()[0] + j; if (i >= expected_intervals[mask_idx][0] && i < expected_intervals[mask_idx][1]) { - EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 1); + EXPECT_EQ(output_tensor->const_data_ptr()[unrolled_index], 1); } else { - EXPECT_EQ(output_tensor.const_data_ptr()[unrolled_index], 0); + EXPECT_EQ(output_tensor->const_data_ptr()[unrolled_index], 0); } } } diff --git a/examples/models/flamingo/cross_attention/targets.bzl b/examples/models/flamingo/cross_attention/targets.bzl index 7bc13270aa9..c3d9da01561 100644 --- a/examples/models/flamingo/cross_attention/targets.bzl +++ b/examples/models/flamingo/cross_attention/targets.bzl @@ -12,8 +12,8 @@ def define_common_targets(): srcs = ["cross_attention_mask.cpp"], exported_headers = ["cross_attention_mask.h"], exported_deps = [ + "//executorch/extension/tensor:tensor", "//executorch/runtime/core/exec_aten:lib", - "//executorch/extension/runner_util:managed_tensor", "//executorch/runtime/core/exec_aten/util:tensor_util", ], ) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index ea95c7f965c..09ada515a10 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -200,8 +200,9 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}` -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -251,8 +252,9 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-out-android \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_XNNPACK=ON \ diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index abad63a3b5f..79fcd267af0 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -75,8 +75,8 @@ add_subdirectory( ) set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) -set(llama_runner_deps executorch extension_module extension_data_loader - re2::re2 +set(llama_runner_deps executorch extension_data_loader extension_module + extension_tensor re2::re2 ) target_link_libraries(llama_runner PUBLIC ${llama_runner_deps}) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 0a5d773092e..02063ebfa59 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 475c5d92ab1..9ee3f99567d 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -34,8 +34,8 @@ def define_common_targets(): "//executorch/extension/llm/runner:text_prefiller" + aten_suffix, "//executorch/extension/llm/runner:text_token_generator" + aten_suffix, "//executorch/extension/evalue_util:print_evalue" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md index 807e1b3ceef..8cb605d75fc 100644 --- a/examples/models/llava/README.md +++ b/examples/models/llava/README.md @@ -34,8 +34,9 @@ Run the following cmake commands from `executorch/`: cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt index 564d31f8e77..2d0c30a620e 100644 --- a/examples/models/llava/runner/CMakeLists.txt +++ b/examples/models/llava/runner/CMakeLists.txt @@ -40,8 +40,8 @@ add_subdirectory( add_library(llava_runner STATIC ${_llava_runner__srcs}) -set(llava_runner_deps executorch extension_module extension_data_loader - extension_llm_runner +set(llava_runner_deps executorch extension_data_loader extension_llm_runner + extension_module extension_tensor ) target_link_libraries(llava_runner PUBLIC ${llava_runner_deps}) diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h index 50c981026a6..3597ff82efe 100644 --- a/examples/models/llava/runner/llava_image_prefiller.h +++ b/examples/models/llava/runner/llava_image_prefiller.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace torch::executor { @@ -26,18 +26,18 @@ class LlavaImagePrefiller : public ImagePrefiller { */ inline Result prefill(Image& image, int64_t& start_pos) override { - ManagedTensor managed_images( + auto image_tensor = executorch::extension::from_blob( image.data.data(), {3, image.height, image.width}, ScalarType::Byte); // Run image encoder - std::vector image_encoder_outputs = ET_UNWRAP(module_->execute( - kImageEncoderMethod, managed_images.get_aliasing_tensor())); + auto image_encoder_outputs = + ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor)); // inputs:[start_pos, embeds] - ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long); - auto start_pos_tensor = managed_start_pos.get_aliasing_tensor(); + auto start_pos_tensor = + executorch::extension::from_blob(&start_pos, {1}, ScalarType::Long); // Run text model - std::vector outputs_res = ET_UNWRAP(module_->execute( + auto outputs_res = ET_UNWRAP(module_->execute( kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]})); ET_CHECK_MSG( outputs_res[0].isTensor(), diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h index e70ba59d513..a58bcc47e0a 100644 --- a/examples/models/llava/runner/llava_text_decoder_runner.h +++ b/examples/models/llava/runner/llava_text_decoder_runner.h @@ -20,17 +20,14 @@ class LlavaTextDecoderRunner : public TextDecoderRunner { : TextDecoderRunner(module, true, vocab_size, temperature){}; inline Result step( - ManagedTensor& managed_tokens, - ManagedTensor& managed_start_pos) override { - auto tokens = managed_tokens.get_aliasing_tensor(); - auto start_pos = managed_start_pos.get_aliasing_tensor(); - + executorch::extension::TensorPtr& tokens, + executorch::extension::TensorPtr& start_pos) override { // run token embedding - std::vector token_embedding_outputs = + auto token_embedding_outputs = ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens)); // run text model - std::vector outputs_res = ET_UNWRAP(module_->execute( + auto outputs_res = ET_UNWRAP(module_->execute( kTextModelMethod, {start_pos, token_embedding_outputs[0]})); ET_CHECK_MSG( diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl index 72942acf16f..c7523d6cc45 100644 --- a/examples/models/llava/runner/targets.bzl +++ b/examples/models/llava/runner/targets.bzl @@ -16,8 +16,8 @@ def define_common_targets(): "//executorch/extension/llm/runner:runner_lib", "//executorch/extension/llm/tokenizer:bpe_tokenizer", "//executorch/extension/evalue_util:print_evalue", - "//executorch/extension/runner_util:managed_tensor", "//executorch/extension/module:module", + "//executorch/extension/tensor:tensor", "//executorch/kernels/quantized:generated_lib", "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/core/exec_aten/util:tensor_util", diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt index 39358e088e7..e1ffd0da055 100644 --- a/examples/models/phi-3-mini/CMakeLists.txt +++ b/examples/models/phi-3-mini/CMakeLists.txt @@ -23,6 +23,7 @@ set(CMAKE_BUILD_TYPE Release) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) @@ -47,6 +48,6 @@ target_include_directories( PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src ) target_link_libraries( - phi_3_mini_runner PRIVATE executorch extension_module_static + phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor optimized_native_cpu_ops_lib xnnpack_backend gflags ) diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index 6619a111a29..19269716211 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -26,8 +26,9 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_ENABLE_LOGGING=1 \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp index a6cee57ea8f..9da323278f5 100644 --- a/examples/models/phi-3-mini/runner.cpp +++ b/examples/models/phi-3-mini/runner.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include namespace torch::executor { @@ -81,23 +81,17 @@ uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) { } uint64_t Runner::prefill(std::vector& tokens) { - ManagedTensor input_tokens( + auto result = module_->forward(from_blob( tokens.data(), {1, static_cast(tokens.size())}, - ScalarType::Long); - std::vector inputs = {input_tokens.get_aliasing_tensor()}; - - auto result = module_->forward(inputs); + ScalarType::Long)); ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens"); return logits_to_token(result.get()[0].toTensor()); } uint64_t Runner::run_model_step(uint64_t token) { - ManagedTensor input_token(&token, {1, 1}, ScalarType::Long); - std::vector inputs = {input_token.get_aliasing_tensor()}; - - auto result = module_->forward(inputs); + auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long)); ET_CHECK_MSG( result.error() == Error::Ok, "Failed to run forward() for token %" PRIu64, diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt index 006e0f75174..97995086335 100644 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt @@ -28,6 +28,7 @@ target_link_libraries( full_portable_ops_lib extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp index 7340672c9ed..599accfd1ed 100644 --- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp index d452336175f..0ccaefa79e0 100644 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp @@ -13,9 +13,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -26,6 +26,7 @@ namespace torch { namespace executor { namespace { +using namespace executorch::extension; static constexpr auto kTopp = 0.9f; void printReport(const Runner::Stats& stats); std::string statsToJsonString(const Runner::Stats& stats); @@ -136,32 +137,30 @@ int32_t Runner::logitsToToken(const exec_aten::Tensor& logits_tensor) { // step. Returning the logits tensor. Result Runner::run_model_step( int64_t input_token, - Tensor& token, - Tensor& start_pos, - Tensor& atten_mask, - std::vector& kv_tensors, - std::vector& kv_outputs) { - token.mutable_data_ptr()[0] = input_token; + TensorPtr& token, + TensorPtr& start_pos, + TensorPtr& atten_mask, + std::vector& kv_tensors, + std::vector& kv_outputs) { + token->mutable_data_ptr()[0] = input_token; // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache] - std::vector inputs = {token, start_pos, atten_mask}; - inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end()); - Result> outputs_res = module_->forward(inputs); + auto outputs_res = module_->forward({*token, *start_pos, *atten_mask}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); // TODO: need to handle batch size != 1 - size_t v_offset = kv_outputs[0].nbytes(); - size_t el_size = kv_outputs[0].element_size(); + size_t v_offset = kv_outputs[0]->nbytes(); + size_t el_size = kv_outputs[0]->element_size(); size_t k_input_step = (max_seq_len_ - 1) * el_size; int k_tensors_end = kv_tensors.size() / 2; // update k caches for (int j = 0; j < k_tensors_end; ++j) { uint8_t* input_addr = - static_cast(kv_tensors[j].mutable_data_ptr()); + static_cast(kv_tensors[j]->mutable_data_ptr()); uint8_t* output_addr = - static_cast(kv_outputs[j].mutable_data_ptr()); + static_cast(kv_outputs[j]->mutable_data_ptr()); // fill the output k values back - for (int src = 0, dst = k_input_step; src < kv_outputs[j].nbytes(); + for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes(); src += el_size, dst += k_input_step) { input_addr[dst] = output_addr[src]; } @@ -169,7 +168,7 @@ Result Runner::run_model_step( // inputs ET_CHECK_MSG( internal::set_tensor_data( - kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok, + *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, "Failed to set input tensor when updating k_cache"); } // update v caches @@ -179,25 +178,25 @@ Result Runner::run_model_step( ET_CHECK_MSG( internal::set_tensor_data( - kv_tensors[j], new_inp_addr, kv_tensors[j].nbytes()) == Error::Ok, + *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, "Failed to set input tensor when updating v_cache"); // outputs char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset); ET_CHECK_MSG( internal::set_tensor_data( - kv_outputs[j], new_out_addr, kv_outputs[j].nbytes()) == Error::Ok, + *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok, "Failed to set output tensor when updating v_cache"); ET_CHECK_MSG( - module_->set_output_data_ptr(kv_outputs[j], j + 1) == Error::Ok, + module_->set_output_data_ptr(*kv_outputs[j], j + 1) == Error::Ok, "Failed to set llama output data pointer"); } // Bump start_pos by 1 - start_pos.mutable_data_ptr()[0]++; + start_pos->mutable_data_ptr()[0]++; // update atten_mask - atten_mask.mutable_data_ptr() - [atten_mask.numel() - 1 - start_pos.const_data_ptr()[0]] = 0; + atten_mask->mutable_data_ptr() + [atten_mask->numel() - 1 - start_pos->const_data_ptr()[0]] = 0; return outputs_res.get()[0].toTensor(); } // TODO: add overloaded method for on-device tokenize @@ -253,19 +252,14 @@ Error Runner::generate( std::vector hidden_states_data_shape = {1, 1, dim_}; // initialize tensor wrappers - ManagedTensor managed_token( + auto token = from_blob( io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int); - ManagedTensor managed_pos_id( + auto start_pos = from_blob( io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int); - ManagedTensor managed_atten_mask( + auto atten_mask = from_blob( io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float); - Tensor token = managed_token.get_aliasing_tensor(); - Tensor atten_mask = managed_atten_mask.get_aliasing_tensor(); - Tensor start_pos = managed_pos_id.get_aliasing_tensor(); - - std::vector managed_kv_inputs, managed_kv_outputs; - std::vector kv_tensors, kv_outputs; + std::vector kv_tensors, kv_outputs; Result method_meta = get_method_meta(); size_t num_inputs = method_meta->num_inputs(); @@ -282,22 +276,20 @@ Error Runner::generate( auto tensor_shape = tensor_meta->sizes(); std::vector sizes( tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - managed_kv_inputs.emplace_back(ManagedTensor( + kv_tensors.emplace_back(from_blob( io_mem_mgr_.get_k_caches_read_ptr(i), sizes, tensor_meta->scalar_type())); - kv_tensors.emplace_back(managed_kv_inputs.back().get_aliasing_tensor()); // outpus Result out_tensor_meta = method_meta->output_tensor_meta(i + 1); tensor_shape = out_tensor_meta->sizes(); sizes = std::vector{ tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - managed_kv_outputs.emplace_back(ManagedTensor( + kv_outputs.emplace_back(from_blob( io_mem_mgr_.get_k_caches_write_ptr(i), sizes, - kv_tensors.back().scalar_type())); - kv_outputs.emplace_back(managed_kv_outputs.back().get_aliasing_tensor()); + kv_tensors.back()->scalar_type())); ET_CHECK_MSG( module_->set_output_data_ptr(kv_outputs.back(), i + 1) == Error::Ok, "Failed to set output tensor for kv cache"); @@ -314,11 +306,10 @@ Error Runner::generate( std::vector sizes( tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - managed_kv_inputs.emplace_back(ManagedTensor( + kv_tensors.emplace_back(from_blob( io_mem_mgr_.get_v_caches_read_ptr(i), sizes, tensor_meta->scalar_type())); - kv_tensors.push_back(managed_kv_inputs.back().get_aliasing_tensor()); // outputs Result out_tensor_meta = @@ -327,22 +318,20 @@ Error Runner::generate( sizes = std::vector{ tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - managed_kv_outputs.push_back(ManagedTensor( + kv_outputs.push_back(from_blob( io_mem_mgr_.get_v_caches_write_ptr(i), sizes, - kv_tensors.back().scalar_type())); - kv_outputs.push_back(managed_kv_outputs.back().get_aliasing_tensor()); + kv_tensors.back()->scalar_type())); ET_CHECK_MSG( module_->set_output_data_ptr(kv_outputs.back(), output_index) == Error::Ok, "Failed to set output tensor for llama block"); } - ManagedTensor affine_managed_logits( + auto affine_logits = from_blob( reinterpret_cast(io_mem_mgr_.get_logit_ptr()), logits_data_shape, ScalarType::Float); - Tensor affine_logits = affine_managed_logits.get_aliasing_tensor(); ET_CHECK_MSG( module_->set_output_data_ptr(affine_logits, 0) == Error::Ok, "Failed to set output tensor for affine module - logits"); @@ -351,7 +340,7 @@ Error Runner::generate( std::string final_output; while (pos < seq_len - 1) { // Run the model - Result logits_res = run_model_step( + auto logits_res = run_model_step( cur_token, token, start_pos, atten_mask, kv_tensors, kv_outputs); if (pos == num_prompt_tokens) { stats_.first_token_ms = util::time_in_ms(); diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h index cdbb2cdd2e8..1c35c821ceb 100644 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include class RpcMemAllocator { public: @@ -248,13 +248,13 @@ class Runner { T getMetadataHelper(std::string method_name, T default_val); template int32_t logitsToToken(const exec_aten::Tensor& logits_tensor); - Result run_model_step( + Result run_model_step( int64_t input_token, - Tensor& token, - Tensor& start_pos, - Tensor& atten_mask, - std::vector& kv_tensors, - std::vector& kv_outputs); + ::executorch::extension::TensorPtr& token, + ::executorch::extension::TensorPtr& start_pos, + ::executorch::extension::TensorPtr& atten_mask, + std::vector<::executorch::extension::TensorPtr>& kv_tensors, + std::vector<::executorch::extension::TensorPtr>& kv_outputs); // metadata int32_t vocab_size_; int64_t bos_id_; diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index c1fd5dc6538..1a9406ca955 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -35,6 +35,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) @@ -89,6 +90,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp index 49782cf8789..d69aa0aa7a8 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp index aae18434c61..9d06e8118da 100644 --- a/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b_runner.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index ec13cec37c5..d6d99112932 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h index b9849a21327..bd24ea6beb4 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h @@ -21,7 +21,6 @@ #include #include #include -#include namespace torch { namespace executor { diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt index e6af95595b7..c59cea32b9f 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt @@ -24,6 +24,7 @@ target_link_libraries( executorch_no_prim_ops extension_data_loader extension_module + extension_tensor gflags re2::re2 ) diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp index 3d3d99d7074..b6c211d8acb 100644 --- a/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/stable_diffusion/runner/runner.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include @@ -22,6 +22,8 @@ #include #include +using namespace ::executorch::extension; + namespace torch { namespace executor { @@ -350,31 +352,27 @@ Error Runner::generate(std::string prompt) { MethodMeta encoder_method_meta = method_metas[0].get(); // Initialize text_encoder input tensors: cond/uncond tokenized_input[1,77] - ManagedTensor managed_cond_tokens( + auto cond_tokens_tensor = from_blob( cond_tokens.data(), {1, 77}, encoder_method_meta.input_tensor_meta(0)->scalar_type()); - ManagedTensor managed_uncond_tokens( + auto uncond_tokens_tensor = from_blob( uncond_tokens.data(), {1, 77}, encoder_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor cond_tokens_tensor = managed_cond_tokens.get_aliasing_tensor(); - Tensor uncond_tokens_tensor = managed_uncond_tokens.get_aliasing_tensor(); // Initialize text_encoder output tensors: cond/uncond embedding[1, 77, 1024] constexpr int emb_size = 1 * 77 * 1024; std::vector cond_emb_vec(emb_size); std::vector uncond_emb_vec(emb_size); std::vector fp_emb_vec(emb_size); - ManagedTensor managed_cond_emb( + auto cond_emb_tensor = from_blob( cond_emb_vec.data(), {1, 77, 1024}, encoder_method_meta.output_tensor_meta(0)->scalar_type()); - ManagedTensor managed_uncond_emb( + auto uncond_emb_tensor = from_blob( uncond_emb_vec.data(), {1, 77, 1024}, encoder_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor cond_emb_tensor = managed_cond_emb.get_aliasing_tensor(); - Tensor uncond_emb_tensor = managed_uncond_emb.get_aliasing_tensor(); modules_[0]->set_output_data_ptr(cond_emb_tensor, 0); long encoder_start = util::time_in_ms(); auto cond_res = modules_[0]->forward(cond_tokens_tensor); @@ -403,22 +401,17 @@ Error Runner::generate(std::string prompt) { // 3. cond/uncond embedding[1,77,1024] std::vector latent_model_input(latent.size()); std::vector fp_latent_model_input(latent.size()); - ManagedTensor managed_latent( + auto latent_tensor = from_blob( latent_model_input.data(), {1, 64, 64, 4}, unet_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor latent_tensor = managed_latent.get_aliasing_tensor(); - std::vector managed_time_emb_tensors; - std::vector time_emb_tensors; - managed_time_emb_tensors.reserve(num_time_steps_); + std::vector time_emb_tensors; time_emb_tensors.reserve(num_time_steps_); - for (int step_index = 0; step_index < num_time_steps_; step_index++) { - managed_time_emb_tensors.emplace_back(ManagedTensor( + for (auto step_index = 0; step_index < num_time_steps_; step_index++) { + time_emb_tensors.emplace_back(from_blob( time_emb_list_[step_index].data(), {1, 1280}, unet_method_meta.input_tensor_meta(1)->scalar_type())); - time_emb_tensors.emplace_back( - managed_time_emb_tensors.back().get_aliasing_tensor()); } // requantize text encoders output dequant_tensor( @@ -447,17 +440,14 @@ Error Runner::generate(std::string prompt) { std::vector noise_pred_uncond(latent.size()); std::vector fp_noise_pred_text(noise_pred_text.size()); std::vector fp_noise_pred_uncond(noise_pred_uncond.size()); - ManagedTensor managed_noise_pred_text( + auto noise_pred_text_tensor = from_blob( noise_pred_text.data(), {1, 64, 64, 4}, unet_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor noise_pred_text_tensor = managed_noise_pred_text.get_aliasing_tensor(); - ManagedTensor managed_noise_pred_uncond( + auto noise_pred_uncond_tensor = from_blob( noise_pred_uncond.data(), {1, 64, 64, 4}, unet_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor noise_pred_uncond_tensor = - managed_noise_pred_uncond.get_aliasing_tensor(); // Execute unet for (int step_index = 0; step_index < num_time_steps_; step_index++) { @@ -514,20 +504,18 @@ Error Runner::generate(std::string prompt) { MethodMeta vae_method_meta = method_metas[2].get(); // Initialize vae input tensor : latent[1,64,64,4] std::vector vae_input(latent.size()); - ManagedTensor managed_vae_input( + auto vae_input_tensor = from_blob( vae_input.data(), {1, 64, 64, 4}, vae_method_meta.input_tensor_meta(0)->scalar_type()); - Tensor vae_input_tensor = managed_vae_input.get_aliasing_tensor(); // Intialize vae output tensor: output[1,512,512,3] constexpr int image_size = 1 * 512 * 512 * 3; std::vector q_out(image_size); std::vector out(image_size); - ManagedTensor managed_output( + auto output_tensor = from_blob( q_out.data(), {1, 512, 512, 3}, vae_method_meta.output_tensor_meta(0)->scalar_type()); - Tensor output_tensor = managed_output.get_aliasing_tensor(); quant_tensor(latent, vae_input, vae_input_scale_, vae_input_offset_); diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md index 61c14b5c7e4..dcd5b9c5d70 100644 --- a/examples/xnnpack/README.md +++ b/examples/xnnpack/README.md @@ -38,9 +38,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . @@ -92,9 +93,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 6827ae79040..74f98960002 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -39,6 +39,7 @@ list( extension_data_loader extension_module extension_runner_util + extension_tensor extension_threadpool fbjni ) diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index 7afd9f8a941..7cdf8ef7ec4 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -40,7 +40,7 @@ fb_android_cxx_library( "//third-party/glog:glog", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", - "//xplat/executorch/extension/runner_util:managed_tensor_static", + "//xplat/executorch/extension/tensor:tensor_static", ], ) @@ -64,7 +64,7 @@ fb_android_cxx_library( "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", - "//xplat/executorch/extension/runner_util:managed_tensor_static", + "//xplat/executorch/extension/tensor:tensor_static", ], ) diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index 79c6ebc5161..ef74d6480bb 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -55,6 +55,7 @@ void et_pal_emit_log_message( } #endif +using namespace executorch::extension; using namespace torch::executor; namespace executorch::extension { @@ -167,7 +168,7 @@ class JEValue : public facebook::jni::JavaClass { evalue.tag); } - static ManagedTensor JEValueToTensorImpl( + static TensorPtr JEValueToTensorImpl( facebook::jni::alias_ref JEValue) { static const auto typeCodeField = JEValue::javaClassStatic()->getField("mTypeCode"); @@ -221,7 +222,7 @@ class JEValue : public facebook::jni::JavaClass { numel, dataCapacity); } - return ManagedTensor( + return from_blob( jni->GetDirectBufferAddress(jbuffer.get()), shape_vec, scalar_type); } facebook::jni::throwNewJavaException( @@ -293,9 +294,8 @@ class ExecuTorchJni : public facebook::jni::HybridClass { facebook::jni::alias_ref< facebook::jni::JArrayClass::javaobject> jinputs) { - std::vector evalues = {}; - - std::vector managed_tensors = {}; + std::vector evalues; + std::vector tensors; static const auto typeCodeField = JEValue::javaClassStatic()->getField("mTypeCode"); @@ -304,18 +304,17 @@ class ExecuTorchJni : public facebook::jni::HybridClass { auto jevalue = jinputs->getElement(i); const auto typeCode = jevalue->getFieldValue(typeCodeField); if (typeCode == JEValue::kTypeCodeTensor) { - managed_tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue)); - evalues.emplace_back( - EValue(managed_tensors.back().get_aliasing_tensor())); + tensors.emplace_back(JEValue::JEValueToTensorImpl(jevalue)); + evalues.emplace_back(tensors.back()); } else if (typeCode == JEValue::kTypeCodeInt) { int64_t value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } else if (typeCode == JEValue::kTypeCodeDouble) { double value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } else if (typeCode == JEValue::kTypeCodeBool) { bool value = jevalue->getFieldValue(typeCodeField); - evalues.emplace_back(EValue(value)); + evalues.emplace_back(value); } } diff --git a/extension/aten_util/test/targets.bzl b/extension/aten_util/test/targets.bzl index b724bbce2bc..db2247fd60b 100644 --- a/extension/aten_util/test/targets.bzl +++ b/extension/aten_util/test/targets.bzl @@ -18,7 +18,6 @@ def define_common_targets(): "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/kernel:operator_registry", "//executorch/extension/aten_util:aten_bridge", - "//executorch/extension/runner_util:managed_tensor", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], external_deps = [ diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 27bc84fe115..a9245768b9d 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -43,7 +43,9 @@ target_include_directories( add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs}) -set(runner_deps executorch extension_module extension_data_loader) +set(runner_deps executorch extension_data_loader extension_module + extension_tensor +) target_link_libraries(extension_llm_runner PUBLIC ${runner_deps}) diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index dbffac46fce..43bbe688448 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -31,7 +31,6 @@ #include #include #include -#include namespace executorch { namespace extension { diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 4d715980afe..f20240956cb 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -26,7 +26,7 @@ def define_common_targets(): ":stats", "//executorch/extension/llm/sampler:sampler" + aten_suffix, "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) @@ -41,7 +41,7 @@ def define_common_targets(): ":text_decoder_runner" + aten_suffix, "//executorch/extension/llm/tokenizer:tokenizer_header", "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) @@ -55,7 +55,7 @@ def define_common_targets(): ":text_decoder_runner" + aten_suffix, "//executorch/extension/llm/tokenizer:tokenizer_header", "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/runner_util:managed_tensor" + aten_suffix, + "//executorch/extension/tensor:tensor" + aten_suffix, ], ) diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 5b77c69825f..928a21244a2 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -38,14 +38,11 @@ TextDecoderRunner::TextDecoderRunner( // input. It should be safe to call multiple times with the same inputs. The // outer loop (call site) is responsible for managing state. ::executorch::runtime::Result TextDecoderRunner::step( - ManagedTensor& managed_tokens, - ManagedTensor& managed_start_pos) { - auto tokens = managed_tokens.get_aliasing_tensor(); + TensorPtr& tokens, + TensorPtr& start_pos) { // ET_LOG(Info, "Input token %" PRIu64, input_token); if (use_kv_cache_) { - auto start_pos = managed_start_pos.get_aliasing_tensor(); - ::executorch::runtime::Result> - outputs_res = module_->forward({tokens, start_pos}); + auto outputs_res = module_->forward({*tokens, *start_pos}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, @@ -57,10 +54,9 @@ ::executorch::runtime::Result TextDecoderRunner::step( // Return the logits tensor return outputs_res.get()[0].toTensor(); } else { // no kv cache - (void)managed_start_pos; // unused + (void)start_pos; // unused - ::executorch::runtime::Result> - outputs_res = module_->forward(tokens); + auto outputs_res = module_->forward(tokens); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 70ee1d01364..16adeeed0a5 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -12,7 +12,7 @@ #include #include -#include +#include // patternlint-disable-next-line executorch-cpp-nostdinc #include @@ -38,8 +38,8 @@ class TextDecoderRunner { * @return The output of the LLM Module. This will be a tensor of logits. */ virtual ::executorch::runtime::Result step( - ManagedTensor& input, - ManagedTensor& start_pos); + TensorPtr& input, + TensorPtr& start_pos); /** * Load the Module for text decode purpose. diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index 53a737e6afc..e6229e0b807 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -25,7 +25,7 @@ TextPrefiller::TextPrefiller( ::executorch::runtime::Result TextPrefiller::prefill( std::vector& prompt_tokens, - int64_t start_pos) { + int64_t start_pos_index) { ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null"); if (!text_decoder_runner_->is_method_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load()); @@ -38,16 +38,15 @@ ::executorch::runtime::Result TextPrefiller::prefill( uint64_t cur_token; if (enable_parallel_prefill_ || !use_kv_cache_) { // initialize tensor wrappers - ManagedTensor managed_tokens( + auto tokens = from_blob( prompt_tokens.data(), {1, num_prompt_tokens}, exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos( - &start_pos, {1}, exec_aten::ScalarType::Long); + auto start_pos = + from_blob(&start_pos_index, {1}, exec_aten::ScalarType::Long); - ::executorch::runtime::Result outputs_res = - text_decoder_runner_->step(managed_tokens, managed_start_pos); + auto outputs_res = text_decoder_runner_->step(tokens, start_pos); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_LOG( @@ -62,28 +61,25 @@ ::executorch::runtime::Result TextPrefiller::prefill( cur_token = prompt_tokens[0]; // initialize tensor wrappers - ManagedTensor managed_tokens( - &cur_token, {1, 1}, exec_aten::ScalarType::Long); + auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long); - ManagedTensor managed_start_pos( - &pos_data, {1}, exec_aten::ScalarType::Long); + auto start_pos = from_blob(&pos_data, {1}, exec_aten::ScalarType::Long); // run the first token and get back logits tensor. Assuming the first token // is bos so don't callback. - exec_aten::Tensor logits_tensor = ET_UNWRAP( - text_decoder_runner_->step(managed_tokens, managed_start_pos)); + auto logits_tensor = + ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos)); pos = 1; // start from index 1 while (pos < num_prompt_tokens) { // Run the model - pos_data = start_pos + pos; + pos_data = start_pos_index + pos; // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[pos]; - logits_tensor = ET_UNWRAP( - text_decoder_runner_->step(managed_tokens, managed_start_pos)); + logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos)); pos++; } diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 46d682a4e44..01887e75600 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace executorch { namespace extension { @@ -69,15 +70,18 @@ class TextTokenGenerator { } // initialize tensor wrappers - ManagedTensor tokens_managed( - token_data.data(), token_shape, exec_aten::ScalarType::Long); + auto tokens_managed = from_blob( + token_data.data(), + token_shape, + exec_aten::ScalarType::Long, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); - ManagedTensor start_pos_managed(&pos, {1}, exec_aten::ScalarType::Long); + auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long); // Generate our tokens while (pos < seq_len - 1) { // Run the model - ::executorch::runtime::Result logits_res = + auto logits_res = text_decoder_runner_->step(tokens_managed, start_pos_managed); ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); @@ -98,7 +102,8 @@ class TextTokenGenerator { } else { // push it to the back token_data.push_back(cur_token); - tokens_managed.resize({1, static_cast(token_data.size())}); + ET_CHECK_OK_OR_RETURN_ERROR(resize_tensor_ptr( + tokens_managed, {1, static_cast(token_data.size())})); } // print the token as string, decode it with the Tokenizer object diff --git a/kernels/README.md b/kernels/README.md index 4e9656e6e9e..026778cc287 100644 --- a/kernels/README.md +++ b/kernels/README.md @@ -355,7 +355,7 @@ cmake . \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_SDK=ON \ -DEXECUTORCH_BUILD_VULKAN=OFF \ -DEXECUTORCH_BUILD_XNNPACK=ON \ diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 93ae82acc33..dca2a7bbbce 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -52,8 +52,7 @@ { "directory": "extension/runner_util/test", "sources": [ - "inputs_test.cpp", - "managed_tensor_test.cpp" + "inputs_test.cpp" ], "additional_libs": [ "extension_data_loader", From b8a1899e8ac208619e1cef5daf420cde99fd7beb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 5 Sep 2024 14:44:17 -0700 Subject: [PATCH 210/531] [ExecuTorch] Implement BFloat16 and hook it up to scalar_type_util Differential Revision: D61981361 Pull Request resolved: https://github.com/pytorch/executorch/pull/4975 --- kernels/portable/cpu/scalar_utils.h | 18 +- .../core/exec_aten/util/genScalarTypeTable.py | 41 ++- .../core/exec_aten/util/scalar_type_util.h | 206 +++++++++--- .../util/test/scalar_type_util_test.cpp | 54 +-- runtime/core/portable_type/bfloat16.h | 311 ++++++++++++++++++ .../core/portable_type/test/CMakeLists.txt | 2 +- .../core/portable_type/test/bfloat16_test.cpp | 191 +++++++++++ runtime/core/portable_type/test/targets.bzl | 8 + 8 files changed, 744 insertions(+), 87 deletions(-) create mode 100644 runtime/core/portable_type/test/bfloat16_test.cpp diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h index 3daf3e72526..3d6dfb75e47 100644 --- a/kernels/portable/cpu/scalar_utils.h +++ b/kernels/portable/cpu/scalar_utils.h @@ -94,12 +94,6 @@ struct promote_type_with_scalar_type { static_assert( !is_bits_type::value, "promote_type_with_scalar_type not valid for bits dtypes"); - static_assert( - !std::is_same< - T1, - typename ScalarTypeToCppType::type>:: - value, - "promote_type_with_scalar_type not valid for BFloat16"); using promote_type_with_scalar_type_not_respecting_half_to_float = typename std::conditional< is_complex_type::value || @@ -119,10 +113,14 @@ struct promote_type_with_scalar_type { public: using type = typename std::conditional< half_to_float && - std::is_same< - promote_type_with_scalar_type_not_respecting_half_to_float, - typename ScalarTypeToCppType::type>:: - value, + (std::is_same< + promote_type_with_scalar_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::Half>::type>::value || + std::is_same< + promote_type_with_scalar_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::BFloat16>::type>::value), typename ScalarTypeToCppType::type, promote_type_with_scalar_type_not_respecting_half_to_float>::type; }; diff --git a/runtime/core/exec_aten/util/genScalarTypeTable.py b/runtime/core/exec_aten/util/genScalarTypeTable.py index 07100472ae4..c2bc84c2700 100644 --- a/runtime/core/exec_aten/util/genScalarTypeTable.py +++ b/runtime/core/exec_aten/util/genScalarTypeTable.py @@ -4,20 +4,35 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -indexToType = ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"] +indexToType = [ + "U1", + "I1", + "I2", + "I4", + "I8", + "F2", + "F4", + "F8", + "C2", + "C4", + "C8", + "B1", + "BF", +] promoteTypesLookup = [ - ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1"], - ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1"], - ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2"], - ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4"], - ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8"], - ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2"], - ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4"], - ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8"], - ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2"], - ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4"], - ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"], - ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1"], + ["U1", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "U1", "BF"], + ["I2", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I1", "BF"], + ["I2", "I2", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I2", "BF"], + ["I4", "I4", "I4", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I4", "BF"], + ["I8", "I8", "I8", "I8", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "I8", "BF"], + ["F2", "F2", "F2", "F2", "F2", "F2", "F4", "F8", "C2", "C4", "C8", "F2", "F4"], + ["F4", "F4", "F4", "F4", "F4", "F4", "F4", "F8", "C4", "C4", "C8", "F4", "F4"], + ["F8", "F8", "F8", "F8", "F8", "F8", "F8", "F8", "C8", "C8", "C8", "F8", "F8"], + ["C2", "C2", "C2", "C2", "C2", "C2", "C4", "C8", "C2", "C4", "C8", "C2", "C4"], + ["C4", "C4", "C4", "C4", "C4", "C4", "C4", "C8", "C4", "C4", "C8", "C4", "C4"], + ["C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8", "C8"], + ["U1", "I1", "I2", "I4", "I8", "F2", "F4", "F8", "C2", "C4", "C8", "B1", "BF"], + ["BF", "BF", "BF", "BF", "BF", "F4", "F4", "F8", "C4", "C4", "C8", "BF", "BF"], ] for rowIndex, row in enumerate(promoteTypesLookup): for colIndex, col in enumerate(row): diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index c92f910431f..479767b4abb 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -21,6 +21,7 @@ #pragma once +#include #include #include #include @@ -164,8 +165,21 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) ::exec_aten::ScalarType::SCALARTYPE>::type, \ SCALARTYPE) +#define ET_FORALL_FLOAT_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ + _(float, Float) \ + _(double, Double) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE1>::type, \ + SCALARTYPE1) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE2>::type, \ + SCALARTYPE2) + #define ET_FORALL_FLOATH_TYPES(_) ET_FORALL_FLOAT_TYPES_AND(Half, _) +#define ET_FORALL_FLOATHBF16_TYPES(_) \ + ET_FORALL_FLOAT_TYPES_AND2(Half, BFloat16, _) + // Here `ANOTHER_INPUT` should be another variable to be forwarded to a given // function. Not to be confused with another scalar type as in // `ET_FORALL_FLOAT_TYPES_AND`. @@ -177,6 +191,12 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) +#define ET_FORALL_FLOATHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::Half, Half) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, ::exec_aten::BFloat16, BFloat16) + // In this context, "REAL" means integer/float C types, which is why BFloat16 // and Half are not included. #define ET_FORALL_REAL_TYPES(_) \ @@ -209,6 +229,17 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) +#define ET_FORALL_REALHBF16_TYPES_WITH2(ANOTHER_INPUT1, ANOTHER_INPUT2, _) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, uint8_t, Byte) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int8_t, Char) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int16_t, Short) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int32_t, Int) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, int64_t, Long) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, float, Float) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, double, Double) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::Half, Half) \ + _(ANOTHER_INPUT1, ANOTHER_INPUT2, exec_aten::BFloat16, BFloat16) + // For macros that take `SCALARTYPEn` parameters, those parameters should be // an unquoted/unqualified enumerator name like `Int` or `Float`. #define ET_FORALL_REAL_TYPES_AND(SCALARTYPE, _) \ @@ -223,8 +254,29 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType) ::exec_aten::ScalarType::SCALARTYPE>::type, \ SCALARTYPE) +#define ET_FORALL_REAL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int32_t, Int) \ + _(int64_t, Long) \ + _(float, Float) \ + _(double, Double) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE1>::type, \ + SCALARTYPE1) \ + _(::executorch::runtime::ScalarTypeToCppType< \ + ::exec_aten::ScalarType::SCALARTYPE2>::type, \ + SCALARTYPE2) + #define ET_FORALL_REALH_TYPES(_) ET_FORALL_REAL_TYPES_AND(Half, _) +#define ET_FORALL_REALHBF16_TYPES(_) \ + ET_FORALL_REAL_TYPES_AND2(Half, BFloat16, _) + +#define ET_FORALL_REALHBBF16_TYPES(_) \ + ET_FORALL_REAL_TYPES_AND3(Bool, Half, BFloat16, _) + #define ET_FORALL_REAL_TYPES_AND_WITH(SCALARTYPE, ANOTHER_INPUT, _) \ _(ANOTHER_INPUT, uint8_t, Byte) \ _(ANOTHER_INPUT, int8_t, Char) \ @@ -381,6 +433,10 @@ inline bool isRealHBType(exec_aten::ScalarType t) { return (isRealHType(t) || t == exec_aten::ScalarType::Bool); } +inline bool isRealHBBF16Type(exec_aten::ScalarType t) { + return (isRealHBType(t) || t == exec_aten::ScalarType::BFloat16); +} + inline constexpr bool isComplexType(exec_aten::ScalarType t) { return ( t == exec_aten::ScalarType::ComplexHalf || @@ -589,6 +645,7 @@ using C4 = using C8 = typename ScalarTypeToCppType::type; using B1 = typename ScalarTypeToCppType::type; +using BF = typename ScalarTypeToCppType::type; #define TABLE_ENTRY(key1, key2, value) \ template <> \ @@ -613,6 +670,7 @@ TABLE_ENTRY(U1, C2, C2); TABLE_ENTRY(U1, C4, C4); TABLE_ENTRY(U1, C8, C8); TABLE_ENTRY(U1, B1, U1); +TABLE_ENTRY(U1, BF, BF); TABLE_ENTRY(I1, U1, I2); TABLE_ENTRY(I1, I1, I1); TABLE_ENTRY(I1, I2, I2); @@ -625,6 +683,7 @@ TABLE_ENTRY(I1, C2, C2); TABLE_ENTRY(I1, C4, C4); TABLE_ENTRY(I1, C8, C8); TABLE_ENTRY(I1, B1, I1); +TABLE_ENTRY(I1, BF, BF); TABLE_ENTRY(I2, U1, I2); TABLE_ENTRY(I2, I1, I2); TABLE_ENTRY(I2, I2, I2); @@ -637,6 +696,7 @@ TABLE_ENTRY(I2, C2, C2); TABLE_ENTRY(I2, C4, C4); TABLE_ENTRY(I2, C8, C8); TABLE_ENTRY(I2, B1, I2); +TABLE_ENTRY(I2, BF, BF); TABLE_ENTRY(I4, U1, I4); TABLE_ENTRY(I4, I1, I4); TABLE_ENTRY(I4, I2, I4); @@ -649,6 +709,7 @@ TABLE_ENTRY(I4, C2, C2); TABLE_ENTRY(I4, C4, C4); TABLE_ENTRY(I4, C8, C8); TABLE_ENTRY(I4, B1, I4); +TABLE_ENTRY(I4, BF, BF); TABLE_ENTRY(I8, U1, I8); TABLE_ENTRY(I8, I1, I8); TABLE_ENTRY(I8, I2, I8); @@ -661,6 +722,7 @@ TABLE_ENTRY(I8, C2, C2); TABLE_ENTRY(I8, C4, C4); TABLE_ENTRY(I8, C8, C8); TABLE_ENTRY(I8, B1, I8); +TABLE_ENTRY(I8, BF, BF); TABLE_ENTRY(F2, U1, F2); TABLE_ENTRY(F2, I1, F2); TABLE_ENTRY(F2, I2, F2); @@ -673,6 +735,7 @@ TABLE_ENTRY(F2, C2, C2); TABLE_ENTRY(F2, C4, C4); TABLE_ENTRY(F2, C8, C8); TABLE_ENTRY(F2, B1, F2); +TABLE_ENTRY(F2, BF, F4); TABLE_ENTRY(F4, U1, F4); TABLE_ENTRY(F4, I1, F4); TABLE_ENTRY(F4, I2, F4); @@ -685,6 +748,7 @@ TABLE_ENTRY(F4, C2, C4); TABLE_ENTRY(F4, C4, C4); TABLE_ENTRY(F4, C8, C8); TABLE_ENTRY(F4, B1, F4); +TABLE_ENTRY(F4, BF, F4); TABLE_ENTRY(F8, U1, F8); TABLE_ENTRY(F8, I1, F8); TABLE_ENTRY(F8, I2, F8); @@ -697,6 +761,7 @@ TABLE_ENTRY(F8, C2, C8); TABLE_ENTRY(F8, C4, C8); TABLE_ENTRY(F8, C8, C8); TABLE_ENTRY(F8, B1, F8); +TABLE_ENTRY(F8, BF, F8); TABLE_ENTRY(C2, U1, C2); TABLE_ENTRY(C2, I1, C2); TABLE_ENTRY(C2, I2, C2); @@ -709,6 +774,7 @@ TABLE_ENTRY(C2, C2, C2); TABLE_ENTRY(C2, C4, C4); TABLE_ENTRY(C2, C8, C8); TABLE_ENTRY(C2, B1, C2); +TABLE_ENTRY(C2, BF, C4); TABLE_ENTRY(C4, U1, C4); TABLE_ENTRY(C4, I1, C4); TABLE_ENTRY(C4, I2, C4); @@ -721,6 +787,7 @@ TABLE_ENTRY(C4, C2, C4); TABLE_ENTRY(C4, C4, C4); TABLE_ENTRY(C4, C8, C8); TABLE_ENTRY(C4, B1, C4); +TABLE_ENTRY(C4, BF, C4); TABLE_ENTRY(C8, U1, C8); TABLE_ENTRY(C8, I1, C8); TABLE_ENTRY(C8, I2, C8); @@ -733,6 +800,7 @@ TABLE_ENTRY(C8, C2, C8); TABLE_ENTRY(C8, C4, C8); TABLE_ENTRY(C8, C8, C8); TABLE_ENTRY(C8, B1, C8); +TABLE_ENTRY(C8, BF, C8); TABLE_ENTRY(B1, U1, U1); TABLE_ENTRY(B1, I1, I1); TABLE_ENTRY(B1, I2, I2); @@ -745,6 +813,20 @@ TABLE_ENTRY(B1, C2, C2); TABLE_ENTRY(B1, C4, C4); TABLE_ENTRY(B1, C8, C8); TABLE_ENTRY(B1, B1, B1); +TABLE_ENTRY(B1, BF, BF); +TABLE_ENTRY(BF, U1, BF); +TABLE_ENTRY(BF, I1, BF); +TABLE_ENTRY(BF, I2, BF); +TABLE_ENTRY(BF, I4, BF); +TABLE_ENTRY(BF, I8, BF); +TABLE_ENTRY(BF, F2, F4); +TABLE_ENTRY(BF, F4, F4); +TABLE_ENTRY(BF, F8, F8); +TABLE_ENTRY(BF, C2, C4); +TABLE_ENTRY(BF, C4, C4); +TABLE_ENTRY(BF, C8, C8); +TABLE_ENTRY(BF, B1, BF); +TABLE_ENTRY(BF, BF, BF); } // namespace internal @@ -760,26 +842,20 @@ struct promote_types { (!is_bits_type::value && !is_bits_type::value), "promote_types not valid for bits dtypes"); - static_assert( - !std::is_same< - T1, - typename ScalarTypeToCppType::type>:: - value && - !std::is_same< - T2, - typename ScalarTypeToCppType< - exec_aten::ScalarType::BFloat16>::type>::value, - "promote_types not valid for BFloat16"); using promoted_type_not_respecting_half_to_float = typename internal::promote_types_lookup::type; public: using type = typename std::conditional< half_to_float && - std::is_same< - promoted_type_not_respecting_half_to_float, - typename ScalarTypeToCppType::type>:: - value, + (std::is_same< + promoted_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::Half>::type>::value || + std::is_same< + promoted_type_not_respecting_half_to_float, + typename ScalarTypeToCppType< + exec_aten::ScalarType::BFloat16>::type>::value), typename ScalarTypeToCppType::type, promoted_type_not_respecting_half_to_float>::type; }; @@ -787,7 +863,8 @@ struct promote_types { /** * Implements type promotion rules that are consistent with ATen behaviour, * which in turn is consistent with NumPy's promote_types. - * If half_to_float is set to true, then half will be promoted to float instead + * If half_to_float is set to true, then half and bfloat16 will be promoted to + * float instead */ inline exec_aten::ScalarType promoteTypes( exec_aten::ScalarType a, @@ -806,6 +883,7 @@ inline exec_aten::ScalarType promoteTypes( constexpr auto c4 = exec_aten::ScalarType::ComplexFloat; constexpr auto c8 = exec_aten::ScalarType::ComplexDouble; constexpr auto b1 = exec_aten::ScalarType::Bool; + constexpr auto bf = exec_aten::ScalarType::BFloat16; // For QInt types, only allow exact match if (executorch::runtime::isQIntType(a) && a == b) { @@ -825,34 +903,41 @@ inline exec_aten::ScalarType promoteTypes( ET_CHECK_MSG(false, "promoteTypes not valid for bits dtypes"); } - ET_CHECK_MSG( - a != exec_aten::ScalarType::BFloat16 && - b != exec_aten::ScalarType::BFloat16, - "promoteTypes not valid for BFloat16"); // 12 types are handled by this function, see the constexpr definitions above - const int NUM_PROMOTE_TYPES = 12; - + const int NUM_PROMOTE_TYPES = 13; + + static constexpr std::array + dtype2index = {{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, + }}; + auto ix_a = dtype2index[(int)a]; + ET_CHECK(ix_a != -1); + auto ix_b = dtype2index[(int)b]; + ET_CHECK(ix_b != -1); static constexpr exec_aten::ScalarType _promoteTypesLookup[NUM_PROMOTE_TYPES][NUM_PROMOTE_TYPES] = { - /* u1 i1 i2 i4 i8 f2 f4 f8 c2 c4 c8 b1 */ - /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1}, - /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1}, - /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2}, - /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4}, - /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8}, - /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2}, - /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4}, - /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8}, - /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2}, - /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4}, - /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8}, - /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1}, + /* u1 i1 i2 i4 i8 f2 f4 f8 c2 c4 c8 b1 bf*/ + /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, bf}, + /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, bf}, + /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, bf}, + /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, bf}, + /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, bf}, + /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, f4}, + /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, f4}, + /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, f8}, + /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, c4}, + /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, c4}, + /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8}, + /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, bf}, + /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, bf}, }; - exec_aten::ScalarType promoted_type = - _promoteTypesLookup[static_cast(a)][static_cast(b)]; + exec_aten::ScalarType promoted_type = _promoteTypesLookup[ix_a][ix_b]; - if (half_to_float && promoted_type == exec_aten::ScalarType::Half) { + if (half_to_float && + (promoted_type == exec_aten::ScalarType::Half || + promoted_type == exec_aten::ScalarType::BFloat16)) { promoted_type = exec_aten::ScalarType::Float; } @@ -974,6 +1059,13 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) +#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ + ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__) + #define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::Byte, CTYPE_ALIAS, __VA_ARGS__) \ @@ -1001,6 +1093,13 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__) +#define ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND( \ + ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) + #define ET_INTERNAL_SWITCH_CASE_QINT_TYPES(CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::QInt8, CTYPE_ALIAS, __VA_ARGS__) \ @@ -1112,6 +1211,22 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2( \ ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)) +#define ET_SWITCH_REAL_TYPES_AND3( \ + ADDITIONAL1, \ + ADDITIONAL2, \ + ADDITIONAL3, \ + TYPE, \ + CONTEXT, \ + NAME, \ + CTYPE_ALIAS, \ + ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3( \ + ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)) + #define ET_SWITCH_REALH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_SWITCH_REAL_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) @@ -1122,6 +1237,10 @@ inline exec_aten::ScalarType promoteTypes( ET_SWITCH_REAL_TYPES_AND2( \ Half, Bool, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) +#define ET_SWITCH_REALHBBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_SWITCH_REAL_TYPES_AND3( \ + Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) + #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH( \ TYPE, \ @@ -1154,9 +1273,22 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND( \ ADDITIONAL, CTYPE_ALIAS, __VA_ARGS__)) +#define ET_SWITCH_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE_FLOAT_TYPES_AND2( \ + ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)) + #define ET_SWITCH_FLOATH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_SWITCH_FLOAT_TYPES_AND(Half, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) +#define ET_SWITCH_FLOATHBF16_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_SWITCH_FLOAT_TYPES_AND2( \ + Half, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__) + #define ET_SWITCH_QINT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ ET_INTERNAL_SWITCH( \ TYPE, \ diff --git a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp index b91c7009f45..9df01b7be9f 100644 --- a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp +++ b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp @@ -139,37 +139,38 @@ TEST(ScalarTypeUtilTest, promoteTypesTest) { // Check some common cases - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Double) == - ScalarType::Double); - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Short) == ScalarType::Float); - - ET_CHECK( - promoteTypes(ScalarType::Float, ScalarType::Int) == ScalarType::Float); - ET_CHECK( - promoteTypes(ScalarType::Long, ScalarType::Float) == ScalarType::Float); - - ET_CHECK( - promoteTypes(ScalarType::Bool, ScalarType::Bool) == ScalarType::Bool); - - ET_CHECK(promoteTypes(ScalarType::Byte, ScalarType::Int) == ScalarType::Int); - ET_CHECK( - promoteTypes(ScalarType::Char, ScalarType::Bool) == ScalarType::Char); - ET_CHECK(promoteTypes(ScalarType::Bool, ScalarType::Int) == ScalarType::Int); + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Double), ScalarType::Double); + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Short), ScalarType::Float); + + EXPECT_EQ( + promoteTypes(ScalarType::Float, ScalarType::Int), ScalarType::Float); + EXPECT_EQ( + promoteTypes(ScalarType::Long, ScalarType::Float), ScalarType::Float); + + EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Bool), ScalarType::Bool); + + EXPECT_EQ(promoteTypes(ScalarType::Byte, ScalarType::Int), ScalarType::Int); + EXPECT_EQ(promoteTypes(ScalarType::Char, ScalarType::Bool), ScalarType::Char); + EXPECT_EQ(promoteTypes(ScalarType::Bool, ScalarType::Int), ScalarType::Int); + + EXPECT_EQ( + promoteTypes(ScalarType::BFloat16, ScalarType::Half), ScalarType::Float); + EXPECT_EQ( + promoteTypes(ScalarType::BFloat16, ScalarType::Bool), + ScalarType::BFloat16); } template struct promote_types_is_valid : std::integral_constant< bool, - !std::is_same::value && - !std::is_same::value && - (std::is_same::value || - (!executorch::runtime::is_qint_type::value && - !executorch::runtime::is_qint_type::value && - !executorch::runtime::is_bits_type::value && - !executorch::runtime::is_bits_type::value))> {}; + (std::is_same::value || + (!executorch::runtime::is_qint_type::value && + !executorch::runtime::is_qint_type::value && + !executorch::runtime::is_bits_type::value && + !executorch::runtime::is_bits_type::value))> {}; template struct CompileTimePromoteTypesTestCase { @@ -195,7 +196,8 @@ struct CompileTimePromoteTypesTestCase { auto expected = executorch::runtime::promoteTypes( scalarType1, scalarType2, half_to_float); EXPECT_EQ(actual, expected) - << "promoting " << (int)scalarType1 << " to " << (int)scalarType2; + << "promoting " << (int)scalarType1 << " to " << (int)scalarType2 + << " (half to float: " << half_to_float << ')'; } template < diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h index a1ceb0c56a7..e665e6152e3 100644 --- a/runtime/core/portable_type/bfloat16.h +++ b/runtime/core/portable_type/bfloat16.h @@ -8,11 +8,41 @@ #pragma once +#include #include +#include +#include +#include namespace torch { namespace executor { +namespace internal { +inline float f32_from_bits(uint16_t src) { + float res = 0; + uint32_t tmp = src; + tmp <<= 16; + std::memcpy(&res, &tmp, sizeof(tmp)); + return res; +} + +inline uint16_t bits_from_f32(float src) { + uint32_t res = 0; + std::memcpy(&res, &src, sizeof(res)); + return res >> 16; +} + +inline uint16_t round_to_nearest_even(float src) { + if (std::isnan(src)) { + return UINT16_C(0x7FC0); + } + uint32_t U32 = 0; + std::memcpy(&U32, &src, sizeof(U32)); + uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); + return static_cast((U32 + rounding_bias) >> 16); +} +} // namespace internal + /** * The "brain floating-point" type, compatible with c10/util/BFloat16.h from * pytorch core. @@ -22,7 +52,288 @@ namespace executor { */ struct alignas(2) BFloat16 { uint16_t x; + + BFloat16() = default; + struct from_bits_t {}; + static constexpr from_bits_t from_bits() { + return from_bits_t(); + } + + constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {} + /* implicit */ BFloat16(float value) + : x(internal::round_to_nearest_even(value)) {} + operator float() const { + return internal::f32_from_bits(x); + } }; +inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) { + out << (float)value; + return out; +} + +/// Arithmetic + +inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) { + return static_cast(a) + static_cast(b); +} + +inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) { + return static_cast(a) - static_cast(b); +} + +inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) { + return static_cast(a) * static_cast(b); +} + +inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) { + return static_cast(a) / static_cast(b); +} + +inline BFloat16 operator-(const BFloat16& a) { + return -static_cast(a); +} + +inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) { + a = a + b; + return a; +} + +inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) { + a = a - b; + return a; +} + +inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) { + a = a * b; + return a; +} + +inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) { + a = a / b; + return a; +} + +inline BFloat16& operator|(BFloat16& a, const BFloat16& b) { + a.x = a.x | b.x; + return a; +} + +inline BFloat16& operator^(BFloat16& a, const BFloat16& b) { + a.x = a.x ^ b.x; + return a; +} + +inline BFloat16& operator&(BFloat16& a, const BFloat16& b) { + a.x = a.x & b.x; + return a; +} + +/// Arithmetic with floats + +inline float operator+(BFloat16 a, float b) { + return static_cast(a) + b; +} +inline float operator-(BFloat16 a, float b) { + return static_cast(a) - b; +} +inline float operator*(BFloat16 a, float b) { + return static_cast(a) * b; +} +inline float operator/(BFloat16 a, float b) { + return static_cast(a) / b; +} + +inline float operator+(float a, BFloat16 b) { + return a + static_cast(b); +} +inline float operator-(float a, BFloat16 b) { + return a - static_cast(b); +} +inline float operator*(float a, BFloat16 b) { + return a * static_cast(b); +} +inline float operator/(float a, BFloat16 b) { + return a / static_cast(b); +} + +inline float& operator+=(float& a, const BFloat16& b) { + return a += static_cast(b); +} +inline float& operator-=(float& a, const BFloat16& b) { + return a -= static_cast(b); +} +inline float& operator*=(float& a, const BFloat16& b) { + return a *= static_cast(b); +} +inline float& operator/=(float& a, const BFloat16& b) { + return a /= static_cast(b); +} + +/// Arithmetic with doubles + +inline double operator+(BFloat16 a, double b) { + return static_cast(a) + b; +} +inline double operator-(BFloat16 a, double b) { + return static_cast(a) - b; +} +inline double operator*(BFloat16 a, double b) { + return static_cast(a) * b; +} +inline double operator/(BFloat16 a, double b) { + return static_cast(a) / b; +} + +inline double operator+(double a, BFloat16 b) { + return a + static_cast(b); +} +inline double operator-(double a, BFloat16 b) { + return a - static_cast(b); +} +inline double operator*(double a, BFloat16 b) { + return a * static_cast(b); +} +inline double operator/(double a, BFloat16 b) { + return a / static_cast(b); +} + +/// Arithmetic with ints + +inline BFloat16 operator+(BFloat16 a, int b) { + return a + static_cast(b); +} +inline BFloat16 operator-(BFloat16 a, int b) { + return a - static_cast(b); +} +inline BFloat16 operator*(BFloat16 a, int b) { + return a * static_cast(b); +} +inline BFloat16 operator/(BFloat16 a, int b) { + return a / static_cast(b); +} + +inline BFloat16 operator+(int a, BFloat16 b) { + return static_cast(a) + b; +} +inline BFloat16 operator-(int a, BFloat16 b) { + return static_cast(a) - b; +} +inline BFloat16 operator*(int a, BFloat16 b) { + return static_cast(a) * b; +} +inline BFloat16 operator/(int a, BFloat16 b) { + return static_cast(a) / b; +} + +//// Arithmetic with int64_t + +inline BFloat16 operator+(BFloat16 a, int64_t b) { + return a + static_cast(b); +} +inline BFloat16 operator-(BFloat16 a, int64_t b) { + return a - static_cast(b); +} +inline BFloat16 operator*(BFloat16 a, int64_t b) { + return a * static_cast(b); +} +inline BFloat16 operator/(BFloat16 a, int64_t b) { + return a / static_cast(b); +} + +inline BFloat16 operator+(int64_t a, BFloat16 b) { + return static_cast(a) + b; +} +inline BFloat16 operator-(int64_t a, BFloat16 b) { + return static_cast(a) - b; +} +inline BFloat16 operator*(int64_t a, BFloat16 b) { + return static_cast(a) * b; +} +inline BFloat16 operator/(int64_t a, BFloat16 b) { + return static_cast(a) / b; +} + +// Overloading < and > operators, because std::max and std::min use them. + +inline bool operator>(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) > float(rhs); +} + +inline bool operator<(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) < float(rhs); +} + } // namespace executor } // namespace torch + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_signed = true; + static constexpr bool is_specialized = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = + numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 8; + static constexpr int digits10 = 2; + static constexpr int max_digits10 = 4; + static constexpr int radix = 2; + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = + numeric_limits::tinyness_before; + + static constexpr torch::executor::BFloat16 min() { + return torch::executor::BFloat16( + 0x0080, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 lowest() { + return torch::executor::BFloat16( + 0xFF7F, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 max() { + return torch::executor::BFloat16( + 0x7F7F, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 epsilon() { + return torch::executor::BFloat16( + 0x3C00, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 round_error() { + return torch::executor::BFloat16( + 0x3F00, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 infinity() { + return torch::executor::BFloat16( + 0x7F80, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 quiet_NaN() { + return torch::executor::BFloat16( + 0x7FC0, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 signaling_NaN() { + return torch::executor::BFloat16( + 0x7F80, torch::executor::BFloat16::from_bits()); + } + static constexpr torch::executor::BFloat16 denorm_min() { + return torch::executor::BFloat16( + 0x0001, torch::executor::BFloat16::from_bits()); + } +}; + +} // namespace std diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt index 21eb4feae0f..58a69f656eb 100644 --- a/runtime/core/portable_type/test/CMakeLists.txt +++ b/runtime/core/portable_type/test/CMakeLists.txt @@ -24,7 +24,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs optional_test.cpp tensor_test.cpp half_test.cpp scalar_test.cpp - tensor_impl_test.cpp + tensor_impl_test.cpp bfloat16_test.cpp ) et_cxx_test(runtime_core_portable_type_test SOURCES ${_test_srcs} EXTRA_LIBS) diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp new file mode 100644 index 00000000000..9ea53e6cba2 --- /dev/null +++ b/runtime/core/portable_type/test/bfloat16_test.cpp @@ -0,0 +1,191 @@ +#include + +#include + +using torch::executor::BFloat16; + +namespace { +float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + uint32_t bytes; + bytes = 0; + bytes |= sign; + bytes <<= 8; + bytes |= exponent; + bytes <<= 23; + bytes |= fraction; + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + float res; + std::memcpy(&res, &bytes, sizeof(res)); + return res; +} + +TEST(BFloat16Conversion, FloatToBFloat16AndBack) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float in[100]; + for (int i = 0; i < 100; ++i) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) + in[i] = i + 1.25; + } + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + BFloat16 bfloats[100]; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float out[100]; + + for (int i = 0; i < 100; ++i) { + bfloats[i].x = torch::executor::internal::bits_from_f32(in[i]); + out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x); + + // The relative error should be less than 1/(2^7) since BFloat16 + // has 7 bits mantissa. + EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128); + } +} + +TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float in[100]; + for (int i = 0; i < 100; ++i) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) + in[i] = i + 1.25; + } + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + BFloat16 bfloats[100]; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) + float out[100]; + + for (int i = 0; i < 100; ++i) { + bfloats[i].x = torch::executor::internal::round_to_nearest_even(in[i]); + out[i] = torch::executor::internal::f32_from_bits(bfloats[i].x); + + // The relative error should be less than 1/(2^7) since BFloat16 + // has 7 bits mantissa. + EXPECT_LE(std::fabs(out[i] - in[i]) / in[i], 1.0 / 128); + } +} + +TEST(BFloat16Conversion, NaN) { + float inNaN = float_from_bytes(0, 0xFF, 0x7FFFFF); + EXPECT_TRUE(std::isnan(inNaN)); + + BFloat16 a = BFloat16(inNaN); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_TRUE(std::isnan(out)); +} + +TEST(BFloat16Conversion, Inf) { + float inInf = float_from_bytes(0, 0xFF, 0); + EXPECT_TRUE(std::isinf(inInf)); + + BFloat16 a = BFloat16(inInf); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_TRUE(std::isinf(out)); +} + +TEST(BFloat16Conversion, SmallestDenormal) { + float in = std::numeric_limits::denorm_min(); // The smallest non-zero + // subnormal number + BFloat16 a = BFloat16(in); + float out = torch::executor::internal::f32_from_bits(a.x); + + EXPECT_FLOAT_EQ(in, out); +} + +TEST(BFloat16Math, Addition) { + // This test verifies that if only first 7 bits of float's mantissa are + // changed after addition, we should have no loss in precision. + + // input bits + // S | Exponent | Mantissa + // 0 | 10000000 | 10010000000000000000000 = 3.125 + float input = float_from_bytes(0, 0, 0x40480000); + + // expected bits + // S | Exponent | Mantissa + // 0 | 10000001 | 10010000000000000000000 = 6.25 + float expected = float_from_bytes(0, 0, 0x40c80000); + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + BFloat16 b; + b.x = torch::executor::internal::bits_from_f32(input); + b = b + b; + + float res = torch::executor::internal::f32_from_bits(b.x); + EXPECT_EQ(res, expected); +} + +TEST(BFloat16Math, Subtraction) { + // This test verifies that if only first 7 bits of float's mantissa are + // changed after subtraction, we should have no loss in precision. + + // input bits + // S | Exponent | Mantissa + // 0 | 10000001 | 11101000000000000000000 = 7.625 + float input = float_from_bytes(0, 0, 0x40f40000); + + // expected bits + // S | Exponent | Mantissa + // 0 | 10000000 | 01010000000000000000000 = 2.625 + float expected = float_from_bytes(0, 0, 0x40280000); + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + BFloat16 b; + b.x = torch::executor::internal::bits_from_f32(input); + b = b - 5; + + float res = torch::executor::internal::f32_from_bits(b.x); + EXPECT_EQ(res, expected); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +TEST(BFloat16Math, NextAfterZero) { + const BFloat16 zero{0}; + + auto check_nextafter = [](BFloat16 from, BFloat16 to, BFloat16 expected) { + BFloat16 actual = std::nextafter(from, to); + // Check for bitwise equality! + ASSERT_EQ(actual.x ^ expected.x, uint16_t{0}); + }; + check_nextafter(zero, zero, /*expected=*/zero); + check_nextafter(zero, -zero, /*expected=*/-zero); + check_nextafter(-zero, zero, /*expected=*/zero); + check_nextafter(-zero, -zero, /*expected=*/-zero); +} + +float BinaryToFloat(uint32_t bytes) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + float res; + std::memcpy(&res, &bytes, sizeof(res)); + return res; +} + +struct BFloat16TestParam { + uint32_t input; + uint16_t rne; +}; + +class BFloat16Test : public ::testing::Test, + public ::testing::WithParamInterface {}; + +TEST_P(BFloat16Test, BFloat16RNETest) { + float value = BinaryToFloat(GetParam().input); + uint16_t rounded = torch::executor::internal::round_to_nearest_even(value); + EXPECT_EQ(GetParam().rne, rounded); +} + +INSTANTIATE_TEST_SUITE_P( + BFloat16TestInstantiation, + BFloat16Test, + ::testing::Values( + BFloat16TestParam{0x3F848000, 0x3F84}, + BFloat16TestParam{0x3F848010, 0x3F85}, + BFloat16TestParam{0x3F850000, 0x3F85}, + BFloat16TestParam{0x3F858000, 0x3F86}, + BFloat16TestParam{0x3FFF8000, 0x4000})); + +} // namespace diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl index af55f95e45e..c0b4ef00c78 100644 --- a/runtime/core/portable_type/test/targets.bzl +++ b/runtime/core/portable_type/test/targets.bzl @@ -6,6 +6,14 @@ def define_common_targets(): The directory containing this targets.bzl file should also contain both TARGETS and BUCK files that call this function. """ + runtime.cxx_test( + name = "bfloat16_test", + srcs = ["bfloat16_test.cpp"], + deps = [ + "//executorch/runtime/core/portable_type:portable_type", + ], + ) + runtime.cxx_test( name = "optional_test", srcs = ["optional_test.cpp"], From c9ac212b12a1b7400b6e67de7a4180e065fb3197 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 5 Sep 2024 14:44:21 -0700 Subject: [PATCH 211/531] [ExecuTorch] support BF16 in op_to_copy Differential Revision: D61981356 Pull Request resolved: https://github.com/pytorch/executorch/pull/4976 --- kernels/portable/cpu/op_to_copy.cpp | 9 +- kernels/test/op_to_copy_test.cpp | 27 +- runtime/core/exec_aten/exec_aten.h | 2 + .../exec_aten/testing_util/tensor_util.cpp | 20 +- runtime/core/portable_type/bfloat16_math.h | 290 ++++++++++++++++++ runtime/core/portable_type/targets.bzl | 1 + 6 files changed, 334 insertions(+), 15 deletions(-) create mode 100644 runtime/core/portable_type/bfloat16_math.h diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp index 7ecd4f3b5e1..c0c04e65e93 100644 --- a/kernels/portable/cpu/op_to_copy.cpp +++ b/kernels/portable/cpu/op_to_copy.cpp @@ -46,10 +46,11 @@ Tensor& to_copy_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { - ET_SWITCH_REALHB_TYPES(out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { - _to_impl(self, out); - }); + ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { + ET_SWITCH_REALHBBF16_TYPES( + out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { + _to_impl(self, out); + }); }); return out; diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp index 1cc892dedbe..0a6529e736d 100644 --- a/kernels/test/op_to_copy_test.cpp +++ b/kernels/test/op_to_copy_test.cpp @@ -36,7 +36,9 @@ typedef std::map< std::type_index, std::variant< std::vector, - std::vector>> + std::vector, + std::vector, + std::vector>> FloatingTypeToDataMap; typedef std::map< @@ -309,9 +311,9 @@ TEST_F(OpToTest, AllDtypesSupported) { ScalarType::OUTPUT_DTYPE>(test_cases); #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ - ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); + ET_FORALL_REALHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_REAL_TYPES(TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY #undef TEST_KERNEL @@ -323,14 +325,14 @@ TEST_F(OpToTest, BoolTests) { #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE) \ test_runner_to_bool( \ test_case_to_bool, result_to_bool); - ET_FORALL_REAL_TYPES(TEST_TO_BOOL); + ET_FORALL_REALHBF16_TYPES(TEST_TO_BOOL); std::vector test_case_from_bool = {true, true, false}; std::vector result_from_bool = {1.0, 1.0, 0}; #define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE) \ test_runner_from_bool( \ test_case_from_bool, result_from_bool); - ET_FORALL_REAL_TYPES(TEST_FROM_BOOL); + ET_FORALL_REALHBF16_TYPES(TEST_FROM_BOOL); } TEST_F(OpToTest, NanInfSupported) { @@ -349,9 +351,9 @@ TEST_F(OpToTest, NanInfSupported) { ScalarType::OUTPUT_DTYPE>(test_cases); #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ - ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); + ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY #undef TEST_KERNEL @@ -381,6 +383,13 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { -0.30919688936285893988}; // clang-format on + std::vector half_data; + std::vector bf16_data; + for (auto d : double_data) { + half_data.emplace_back(d); + bf16_data.emplace_back(d); + } + std::vector int64_data = { -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0}; std::vector int32_data = { @@ -394,6 +403,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { FloatingTypeToDataMap floating_point_data; floating_point_data[typeid(float)] = float_data; floating_point_data[typeid(double)] = double_data; + floating_point_data[typeid(exec_aten::Half)] = half_data; + floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data; // Gathering all int data together for better traversial IntTypeToDataMap int_data; @@ -412,7 +423,7 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) { #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); } TEST_F(OpToTest, MismatchedSizesDie) { diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h index 919b5420b3a..808d31502a9 100644 --- a/runtime/core/exec_aten/exec_aten.h +++ b/runtime/core/exec_aten/exec_aten.h @@ -17,6 +17,7 @@ #include // @manual #include // @manual #include // @manual +#include // @manual #include // @manual #include // @manual #include // @manual @@ -31,6 +32,7 @@ #else // use executor #include // @manual #include // @manual +#include // @manual #include // @manual #include // @manual #include // @manual diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 03dffd208f0..1fd751dc882 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -16,6 +16,8 @@ #include #include +using exec_aten::BFloat16; +using exec_aten::Half; using exec_aten::ScalarType; using exec_aten::Tensor; @@ -32,9 +34,7 @@ namespace { * T must be a floating point type. Non-floating point data should be compared * directly. */ -template < - typename T, - typename = std::enable_if_t::value>> +template bool data_is_close( const T* a, const T* b, @@ -119,6 +119,20 @@ bool tensors_are_close( a.numel(), rtol, atol); + } else if (a.scalar_type() == ScalarType::Half) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); + } else if (a.scalar_type() == ScalarType::BFloat16) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h new file mode 100644 index 00000000000..68ee77cf340 --- /dev/null +++ b/runtime/core/portable_type/bfloat16_math.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace std { + +template +struct is_reduced_floating_point + : std::integral_constant< + bool, + std::is_same::value || + std::is_same::value> {}; + +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T acos(T a) { + return std::acos(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T asin(T a) { + return std::asin(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T atan(T a) { + return std::atan(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T atanh(T a) { + return std::atanh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T erf(T a) { + return std::erf(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T erfc(T a) { + return std::erfc(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T exp(T a) { + return std::exp(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T expm1(T a) { + return std::expm1(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline bool isfinite(T a) { + return std::isfinite(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log(T a) { + return std::log(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log10(T a) { + return std::log10(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log1p(T a) { + return std::log1p(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T log2(T a) { + return std::log2(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T ceil(T a) { + return std::ceil(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T cos(T a) { + return std::cos(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T floor(T a) { + return std::floor(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T nearbyint(T a) { + return std::nearbyint(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sin(T a) { + return std::sin(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T tan(T a) { + return std::tan(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sinh(T a) { + return std::sinh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T cosh(T a) { + return std::cosh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T tanh(T a) { + return std::tanh(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T trunc(T a) { + return std::trunc(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T lgamma(T a) { + return std::lgamma(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T sqrt(T a) { + return std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T rsqrt(T a) { + return 1.0 / std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T abs(T a) { + return std::abs(float(a)); +} +#if defined(_MSC_VER) && defined(__CUDACC__) +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, double b) { + return std::pow(float(a), float(b)); +} +#else +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, double b) { + return std::pow(float(a), b); +} +#endif +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T pow(T a, T b) { + return std::pow(float(a), float(b)); +} +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T fmod(T a, T b) { + return std::fmod(float(a), float(b)); +} + +/* + The following function is inspired from the implementation in `musl` + Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT + ---------------------------------------------------------------------- + Copyright © 2005-2020 Rich Felker, et al. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ---------------------------------------------------------------------- + */ +template < + typename T, + typename std::enable_if::value, int>::type = 0> +inline T nextafter(T from, T to) { + // Reference: + // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c + using int_repr_t = uint16_t; + constexpr uint8_t bits = 16; + union { + T f; + int_repr_t i; + } ufrom = {from}, uto = {to}; + + // get a mask to get the sign bit i.e. MSB + int_repr_t sign_mask = int_repr_t{1} << (bits - 1); + + // short-circuit: if either is NaN, return NaN + if (from != from || to != to) { + return from + to; + } + + // short-circuit: if they are exactly the same. + if (ufrom.i == uto.i) { + return from; + } + + // mask the sign-bit to zero i.e. positive + // equivalent to abs(x) + int_repr_t abs_from = ufrom.i & ~sign_mask; + int_repr_t abs_to = uto.i & ~sign_mask; + if (abs_from == 0) { + // if both are zero but with different sign, + // preserve the sign of `to`. + if (abs_to == 0) { + return to; + } + // smallest subnormal with sign of `to`. + ufrom.i = (uto.i & sign_mask) | int_repr_t{1}; + return ufrom.f; + } + + // if abs(from) > abs(to) or sign(from) != sign(to) + if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) { + ufrom.i--; + } else { + ufrom.i++; + } + + return ufrom.f; +} + +} // namespace std diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 0d65ef36b85..b8ccbe602ed 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -43,6 +43,7 @@ def define_common_targets(): name = "scalar_type", exported_headers = [ "bfloat16.h", + "bfloat16_math.h", "complex.h", "half.h", "scalar_type.h", From 10288a2043e936b54370accabf462d6e67f4e727 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 5 Sep 2024 14:44:26 -0700 Subject: [PATCH 212/531] [ExecuTorch] support BF16 in op_mul Differential Revision: D61981355 Pull Request resolved: https://github.com/pytorch/executorch/pull/4977 --- kernels/optimized/cpu/binary_ops.h | 3 +- kernels/optimized/cpu/op_mul.cpp | 17 +- kernels/portable/cpu/op_mul.cpp | 18 +- kernels/test/op_mul_test.cpp | 158 +++++++++++------- .../exec_aten/testing_util/tensor_util.cpp | 2 +- runtime/core/exec_aten/util/tensor_util.h | 9 + 6 files changed, 130 insertions(+), 77 deletions(-) diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h index 01f3eed401e..6d941509f72 100644 --- a/kernels/optimized/cpu/binary_ops.h +++ b/kernels/optimized/cpu/binary_ops.h @@ -75,7 +75,8 @@ ElementwiseOptimizedPath inline select_optimized_path( ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) { + if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half || + a_type == ScalarType::BFloat16) { return ElementwiseOptimizedPath::kNone; } if (a.sizes().equals(b.sizes()) || diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 3b93870a610..31b0f7754fb 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -80,7 +80,8 @@ Tensor& opt_mul_out( ScalarType out_type = out.scalar_type(); if (b.numel() == 1) { - if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half && + a_type != ScalarType::BFloat16) { auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( ctx, @@ -170,12 +171,12 @@ Tensor& opt_mul_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { apply_binary_elementwise_fn( [](const CTYPE_A val_a, const CTYPE_B val_b) { CTYPE_IN a_casted = static_cast(val_a); @@ -210,7 +211,7 @@ Tensor& opt_mul_scalar_out( ET_CHECK(common_type == out_type); - if (common_type == ScalarType::Half) { + if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) { common_type = ScalarType::Float; } @@ -219,7 +220,7 @@ Tensor& opt_mul_scalar_out( ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor."); if (a_type == common_type && a_type == out_type && - a_type != ScalarType::Half) { + a_type != ScalarType::Half && a_type != ScalarType::BFloat16) { ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { CTYPE_B b_val; @@ -235,11 +236,11 @@ Tensor& opt_mul_scalar_out( }); }); } else { - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { ET_SWITCH_REALB_TYPES( common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() { CTYPE_B b_val; ET_EXTRACT_SCALAR(b, b_val); diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 1d29b8bfe8a..8fc4f9d4593 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -70,7 +70,11 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { InvalidArgument, out); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); @@ -82,12 +86,12 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { MulInner< can_cast::value, CTYPE_A, @@ -129,15 +133,15 @@ Tensor& mul_scalar_out( ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); - if (common_type == ScalarType::Half) { + if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) { common_type = ScalarType::Float; } - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "mul.Scalar_out", CTYPE_B, [&]() { ET_SWITCH_REALB_TYPES( common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() { CTYPE_B b_val; utils::extract_scalar(b, &b_val); diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 32b69352ef1..41a8656f967 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -72,7 +72,7 @@ class OpMulOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_mul_enumerate_out_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -89,29 +89,99 @@ class OpMulOutTest : public OperatorTest { // Multiply two tensors op_mul_out( - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes), out); - EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8})); + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), tf.ones(sizes), out); + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875})); op_mul_out( tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.zeros(sizes), out); EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, 0.0, 0.0, 0.0})); op_mul_out( - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), + tf.make(sizes, /*data=*/{1.25, 2.5, 4.75, 8.875}), out); EXPECT_TENSOR_CLOSE( - out, tf.make(sizes, /*data=*/{1.21, 4.84, 19.36, 77.44})); + out, tf.make(sizes, /*data=*/{1.5625, 6.25, 22.5625, 78.765625})); } void test_mul_enumerate_a_types() { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_mul_enumerate_b_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } + + template + void test_optimized_path_ignores_leading_1_dimensions() { + TensorFactory tf; + + const std::vector sizes1 = {1, 1, 2, 2}; + const std::vector sizes2 = {1, 2, 2}; + + // Destination for the mul. + Tensor out = tf.zeros(sizes1); + + // Multiply two tensors + op_mul_out( + tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out); + EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8})); + } + + template + void test_broadcast_a2b() { + TensorFactory tf_a; + + std::vector> b_sizeses = { + {2}, + {1, 2}, + }; + for (const auto& b_sizes : b_sizeses) { + // a and b of different shapes + Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); + Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2}); + + // Destination for output of mul. + Tensor out = tf_a.zeros({2, 2}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE( + op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + } + } + + template + void test_broadcast_b2a() { + TensorFactory tf_a; + // a and b of different shapes + Tensor a = tf_a.make({2}, /*data=*/{2, 2}); + Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); + + // Destination for output of mul. + Tensor out = tf_a.zeros({2, 2}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE( + op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + } + + template + void test_scalar_input_broadcast() { + TensorFactory tf_a; + + // a is a 1d tensor and b is a scalar + Tensor a = tf_a.make({2}, /*data=*/{2, 2}); + Tensor b = tf_a.make({}, /*data=*/{2}); + + // Destination for output of mul. + Tensor out = tf_a.make({2}, /*data=*/{2, 2}); + Tensor expected = tf_a.make({2}, /*data=*/{4, 4}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); + EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); + } }; class OpMulScalarOutTest : public OperatorTest { @@ -141,6 +211,14 @@ TEST_F(OpMulOutTest, DoubleTensors) { test_floating_point_mul_out(); } +TEST_F(OpMulOutTest, HalfTensors) { + test_floating_point_mul_out(); +} + +TEST_F(OpMulOutTest, BFloat16Tensors) { + test_floating_point_mul_out(); +} + TEST_F(OpMulOutTest, BoolTensors) { TensorFactory tf; @@ -166,18 +244,12 @@ TEST_F(OpMulOutTest, BoolTensors) { } TEST_F(OpMulOutTest, OptimizedPathIgnoresLeading1Dimensions) { - TensorFactory tf; +#define ENUMERATE_TEST_ENTRY(ctype, dtype) \ + test_optimized_path_ignores_leading_1_dimensions(); - const std::vector sizes1 = {1, 1, 2, 2}; - const std::vector sizes2 = {1, 2, 2}; + ET_FORALL_FLOATHBF16_TYPES(ENUMERATE_TEST_ENTRY); - // Destination for the mul. - Tensor out = tf.zeros(sizes1); - - // Multiply two tensors - op_mul_out( - tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8}), tf.ones(sizes2), out); - EXPECT_TENSOR_CLOSE(out, tf.make(sizes1, /*data=*/{1.1, 2.2, 4.4, 8.8})); +#undef ENUMERATE_TEST_ENTRY } // Mismatched shape tests. @@ -202,40 +274,16 @@ TEST_F(OpMulOutTest, MismatchedNonBroadcastableInputShapesDies) { // Broadcast tensor b's size to tensor a's size TEST_F(OpMulOutTest, BroadcastA2BTest) { - TensorFactory tf_a; - - std::vector> b_sizeses = { - {2}, - {1, 2}, - }; - for (const auto& b_sizes : b_sizeses) { - // a and b of different shapes - Tensor a = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); - Tensor b = tf_a.make(b_sizes, /*data=*/{2, 2}); - - // Destination for output of mul. - Tensor out = tf_a.zeros({2, 2}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE( - op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); - } + test_broadcast_a2b(); + test_broadcast_a2b(); + test_broadcast_a2b(); } // Broadcast tensor a's size to tensor b's size TEST_F(OpMulOutTest, BroadcastB2ATest) { - TensorFactory tf_a; - - // a and b of different shapes - Tensor a = tf_a.make({2}, /*data=*/{2, 2}); - Tensor b = tf_a.make({2, 2}, /*data=*/{1, 2, 3, 4}); - - // Destination for output of mul. - Tensor out = tf_a.zeros({2, 2}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE( - op_mul_out(a, b, out), tf_a.make({2, 2}, /*data=*/{2, 4, 6, 8})); + test_broadcast_b2a(); + test_broadcast_b2a(); + test_broadcast_b2a(); } // Broadcast tensor a and b's size to a new size c. @@ -256,19 +304,9 @@ TEST_F(OpMulOutTest, BroadcastAB2CTest) { } TEST_F(OpMulOutTest, ScalarInputBroadcastTest) { - TensorFactory tf_a; - - // a is a 1d tensor and b is a scalar - Tensor a = tf_a.make({2}, /*data=*/{2, 2}); - Tensor b = tf_a.make({}, /*data=*/{2}); - - // Destination for output of mul. - Tensor out = tf_a.make({2}, /*data=*/{2, 2}); - Tensor expected = tf_a.make({2}, /*data=*/{4, 4}); - - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); - EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); + test_scalar_input_broadcast(); + test_scalar_input_broadcast(); + test_scalar_input_broadcast(); } TEST_F(OpMulOutTest, MismatchedOutputShapesDies) { diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 1fd751dc882..0301cc9a519 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -283,7 +283,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) { break; switch (t.scalar_type()) { - ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE) + ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, PRINT_CASE) default: ET_CHECK_MSG( false, diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index cadb5ecd9ab..630f0cdb4a1 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -516,6 +516,15 @@ inline bool tensor_is_realhb_type(exec_aten::Tensor t) { return true; } +inline bool tensor_is_realhbbf16_type(exec_aten::Tensor t) { + ET_LOG_MSG_AND_RETURN_IF_FALSE( + executorch::runtime::isRealHBBF16Type(t.scalar_type()), + "Expected to find a real type, but tensor has type %s", + torch::executor::toString(t.scalar_type())); + + return true; +} + inline bool tensor_is_complex_type(exec_aten::Tensor t) { ET_LOG_MSG_AND_RETURN_IF_FALSE( torch::executor::isComplexType(t.scalar_type()), From 8fb1defe00745e8103ddf800e8fe638167d1bacd Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:36:28 -0700 Subject: [PATCH 213/531] Revert default to 8650 for llama Differential Revision: D62221547 Pull Request resolved: https://github.com/pytorch/executorch/pull/5100 --- extension/llm/export/partitioner_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 0d9f7c6cfd9..e75d5bef3fb 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -140,7 +140,7 @@ def get_qnn_partitioner( return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=QcomChipset.SM8450, # default to SM8450 # pyre-fixme[16] + soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, From 89829b5faf7e7f4e316358a233142d39da508361 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Thu, 5 Sep 2024 16:10:43 -0700 Subject: [PATCH 214/531] [Build] Use C++17 Constructor for tiktoken.cpp when C++20 is unavailable (#5025) The basic_string_view constructor: ``` template< class It, class End > constexpr basic_string_view( It first, End last ); ``` requires C++20. To allow the code to compile with C++17, use the basic_string_view constructor: ``` constexpr basic_string_view( const CharT* s, size_type count ); ``` For #4661 --- extension/llm/tokenizer/tiktoken.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp index f8ccf74fd6b..f99ac2e955e 100644 --- a/extension/llm/tokenizer/tiktoken.cpp +++ b/extension/llm/tokenizer/tiktoken.cpp @@ -266,7 +266,11 @@ Tiktoken::_split_with_allowed_special_token( return std::make_pair(std::nullopt, input); } +#if __cplusplus >= 202002L auto start = input.begin(); +#else + const char* start = input.data(); +#endif std::string special; while (true) { if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) { @@ -276,9 +280,15 @@ Tiktoken::_split_with_allowed_special_token( if (allowed_special.count(special) == 1) { // Found an allowed special token, split the text with it. +#if __cplusplus >= 202002L return std::make_pair( special, re2::StringPiece(start, input.begin() - start - special.size())); +#else + return std::make_pair( + special, + re2::StringPiece(start, (input.data() - start) - special.size())); +#endif } // else try to find the next special token } From 3c582377aeee56bea506fee202cd5a2d4a2b7d30 Mon Sep 17 00:00:00 2001 From: lucylq Date: Thu, 5 Sep 2024 16:33:28 -0700 Subject: [PATCH 215/531] Update module/test .pte file Differential Revision: D62260916 Pull Request resolved: https://github.com/pytorch/executorch/pull/5116 --- extension/module/test/module_test.cpp | 105 ++++++++++++---------- extension/module/test/resources/README.md | 4 + extension/module/test/resources/add.pte | Bin 0 -> 728 bytes extension/module/test/resources/model.pte | Bin 1600 -> 0 bytes 4 files changed, 61 insertions(+), 48 deletions(-) create mode 100644 extension/module/test/resources/README.md create mode 100644 extension/module/test/resources/add.pte delete mode 100644 extension/module/test/resources/model.pte diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp index 4ef454e1c75..75cead25a72 100644 --- a/extension/module/test/module_test.cpp +++ b/extension/module/test/module_test.cpp @@ -22,7 +22,7 @@ namespace torch::executor { class ModuleTest : public ::testing::Test { protected: static void SetUpTestSuite() { - model_path_ = std::getenv("RESOURCES_PATH") + std::string("/model.pte"); + model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte"); } static std::string model_path_; @@ -95,7 +95,7 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto meta = module.method_meta("forward"); EXPECT_TRUE(meta.ok()); EXPECT_STREQ(meta->name(), "forward"); - EXPECT_EQ(meta->num_inputs(), 1); + EXPECT_EQ(meta->num_inputs(), 2); EXPECT_EQ(*(meta->input_tag(0)), Tag::Tensor); EXPECT_EQ(meta->num_outputs(), 1); EXPECT_EQ(*(meta->output_tag(0)), Tag::Tensor); @@ -103,9 +103,8 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto input_meta = meta->input_tensor_meta(0); EXPECT_TRUE(input_meta.ok()); EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float); - EXPECT_EQ(input_meta->sizes().size(), 2); + EXPECT_EQ(input_meta->sizes().size(), 1); EXPECT_EQ(input_meta->sizes()[0], 1); - EXPECT_EQ(input_meta->sizes()[1], 2); const auto output_meta = meta->output_tensor_meta(0); EXPECT_TRUE(output_meta.ok()); @@ -124,19 +123,22 @@ TEST_F(ModuleTest, TestNonExistentMethodMeta) { TEST_F(ModuleTest, TestExecute) { Module module(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(result.ok()); EXPECT_TRUE(module.is_loaded()); EXPECT_TRUE(module.is_method_loaded("forward")); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreload) { @@ -145,17 +147,18 @@ TEST_F(ModuleTest, TestExecutePreload) { const auto error = module.load(); EXPECT_EQ(error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreload_method) { @@ -164,17 +167,18 @@ TEST_F(ModuleTest, TestExecutePreload_method) { const auto error = module.load_method("forward"); EXPECT_EQ(error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { @@ -186,17 +190,18 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { const auto load_method_error = module.load_method("forward"); EXPECT_EQ(load_method_error, Error::Ok); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.execute("forward", Tensor(&tensor)); + const auto result = + module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestExecuteOnNonExistent) { @@ -218,41 +223,42 @@ TEST_F(ModuleTest, TestExecuteOnCurrupted) { TEST_F(ModuleTest, TestGet) { Module module(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.get("forward", Tensor(&tensor)); + const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestForward) { auto module = std::make_unique(model_path_); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module->forward(Tensor(&tensor)); + + const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); std::array input2{2, 3}; TensorImpl tensor2( ScalarType::Float, sizes.size(), sizes.data(), input2.data()); - const auto result2 = module->forward(Tensor(&tensor2)); + const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)}); EXPECT_TRUE(result2.ok()); const auto data2 = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data2[0], 2.5, 1e-5); + EXPECT_NEAR(data2[0], 4, 1e-5); } TEST_F(ModuleTest, TestForwardWithInvalidInputs) { @@ -303,23 +309,26 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { EXPECT_EQ(load_error, Error::Ok); EXPECT_TRUE(module1->is_loaded()); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result1 = module1->execute("forward", Tensor(&tensor)); + auto result1 = + module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result1.ok()); auto module2 = std::make_unique(module1->program()); - auto result2 = module2->execute("forward", Tensor(&tensor)); + auto result2 = + module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result2.ok()); module1 = std::make_unique("/path/to/nonexistent/file.pte"); EXPECT_FALSE(module1->is_loaded()); - auto result3 = module2->execute("forward", Tensor(&tensor)); + auto result3 = + module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result3.ok()); } @@ -351,17 +360,17 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { EXPECT_EQ(module.program(), shared_program); - std::array input{1, 2}; - std::array sizes{1, 2}; + std::array input{1}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result = module.execute("forward", Tensor(&tensor)); + auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], 1.5, 1e-5); + EXPECT_NEAR(data[0], 2, 1e-5); } TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { @@ -379,24 +388,24 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { EXPECT_TRUE(program != nullptr); auto thread = [](std::shared_ptr program, - const std::array& input) { + const std::array& input) { Module module(program); - std::array sizes{1, 2}; + std::array sizes{1}; TensorImpl tensor( ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data()); - const auto result = module.forward(Tensor(&tensor)); + const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); - EXPECT_NEAR(data[0], (input[0] + input[1]) / 2.0, 1e-5); + EXPECT_NEAR(data[0], (input[0] * 2), 1e-5); }; - std::thread t1(thread, program, std::array{1, 2}); - std::thread t2(thread, program, std::array{2, 3}); - std::thread t3(thread, program, std::array{3, 4}); - std::thread t4(thread, program, std::array{4, 5}); - std::thread t5(thread, program, std::array{5, 6}); + std::thread t1(thread, program, std::array{1}); + std::thread t2(thread, program, std::array{2}); + std::thread t3(thread, program, std::array{3}); + std::thread t4(thread, program, std::array{4}); + std::thread t5(thread, program, std::array{5}); t1.join(); t2.join(); diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md new file mode 100644 index 00000000000..5067c870a3c --- /dev/null +++ b/extension/module/test/resources/README.md @@ -0,0 +1,4 @@ +## Resources + +### model.pte +- generated via `buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"` after D62209852. diff --git a/extension/module/test/resources/add.pte b/extension/module/test/resources/add.pte new file mode 100644 index 0000000000000000000000000000000000000000..43252ca7d3d05e8fe847e122c9c7de976e0e0096 GIT binary patch literal 728 zcmZ`$O-_Sg5Pi1LVna-08q!40d z9v`$+<8G4NtCNznyo*d;{901sofq4 z&95FC(_^3>z=$tz@ie@O=wFQ6?sRR{e3+V%PkTlvcFUN0_=L1XT6i=0w*D?~&6+W_ zm?cw28ad;8ZTa+8zxfcf= X8imU+Ugk*@$4qIuZ+H9Wa$n&GV)QgK literal 0 HcmV?d00001 diff --git a/extension/module/test/resources/model.pte b/extension/module/test/resources/model.pte deleted file mode 100644 index 91b52416847fff9a794db423583a0c8c5a303d66..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1600 zcmbtUF>6y%6h2LpG`67x38h1a$0LYCkyaeiEbSs>a5ICDhBhX*B=CHR$qS80K{`1U zXNL|&931)!#9tu#4;(T$Iyg9V(8lk(C%1VGUBnAt?z!iF=R4<~d-K*rl*siS$iyu-6$x zFl7m-M2@Fz2}U6M0xKZM07ofLj3j#QKR!jPJqb zp3{bj>>9(kS26A**tmi#F5`Gk<{-`|+^qF;hi6cJ8ncUKU0gGsaD)4-k27I)K=^>g zK2QX*z%T5LfHQ!1Tmg0fo?+wC_98|5SnV|5io44@V?6_8?BGnalxusq56&>A^EsxI z2bbDU;A7kd;KsS~65M|ejZZO+Wvt;|^6=v{PA3|yZML2Zjnz9vY)0efwVz2_!%%$- z>=QehftP({e(LcZaGWVG;dp#o?mm=(^DD>p?W5Xh51X#mGJMpeZ?@KqEaUz&_O7v= z;(tu!He&fI_$A@U!DGiYUDifD_Q1XcEI8B0nbC1=*2C}8R5@p3;Z9S7{c-idwDEj} z^p^qlW(%lpu6r;2=tbA~B`+KV!Mf)+I$>mnQOwb52hE-d_xfGafV)~dsv-7rHr{_a zXa`X{?B9z9-N5+Qw0x>t*R0xA{{M~tm->%l=bq+mwj0q*+11*OMWY350{ef^=^fU4 GP5BEPFB_=< From 5f4a81154ce660b7822046f0a41b64e3a67014c2 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:55:37 -0700 Subject: [PATCH 216/531] [ET-VK] Fix negative dim in `normalize_to_dim_index` Differential Revision: D62270925 Pull Request resolved: https://github.com/pytorch/executorch/pull/5118 --- backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h index 45dfceb3f0d..4bd8e9b900b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h @@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST; constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST; inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) { - return static_cast(dim - v_in.dim()); + return dim < 0 ? static_cast(dim) + : static_cast(dim - v_in.dim()); } /* From cab29eaf6f585e7a73f2e8e8ccd3c976e0e52c0c Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Thu, 5 Sep 2024 23:28:02 -0400 Subject: [PATCH 217/531] [ET-VK] Introduce axis mapping for no-copy permute of texture-backed tensors Differential Revision: D62210118 Pull Request resolved: https://github.com/pytorch/executorch/pull/5092 --- .../vulkan/runtime/api/containers/Tensor.cpp | 129 ++++++++++++++---- .../vulkan/runtime/api/containers/Tensor.h | 65 +++++++-- backends/vulkan/runtime/graph/ComputeGraph.h | 4 + .../vulkan/test/vulkan_compute_api_test.cpp | 10 +- 4 files changed, 167 insertions(+), 41 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 7b9d30ef658..6fe6746ec0d 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -80,6 +80,42 @@ std::vector calculate_strides( return strides; } +/* + * Axis mapping is somewhat analogous to strides for texture backed tensors. + * + * The axis mapping is normalized to 4 dimensions, similar to the padded sizes. + * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture + * axis that corresponds to the width, height, and channels dimension of the + * tensor. Thus the axis mapping can be considered to be in WHCN dimension + * order. + * + * The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor + * dimension along which batches will be concatenated. This dimension can be + * referred to as the "inner dimension" To determine which image texture axis is + * used for the concatenation, a double lookup will need to be performed + * (axis_mapping.at(axis_mapping.at(3))). + * + * The reason for strucuring axis mapping this way is because for the batch dim, + * two things need to be easily derived: + * + * 1. The dim idx of the inner dimension, so that the size of the inner + * dimension can be easily determined. + * 2. The texture axis used to concatenate batches + * + * By storing the dim index of the inner dimension instead of the texture axis + * it maps to, both pieces of information are readily available. + * + * The axis mapping allows for permuted views of texture-backed tensors. + */ +std::vector default_axis_mapping() { + // Currently, all compute shaders have an assumption that the channels dim is + // used to combine with the batch dim of a tensor. However, once dim mapping + // is integrated into the tensor indexing logic for each compute shader, we + // can be more flexible with mapping the batch dim to different texture axes + // in order to improve performance or memory footprint. + return {0, 1, 2, 2}; +} + bool dim_order_is_valid(const std::vector& dim_order) { int64_t sum = 0; for (size_t i = 0; i < dim_order.size(); ++i) { @@ -137,30 +173,44 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout memory_layout) { VK_CHECK_COND(padded_sizes.size() == 4); + VK_CHECK_COND(axis_mapping.size() == 4); + + utils::uvec3 extents({1, 1, 1}); + // First three elements of axis_mapping indicate which (X,Y,Z) image axis the + // width, height, and channels dim of the tensor maps to. + for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { + const int64_t axis = axis_mapping.at(whcn_dim); + const int64_t dim = padded_sizes.size() - 1 - whcn_dim; + extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); + } - uint32_t N = utils::safe_downcast(padded_sizes.at(0)); - uint32_t C = utils::safe_downcast(padded_sizes.at(1)); - uint32_t H = utils::safe_downcast(padded_sizes.at(2)); - uint32_t W = utils::safe_downcast(padded_sizes.at(3)); + // axis_mapping[3] indicates the WHCN index of the dimension used for batch + // concatenation. Thus a double lookup is required to determine the image axis + // used for batch concatenation. + const int64_t concatted_whcn_dim = axis_mapping.at(3); + const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim); + // Multiply the extents of the batch axis by the batch size. + extents[batch_axis] *= padded_sizes.at(0); switch (memory_layout) { case utils::kWidthPacked: - VK_CHECK_COND(W % 4 == 0); - W /= 4; + VK_CHECK_COND(extents[0] % 4 == 0); + extents[0] /= 4; break; case utils::kHeightPacked: - VK_CHECK_COND(H % 4 == 0); - H /= 4; + VK_CHECK_COND(extents[1] % 4 == 0); + extents[1] /= 4; break; case utils::kChannelsPacked: - VK_CHECK_COND(C % 4 == 0); - C /= 4; + VK_CHECK_COND(extents[2] % 4 == 0); + extents[2] /= 4; break; } - return {W, H, C * N}; + return extents; } // @@ -176,9 +226,10 @@ vTensor::vTensor( const bool allocate_memory) : dtype_(dtype), memory_layout_(memory_layout), - // Calculate tensor size metadata + // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)), + axis_mapping_(default_axis_mapping()), strides_(calculate_strides(sizes, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, @@ -189,12 +240,14 @@ vTensor::vTensor( sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Construct Tensor storage storage_( context, storage_type, memory_layout_, + axis_mapping_, padded_sizes_, dtype_, allocate_memory) { @@ -222,6 +275,7 @@ vTensor::vTensor(const vTensor& other) // Copy tensor size metadata sizes_(other.sizes_.begin(), other.sizes_.end()), dim_order_(other.dim_order_.begin(), other.dim_order_.end()), + axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()), strides_(other.strides_.begin(), other.strides_.end()), numel_(other.numel_), padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, @@ -234,6 +288,7 @@ vTensor::vTensor(const vTensor& other) sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Copy Tensor storage storage_(other.storage_) {} @@ -248,6 +303,7 @@ vTensor::vTensor( // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), dim_order_(dim_order.begin(), dim_order.end()), + axis_mapping_(default_axis_mapping()), strides_(calculate_strides(sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, @@ -258,6 +314,7 @@ vTensor::vTensor( sizes_uniform_(), strides_uniform_(), numel_uniform_(), + axis_mapping_uniform_(), texture_limits_uniform_(), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { @@ -315,6 +372,14 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() { return vkapi::BufferBindInfo(strides_uniform_.buffer()); } +const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() { + if (!axis_mapping_uniform_.buffer()) { + axis_mapping_uniform_ = + ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_)); + } + return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer()); +} + const vkapi::BufferBindInfo vTensor::texture_limits_ubo() { if (!texture_limits_uniform_.buffer()) { texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_); @@ -376,11 +441,7 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } } -void vTensor::update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order) { - sizes_ = new_sizes; - dim_order_ = new_dim_order; +void vTensor::update_metadata() { strides_ = calculate_strides(sizes_, dim_order_); // Only update the memory layout for buffer-backed tensors. Strides are // meaningless for texture-backed tensors and do not impact the memory layout. @@ -396,7 +457,7 @@ void vTensor::update_metadata( // Calculate the extents of the image texture that would have been required // for a tensor of the new sizes. utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); + calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_); // Update the texture limits to reflect the new virtual extents. texture_limits_.limits = utils::ivec3{ @@ -407,15 +468,18 @@ void vTensor::update_metadata( if (sizes_uniform_.buffer()) { sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); } - if (texture_limits_uniform_.buffer()) { - texture_limits_uniform_.update(texture_limits_); - } if (strides_uniform_.buffer()) { strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_)); } if (numel_uniform_.buffer()) { numel_uniform_.update(numel_); } + if (axis_mapping_uniform_.buffer()) { + axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_)); + } + if (texture_limits_uniform_.buffer()) { + texture_limits_uniform_.update(texture_limits_); + } } void vTensor::check_sizes(const std::vector& sizes) const { @@ -423,7 +487,7 @@ void vTensor::check_sizes(const std::vector& sizes) const { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); + calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_); bool valid_resize = virtual_extents[0] <= image_extents()[0]; valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1]; @@ -454,7 +518,9 @@ void vTensor::virtual_reconfigure( VK_CHECK_COND(dim_order_is_valid(new_dim_order)); check_sizes(new_sizes); - update_metadata(new_sizes, new_dim_order); + sizes_ = new_sizes; + dim_order_ = new_dim_order; + update_metadata(); } void vTensor::virtual_resize(const std::vector& new_sizes) { @@ -463,13 +529,16 @@ void vTensor::virtual_resize(const std::vector& new_sizes) { "new sizes cannot modify the dimensionality of the tensor "); check_sizes(new_sizes); - update_metadata(new_sizes, dim_order_); + sizes_ = new_sizes; + update_metadata(); } void vTensor::reallocate(const std::vector& new_sizes) { - update_metadata(new_sizes, dim_order_); + sizes_ = new_sizes; + update_metadata(); storage_.discard_and_reallocate( calculate_padded_sizes(new_sizes, memory_layout_), + axis_mapping_, memory_layout_, dtype_); } @@ -547,12 +616,16 @@ vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, + const std::vector& axis_mapping, const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, - image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)), + image_extents_(calculate_image_extents( + padded_sizes, + axis_mapping, + gpu_memory_layout)), buffer_length_{utils::multiply_integers(padded_sizes)}, buffer_offset_{0}, image_(allocate_image( @@ -665,6 +738,7 @@ bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { void vTensorStorage::discard_and_reallocate( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout gpu_memory_layout, const vkapi::ScalarType dtype) { const bool image_owns_memory = image_.owns_memory(); @@ -672,7 +746,8 @@ void vTensorStorage::discard_and_reallocate( flush(); - image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout); + image_extents_ = + calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout); image_ = allocate_image( context_, image_extents_, diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index d37628e4adc..70f363796fd 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -60,11 +60,11 @@ std::vector calculate_padded_sizes( const utils::GPUMemoryLayout memory_layout); /* - * Given the padded sizes of a tensor and the GPU memory layout, calculate the - * 3D image extents required to store the tensor data as an image texture. + * Calculate the image extents required of a texture backed tensor. */ utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout memory_layout); struct LastAccess { @@ -90,7 +90,8 @@ class vTensorStorage final { Context* context, const utils::StorageType storage_type, const utils::GPUMemoryLayout gpu_memory_layout, - const std::vector& sizes, + const std::vector& axis_mapping, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -159,6 +160,7 @@ class vTensorStorage final { void discard_and_reallocate( const std::vector& padded_sizes, + const std::vector& axis_mapping, const utils::GPUMemoryLayout gpu_memory_layout, const vkapi::ScalarType dtype); }; @@ -218,21 +220,58 @@ class vTensor final { vTensor& operator=(vTensor&& other) = default; private: + /* + * "Core" tensor metadata. They are the minimum amount of information required + * to construct a tensor. + */ + + // Whether the tensor has elements of type float, int, etc. vkapi::ScalarType dtype_; + // Describes which dimension is "tightly packed". For texture backed tensors, + // this describes which dimension is packed along a texel. For buffer backed + // tensors, this describes which dimension has a stride of 1 (i.e. is last in + // the dim order). utils::GPUMemoryLayout memory_layout_; - // sizes of the tensor in NCHW dimension order std::vector sizes_; + + /* + * "Layout" metadata. These describe with further detail how tensor data is + * laid out in memory. However, they are considered secondary to the "core" + * metadata members above because defaults can be assumed based on a given + * memory layout. When permuting the tensor without performing a copy, these + * metadata members are the ones that will be changed. All other metadata is + * derived from a combination of sizes, memory layout, and the below members. + */ + // dim order of the tensor; dimension indices are in NCHW dimension order // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger // strides precede the dims with smaller strides in the dim order. The last // dim is always the fastest moving dim with a stride of 1. std::vector dim_order_; + // Describes which axis of an image texture each dimension of the tensor maps + // to. The axis mapping allows texture based tensors to be permuted and + // transposed without modifying the underlying texture storage. For a more in + // depth explanation of axis mapping, see the `default_axis_mapping()` + // function. + std::vector axis_mapping_; + + /* + * The below can be consider "layout" metadata as well, but are derived from + * the above data members. + */ + // strides of the tensor in NCHW dimension order std::vector strides_; // Contains the number of elements in the tensor according to the canonical // sizes. size_t numel_; + + /* + * The below metadata members are derived from the above, and are typically + * to i.e. pass tensor metadata to compute shaders. + */ + // padded sizes of the tensor in NCHW dimension order. See the // calculate_padded_sizes() function for more context. Note that padded sizes // are only used for texture storage, and not for buffer storage. @@ -260,6 +299,7 @@ class vTensor final { ParamsBuffer sizes_uniform_; ParamsBuffer strides_uniform_; ParamsBuffer numel_uniform_; + ParamsBuffer axis_mapping_uniform_; ParamsBuffer texture_limits_uniform_; vTensorStorage storage_; @@ -365,14 +405,18 @@ class vTensor final { */ const vkapi::BufferBindInfo strides_ubo(); + /* + * Returns a GPU buffer containing the texture axis mapping for each dimension + * of the tensor, in WHCN dimension order. + */ + const vkapi::BufferBindInfo axis_mapping_ubo(); + /* * Returns a GPU buffer containing the virtual image extents of the tensor. * Since a tensor can be resized with the virtual_resize() function, this * GPU buffer contains the image extents of the tensor calculated using the * virtual_resize() function. This allows shaders to exit early if they are * working outside the limits of the texture. - * - * This buffer should only be used to */ const vkapi::BufferBindInfo texture_limits_ubo(); @@ -423,13 +467,10 @@ class vTensor final { private: /* - * Given new sizes and new strides of the dim order, update the sizes and dim - * order metadata of the vTensor. New strides are computed using the new sizes - * and new dim order. + * Assuming sizes, dim order, or axis mapping was modified, recompute all + * derived metadata and update metadata UBO with new values. */ - void update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order); + void update_metadata(); /* * Check that tensor sizes are valid given the current storage resource's diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 210b03c4cad..afdc8290cdd 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -327,6 +327,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().numel_ubo(); } + inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().axis_mapping_ubo(); + } + inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().texture_limits_ubo(); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 1112548b855..c7d20c38675 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1007,10 +1007,16 @@ TEST_F(VulkanComputeAPITest, print_object_sizes) { // The actual sizes of each object is dependent on the platform. However, we // can alert ourselves to any significant changes in the sizes of these // objects by checking the `sizeof()` the class against some loose thresholds. - EXPECT_TRUE(sizeof(vTensor) < 1800); - EXPECT_TRUE(sizeof(Value) < 2400); + + // Current known size on 64 bit system: 1824 B + EXPECT_TRUE(sizeof(vTensor) < 2000); + // Current known size on 64 bit system: 1840 B + EXPECT_TRUE(sizeof(Value) < 2200); + // Current known size on 64 bit system: 240 B EXPECT_TRUE(sizeof(StagingBuffer) < 500); + // Current known size on 64 bit system: 384 B EXPECT_TRUE(sizeof(ComputeGraph) < 500); + // Current known size on 64 bit system: 248 B EXPECT_TRUE(sizeof(ExecuteNode) < 500); } From 0458c2e4df202f9ecbba969a45896d245c9b0c43 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Thu, 5 Sep 2024 20:31:58 -0700 Subject: [PATCH 218/531] Retire the ManagedTensor. Differential Revision: D62245939 Pull Request resolved: https://github.com/pytorch/executorch/pull/5126 --- extension/runner_util/managed_tensor.h | 107 ------------------ extension/runner_util/targets.bzl | 15 --- extension/runner_util/test/CMakeLists.txt | 2 +- .../runner_util/test/managed_tensor_test.cpp | 86 -------------- extension/runner_util/test/targets.bzl | 12 -- 5 files changed, 1 insertion(+), 221 deletions(-) delete mode 100644 extension/runner_util/managed_tensor.h delete mode 100644 extension/runner_util/test/managed_tensor_test.cpp diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h deleted file mode 100644 index 5c74f7550ae..00000000000 --- a/extension/runner_util/managed_tensor.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -// @nolint PATTERNLINT Ok to use stdlib for this optional library -#include - -#include -#include -#include -#include - -#ifdef USE_ATEN_LIB -#include -#endif - -namespace executorch { -namespace extension { - -/** - * A tensor wrapper takes ownership of all the memory of the necessary metadata - * for exec_aten::Tensor. Note that it doesn't own the data memory. - */ -class ManagedTensor { - public: - /// The type used for elements of `sizes()`. - using SizesType = exec_aten::SizesType; - /// The type used for elements of `dim_order()`. - using DimOrderType = exec_aten::DimOrderType; - /// The type used for elements of `strides()`. - using StridesType = exec_aten::StridesType; - - ManagedTensor() = delete; - - explicit ManagedTensor( - void* data, - const std::vector& sizes, - exec_aten::ScalarType dtype) - : sizes_(sizes) { -#ifdef USE_ATEN_LIB - tensor_ = torch::from_blob(data, sizes, dtype); -#else - // Calculate strides. - strides_ = std::vector(sizes_.size()); - if (sizes_.size() > 0) { - strides_.back() = 1; - for (size_t i = strides_.size() - 1; i > 0; --i) { - strides_[i - 1] = strides_[i] * sizes_[i]; - } - } - - // Allocate TensorImpl. - tensor_impl_ = std::make_unique( - dtype, - sizes_.size(), - sizes_.data(), - data, - /*dim_order=*/nullptr, - strides_.data(), - executorch::runtime::TensorShapeDynamism::DYNAMIC_BOUND); -#endif - } - - void resize(const std::vector& new_sizes) { - auto err = executorch::runtime::resize_tensor( - this->get_aliasing_tensor(), - exec_aten::ArrayRef(new_sizes.data(), new_sizes.size())); - ET_CHECK(err == executorch::runtime::Error::Ok); - } - - /** - * Get the underlying Tensor object. This is assuming the copying is cheap. - */ - exec_aten::Tensor get_aliasing_tensor() { -#ifdef USE_ATEN_LIB - return tensor_; -#else - return exec_aten::Tensor(tensor_impl_.get()); -#endif - } - - private: - std::unique_ptr tensor_impl_; - std::vector sizes_; - std::vector strides_; -#ifdef USE_ATEN_LIB - exec_aten::Tensor tensor_; -#endif -}; - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::ManagedTensor; -} // namespace executor -} // namespace torch diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl index 43c0ed08f32..bc0fee197d6 100644 --- a/extension/runner_util/targets.bzl +++ b/extension/runner_util/targets.bzl @@ -26,18 +26,3 @@ def define_common_targets(): "//executorch/runtime/executor:program" + aten_suffix, ], ) - - runtime.cxx_library( - name = "managed_tensor" + aten_suffix, - exported_headers = [ - "managed_tensor.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/runtime/core/exec_aten:lib" + aten_suffix, - "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ], - ) diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt index 6b295611fd2..aefb3b0417a 100644 --- a/extension/runner_util/test/CMakeLists.txt +++ b/extension/runner_util/test/CMakeLists.txt @@ -23,7 +23,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs inputs_test.cpp managed_tensor_test.cpp) +set(_test_srcs inputs_test.cpp) et_cxx_test( extension_runner_util_test diff --git a/extension/runner_util/test/managed_tensor_test.cpp b/extension/runner_util/test/managed_tensor_test.cpp deleted file mode 100644 index 8ac1285f2bd..00000000000 --- a/extension/runner_util/test/managed_tensor_test.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -using namespace ::testing; -using exec_aten::DimOrderType; -using exec_aten::ScalarType; -using exec_aten::SizesType; -using exec_aten::StridesType; -using executorch::extension::ManagedTensor; -using executorch::runtime::ArrayRef; - -class ManagedTensorTest : public ::testing::Test { - protected: - void SetUp() override { - executorch::runtime::runtime_init(); - - data_ = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - sizes_ = {2, 3, 4}; - expected_strides_ = {12, 4, 1}; - managed_tensor_ = - std::make_unique(data_.data(), sizes_, ScalarType::Long); - } - - protected: - std::vector data_; - std::vector sizes_; - std::vector expected_strides_; - std::unique_ptr managed_tensor_; -}; - -TEST_F(ManagedTensorTest, Smoke) { - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - EXPECT_EQ(tensor.sizes(), ArrayRef(sizes_.data(), sizes_.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); - for (size_t i = 0; i < expected_strides_.size(); ++i) { - EXPECT_EQ(tensor.strides()[i], expected_strides_[i]); - } -} - -TEST_F(ManagedTensorTest, ResizeWithUpdatedRank) { - // gtest death test doesn't work on iOS: - // https://github.com/google/googletest/issues/2834 -#if !GTEST_OS_IOS - EXPECT_EXIT( - managed_tensor_->resize(std::vector{2, 3, 4, 5}), - ::testing::KilledBySignal(SIGABRT), - ""); -#endif -} - -TEST_F(ManagedTensorTest, ResizeShrink) { - managed_tensor_->resize(std::vector{2, 2, 2}); - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - std::vector expected_sizes = {2, 2, 2}; - EXPECT_EQ( - tensor.sizes(), - ArrayRef(expected_sizes.data(), expected_sizes.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); -} - -TEST_F(ManagedTensorTest, Resize) { - managed_tensor_->resize(std::vector{4, 3, 2}); - const auto tensor = managed_tensor_->get_aliasing_tensor(); - - std::vector expected_sizes = {4, 3, 2}; - EXPECT_EQ( - tensor.sizes(), - ArrayRef(expected_sizes.data(), expected_sizes.size())); - EXPECT_EQ(tensor.scalar_type(), ScalarType::Long); - EXPECT_EQ(tensor.const_data_ptr(), data_.data()); -} diff --git a/extension/runner_util/test/targets.bzl b/extension/runner_util/test/targets.bzl index 7c042ca9d94..f55a1ea995f 100644 --- a/extension/runner_util/test/targets.bzl +++ b/extension/runner_util/test/targets.bzl @@ -30,15 +30,3 @@ def define_common_targets(is_fbcode = False): "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", }, ) - - runtime.cxx_test( - name = "managed_tensor_test", - srcs = [ - "managed_tensor_test.cpp", - ], - deps = [ - "//executorch/extension/runner_util:managed_tensor", - "//executorch/runtime/core/exec_aten:lib", - "//executorch/runtime/core/exec_aten/util:tensor_util", - ], - ) From 40720f0650112550e14a5f51390ab4fd12aeb727 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 5 Sep 2024 21:12:30 -0700 Subject: [PATCH 219/531] Use Android llm benchmark runner Differential Revision: D62279317 Pull Request resolved: https://github.com/pytorch/executorch/pull/5094 --- .../workflows/upload-android-test-specs.yml | 2 +- .../android-llm-device-farm-test-spec.yml | 22 +++++++++++++++++++ .../LlmBenchmarkRunner.java | 22 +++++++++++++++++-- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml index 5a468da44f1..04f7cf40d73 100644 --- a/.github/workflows/upload-android-test-specs.yml +++ b/.github/workflows/upload-android-test-specs.yml @@ -41,7 +41,7 @@ jobs: with: # Just use a small model here with a minimal amount of configuration to test the spec models: stories110M - devices: samsung_galaxy_s2x + devices: samsung_galaxy_s22 delegates: xnnpack test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml index cac83b8e6f5..896e7b73fbf 100644 --- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml +++ b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml @@ -73,8 +73,30 @@ phases: fi fi; + # Run the new generic benchmark activity https://developer.android.com/tools/adb#am + - echo "Run LLM benchmark" + - | + adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n com.example.executorchllamademo/.LlmBenchmarkRunner \ + --es "model_dir" "/data/local/tmp/llama" \ + --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin" + post_test: commands: + - echo "Gather LLM benchmark results" + - | + BENCHMARK_RESULTS="" + ATTEMPT=0 + MAX_ATTEMPT=10 + while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do + echo "Waiting for benchmark results..." + BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo cat files/benchmark_results.json) + sleep 30 + ((ATTEMPT++)) + done + + adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo ls -la files/ + # Trying to pull the file using adb ends up with permission error, but this works too, so why not + echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json artifacts: # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory. diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java index 33b230b1dff..cee623507fd 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java @@ -14,8 +14,11 @@ import android.util.Log; import android.widget.TextView; import androidx.annotation.NonNull; +import com.google.gson.Gson; +import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.Arrays; public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback { ModelRunner mModelRunner; @@ -32,7 +35,12 @@ protected void onCreate(Bundle savedInstanceState) { Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); String tokenizerPath = intent.getStringExtra("tokenizer_path"); float temperature = intent.getFloatExtra("temperature", 0.8f); @@ -42,7 +50,7 @@ protected void onCreate(Bundle savedInstanceState) { } mStatsDump = new StatsDump(); - mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); mStatsDump.loadStart = System.currentTimeMillis(); } @@ -79,11 +87,21 @@ public void onGenerationStopped() { mTextView.append(mStatsDump.toString()); }); + // TODO (huydhn): Remove txt files here once the JSON format is ready try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { writer.write(mStatsDump.toString()); } catch (IOException e) { e.printStackTrace(); } + + // TODO (huydhn): Figure out on what the final JSON results looks like, we need something + // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042 + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(mStatsDump)); + } catch (IOException e) { + e.printStackTrace(); + } } } From 030fc3f8190c7e577f90d3af73688e9e9e9af191 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 5 Sep 2024 23:12:35 -0500 Subject: [PATCH 220/531] [LLAVA] Enable 2nd XNNPACK Partition pass for the text model Differential Revision: D62279641 Pull Request resolved: https://github.com/pytorch/executorch/pull/4968 --- examples/models/llava/export_llava.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index 4f8a403bb34..a41d8d3ba07 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -211,10 +211,15 @@ def export_all(llava_model: LlavaModel): partitioner={ "image_encoder": [XnnpackPartitioner()], "text_model": [ + # First partition the DQLinear nodes, then partition the rest of the nodes, + # to avoid multiple DQLinear nodes in the same partition, + # to avoid holding multiple unpacked and packed weight buffers in memory, + # to reduce peak memory footprint. XnnpackPartitioner( config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=True, - ) + ), + XnnpackPartitioner(), ], }, compile_config=EdgeCompileConfig(_check_ir_validity=False), From 97396091080d152a09c20d4fa5a7ef3981b2ed48 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 5 Sep 2024 21:18:30 -0700 Subject: [PATCH 221/531] [llava] Expose prefill image and prompt APIs Differential Revision: D62273041 Pull Request resolved: https://github.com/pytorch/executorch/pull/5119 --- examples/models/llama2/runner/runner.cpp | 4 +- examples/models/llava/runner/llava_runner.cpp | 85 ++++++++++++------- examples/models/llava/runner/llava_runner.h | 42 +++++++++ extension/llm/runner/text_prefiller.cpp | 25 +++--- extension/llm/runner/text_prefiller.h | 2 +- 5 files changed, 110 insertions(+), 48 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 02063ebfa59..2c72b4c724e 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -204,8 +204,8 @@ Error Runner::generate( // print prompts wrapped_callback(prompt); - - auto prefill_res = text_prefiller_->prefill(prompt_tokens, 0); + int64_t pos = 0; + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); stats_.first_token_ms = util::time_in_ms(); stats_.prompt_eval_end_ms = util::time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 0fc06da0c56..04c77a1064b 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -72,6 +72,54 @@ Error LlavaRunner::load() { return Error::Ok; } +Error LlavaRunner::prefill_images( + std::vector& images, + int64_t& start_pos) { + for (auto& image : images) { + // pos is updated inside image prefill. + ET_UNWRAP(image_prefiller_->prefill(image, start_pos)); + } + return Error::Ok; +} + +Result LlavaRunner::prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos, + int8_t eos) { + std::vector prompt_tokens = + ET_UNWRAP(tokenizer_->encode(prompt, bos, eos)); + + return text_prefiller_->prefill(prompt_tokens, start_pos); +} + +Error LlavaRunner::generate_from_pos( + const std::string& prompt, + int32_t seq_len, + int64_t start_pos, + std::function token_callback, + std::function + stats_callback) { + // prefill user prompt. No BOS because preset prompt already has it. + token_callback(prompt); + + uint64_t prefill_next_token = + ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0)); + stats_.num_prompt_tokens = start_pos; + + // Generate tokens + int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( + {prefill_next_token}, start_pos, seq_len, token_callback)); + + // Bookkeeping + stats_.num_generated_tokens = num_generated_tokens; + ::executorch::llm::print_report(stats_); + if (stats_callback) { + stats_callback(stats_); + } + return Error::Ok; +} + Error LlavaRunner::generate( std::vector images, const std::string& prompt, @@ -96,43 +144,14 @@ Error LlavaRunner::generate( int64_t pos = 0; // prefill preset prompt - std::vector preset_prompt_tokens = - ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0)); - size_t num_preset_tokens = preset_prompt_tokens.size(); - - ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos)); - pos += num_preset_tokens; + prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0); // prefill images - for (auto& image : images) { - // pos is updated inside image prefill. - ET_UNWRAP(image_prefiller_->prefill(image, pos)); - } - - // prefill user prompt. No BOS because preset prompt already has it. - wrapped_callback(prompt); - - std::vector user_prompt_tokens = - ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0)); - size_t num_user_tokens = user_prompt_tokens.size(); - - uint64_t prefill_next_token = - ET_UNWRAP(text_prefiller_->prefill(user_prompt_tokens, pos)); - pos += num_user_tokens; + prefill_images(images, pos); // Generate tokens - int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - {prefill_next_token}, pos, seq_len, wrapped_callback)); - - // Bookkeeping - stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens; - stats_.num_generated_tokens = num_generated_tokens; - ::executorch::llm::print_report(stats_); - if (stats_callback) { - stats_callback(stats_); - } - - return Error::Ok; + return generate_from_pos( + prompt, seq_len, pos, wrapped_callback, stats_callback); } } // namespace torch::executor diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index 9b14bc9283a..923f8180a83 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -38,6 +38,48 @@ class LlavaRunner : public MultimodalRunner { std::function stats_callback = {}); + /** + * Prefill an LLaVA Module with the given images input. + * @param images The image input to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @return The error status of prefilling images. + */ + Error prefill_images(std::vector& images, int64_t& start_pos); + + /** + * Prefill an LLaVA Module with the given text input. + * @param prompt The text prompt to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The generated token of the LLaVA Module after prefill prompt. + */ + Result prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos = 0, + int8_t eos = 0); + + /** + * Generate tokens from the given prompt, starting from the given position. + * @param prompt The text prompt to LLaVA. + * @param seq_len The total sequence length, including the prompt tokens and + * new tokens. + * @param start_pos The starting position in KV cache of the input in the LLM. + * @param token_callback What to do after a token is generated. + * @param stats_callback What to do with Stats. + * @return The error code. + */ + Error generate_from_pos( + const std::string& prompt, + int32_t seq_len = 1024, + int64_t start_pos = 0, + std::function token_callback = {}, + std::function + stats_callback = {}); + private: inline static const std::string kPresetPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "; diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index e6229e0b807..705583d638b 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -25,7 +25,7 @@ TextPrefiller::TextPrefiller( ::executorch::runtime::Result TextPrefiller::prefill( std::vector& prompt_tokens, - int64_t start_pos_index) { + int64_t& start_pos) { ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null"); if (!text_decoder_runner_->is_method_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load()); @@ -43,45 +43,46 @@ ::executorch::runtime::Result TextPrefiller::prefill( {1, num_prompt_tokens}, exec_aten::ScalarType::Long); - auto start_pos = - from_blob(&start_pos_index, {1}, exec_aten::ScalarType::Long); + auto start_pos_tensor = + from_blob(&start_pos, {1}, exec_aten::ScalarType::Long); - auto outputs_res = text_decoder_runner_->step(tokens, start_pos); + auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_LOG( Info, "Prefill token result numel(): %zu", outputs_res.get().numel()); + start_pos += num_prompt_tokens; cur_token = text_decoder_runner_->logits_to_token(outputs_res.get()); } else { // sequential prefill int64_t pos = 0; // position in the sequence - // token & pos - int64_t pos_data = 0; // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[0]; // initialize tensor wrappers auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long); - auto start_pos = from_blob(&pos_data, {1}, exec_aten::ScalarType::Long); + auto start_pos_tensor = + from_blob(&start_pos, {1}, exec_aten::ScalarType::Long); // run the first token and get back logits tensor. Assuming the first token // is bos so don't callback. auto logits_tensor = - ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos)); + ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor)); - pos = 1; // start from index 1 + pos += 1; // start the loop from index 1 + start_pos += 1; while (pos < num_prompt_tokens) { // Run the model - pos_data = start_pos_index + pos; - // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds) cur_token = prompt_tokens[pos]; - logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos)); + logits_tensor = + ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor)); pos++; + start_pos++; } cur_token = text_decoder_runner_->logits_to_token(logits_tensor); diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h index a8ba77b860a..0ea126f32d6 100644 --- a/extension/llm/runner/text_prefiller.h +++ b/extension/llm/runner/text_prefiller.h @@ -36,7 +36,7 @@ class TextPrefiller { */ ::executorch::runtime::Result prefill( std::vector& prompt_tokens, - int64_t start_pos = 0); + int64_t& start_pos); private: TextDecoderRunner* text_decoder_runner_; From 41ec7fa1a7bcbbe43026bf9b4d4f7e97774d7814 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 6 Sep 2024 12:22:34 -0400 Subject: [PATCH 222/531] [ET-VK] Integrate axis mapping into staging <-> image transfer shaders Differential Revision: D62210117 Pull Request resolved: https://github.com/pytorch/executorch/pull/5093 --- .../runtime/graph/ops/glsl/image_to_nchw.glsl | 9 +- .../runtime/graph/ops/glsl/indexing_utils.h | 93 +++++++++++++++++++ .../ops/glsl/int8_image_to_nchw_noint8.glsl | 9 +- .../runtime/graph/ops/glsl/nchw_to_image.glsl | 9 +- .../ops/glsl/nchw_to_int8_image_noint8.glsl | 15 +-- .../runtime/graph/ops/impl/Convolution.cpp | 2 +- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 8 +- backends/vulkan/test/utils/test_utils.cpp | 7 +- .../vulkan/test/vulkan_compute_api_test.cpp | 15 +-- 9 files changed, 136 insertions(+), 31 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index b51d5a3f6ed..8f113bd2cc2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(0, "w", "nchw_out", DTYPE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 21eadff0b36..9dc06bd8552 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) { return tensor_idx; } +/* + * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis + * mapping. + */ +ivec4 to_tensor_idx( + ivec3 pos, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4 + // elements in the tensor. + pos[axis_mapping[packed_dim]] *= 4; + + ivec4 tensor_idx; + for (int dim = 0; dim < 3; ++dim) { + tensor_idx[dim] = pos[axis_mapping[dim]]; + } + + // Early return if batch is 1. Batch index will be 0. + if (sizes.w == 1) { + tensor_idx.w = 0; + return tensor_idx; + } + + // Else, adjust the dim that's concatenated with batch. Note that the axis + // mapping for the batch dim indicates WHCN dim index of the dim that it is + // concatenated with, not a texture axis. + tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]]; + tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]]; + + return tensor_idx; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim * is packed along a texel @@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis + * mapping. + */ +ivec3 to_texture_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + return pos; +} + /* * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim * is packed along a texel @@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } +/* + * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using + * the axis mapping. + */ +ivec4 to_texture_elem_pos( + const ivec4 idx, + ivec4 sizes, + const ivec4 axis_mapping, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec4 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_mapping[dim]] = idx[dim]; + } + + // Adjust batch dim if needed + if (sizes.w > 1) { + pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w; + } + + // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4 + // tensor elements in that dim. + pos[axis_mapping[packed_dim]] /= 4; + pos.w = idx[packed_dim] % 4; + return pos; +} + // // Texel Access and Storage // diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl index b1e3a0abdfe..3ef984bfc95 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl @@ -16,10 +16,11 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_buffer(0, "w", "nchw_out", "int")} -${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} -${layout_declare_ubo(3, "int", "out_numel")} +${layout_declare_buffer(B, "w", "nchw_out", "int")} +${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")} +${layout_declare_ubo(B, "ivec4", "tensor_sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} +${layout_declare_ubo(B, "int", "out_numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index abe93904805..04b6a26cc44 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(1, "r", "nchw_in", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_buffer(B, "r", "nchw_in", DTYPE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl index 378cf09d129..813a174d2a5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl @@ -16,9 +16,10 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")} -${layout_declare_buffer(1, "r", "nchw_in", "int")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} +${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")} +${layout_declare_buffer(B, "r", "nchw_in", "int")} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_mapping")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -36,7 +37,7 @@ int extend_sign(int x) { ivec4 read_texel(ivec4 tensor_idx) { const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, tensor_sizes, packed_dim); + tensor_idx, sizes, packed_dim); int shift = (1 << 8) - 1; ivec4 masks; @@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) { ivec4 out_tex = ivec4(0); [[unroll]] for (int i = 0; i < 4; ++i) { - if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) { + if (tensor_idx[packed_dim] + i < sizes[packed_dim]) { int in_texel = nchw_in[buf_indices[i] / 4]; int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); extracted_val = extend_sign(extracted_val); @@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) { void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim); + const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim); - if (any(greaterThanEqual(tensor_idx, tensor_sizes))) { + if (any(greaterThanEqual(tensor_idx, sizes))) { return; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 74113197d46..dcdd2dccfa0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,7 +106,7 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {t->sizes_ubo(), t->axis_mapping_ubo()}, // Specialization constants {SV(t->packed_dim_whcn_idx())})); diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 9df5b73c1a1..6a759e0fd2e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -31,7 +31,8 @@ void add_staging_to_tensor_node( graph.strides_ubo(out_tensor), graph.numel_ubo(out_tensor)}); } else { - ubos.append(graph.sizes_ubo(out_tensor)); + ubos.append( + {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( @@ -69,7 +70,8 @@ void add_tensor_to_staging_node( graph.strides_ubo(in_tensor), graph.numel_ubo(in_tensor)}); } else { - ubos.append(graph.sizes_ubo(in_tensor)); + ubos.append( + {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)}); } // Normally, the image_to_nchw shader is structured so that each thread reads @@ -113,7 +115,7 @@ ValueRef prepack( if (graph.is_buffer_storage(v)) { ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); } else { - ubos.append(graph.sizes_ubo(v)); + ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)}); } graph.prepack_nodes().emplace_back(new PrepackNode( diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index a469a44dc1a..4feaecced53 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -85,7 +85,8 @@ void record_nchw_to_image_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, - v_dst.sizes_ubo()); + v_dst.sizes_ubo(), + v_dst.axis_mapping_ubo()); } void record_image_to_nchw_op( @@ -106,7 +107,8 @@ void record_image_to_nchw_op( 0, dst_buffer, v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo()); + v_src.sizes_ubo(), + v_src.axis_mapping_ubo()); } void record_int8_image_to_nchw_noint8_op( @@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op( dst_buffer.buffer(), v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), + v_src.axis_mapping_ubo(), v_src.numel_ubo()); } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c7d20c38675..53d0c820f41 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1233,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config; ComputeGraph graph(config); - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; + std::vector size_big = {1, 8, 8}; + std::vector size_small = {1, 1, 8}; // Build graph @@ -1415,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader + // +2: t.axis_mapping_ubo() for each staging shader // +2: staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + EXPECT_TRUE(get_vma_allocation_count() == 6); ValueRef c = graph.add_tensor( size_big, @@ -1433,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() uniform buffer for staging shader + // +1: t.axis_mapping_ubo() uniform buffer for staging shader // +1: staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 9); + EXPECT_TRUE(get_vma_allocation_count() == 12); ValueRef e = graph.add_tensor( size_big, @@ -1450,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() for staging shader + // +1: t.axis_mapping_ubo() for staging shader // +1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 13); + EXPECT_TRUE(get_vma_allocation_count() == 17); graph.prepare(); graph.encode_execute(); // +3: shared memory allocations for tensors - EXPECT_TRUE(get_vma_allocation_count() == 16); + EXPECT_TRUE(get_vma_allocation_count() == 20); // Run graph From a48f91653380e15b90b6f1cc9ea617081bc5d64d Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Fri, 6 Sep 2024 09:33:16 -0700 Subject: [PATCH 223/531] Revert "Implement dumping operator distribution for TOSA graph" (#5131) Reverts #4970 The pull / unittest-arm (buck2) / linux-job and trunk / test-arm-reference-delegation / linux-job jobs started failing in trunk (dashboard) after this PR merged. See #4970 (comment) for more details. --- backends/arm/test/misc/test_debug_feats.py | 85 ++++++---------------- backends/arm/test/tester/arm_tester.py | 47 ++---------- 2 files changed, 30 insertions(+), 102 deletions(-) diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index dd59fddbd47..aa9703f9eba 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -126,67 +126,26 @@ def test_numerical_diff_prints(self): self.fail() -def test_dump_ops_and_dtypes(): - model = Linear(20, 30) - ( - ArmTester( - model, - example_inputs=model.get_inputs(), - compile_spec=common.get_tosa_compile_spec(), - ) - .quantize() - .dump_dtype_distribution() - .dump_operator_distribution() - .export() - .dump_dtype_distribution() - .dump_operator_distribution() - .to_edge() - .dump_dtype_distribution() - .dump_operator_distribution() - .partition() - .dump_dtype_distribution() - .dump_operator_distribution() - ) - # Just test that there are no execptions. - - -def test_dump_tosa_ops(capsys): - model = Linear(20, 30) - ( - ArmTester( - model, - example_inputs=model.get_inputs(), - compile_spec=common.get_tosa_compile_spec(), - ) - .quantize() - .export() - .to_edge() - .partition() - .dump_operator_distribution() - ) - captured = capsys.readouterr() - assert "Partition operators:" in captured.out - assert "TOSA operators:" in captured.out - - -def test_fail_dump_tosa_ops(capsys): - class Add(torch.nn.Module): - def forward(self, x): - return x + x - - model = Add() - compile_spec = common.get_tosa_compile_spec_unbuilt() - compile_spec.output_format = "vela" - ( - ArmTester( - model, example_inputs=(torch.ones(5),), compile_spec=compile_spec.build() +class TestDumpOperatorsAndDtypes(unittest.TestCase): + def test_dump_ops_and_dtypes(self): + model = Linear(20, 30) + ( + ArmTester( + model, + example_inputs=model.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .dump_dtype_distribution() + .dump_operator_distribution() + .export() + .dump_dtype_distribution() + .dump_operator_distribution() + .to_edge() + .dump_dtype_distribution() + .dump_operator_distribution() + .partition() + .dump_dtype_distribution() + .dump_operator_distribution() ) - .quantize() - .export() - .to_edge() - .partition() - .dump_operator_distribution() - ) - captured = capsys.readouterr() - assert "Partition operators:" in captured.out - assert "Can not get operator distribution for vela command stream." in captured.out + # Just test that there are no execeptions. diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index ec44a02739e..98fac29144c 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -13,7 +13,7 @@ import numpy as np -import torch.fx +import torch from executorch.backends.arm.arm_backend import get_intermediate_path, is_permute_memory from executorch.backends.arm.arm_partitioner import ArmPartitioner @@ -297,7 +297,9 @@ def get_graph(self, stage: str | None = None) -> Graph: return graph - def dump_operator_distribution(self, path_to_dump: Optional[str] = None): + def dump_operator_distribution( + self, path_to_dump: Optional[str] = None + ) -> ArmQuantizer: """Dump a dictionary with {operator: operator count} for the operators in the graph of the current stage. @@ -305,16 +307,13 @@ def dump_operator_distribution(self, path_to_dump: Optional[str] = None): """ graph = self.get_graph(self.cur) op_dist = _get_operator_distribution(graph) - to_print = self.cur + " operators: " + _format_dict(dict(op_dist)) + "\n" - - if self.cur == self.stage_name(tester.Partition): - to_print += _get_tosa_operator_distribution( - self.get_artifact(self.cur).exported_program().graph_module - ) + to_print = self.cur + " operators: " + _format_dict(op_dist) + "\n" _dump_str(to_print, path_to_dump) return self - def dump_dtype_distribution(self, path_to_dump: Optional[str] = None): + def dump_dtype_distribution( + self, path_to_dump: Optional[str] = None + ) -> ArmQuantizer: """Dump a dictionary with {dtype: dtype count} for the dtypes of the nodes in the graph of the current stage. @@ -422,36 +421,6 @@ def _get_operator_distribution(graph: Graph) -> dict[str, int]: ) -def _get_tosa_operator_distribution(graph_module: torch.fx.GraphModule) -> str: - """Counts the occurences of operator names of all lowered modules containing - a TOSA flatbuffer. - The result is a string with the operator distribution or an error message. - """ - op_list = [] - id = 0 - while lowered_module := getattr(graph_module, f"lowered_module_{id}", None): - for spec in lowered_module.compile_specs: - if spec.key != "output_format": - continue - if spec.value == b"tosa": - tosa_fb = lowered_module.processed_bytes - tosa_json = dbg_tosa_fb_to_json(tosa_fb) - for region in tosa_json["regions"]: - for block in region["blocks"]: - op_list.extend( - [operator["op"] for operator in block["operators"]] - ) - break - elif spec.value == b"vela": - return "Can not get operator distribution for vela command stream." - else: - return f"Unknown output format '{spec.value}'." - id += 1 - if id == 0: - return "No delegate with name 'lowered_module_0 found in graph module." - return "TOSA operators: " + _format_dict(dict(Counter(op_list))) - - def _dump_str(to_print: str, path_to_dump: Optional[str] = None): if path_to_dump: with open(path_to_dump, "a") as fp: From 20d93fb165ccac277be7d71fa9ec3aeab7ab045d Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:36:57 -0700 Subject: [PATCH 224/531] Trigger android.yml with tag ciflow/android/* Pull Request resolved: https://github.com/pytorch/executorch/pull/4990 --- .github/workflows/android.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 4c693a90e61..1ea7f398cee 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -5,6 +5,8 @@ on: branches: - main - release/* + tags: + - ciflow/android/* pull_request: paths: - .ci/docker/** From 7122d310844ba701db1f45a13ccaa8c5a002fc89 Mon Sep 17 00:00:00 2001 From: cccclai Date: Fri, 6 Sep 2024 10:10:54 -0700 Subject: [PATCH 225/531] Add proper pt2e calibration (#5095) * Add proper pt2e calibration * distinguish dynamic shape * remove unnecessary code * remove unnecessary code * add comments * Address comments and add template calibration * remove logging * address comments * remove cuda * add graph module eval wrapper --- examples/models/llama2/eval_llama_lib.py | 65 +++++++++++-- examples/models/llama2/export_llama_lib.py | 29 +++++- extension/llm/export/builder.py | 101 ++++++++++++++++++++- 3 files changed, 183 insertions(+), 12 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 3ea4e77a1a6..7cdde228b35 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -29,6 +29,51 @@ ) +class GraphModuleEvalWrapper(EagerEvalWrapper): + """ + A wrapper class for ExecuTorch py-binded integration with the + lm-evaluation-harness library. + """ + + def __init__( + self, + model: torch.fx.GraphModule, + tokenizer: Union[SentencePieceTokenizer, Tiktoken], + max_seq_length: Optional[int] = None, + use_kv_cache: bool = False, + enable_dynamic_shape: bool = True, + ): + super().__init__( + model=model, tokenizer=tokenizer, max_seq_length=max_seq_length + ) + self._model = model.to(self.device) + self._use_kv_cache = use_kv_cache + self._enable_dynamic_shape = enable_dynamic_shape + + def _model_call(self, inps): + if self._use_kv_cache: + if not self._enable_dynamic_shape: + # graph module exported without dynamic shape won't work with a different shape. + # And we have to do single token prefill here. + result_logits = [] + for pos in range(inps.shape[-1]): + pos_tensor = torch.tensor([pos], dtype=torch.int64) + logits = self._model(inps[:, pos : pos + 1], pos_tensor) + result_logits.append(logits) + return torch.cat(result_logits, dim=1) + else: + pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) + # Batch process the whole sequence. + logits = self._model(inps[:, : self._max_seq_length], pos_tensor) + return logits + + else: + return self._model(inps) + + def _model_generate(self, context, max_length, eos_token_id): + raise Exception("unimplemented") + + class ETPybindEvalWrapper(EagerEvalWrapper): """ A wrapper class for ExecuTorch py-binded integration with the @@ -148,6 +193,13 @@ def gen_eval_wrapper( if torch.cuda.is_available() else manager.pre_autograd_graph_module.to(device="cpu") ) + return GraphModuleEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + use_kv_cache=args.use_kv_cache, + enable_dynamic_shape=args.enable_dynamic_shape, + ) else: # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but @@ -157,13 +209,12 @@ def gen_eval_wrapper( if torch.cuda.is_available() else manager.model.eval().to(device="cpu") ) - - return EagerEvalWrapper( - model=model, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - use_kv_cache=args.use_kv_cache, - ) + return EagerEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + use_kv_cache=args.use_kv_cache, + ) def build_args_parser() -> argparse.ArgumentParser: diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 1dac12cc853..5dac3e9adbb 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -16,7 +16,7 @@ from enum import Enum from json import JSONDecodeError from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union import pkg_resources @@ -166,19 +166,25 @@ def build_args_parser() -> argparse.ArgumentParser: nargs="+", type=str, default=None, - help="Tasks for GPTQ calibration", + help="Tasks for GPTQ calibration from lm_eval", ) parser.add_argument( "--calibration_limit", type=int, default=None, - help="number of samples used for calibration", + help="number of samples used for calibration from lm_eval", ) parser.add_argument( "--calibration_seq_length", type=int, default=None, - help="Sequence length for GPTQ calibration", + help="Sequence length for GPTQ calibration from lm_eval", + ) + parser.add_argument( + "--calibration_data", + type=str, + default="Once upon a time", + help="Calibration prompts from users", ) parser.add_argument( "-t", @@ -421,6 +427,11 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: generate_full_logits=args.generate_full_logits, weight_type=weight_type, enable_dynamic_shape=args.enable_dynamic_shape, + calibration_tasks=args.calibration_tasks, + calibration_limit=args.calibration_limit, + calibration_seq_length=args.calibration_seq_length, + calibration_data=args.calibration_data, + tokenizer_path=args.tokenizer_path, verbose=args.verbose, max_seq_len=args.max_seq_length, metadata_str=args.metadata, @@ -630,6 +641,11 @@ def _load_llama_model( generate_full_logits: bool = False, weight_type: WeightType = WeightType.LLAMA, enable_dynamic_shape: bool = False, + calibration_tasks: Optional[List[str]] = None, + calibration_limit: Optional[int] = None, + calibration_seq_length: Optional[int] = None, + calibration_data: Optional[str] = None, + tokenizer_path: Optional[str] = None, verbose: bool = False, max_seq_len: int = 128, metadata_str: Optional[str] = None, @@ -685,6 +701,11 @@ def _load_llama_model( use_kv_cache=use_kv_cache, example_inputs=example_inputs, enable_dynamic_shape=enable_dynamic_shape, + calibration_tasks=calibration_tasks, + calibration_limit=calibration_limit, + calibration_seq_length=calibration_seq_length, + calibration_data=calibration_data, + tokenizer_path=tokenizer_path, verbose=verbose, metadata=_load_llama_model_metadata( weight_type, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 4f5bab7bc02..70ecab898f9 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -27,6 +27,7 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.extension.export_util.utils import export_to_edge, save_pte_program +from executorch.extension.llm.tokenizer.utils import get_tokenizer from torch._export import capture_pre_autograd_graph from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer import Quantizer @@ -66,6 +67,11 @@ def __init__( use_kv_cache, example_inputs, enable_dynamic_shape: bool = False, + calibration_tasks: Optional[List[str]] = None, + calibration_limit: Optional[int] = None, + calibration_seq_length: Optional[int] = None, + calibration_data: Optional[str] = None, + tokenizer_path: Optional[str] = None, verbose: bool = False, metadata: Optional[dict] = None, dynamic_shapes: Optional[Any] = None, @@ -87,6 +93,11 @@ def __init__( self.output_dir = "." self.dynamic_shapes = dynamic_shapes self._saved_pte_filename = None + self.calibration_tasks = calibration_tasks + self.calibration_limit = calibration_limit + self.calibration_seq_length = calibration_seq_length + self.calibration_data = calibration_data + self.tokenizer_path = tokenizer_path def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -167,6 +178,69 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": ) return self + def pt2e_calibrate( + self, + prepared_module, + calibration_tasks, + calibration_limit, + calibration_seq_length, + calibration_data, + tokenizer_path, + ): + logging.info("Run calibration...") + try: + from executorch.examples.models.llama2.eval_llama_lib import ( + GraphModuleEvalWrapper, + ) + from executorch.examples.models.llama2.evaluate import evaluate_model + except ImportError: + raise ImportError( + "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh" + ) + + tokenizer = get_tokenizer(tokenizer_path) + + def calibrate_template( + module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int + ): + # TODO: change criteria & support batch inputs if necessary + pos = torch.tensor(0, dtype=torch.int64) + token_list = tokenizer.encode(prompts, bos=True, eos=False) + + with torch.no_grad(): + while token_list[-1] != tokenizer.eos_id and pos < max_len: + logits = module( + torch.full((1, 1), token_list[pos]), + torch.tensor((pos,)), + ) + pos += 1 + if pos >= len(token_list): + token_list.append(torch.argmax(logits[:], dim=-1).item()) + + calibrate_template( + module=prepared_module, + tokenizer=tokenizer, + prompts=calibration_data, + max_len=calibration_seq_length, + ) + + eval_wrapper = GraphModuleEvalWrapper( + model=prepared_module, + tokenizer=tokenizer, + max_seq_length=calibration_seq_length, + use_kv_cache=self.use_kv_cache, + enable_dynamic_shape=self.enable_dynamic_shape, + ) + eval_results = evaluate_model( + eval_wrapper, + calibration_tasks, + calibration_limit, + ) + + for task, res in eval_results["results"].items(): + print(f"{task}: {res}") + logging.info("Calibration finish...") + def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": """ Quantize the model via pt2e flow and retrieve LLMEdgeManager including the quantized model. @@ -189,8 +263,33 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage self.pre_autograd_graph_module is not None ), "Please run capture_pre_autograd_graph first" m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer) + logging.info( + f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" + ) # Calibrate - m(*self.example_inputs) + if ( + self.calibration_tasks is not None + and self.calibration_limit is not None + and self.calibration_seq_length is not None + and self.calibration_data is not None + and self.tokenizer_path is not None + ): + logging.info( + f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" + ) + self.pt2e_calibrate( + prepared_module=m, + calibration_tasks=self.calibration_tasks, + calibration_limit=self.calibration_limit, + calibration_seq_length=self.calibration_seq_length, + calibration_data=self.calibration_data, + tokenizer_path=self.tokenizer_path, + ) + else: + logging.info( + "No calibration provided, using dummy input to calibrate..." + ) + m(*self.example_inputs) m = convert_pt2e(m) DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m From c83fd2e2f480f894b9e9be7f5dcb1abe518141fc Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Fri, 6 Sep 2024 11:06:01 -0700 Subject: [PATCH 226/531] Build frameworks with EXECUTORCH_XNNPACK_SHARED_WORKSPACE flag. Differential Revision: D62308655 Pull Request resolved: https://github.com/pytorch/executorch/pull/5135 --- build/build_apple_frameworks.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index 8bd9e0539ff..348111e2b4c 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -163,6 +163,7 @@ cmake_build() { -DEXECUTORCH_BUILD_COREML=$COREML \ -DEXECUTORCH_BUILD_MPS=$MPS \ -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ From 2ce4ad1218d19d4689cdfe0aea52497e628197ce Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 6 Sep 2024 11:58:56 -0700 Subject: [PATCH 227/531] Remove extract_constant_segment from config Differential Revision: D61996249 Pull Request resolved: https://github.com/pytorch/executorch/pull/5096 --- exir/_serialize/_program.py | 26 ++++++++---------- exir/capture/_config.py | 6 ---- exir/program/_program.py | 5 ---- runtime/executor/test/method_test.cpp | 8 ++---- runtime/executor/test/program_test.cpp | 7 ++--- runtime/executor/test/targets.bzl | 5 ++-- schema/program.fbs | 1 + test/end2end/exported_module.py | 2 -- .../ModuleLinear-no-constant-segment.pte | Bin 0 -> 1040 bytes test/models/deprecated/README.md | 14 ++++++++++ test/models/deprecated/TARGETS | 12 ++++++++ test/models/export_program.py | 21 ++++++-------- test/run_oss_cpp_tests.sh | 8 +++--- 13 files changed, 59 insertions(+), 56 deletions(-) create mode 100644 test/models/deprecated/ModuleLinear-no-constant-segment.pte create mode 100644 test/models/deprecated/README.md create mode 100644 test/models/deprecated/TARGETS diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 24ee6bd21a3..aa5aba1fd7a 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -347,7 +347,6 @@ def serialize_pte_binary( *, mutable_data: Optional[List[Buffer]] = None, extract_delegate_segments: bool = False, - extract_constant_segment: bool = True, segment_alignment: int = 128, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, @@ -363,8 +362,6 @@ def serialize_pte_binary( and the starting segment offset. - Update the Program.segments field with the offsets and lengths of each segment. - extract_constant_segment: Whether to move the constant data from the Program - into a separate segment. segment_alignment: Alignment in bytes. The starting offset of each segment will be aligned to this value in the output data. constant_tensor_alignment: The minimum alignment of tensor @@ -387,19 +384,18 @@ def serialize_pte_binary( # Store extracted segment data; this may be constant data or delegate data. segments: List[Cord] = [] - if extract_constant_segment: - constant_segment_data, constant_segment_offsets = _extract_constant_segment( - program.constant_buffer, tensor_alignment=constant_tensor_alignment + constant_segment_data, constant_segment_offsets = _extract_constant_segment( + program.constant_buffer, tensor_alignment=constant_tensor_alignment + ) + if len(constant_segment_data) > 0: + # Update program.constant_segment with constant subsegment offset information. + program.constant_segment = SubsegmentOffsets( + segment_index=len(segments), offsets=constant_segment_offsets ) - if len(constant_segment_data) > 0: - # Update program.constant_segment with constant subsegment offset information. - program.constant_segment = SubsegmentOffsets( - segment_index=len(segments), offsets=constant_segment_offsets - ) - # Clear the constant buffer, as constant data will be stored in segments. - program.constant_buffer = [] - # Add to the aggregate segments cord. - segments.append(constant_segment_data) + # Clear the constant buffer, as constant data will be stored in segments. + program.constant_buffer = [] + # Add to the aggregate segments cord. + segments.append(constant_segment_data) if mutable_data is not None: mutable_segment_data, mutable_segment_offsets = _extract_constant_segment( diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 7b91464bdce..2d0a6c4ca80 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -65,12 +65,6 @@ class ExecutorchBackendConfig: # This makes it possible to free those blobs at runtime. extract_delegate_segments: bool = True - # Whether to extract constants from the Program into separate segments, - # rather than encoding those constants in the flatbuffer data. - # This reduces the memory overhead of creating the .pte file for models with - # large constant data. - extract_constant_segment: bool = True - # When extracting segments, the starting offset of each segment will be # aligned to this value (in bytes). Must be a power of two. segment_alignment: int = 128 diff --git a/exir/program/_program.py b/exir/program/_program.py index 849eae4f6f0..1339760f215 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -439,7 +439,6 @@ def to_executorch( new_prog, emit_stacktrace=config.emit_stacktrace, extract_delegate_segments=config.extract_delegate_segments, - extract_constant_segment=config.extract_constant_segment, segment_alignment=config.segment_alignment, constant_tensor_alignment=config.constant_tensor_alignment, delegate_alignment=config.delegate_alignment, @@ -468,7 +467,6 @@ def __init__( exir_exported_program: ExirExportedProgram, emit_stacktrace: bool, extract_delegate_segments: bool, - extract_constant_segment: bool, segment_alignment: int, constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, @@ -483,7 +481,6 @@ def __init__( self._emitter_output: Optional[EmitterOutput] = None self._emit_stacktrace: bool = emit_stacktrace self._extract_delegate_segments: bool = extract_delegate_segments - self._extract_constant_segment: bool = extract_constant_segment self._segment_alignment: int = segment_alignment self._constant_tensor_alignment: Optional[int] = constant_tensor_alignment self._delegate_alignment: Optional[int] = delegate_alignment @@ -493,7 +490,6 @@ def _get_pte_data(self) -> Cord: self._pte_data = _serialize_pte_binary( program=self.program, extract_delegate_segments=self._extract_delegate_segments, - extract_constant_segment=self._extract_constant_segment, segment_alignment=self._segment_alignment, constant_tensor_alignment=self._constant_tensor_alignment, delegate_alignment=self._delegate_alignment, @@ -1351,7 +1347,6 @@ def __init__( program=self._emitter_output.program, mutable_data=self._emitter_output.mutable_data, extract_delegate_segments=backend_config.extract_delegate_segments, - extract_constant_segment=backend_config.extract_constant_segment, segment_alignment=backend_config.segment_alignment, constant_tensor_alignment=backend_config.constant_tensor_alignment, delegate_alignment=backend_config.delegate_alignment, diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp index 06b84d338e1..0163c8ceef9 100644 --- a/runtime/executor/test/method_test.cpp +++ b/runtime/executor/test/method_test.cpp @@ -59,11 +59,9 @@ class MethodTest : public ::testing::Test { load_program(std::getenv("ET_MODULE_INDEX_PATH"), "index"); load_program( std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat"); + load_program(std::getenv("ET_MODULE_LINEAR_PATH"), "linear"); load_program( - std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"), - "linear_constant_segment"); - load_program( - std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"), + std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"), "linear_constant_buffer"); } @@ -274,7 +272,7 @@ TEST_F(MethodTest, ConstantSegmentTest) { // Execute model with constants stored in segment. ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); Result method = - programs_["linear_constant_segment"]->load_method("forward", &mmm.get()); + programs_["linear"]->load_method("forward", &mmm.get()); ASSERT_EQ(method.error(), Error::Ok); // Can execute the method. diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp index 00e8b0e234b..2cc9b4369db 100644 --- a/runtime/executor/test/program_test.cpp +++ b/runtime/executor/test/program_test.cpp @@ -382,8 +382,7 @@ TEST_F(ProgramTest, DEPRECATEDLoad) { TEST_F(ProgramTest, LoadConstantSegment) { // Load the serialized ModuleLinear data, with constants in the segment and no // constants in the flatbuffer. - const char* linear_path = - std::getenv("ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH"); + const char* linear_path = std::getenv("ET_MODULE_LINEAR_PATH"); Result linear_loader = FileDataLoader::from(linear_path); ASSERT_EQ(linear_loader.error(), Error::Ok); @@ -424,11 +423,11 @@ TEST_F(ProgramTest, LoadConstantSegment) { EXPECT_GE(flatbuffer_program->constant_segment()->offsets()->size(), 1); } -TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) { +TEST_F(ProgramTest, LoadConstantSegmentWhenConstantBufferExists) { // Load the serialized ModuleLinear data, with constants in the flatbuffer and // no constants in the segment. const char* linear_path = - std::getenv("ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"); + std::getenv("DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH"); Result linear_loader = FileDataLoader::from(linear_path); ASSERT_EQ(linear_loader.error(), Error::Ok); diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index d6e3bc3d89d..72923e9868f 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -97,6 +97,8 @@ def define_common_targets(is_fbcode = False): # file in fbcode. See https://fburl.com/9esapdmd if not runtime.is_oss and is_fbcode: modules_env = { + # Deprecated model that still works with ExecuTorch runtime. + "DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models/deprecated:ModuleLinear-no-constant-segment.pte)", # The tests use this var to find the program file to load. This uses # an fbcode target path because the authoring/export tools # intentionally don't work in xplat (since they're host-only tools). @@ -104,8 +106,7 @@ def define_common_targets(is_fbcode = False): "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])", "ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])", - "ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte])", - "ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])", + "ET_MODULE_LINEAR_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleLinear.pte])", "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])", "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])", } diff --git a/schema/program.fbs b/schema/program.fbs index cbdda2d3606..e3c7597fcdc 100644 --- a/schema/program.fbs +++ b/schema/program.fbs @@ -429,6 +429,7 @@ table Program { // Each constant is assigned an index into the table which are each individually aligned. // 0 index is reserved to be pointed to by non-constant Tensors. // If this field is non-empty, constant_segment.offsets must be empty. + // DEPRECATED: After D61996249 on 2024-09-05, no new PTE files will use this field. constant_buffer:[Buffer]; // List of delegate data. Pointed to by BackendDelegateDataReference. diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py index 656b5705122..6e6b97b7186 100644 --- a/test/end2end/exported_module.py +++ b/test/end2end/exported_module.py @@ -67,7 +67,6 @@ def export( ignore_to_out_var_failure: bool = False, dynamic_memory_planning_mode: DynamicMemoryPlanningMode = DynamicMemoryPlanningMode.UPPER_BOUND, capture_config=None, - extract_constant_segment: bool = True, skip_type_promotion: bool = False, export_joint_graph: bool = False, ) -> "ExportedModule": @@ -206,7 +205,6 @@ def __init__(self, method): dynamic_memory_planning_mode=dynamic_memory_planning_mode, memory_planning_pass=memory_planning_pass, to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure), - extract_constant_segment=extract_constant_segment, ) ) diff --git a/test/models/deprecated/ModuleLinear-no-constant-segment.pte b/test/models/deprecated/ModuleLinear-no-constant-segment.pte new file mode 100644 index 0000000000000000000000000000000000000000..42b8643fb91a6709d40a70b0ac68d0a194d9e878 GIT binary patch literal 1040 zcmaJ=y-osQ5F8NBlMr(;giugi!WBnC!4V5XMWG#)xyo2bg2qB6ArKQ|Vc{eA7(Ruu zpvqi8F`s$h zkmQi46HkoWS)`BY%>2wVj&dzRybJ2k4DvPOXga|j&%xyKuG<|9hxDKS<=jqh$f1N} zr$C9P(OqKfk(dxx(^2XN+%w&r;&ay5efdQ~I`k}GF(FR*W%@2ZikfeeXCRqbcLi&< zu1?ME5Z8Rr-e&Zry+x=;f01a)tB07HLwwz}_F;4s#eXw=N57Zd;VEObmoBQO#jD{L z^{xR2d<%DZ>W!3nj3nmxJSv2GAg!nRs0Mt5b#zv|CY z&C>F2d`H@}dpp-}P31LB;_E-RoYqVB#c)<@+s?Ed{nF}ZcPy^vP~EuGh#KuMjv7%I VG+W2*u-Rld>2)<4-w(z^`~V(NT@e5P literal 0 HcmV?d00001 diff --git a/test/models/deprecated/README.md b/test/models/deprecated/README.md new file mode 100644 index 00000000000..f1d47d03264 --- /dev/null +++ b/test/models/deprecated/README.md @@ -0,0 +1,14 @@ +## Deprecated Models + +This readme documents deprecated models that remain compatible with versions of the ExecuTorch runtime. + +ModuleLinear-no-constant-segment.pte +- This file contains constants stored in the constant_buffer, which was deprecated in D61996249 on 2024-09-05. Now, constants are stored in a separate segment. +- This .pte file was generated internally using hg commit hash rFBS5e49dc0319b1d2d9969bbcef92857ab76a899c34, with command: + ``` + buck2 build fbcode//executorch/test/models:exported_programs[ModuleLinear-no-constant-segment.pte] --show-output + ``` +- In OSS, the same .pte file can be generated with https://github.com/pytorch/executorch/commit/cea5abbcdded, via: + ``` + python -m test.models.export_program --modules "ModuleLinear" --outdir . + ``` diff --git a/test/models/deprecated/TARGETS b/test/models/deprecated/TARGETS new file mode 100644 index 00000000000..369fc3c4067 --- /dev/null +++ b/test/models/deprecated/TARGETS @@ -0,0 +1,12 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +oncall("executorch") + +runtime.export_file( + name = "ModuleLinear-no-constant-segment.pte", + src = "ModuleLinear-no-constant-segment.pte", + visibility = [ + "//executorch/runtime/executor/test/...", + "//executorch/test/...", + ], +) diff --git a/test/models/export_program.py b/test/models/export_program.py index 7941af376fe..d753475b829 100644 --- a/test/models/export_program.py +++ b/test/models/export_program.py @@ -190,7 +190,6 @@ def export_joint(): def export_module_to_program( module_class: Type[nn.Module], - extract_constant_segment: bool, skip_type_promotion: bool, ): """Exports the module and returns the serialized program data.""" @@ -211,7 +210,6 @@ def export_module_to_program( module = ExportedModule.export( module_class, methods, - extract_constant_segment=extract_constant_segment, skip_type_promotion=skip_type_promotion, export_joint_graph=export_joint, **export_kwargs, @@ -259,18 +257,15 @@ def main() -> None: # Skip type promotion to keep the model in fp16. # Type promotion will convert to fp32. skip_type_promotion = True - for extract_constant_segment in (True, False): - suffix = "" if extract_constant_segment else "-no-constant-segment" - outfile = os.path.join(args.outdir, f"{module_name}{suffix}.pte") - with open(outfile, "wb") as fp: - fp.write( - export_module_to_program( - module_class, - extract_constant_segment=extract_constant_segment, - skip_type_promotion=skip_type_promotion, - ) + outfile = os.path.join(args.outdir, f"{module_name}.pte") + with open(outfile, "wb") as fp: + fp.write( + export_module_to_program( + module_class, + skip_type_promotion=skip_type_promotion, ) - print(f"Exported {module_name} and wrote program data to {outfile}") + ) + print(f"Exported {module_name} and wrote program data to {outfile}") if __name__ == "__main__": diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 3693700e831..078196bfc1e 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -56,23 +56,23 @@ export_test_model() { python3 -m test.models.export_program --modules "ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain" --outdir "cmake-out" 2> /dev/null python3 -m test.models.export_delegated_program --modules "ModuleAddMul" --backend_id "StubBackend" --outdir "cmake-out" || true + DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath test/models/deprecated/ModuleLinear-no-constant-segment.pte)" ET_MODULE_ADD_HALF_PATH="$(realpath cmake-out/ModuleAddHalf.pte)" ET_MODULE_ADD_PATH="$(realpath cmake-out/ModuleAdd.pte)" ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH="$(realpath cmake-out/ModuleDynamicCatUnallocatedIO.pte)" ET_MODULE_INDEX_PATH="$(realpath cmake-out/ModuleIndex.pte)" - ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH="$(realpath cmake-out/ModuleLinear-no-constant-segment.pte)" - ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH="$(realpath cmake-out/ModuleLinear.pte)" + ET_MODULE_LINEAR_PATH="$(realpath cmake-out/ModuleLinear.pte)" ET_MODULE_MULTI_ENTRY_PATH="$(realpath cmake-out/ModuleMultipleEntry.pte)" ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH="$(realpath cmake-out/ModuleAddMul-nosegments-da1024.pte)" ET_MODULE_ADD_MUL_NOSEGMENTS_PATH="$(realpath cmake-out/ModuleAddMul-nosegments.pte)" ET_MODULE_ADD_MUL_PATH="$(realpath cmake-out/ModuleAddMul.pte)" ET_MODULE_SIMPLE_TRAIN_PATH="$(realpath cmake-out/ModuleSimpleTrain.pte)" + export DEPRECATED_ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH export ET_MODULE_ADD_HALF_PATH export ET_MODULE_ADD_PATH export ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH export ET_MODULE_INDEX_PATH - export ET_MODULE_LINEAR_CONSTANT_BUFFER_PATH - export ET_MODULE_LINEAR_CONSTANT_SEGMENT_PATH + export ET_MODULE_LINEAR_PATH export ET_MODULE_MULTI_ENTRY_PATH export ET_MODULE_ADD_MUL_NOSEGMENTS_DA1024_PATH export ET_MODULE_ADD_MUL_NOSEGMENTS_PATH From a25db2f9548dbd1d58021f8e3b0742e85eb840fb Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Fri, 6 Sep 2024 14:58:27 -0500 Subject: [PATCH 228/531] [WIP][Llava] Add support to cross compile llava_runner for Android Differential Revision: D62281425 Pull Request resolved: https://github.com/pytorch/executorch/pull/5108 --- .ci/scripts/test_llava.sh | 144 ++++++++++++++++++++------- examples/models/llava/CMakeLists.txt | 18 +++- examples/models/llava/main.cpp | 15 +++ 3 files changed, 139 insertions(+), 38 deletions(-) diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 90a2afa11f8..7dc6d15e407 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -9,48 +9,97 @@ set -exu # shellcheck source=/dev/null BUILD_TYPE=${1:-Debug} +TARGET_OS=${2:-Native} +BUILD_DIR=${3:-cmake-out} -echo "Building with BUILD_TYPE: $BUILD_TYPE" +echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 + PYTHON_EXECUTABLE=python3 fi +TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')" +if [[ "${TARGET_OS_lower}" == "android" ]]; then + if [[ -z "${ANDROID_NDK}" ]]; then + echo "Set ANDROID_NDK environment variable to build for Android." + exit 1 + fi +fi + +# Number of processes for a parallel build +NPROC=8 +if hash nproc &> /dev/null; then NPROC=$(nproc); fi + +EXECUTORCH_COMMON_CMAKE_ARGS=" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON" + cmake_install_executorch_libraries() { - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ - -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ - -Bcmake-out . - - - cmake --build cmake-out -j9 --target install --config ${BUILD_TYPE} + cmake \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + +cmake_install_executorch_libraries_for_android() { + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} } + +LLAVA_COMMON_CMAKE_ARGS=" \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON" + cmake_build_llava_runner() { dir=examples/models/llava python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_PREFIX_PATH="$python_lib" \ - -Bcmake-out/${dir} \ + cmake \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -B${BUILD_DIR}/${dir} \ ${dir} + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} +} + - cmake --build cmake-out/${dir} -j9 --config ${BUILD_TYPE} +cmake_build_llava_runner_for_android() { + dir=examples/models/llava + python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \ + -B${BUILD_DIR}/${dir} \ + ${dir} + + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} } # only export the one without custom op for now since it's @@ -81,13 +130,24 @@ run_and_verify() { echo "tokenizer.bin is missing." exit 1 fi - RUNTIME_ARGS="--model_path=llava.pte \ - --tokenizer_path=tokenizer.bin \ - --image_path=image.pt \ - --prompt=ASSISTANT: \ - --temperature=0 \ - --seq_len=650" - cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + + + + RUNTIME_ARGS="--model_path=llava.pte \ + --tokenizer_path=tokenizer.bin \ + --image_path=image.pt \ + --prompt=ASSISTANT: \ + --temperature=0 \ + --seq_len=650" + + if [[ "${TARGET_OS_lower}" == "android" ]]; then + echo "Transfer relevant files to the phone via ADB and run llava_main with following args," + echo "$ llava_main ${RUNTIME_ARGS} " + exit 0; + fi + + ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + # verify result.txt RESULT=$(cat result.txt) # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. @@ -110,8 +170,20 @@ run_and_verify() { fi } -cmake_install_executorch_libraries -cmake_build_llava_runner +# Step1. Build stuff +if [[ "${TARGET_OS_lower}" == "android" ]]; then + cmake_install_executorch_libraries_for_android + cmake_build_llava_runner_for_android +elif [[ "${TARGET_OS_lower}" == "native" ]]; then + cmake_install_executorch_libraries + cmake_build_llava_runner +else + echo "Invalid TARGET_OS ($2): ${TARGET_OS}" +fi + +# Step2. Generate the PTE export_llava + +# Step3. Run prepare_image_tensor run_and_verify diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index 444f6b33892..c36e39a04cb 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -21,6 +21,9 @@ project(llava) # Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) +# This is a temporary hack to get around Torch dep so we can test this on android +option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF) + include(CMakeDependentOption) # # pthreadpool: build pthreadpool library. Disable on unsupported platforms @@ -70,7 +73,14 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) find_package(gflags REQUIRED) -find_package(Torch CONFIG REQUIRED) +# Avoid torch dep from torch.load()-ing the image. +# This is a temporary hack. +if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE) + add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1) + message("Buidling the runner without Torch, feeding a dummy image!") +else() + find_package(Torch CONFIG REQUIRED) +endif() add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) # @@ -95,7 +105,11 @@ endif() # llava_runner library add_subdirectory(runner) -set(link_libraries gflags torch) +set(LINK_LIBS gflags) +if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE) + list(APPEND LINK_LIBS torch) +endif() +set(link_libraries ${LINK_LIBS}) set(_srcs main.cpp) if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp index 171eb77077f..53f6329b4d8 100644 --- a/examples/models/llava/main.cpp +++ b/examples/models/llava/main.cpp @@ -8,7 +8,11 @@ #include #include +#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE #include +#else +#include // std::fill +#endif #if defined(ET_USE_THREADPOOL) #include @@ -80,6 +84,15 @@ int32_t main(int32_t argc, char** argv) { // read image and resize the longest edge to 336 std::vector image_data; + +#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE + // Work without torch using a random data + image_data.resize(3 * 240 * 336); + std::fill(image_data.begin(), image_data.end(), 0); // black + std::array image_shape = {3, 240, 336}; + std::vector images = { + {.data = image_data, .width = image_shape[2], .height = image_shape[1]}}; +#else // LLAVA_NO_TORCH_DUMMY_IMAGE // cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR); // int longest_edge = std::max(image.rows, image.cols); // float scale_factor = 336.0f / longest_edge; @@ -102,6 +115,8 @@ int32_t main(int32_t argc, char** argv) { {.data = image_data, .width = static_cast(image_tensor.size(2)), .height = static_cast(image_tensor.size(1))}}; +#endif // LLAVA_NO_TORCH_DUMMY_IMAGE + // generate runner.generate(std::move(images), prompt, seq_len); return 0; From 17103dcd9df50d770a603b2d876a3c4c713238e1 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Fri, 6 Sep 2024 15:16:58 -0500 Subject: [PATCH 229/531] [Llama] Dump RSS info for Linux Differential Revision: D62222512 Pull Request resolved: https://github.com/pytorch/executorch/pull/5101 --- examples/models/llama2/runner/runner.cpp | 13 ++++++++++ examples/models/llava/runner/llava_runner.cpp | 21 ++++++++++++++-- extension/llm/runner/util.h | 25 +++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 2c72b4c724e..b048604251a 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -153,6 +153,11 @@ Error Runner::generate( stats_.model_load_end_ms = util::time_in_ms(); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -213,6 +218,10 @@ Error Runner::generate( // print the first token from prefill. No prev_token so use cur_token for it. wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); + ET_LOG( + Info, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); // start the main loop prompt_tokens.push_back(cur_token); @@ -221,6 +230,10 @@ Error Runner::generate( stats_.inference_end_ms = util::time_in_ms(); printf("\n"); + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); if (num_prompt_tokens + num_generated_tokens == seq_len) { ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 04c77a1064b..64763c72576 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -131,6 +131,11 @@ Error LlavaRunner::generate( ET_CHECK_OK_OR_RETURN_ERROR(load()); } + ET_LOG( + Info, + "RSS after loading model: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Wrap the token_callback with print function std::function wrapped_callback = [token_callback](const std::string& piece) { @@ -149,9 +154,21 @@ Error LlavaRunner::generate( // prefill images prefill_images(images, pos); + ET_LOG( + Info, + "RSS after prompt and image prefill: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + // Generate tokens - return generate_from_pos( - prompt, seq_len, pos, wrapped_callback, stats_callback); + Error err = + generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback); + + ET_LOG( + Info, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + util::get_rss_bytes() / 1024.0 / 1024.0); + + return err; } } // namespace torch::executor diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index baf6af328b4..2f1d084811e 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -10,6 +10,9 @@ #include #include #include +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) +#include +#endif namespace executorch { namespace extension { @@ -44,6 +47,27 @@ long inline time_in_ms() { return time.tv_sec * 1000 + time.tv_nsec / 1000000; } +// ---------------------------------------------------------------------------- +// utilities: memory usage + +// Returns the current RSS in bytes. Returns 0 if not supported. +// RSS: Resident Set Size, the amount of memory currently in the RAM for this +// process. These values are approximate, and are only used for logging +// purposes. +size_t inline get_rss_bytes() { +#if defined(__linux__) || defined(__ANDROID__) || defined(__unix__) + struct rusage r_usage; + if (getrusage(RUSAGE_SELF, &r_usage) == 0) { + return r_usage.ru_maxrss * 1024; + } +#endif // __linux__ || __ANDROID__ || __unix__ + // Unsupported platform like Windows, or getrusage() failed. + // __APPLE__ and __MACH__ are not supported because r_usage.ru_maxrss does not + // consistently return kbytes on macOS. On older versions of macOS, it + // returns bytes, but on newer versions it returns kbytes. Need to figure out + // when this changed. + return 0; +} } // namespace llm } // namespace extension } // namespace executorch @@ -53,6 +77,7 @@ namespace executor { namespace util { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. +using ::executorch::extension::llm::get_rss_bytes; using ::executorch::extension::llm::safe_printf; using ::executorch::extension::llm::time_in_ms; } // namespace util From 27632333c002d650313a594d09aef8de23f256fa Mon Sep 17 00:00:00 2001 From: cccclai Date: Fri, 6 Sep 2024 14:07:35 -0700 Subject: [PATCH 230/531] Revert "Add proper pt2e calibration" (#5136) Revert "Add proper pt2e calibration (#5095)" This reverts commit 7122d310844ba701db1f45a13ccaa8c5a002fc89. --- examples/models/llama2/eval_llama_lib.py | 65 ++----------- examples/models/llama2/export_llama_lib.py | 29 +----- extension/llm/export/builder.py | 101 +-------------------- 3 files changed, 12 insertions(+), 183 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 7cdde228b35..3ea4e77a1a6 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -29,51 +29,6 @@ ) -class GraphModuleEvalWrapper(EagerEvalWrapper): - """ - A wrapper class for ExecuTorch py-binded integration with the - lm-evaluation-harness library. - """ - - def __init__( - self, - model: torch.fx.GraphModule, - tokenizer: Union[SentencePieceTokenizer, Tiktoken], - max_seq_length: Optional[int] = None, - use_kv_cache: bool = False, - enable_dynamic_shape: bool = True, - ): - super().__init__( - model=model, tokenizer=tokenizer, max_seq_length=max_seq_length - ) - self._model = model.to(self.device) - self._use_kv_cache = use_kv_cache - self._enable_dynamic_shape = enable_dynamic_shape - - def _model_call(self, inps): - if self._use_kv_cache: - if not self._enable_dynamic_shape: - # graph module exported without dynamic shape won't work with a different shape. - # And we have to do single token prefill here. - result_logits = [] - for pos in range(inps.shape[-1]): - pos_tensor = torch.tensor([pos], dtype=torch.int64) - logits = self._model(inps[:, pos : pos + 1], pos_tensor) - result_logits.append(logits) - return torch.cat(result_logits, dim=1) - else: - pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) - # Batch process the whole sequence. - logits = self._model(inps[:, : self._max_seq_length], pos_tensor) - return logits - - else: - return self._model(inps) - - def _model_generate(self, context, max_length, eos_token_id): - raise Exception("unimplemented") - - class ETPybindEvalWrapper(EagerEvalWrapper): """ A wrapper class for ExecuTorch py-binded integration with the @@ -193,13 +148,6 @@ def gen_eval_wrapper( if torch.cuda.is_available() else manager.pre_autograd_graph_module.to(device="cpu") ) - return GraphModuleEvalWrapper( - model=model, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - use_kv_cache=args.use_kv_cache, - enable_dynamic_shape=args.enable_dynamic_shape, - ) else: # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but @@ -209,12 +157,13 @@ def gen_eval_wrapper( if torch.cuda.is_available() else manager.model.eval().to(device="cpu") ) - return EagerEvalWrapper( - model=model, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - use_kv_cache=args.use_kv_cache, - ) + + return EagerEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + use_kv_cache=args.use_kv_cache, + ) def build_args_parser() -> argparse.ArgumentParser: diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 5dac3e9adbb..1dac12cc853 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -16,7 +16,7 @@ from enum import Enum from json import JSONDecodeError from pathlib import Path -from typing import List, Optional, Union +from typing import Optional, Union import pkg_resources @@ -166,25 +166,19 @@ def build_args_parser() -> argparse.ArgumentParser: nargs="+", type=str, default=None, - help="Tasks for GPTQ calibration from lm_eval", + help="Tasks for GPTQ calibration", ) parser.add_argument( "--calibration_limit", type=int, default=None, - help="number of samples used for calibration from lm_eval", + help="number of samples used for calibration", ) parser.add_argument( "--calibration_seq_length", type=int, default=None, - help="Sequence length for GPTQ calibration from lm_eval", - ) - parser.add_argument( - "--calibration_data", - type=str, - default="Once upon a time", - help="Calibration prompts from users", + help="Sequence length for GPTQ calibration", ) parser.add_argument( "-t", @@ -427,11 +421,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: generate_full_logits=args.generate_full_logits, weight_type=weight_type, enable_dynamic_shape=args.enable_dynamic_shape, - calibration_tasks=args.calibration_tasks, - calibration_limit=args.calibration_limit, - calibration_seq_length=args.calibration_seq_length, - calibration_data=args.calibration_data, - tokenizer_path=args.tokenizer_path, verbose=args.verbose, max_seq_len=args.max_seq_length, metadata_str=args.metadata, @@ -641,11 +630,6 @@ def _load_llama_model( generate_full_logits: bool = False, weight_type: WeightType = WeightType.LLAMA, enable_dynamic_shape: bool = False, - calibration_tasks: Optional[List[str]] = None, - calibration_limit: Optional[int] = None, - calibration_seq_length: Optional[int] = None, - calibration_data: Optional[str] = None, - tokenizer_path: Optional[str] = None, verbose: bool = False, max_seq_len: int = 128, metadata_str: Optional[str] = None, @@ -701,11 +685,6 @@ def _load_llama_model( use_kv_cache=use_kv_cache, example_inputs=example_inputs, enable_dynamic_shape=enable_dynamic_shape, - calibration_tasks=calibration_tasks, - calibration_limit=calibration_limit, - calibration_seq_length=calibration_seq_length, - calibration_data=calibration_data, - tokenizer_path=tokenizer_path, verbose=verbose, metadata=_load_llama_model_metadata( weight_type, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 70ecab898f9..4f5bab7bc02 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -27,7 +27,6 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.extension.export_util.utils import export_to_edge, save_pte_program -from executorch.extension.llm.tokenizer.utils import get_tokenizer from torch._export import capture_pre_autograd_graph from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer import Quantizer @@ -67,11 +66,6 @@ def __init__( use_kv_cache, example_inputs, enable_dynamic_shape: bool = False, - calibration_tasks: Optional[List[str]] = None, - calibration_limit: Optional[int] = None, - calibration_seq_length: Optional[int] = None, - calibration_data: Optional[str] = None, - tokenizer_path: Optional[str] = None, verbose: bool = False, metadata: Optional[dict] = None, dynamic_shapes: Optional[Any] = None, @@ -93,11 +87,6 @@ def __init__( self.output_dir = "." self.dynamic_shapes = dynamic_shapes self._saved_pte_filename = None - self.calibration_tasks = calibration_tasks - self.calibration_limit = calibration_limit - self.calibration_seq_length = calibration_seq_length - self.calibration_data = calibration_data - self.tokenizer_path = tokenizer_path def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -178,69 +167,6 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": ) return self - def pt2e_calibrate( - self, - prepared_module, - calibration_tasks, - calibration_limit, - calibration_seq_length, - calibration_data, - tokenizer_path, - ): - logging.info("Run calibration...") - try: - from executorch.examples.models.llama2.eval_llama_lib import ( - GraphModuleEvalWrapper, - ) - from executorch.examples.models.llama2.evaluate import evaluate_model - except ImportError: - raise ImportError( - "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh" - ) - - tokenizer = get_tokenizer(tokenizer_path) - - def calibrate_template( - module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int - ): - # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int64) - token_list = tokenizer.encode(prompts, bos=True, eos=False) - - with torch.no_grad(): - while token_list[-1] != tokenizer.eos_id and pos < max_len: - logits = module( - torch.full((1, 1), token_list[pos]), - torch.tensor((pos,)), - ) - pos += 1 - if pos >= len(token_list): - token_list.append(torch.argmax(logits[:], dim=-1).item()) - - calibrate_template( - module=prepared_module, - tokenizer=tokenizer, - prompts=calibration_data, - max_len=calibration_seq_length, - ) - - eval_wrapper = GraphModuleEvalWrapper( - model=prepared_module, - tokenizer=tokenizer, - max_seq_length=calibration_seq_length, - use_kv_cache=self.use_kv_cache, - enable_dynamic_shape=self.enable_dynamic_shape, - ) - eval_results = evaluate_model( - eval_wrapper, - calibration_tasks, - calibration_limit, - ) - - for task, res in eval_results["results"].items(): - print(f"{task}: {res}") - logging.info("Calibration finish...") - def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": """ Quantize the model via pt2e flow and retrieve LLMEdgeManager including the quantized model. @@ -263,33 +189,8 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage self.pre_autograd_graph_module is not None ), "Please run capture_pre_autograd_graph first" m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer) - logging.info( - f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" - ) # Calibrate - if ( - self.calibration_tasks is not None - and self.calibration_limit is not None - and self.calibration_seq_length is not None - and self.calibration_data is not None - and self.tokenizer_path is not None - ): - logging.info( - f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" - ) - self.pt2e_calibrate( - prepared_module=m, - calibration_tasks=self.calibration_tasks, - calibration_limit=self.calibration_limit, - calibration_seq_length=self.calibration_seq_length, - calibration_data=self.calibration_data, - tokenizer_path=self.tokenizer_path, - ) - else: - logging.info( - "No calibration provided, using dummy input to calibrate..." - ) - m(*self.example_inputs) + m(*self.example_inputs) m = convert_pt2e(m) DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m From f55ce1f2327546580705491f1923292ef14d4336 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:09:59 -0700 Subject: [PATCH 231/531] Llava prefill Java API Add Java API and JNI layer Pull Request resolved: https://github.com/pytorch/executorch/pull/5132 --- extension/android/jni/jni_layer_llama.cpp | 80 +++++++++++++++++++ .../org/pytorch/executorch/LlamaModule.java | 57 +++++++++++++ extension/llm/runner/multimodal_runner.h | 44 ++++++++++ 3 files changed, 181 insertions(+) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index dda9ece589d..5f2cac188fc 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -180,6 +180,86 @@ class ExecuTorchLlamaJni return 0; } + // Returns a tuple of (error, start_pos) + // Contract is valid within an AAR (JNI + corresponding Java code) + // If the first element is not Error::Ok, the other element is undefined. + facebook::jni::local_ref prefill_prompt( + facebook::jni::alias_ref prompt, + jlong start_pos, + jint bos, + jint eos) { + facebook::jni::local_ref tuple_result = + facebook::jni::make_long_array(2); + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + tuple_result->pin()[0] = static_cast(Error::NotSupported); + return tuple_result; + } + + auto&& result = multi_modal_runner_->prefill_prompt( + prompt->toStdString(), start_pos, bos, eos); + tuple_result->pin()[0] = static_cast(Error::Ok); + if (result.ok()) { + tuple_result->pin()[1] = static_cast(start_pos); + } + return tuple_result; + } + + // Returns a tuple of (error, start_pos) + // Contract is valid within an AAR (JNI + corresponding Java code) + // If the first element is not Error::Ok, the other element is undefined. + + facebook::jni::local_ref prefill_images( + facebook::jni::alias_ref image, + jint width, + jint height, + jint channels, + jlong start_pos) { + facebook::jni::local_ref tuple_result = + facebook::jni::make_long_array(2); + + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + tuple_result->pin()[0] = static_cast(Error::NotSupported); + return tuple_result; + } + + auto image_size = image->size(); + std::vector images; + if (image_size != 0) { + std::vector image_data_jint(image_size); + std::vector image_data(image_size); + image->getRegion(0, image_size, image_data_jint.data()); + for (int i = 0; i < image_size; i++) { + image_data[i] = image_data_jint[i]; + } + Image image_runner{image_data, width, height, channels}; + images.push_back(image_runner); + } + // TODO(hsz): make start_pos a reference and update it here + jint result = static_cast( + multi_modal_runner_->prefill_images(images, start_pos)); + tuple_result->pin()[0] = result; + tuple_result->pin()[1] = static_cast(start_pos); + return tuple_result; + } + + jint generate_from_pos( + facebook::jni::alias_ref prompt, + jint seq_len, + jlong start_pos, + facebook::jni::alias_ref callback) { + if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { + return static_cast(Error::NotSupported); + } + return static_cast(multi_modal_runner_->generate_from_pos( + prompt->toStdString(), + seq_len, + start_pos, + [callback](const std::string& result) { callback->onResult(result); }, + [callback](const ::executorch::extension::llm::Stats& stats) { + callback->onStats(stats); + })); + } + void stop() { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { multi_modal_runner_->stop(); diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index bdc8506aa9c..e636c5f3f80 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -94,6 +94,63 @@ public native int generate( int seqLen, LlamaCallback llamaCallback); + /** + * Prefill an LLaVA Module with the given images input. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @param startPos The starting position in KV cache of the input in the LLM. + * @return The updated starting position in KV cache of the input in the LLM. + * @throws RuntimeException if the prefill failed + */ + public long prefillImages(int[] image, int width, int height, int channels, long startPos) { + long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos); + if (nativeResult[0] != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); + } + return nativeResult[1]; + } + + // returns a tuple of (status, updated startPos) + private native long[] prefillImagesNative( + int[] image, int width, int height, int channels, long startPos); + + /** + * Prefill an LLaVA Module with the given text input. + * + * @param prompt The text prompt to LLaVA. + * @param startPos The starting position in KV cache of the input in the LLM. It's passed as + * reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The updated starting position in KV cache of the input in the LLM. + * @throws RuntimeException if the prefill failed + */ + public long prefillPrompt(String prompt, long startPos, int bos, int eos) { + long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos); + if (nativeResult[0] != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); + } + return nativeResult[1]; + } + + // returns a tuple of (status, updated startPos) + private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos); + + /** + * Generate tokens from the given prompt, starting from the given position. + * + * @param prompt The text prompt to LLaVA. + * @param seqLen The total sequence length, including the prompt tokens and new tokens. + * @param startPos The starting position in KV cache of the input in the LLM. + * @param llamaCallback callback object to receive results. + * @return The error code. + */ + public native int generateFromPos( + String prompt, int seqLen, long startPos, LlamaCallback callback); + /** Stop current generate() before it finishes. */ @DoNotStrip public native void stop(); diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 43bbe688448..70ecafee810 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -61,6 +61,50 @@ class MultimodalRunner { std::function token_callback = {}, std::function stats_callback = {}) = 0; + /** + * Prefill an LLaVA Module with the given images input. + * @param images The image input to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @return The error status of prefilling images. + */ + virtual runtime::Error prefill_images( + std::vector& images, + int64_t& start_pos) = 0; + + /** + * Prefill an LLaVA Module with the given text input. + * @param prompt The text prompt to LLaVA. + * @param start_pos The starting position in KV cache of the input in the LLM. + * It's passed as reference and will be updated inside this function. + * @param bos The number of BOS (begin of sequence) token. + * @param eos The number of EOS (end of sequence) token. + * @return The generated token of the LLaVA Module after prefill prompt. + */ + virtual runtime::Result prefill_prompt( + const std::string& prompt, + int64_t& start_pos, + int8_t bos = 0, + int8_t eos = 0) = 0; + + /** + * Generate tokens from the given prompt, starting from the given position. + * @param prompt The text prompt to LLaVA. + * @param seq_len The total sequence length, including the prompt tokens and + * new tokens. + * @param start_pos The starting position in KV cache of the input in the LLM. + * @param token_callback What to do after a token is generated. + * @param stats_callback What to do with Stats. + * @return The error code. + */ + virtual runtime::Error generate_from_pos( + const std::string& prompt, + int32_t seq_len = 1024, + int64_t start_pos = 0, + std::function token_callback = {}, + std::function + stats_callback = {}) = 0; + inline void stop() { text_token_generator_->stop(); } From fb86e610e5ab6ab0085e7570e5796ff028bdc466 Mon Sep 17 00:00:00 2001 From: Chirag Modi <98582575+cmodi-meta@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:39:24 -0700 Subject: [PATCH 232/531] Add Echo parameter to llama runner, jni+java layer, and demo app Differential Revision: D62247137 Pull Request resolved: https://github.com/pytorch/executorch/pull/5011 --- .../executorchllamademo/MainActivity.java | 16 ++++++++-- examples/models/llama2/export_llama_lib.py | 1 - examples/models/llama2/runner/runner.cpp | 7 +++-- examples/models/llama2/runner/runner.h | 3 +- extension/android/jni/jni_layer_llama.cpp | 4 ++- .../org/pytorch/executorch/LlamaModule.java | 30 +++++++++++++++++-- 6 files changed, 51 insertions(+), 10 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index f24254efb31..96b200303c9 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -73,8 +73,15 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa @Override public void onResult(String result) { - mResultMessage.appendText(result); - run(); + if (result.equals("\n\n")) { + if (!mResultMessage.getText().isEmpty()) { + mResultMessage.appendText(result); + run(); + } + } else { + mResultMessage.appendText(result); + run(); + } } @Override @@ -614,6 +621,7 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, prompt, ModelUtils.VISION_MODEL_SEQ_LEN, + false, MainActivity.this); } else { // no image selected, we pass in empty int array @@ -624,10 +632,12 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, prompt, ModelUtils.VISION_MODEL_SEQ_LEN, + false, MainActivity.this); } } else { - mModule.generate(prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, MainActivity.this); + mModule.generate( + prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this); } long generateDuration = System.currentTimeMillis() - generateStartTime; diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 1dac12cc853..855e26e9169 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -313,7 +313,6 @@ def build_args_parser() -> argparse.ArgumentParser: def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str: - path = str(path) if verbose_export(): diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index b048604251a..1e17c754007 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -143,7 +143,8 @@ Error Runner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, - std::function stats_callback) { + std::function stats_callback, + bool echo) { // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); @@ -208,7 +209,9 @@ Error Runner::generate( // after the prompt. After that we will enter generate loop. // print prompts - wrapped_callback(prompt); + if (echo) { + wrapped_callback(prompt); + } int64_t pos = 0; auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); stats_.first_token_ms = util::time_in_ms(); diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 4e3c1daef7b..cec8c61157f 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -40,7 +40,8 @@ class Runner { const std::string& prompt, int32_t seq_len = 128, std::function token_callback = {}, - std::function stats_callback = {}); + std::function stats_callback = {}, + bool echo = true); void stop(); private: diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 5f2cac188fc..0d43317c3ca 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -150,6 +150,7 @@ class ExecuTorchLlamaJni jint channels, facebook::jni::alias_ref prompt, jint seq_len, + jboolean echo, facebook::jni::alias_ref callback) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { auto image_size = image->size(); @@ -175,7 +176,8 @@ class ExecuTorchLlamaJni prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, - [callback](const Stats& result) { callback->onStats(result); }); + [callback](const Stats& result) { callback->onStats(result); }, + echo); } return 0; } diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index e636c5f3f80..c4de23df0ee 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -33,6 +33,7 @@ public class LlamaModule { private final HybridData mHybridData; private static final int DEFAULT_SEQ_LEN = 128; + private static final boolean DEFAULT_ECHO = true; @DoNotStrip private static native HybridData initHybrid( @@ -59,7 +60,7 @@ public void resetNative() { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, LlamaCallback llamaCallback) { - return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback); + return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback); } /** @@ -70,7 +71,30 @@ public int generate(String prompt, LlamaCallback llamaCallback) { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback); + return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback); + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param llamaCallback callback object to receive results. + */ + public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { + return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback); + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param seqLen sequence length + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param llamaCallback callback object to receive results. + */ + public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) { + return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback); } /** @@ -82,6 +106,7 @@ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { * @param channels Input image number of channels * @param prompt Input prompt * @param seqLen sequence length + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) * @param llamaCallback callback object to receive results. */ @DoNotStrip @@ -92,6 +117,7 @@ public native int generate( int channels, String prompt, int seqLen, + boolean echo, LlamaCallback llamaCallback); /** From 5d4d821a0c158568e33eece283f9978e543fa95d Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Fri, 6 Sep 2024 14:45:51 -0700 Subject: [PATCH 233/531] App skeleton. Differential Revision: D62281653 Pull Request resolved: https://github.com/pytorch/executorch/pull/5128 --- .../apple/Benchmark/App/App.entitlements | 12 + extension/apple/Benchmark/App/App.swift | 16 + .../Benchmark.xcodeproj/project.pbxproj | 535 ++++++++++++++++++ .../xcshareddata/xcschemes/Benchmark.xcscheme | 107 ++++ extension/apple/Benchmark/Tests/Tests.mm | 105 ++++ .../apple/Benchmark/Tests/Tests.xcconfig | 26 + .../apple/Benchmark/Tests/Tests.xctestplan | 28 + 7 files changed, 829 insertions(+) create mode 100644 extension/apple/Benchmark/App/App.entitlements create mode 100644 extension/apple/Benchmark/App/App.swift create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj create mode 100644 extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme create mode 100644 extension/apple/Benchmark/Tests/Tests.mm create mode 100644 extension/apple/Benchmark/Tests/Tests.xcconfig create mode 100644 extension/apple/Benchmark/Tests/Tests.xctestplan diff --git a/extension/apple/Benchmark/App/App.entitlements b/extension/apple/Benchmark/App/App.entitlements new file mode 100644 index 00000000000..e461e7f22f6 --- /dev/null +++ b/extension/apple/Benchmark/App/App.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.security.app-sandbox + + com.apple.security.files.user-selected.read-only + + com.apple.developer.kernel.increased-memory-limit + + + diff --git a/extension/apple/Benchmark/App/App.swift b/extension/apple/Benchmark/App/App.swift new file mode 100644 index 00000000000..30fbd221dc0 --- /dev/null +++ b/extension/apple/Benchmark/App/App.swift @@ -0,0 +1,16 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +@main +struct BenchmarkApp: App { + var body: some Scene { + WindowGroup {} + } +} diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj new file mode 100644 index 00000000000..4dcffaffbf6 --- /dev/null +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -0,0 +1,535 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + 03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; }; + 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; }; + 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; }; + 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; }; + 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; }; + 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; }; + 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; }; + 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; }; + 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; }; + 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; }; + 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; }; + 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; }; + 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; }; + 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; }; + 03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */; }; + 03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D162C8AB00500F2D6EE /* CoreML.framework */; }; + 03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */; }; +/* End PBXBuildFile section */ + +/* Begin PBXContainerItemProxy section */ + 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = 03B2D35C2C8A515A0046936E /* Project object */; + proxyType = 1; + remoteGlobalIDString = 03B2D3632C8A515A0046936E; + remoteInfo = Benchmark; + }; +/* End PBXContainerItemProxy section */ + +/* Begin PBXFileReference section */ + 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = ""; }; + 03B019502C8A80D30044D558 /* Tests.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = Tests.xcconfig; sourceTree = ""; }; + 03B2D3642C8A515A0046936E /* Benchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Benchmark.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 03B2D3672C8A515A0046936E /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = ""; }; + 03B2D36D2C8A515B0046936E /* App.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = App.entitlements; sourceTree = ""; }; + 03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; + 03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = ""; }; + 03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; }; + 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; + 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; + 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; + 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; + 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; + 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; + 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; + 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; + 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; }; + 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Metal.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D162C8AB00500F2D6EE /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; }; + 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 03B2D3612C8A515A0046936E /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3722C8A515C0046936E /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */, + 03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */, + 03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */, + 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */, + 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */, + 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */, + 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */, + 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */, + 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */, + 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */, + 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */, + 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */, + 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */, + 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 03B2D35B2C8A515A0046936E = { + isa = PBXGroup; + children = ( + 03B2D3662C8A515A0046936E /* App */, + 03ED6CEB2C8AAF5300F2D6EE /* Frameworks */, + 03C7FA322C8AA24200E6E9AE /* Models */, + 03B2D3782C8A515C0046936E /* Tests */, + 03B2D3652C8A515A0046936E /* Products */, + ); + sourceTree = ""; + }; + 03B2D3652C8A515A0046936E /* Products */ = { + isa = PBXGroup; + children = ( + 03B2D3642C8A515A0046936E /* Benchmark.app */, + 03B2D3752C8A515C0046936E /* Tests.xctest */, + ); + name = Products; + sourceTree = ""; + }; + 03B2D3662C8A515A0046936E /* App */ = { + isa = PBXGroup; + children = ( + 03B2D3672C8A515A0046936E /* App.swift */, + 03B2D36D2C8A515B0046936E /* App.entitlements */, + ); + path = App; + sourceTree = SOURCE_ROOT; + }; + 03B2D3782C8A515C0046936E /* Tests */ = { + isa = PBXGroup; + children = ( + 03B2D3792C8A515C0046936E /* Tests.mm */, + 03B019502C8A80D30044D558 /* Tests.xcconfig */, + 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */, + ); + path = Tests; + sourceTree = SOURCE_ROOT; + }; + 03ED6CEB2C8AAF5300F2D6EE /* Frameworks */ = { + isa = PBXGroup; + children = ( + 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */, + 03ED6D162C8AB00500F2D6EE /* CoreML.framework */, + 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */, + 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */, + 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */, + 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */, + 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */, + 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */, + 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */, + 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */, + 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */, + 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */, + 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */, + 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */, + ); + name = Frameworks; + sourceTree = SOURCE_ROOT; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 03B2D3632C8A515A0046936E /* App */ = { + isa = PBXNativeTarget; + buildConfigurationList = 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */; + buildPhases = ( + 03B2D3602C8A515A0046936E /* Sources */, + 03B2D3612C8A515A0046936E /* Frameworks */, + 03B2D3622C8A515A0046936E /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = App; + productName = Benchmark; + productReference = 03B2D3642C8A515A0046936E /* Benchmark.app */; + productType = "com.apple.product-type.application"; + }; + 03B2D3742C8A515C0046936E /* Tests */ = { + isa = PBXNativeTarget; + buildConfigurationList = 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */; + buildPhases = ( + 03B2D3712C8A515C0046936E /* Sources */, + 03B2D3722C8A515C0046936E /* Frameworks */, + 03B2D3732C8A515C0046936E /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + 03B2D3772C8A515C0046936E /* PBXTargetDependency */, + ); + name = Tests; + productName = BenchmarkTests; + productReference = 03B2D3752C8A515C0046936E /* Tests.xctest */; + productType = "com.apple.product-type.bundle.unit-test"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 03B2D35C2C8A515A0046936E /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1540; + LastUpgradeCheck = 1540; + TargetAttributes = { + 03B2D3632C8A515A0046936E = { + CreatedOnToolsVersion = 15.4; + }; + 03B2D3742C8A515C0046936E = { + CreatedOnToolsVersion = 15.4; + TestTargetID = 03B2D3632C8A515A0046936E; + }; + }; + }; + buildConfigurationList = 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 03B2D35B2C8A515A0046936E; + productRefGroup = 03B2D3652C8A515A0046936E /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 03B2D3632C8A515A0046936E /* App */, + 03B2D3742C8A515C0046936E /* Tests */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 03B2D3622C8A515A0046936E /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3732C8A515C0046936E /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 03B2D3602C8A515A0046936E /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03B2D3682C8A515A0046936E /* App.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; + 03B2D3712C8A515C0046936E /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin PBXTargetDependency section */ + 03B2D3772C8A515C0046936E /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 03B2D3632C8A515A0046936E /* App */; + targetProxy = 03B2D3762C8A515C0046936E /* PBXContainerItemProxy */; + }; +/* End PBXTargetDependency section */ + +/* Begin XCBuildConfiguration section */ + 03B2D3872C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 03B2D3882C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "c++17"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SWIFT_COMPILATION_MODE = wholemodule; + }; + name = Release; + }; + 03B2D38A2C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_ENTITLEMENTS = App/App.entitlements; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks"; + "LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 11.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; + PRODUCT_NAME = Benchmark; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 03B2D38B2C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_ENTITLEMENTS = App/App.entitlements; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphoneos*]" = YES; + "INFOPLIST_KEY_UILaunchScreen_Generation[sdk=iphonesimulator*]" = YES; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphoneos*]" = UIStatusBarStyleDefault; + "INFOPLIST_KEY_UIStatusBarStyle[sdk=iphonesimulator*]" = UIStatusBarStyleDefault; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks"; + "LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 11.0; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; + PRODUCT_NAME = Benchmark; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; + 03B2D38D2C8A515C0046936E /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MACOSX_DEPLOYMENT_TARGET = 10.15; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark"; + }; + name = Debug; + }; + 03B2D38E2C8A515C0046936E /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 03B019502C8A80D30044D558 /* Tests.xcconfig */; + buildSettings = { + ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES; + BUNDLE_LOADER = "$(TEST_HOST)"; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = ""; + GENERATE_INFOPLIST_FILE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + MACOSX_DEPLOYMENT_TARGET = 10.15; + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = auto; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SWIFT_EMIT_LOC_STRINGS = NO; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + TEST_HOST = "$(BUILT_PRODUCTS_DIR)/Benchmark.app/$(BUNDLE_EXECUTABLE_FOLDER_PATH)/Benchmark"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 03B2D35F2C8A515A0046936E /* Build configuration list for PBXProject "Benchmark" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D3872C8A515C0046936E /* Debug */, + 03B2D3882C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 03B2D3892C8A515C0046936E /* Build configuration list for PBXNativeTarget "App" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D38A2C8A515C0046936E /* Debug */, + 03B2D38B2C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 03B2D38C2C8A515C0046936E /* Build configuration list for PBXNativeTarget "Tests" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 03B2D38D2C8A515C0046936E /* Debug */, + 03B2D38E2C8A515C0046936E /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 03B2D35C2C8A515A0046936E /* Project object */; +} diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme new file mode 100644 index 00000000000..ebfe1e5fd35 --- /dev/null +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/xcshareddata/xcschemes/Benchmark.xcscheme @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm new file mode 100644 index 00000000000..5cf958765d3 --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.mm @@ -0,0 +1,105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#import + +#import + +#import +#import + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; + +@interface Tests : XCTestCase +@end + +@implementation Tests + ++ (void)initialize { + if (self == [Tests class]) { + NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath + stringByAppendingPathComponent:@"Models"]; + NSArray *models = + [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir + error:nil]; + for (NSString *model in models) { + NSString *modelName = model.stringByDeletingPathExtension; + NSString *modelPath = [modelsDir stringByAppendingPathComponent:model]; + XCTAssertGreaterThan(modelPath.length, 0); + + SEL testLoadSelector = NSSelectorFromString( + [NSString stringWithFormat:@"test_load_%@", modelName]); + IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + [_self + measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->load_method("forward"), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testLoadSelector, testLoadImplementation, "v@:"); + + SEL testForwardSelector = NSSelectorFromString( + [NSString stringWithFormat:@"test_forward_%@", modelName]); + IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + XCTAssertEqual(module->load_method("forward"), Error::Ok); + + const auto method_meta = module->method_meta("forward"); + XCTAssertEqual(method_meta.error(), Error::Ok); + + const auto num_inputs = method_meta->num_inputs(); + XCTAssertGreaterThan(num_inputs, 0); + + std::vector> buffers; + buffers.reserve(num_inputs); + std::vector tensors; + tensors.reserve(num_inputs); + std::vector __block inputs; + inputs.reserve(num_inputs); + + for (auto index = 0; index < num_inputs; ++index) { + auto input_tag = method_meta->input_tag(index); + XCTAssertEqual(input_tag.error(), Error::Ok); + + switch (*input_tag) { + case Tag::Tensor: { + const auto tensor_meta = method_meta->input_tensor_meta(index); + XCTAssertEqual(tensor_meta.error(), Error::Ok); + + const auto sizes = tensor_meta->sizes(); + buffers.emplace_back(tensor_meta->nbytes(), + 0b01010101); // Set all bytes to be non-zero. + tensors.emplace_back(from_blob(buffers.rbegin()->data(), + {sizes.begin(), sizes.end()}, + tensor_meta->scalar_type())); + inputs.emplace_back(tensors.back()); + } break; + default: + XCTFail("Unsupported tag %i at input %d", *input_tag, index); + } + } + [_self + measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->forward(inputs).error(), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testForwardSelector, testForwardImplementation, "v@:"); + } + } +} + +@end diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig new file mode 100644 index 00000000000..e8168046c3d --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.xcconfig @@ -0,0 +1,26 @@ +OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a + +OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a + +OTHER_LDFLAGS[sdk=macos*] = $(inherited) \ +-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a diff --git a/extension/apple/Benchmark/Tests/Tests.xctestplan b/extension/apple/Benchmark/Tests/Tests.xctestplan new file mode 100644 index 00000000000..025f50f1942 --- /dev/null +++ b/extension/apple/Benchmark/Tests/Tests.xctestplan @@ -0,0 +1,28 @@ +{ + "configurations" : [ + { + "id" : "0430A5ED-FD8D-444E-9933-740E01CCD53C", + "name" : "Test Scheme Action", + "options" : { + + } + } + ], + "defaultOptions" : { + "targetForVariableExpansion" : { + "containerPath" : "container:Benchmark.xcodeproj", + "identifier" : "03B2D3632C8A515A0046936E", + "name" : "App" + } + }, + "testTargets" : [ + { + "target" : { + "containerPath" : "container:Benchmark.xcodeproj", + "identifier" : "03B2D3742C8A515C0046936E", + "name" : "Tests" + } + } + ], + "version" : 1 +} From 617f9d8af6693aa121f219f991eaa517e0571a42 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Fri, 6 Sep 2024 15:01:51 -0700 Subject: [PATCH 234/531] Disable fail_fast for benchmark jobs Differential Revision: D62316886 Pull Request resolved: https://github.com/pytorch/executorch/pull/5139 --- .github/workflows/android-perf.yml | 1 + .github/workflows/apple-perf.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 53d934a0a62..11950623ea0 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -279,6 +279,7 @@ jobs: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false with: device-type: android runner: linux.2xlarge diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 41e2868bfbb..8da58653a82 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -290,6 +290,7 @@ jobs: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false with: device-type: ios # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS From 8afdc48e8378269469f40ee5525800d4a121c8ef Mon Sep 17 00:00:00 2001 From: lucylq Date: Fri, 6 Sep 2024 15:06:25 -0700 Subject: [PATCH 235/531] Ensure 0-index in constant buffer is carried through Differential Revision: D62209852 Pull Request resolved: https://github.com/pytorch/executorch/pull/5145 --- exir/_serialize/_program.py | 7 +++++- exir/_serialize/test/test_program.py | 27 +++++++++++++++++++++++ runtime/executor/test/program_test.cpp | 30 ++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index aa5aba1fd7a..2256d5fcc99 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -387,7 +387,12 @@ def serialize_pte_binary( constant_segment_data, constant_segment_offsets = _extract_constant_segment( program.constant_buffer, tensor_alignment=constant_tensor_alignment ) - if len(constant_segment_data) > 0: + + # If there are no constants, len(constant_segment_data) = 0. However, there may + # be non-constants, in which case len(constant_segment_offsets) = 1, containing + # the placeholder value 0. Ensure the placeholder value is put into + # program.constant_segment.offsets. + if len(constant_segment_offsets) > 0: # Update program.constant_segment with constant subsegment offset information. program.constant_segment = SubsegmentOffsets( segment_index=len(segments), offsets=constant_segment_offsets diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index c4f4df0d0b2..afd8e3d282e 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -583,6 +583,33 @@ def test_round_trip_with_segments(self) -> None: program2 = deserialize_pte_binary(pte_data) self.assert_programs_equal(program, program2) + def test_no_constants(self) -> None: + program = get_test_program() + # Insert placeholder for non-const tensors. + add_constant_data(program, [b""]) + + pte_data = bytes( + serialize_pte_binary( + program, + extract_delegate_segments=True, + segment_alignment=SEGMENT_ALIGNMENT, + constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT, + ) + ) + # The input Program should not be modified. + self.assertEqual(program.segments, []) + + # Peek inside the actual flatbuffer data to see the segments. + flatbuffer_program = _json_to_program(_program_flatbuffer_to_json(pte_data)) + + # Constant buffer should be empty. + self.assertEqual(len(flatbuffer_program.constant_buffer), 0) + + # Constant segment should contain the placeholder. + self.assertEqual(flatbuffer_program.constant_segment.segment_index, 0) + self.assertEqual(len(flatbuffer_program.constant_segment.offsets), 1) + self.assertEqual(flatbuffer_program.constant_segment.offsets[0], 0) + def test_unused_inline_delegate_blobs_with_segments(self) -> None: # Create a program with some delegate data blobs. program = get_test_program() diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp index 2cc9b4369db..80f91f1af6a 100644 --- a/runtime/executor/test/program_test.cpp +++ b/runtime/executor/test/program_test.cpp @@ -379,9 +379,31 @@ TEST_F(ProgramTest, DEPRECATEDLoad) { EXPECT_EQ(program_res.error(), Error::Ok); } +TEST_F(ProgramTest, LoadConstantSegmentWithNoConstantSegment) { + Result program = + Program::load(add_loader_.get(), kDefaultVerification); + ASSERT_EQ(program.error(), Error::Ok); + + // Load constant segment data should fail. + const auto segment_info = DataLoader::SegmentInfo( + DataLoader::SegmentInfo::Type::Constant, + /*segment_index=*/0); + Result segment = + ProgramTestFriend::LoadSegment(&program.get(), segment_info); + EXPECT_NE(segment.error(), Error::Ok); + + const executorch_flatbuffer::Program* flatbuffer_program = + ProgramTestFriend::GetInternalProgram(&program.get()); + + // The constant buffer should be empty. + EXPECT_EQ(flatbuffer_program->constant_buffer()->size(), 0); + + // Expect 1 constant segment, placeholder for non-const tensors. + EXPECT_EQ(flatbuffer_program->segments()->size(), 1); +} + TEST_F(ProgramTest, LoadConstantSegment) { - // Load the serialized ModuleLinear data, with constants in the segment and no - // constants in the flatbuffer. + // Load the serialized ModuleLinear data, with constants in the segment. const char* linear_path = std::getenv("ET_MODULE_LINEAR_PATH"); Result linear_loader = FileDataLoader::from(linear_path); ASSERT_EQ(linear_loader.error(), Error::Ok); @@ -504,8 +526,8 @@ TEST_F(ProgramTest, LoadFromMutableSegment) { const executorch_flatbuffer::Program* flatbuffer_program = ProgramTestFriend::GetInternalProgram(&program.get()); - // Expect 1 segment. 1 mutable segment and no constant segment. - EXPECT_EQ(flatbuffer_program->segments()->size(), 1); + // Expect 2 segments. 1 mutable segment and 1 constant segment. + EXPECT_EQ(flatbuffer_program->segments()->size(), 2); // Expect a mutable data segment. EXPECT_EQ(flatbuffer_program->mutable_data_segments()->size(), 1); From 1cc8503056eab95eaf2f753c5a1bf237102a26ba Mon Sep 17 00:00:00 2001 From: cccclai Date: Fri, 6 Sep 2024 16:19:48 -0700 Subject: [PATCH 236/531] Allow qnn to use the IR from torch.export.export Differential Revision: D62219962 Pull Request resolved: https://github.com/pytorch/executorch/pull/4942 --- examples/models/llama2/export_llama_lib.py | 3 +++ examples/models/llava/export_llava.py | 2 ++ extension/llm/export/builder.py | 25 ++++++++++++++++------ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 855e26e9169..c19ddd58a26 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -423,6 +423,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: verbose=args.verbose, max_seq_len=args.max_seq_length, metadata_str=args.metadata, + args=args, ) .set_output_dir(output_dir_path) .to_dtype(dtype_override) @@ -632,6 +633,7 @@ def _load_llama_model( verbose: bool = False, max_seq_len: int = 128, metadata_str: Optional[str] = None, + args, ) -> "LLMEdgeManager": """ A helper util that builds a Llama2 model. It returns a LLMEdgeManager that @@ -693,4 +695,5 @@ def _load_llama_model( model.params, metadata_str, ), + args=args, ) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index a41d8d3ba07..bdeaef15fe6 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -89,6 +89,7 @@ def forward(self, input_pos, embeddings): use_kv_cache=True, example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings), dynamic_shapes=dynamic_shapes, + args=llava.text_model_args, ) dtype_override = DType.fp32 @@ -145,6 +146,7 @@ def forward(self, images): use_kv_cache=True, example_inputs=(resized,), dynamic_shapes=dynamic_shapes, + args=None, ) .capture_pre_autograd_graph() .pt2e_quantize([quantizer]) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 4f5bab7bc02..2c2e52c744f 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -65,6 +65,7 @@ def __init__( dtype, use_kv_cache, example_inputs, + args: Optional[Any] = None, enable_dynamic_shape: bool = False, verbose: bool = False, metadata: Optional[dict] = None, @@ -87,6 +88,7 @@ def __init__( self.output_dir = "." self.dynamic_shapes = dynamic_shapes self._saved_pte_filename = None + self.args = args def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -162,9 +164,20 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): # pyre-fixme[8] - self.pre_autograd_graph_module = capture_pre_autograd_graph( - self.model, self.example_inputs, dynamic_shapes=dynamic_shape - ) + if hasattr(self.args, "qnn") and self.args.qnn: + # TODO: this is temporary and export_for_training doesn't work with qnn either. We need a + # functional graph. See issue https://github.com/pytorch/executorch/pull/4627 for more details + self.pre_autograd_graph_module = torch.export.export( + self.model, + self.example_inputs, + dynamic_shapes=dynamic_shape, + strict=True, + ).module() + else: + self.pre_autograd_graph_module = capture_pre_autograd_graph( + self.model, self.example_inputs, dynamic_shapes=dynamic_shape + ) + return self def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": @@ -210,10 +223,8 @@ def export_to_edge(self) -> "LLMEdgeManager": # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up) with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): if self.pre_autograd_graph_module is None: - # pyre-fixme[8] - self.pre_autograd_graph_module = capture_pre_autograd_graph( - self.model, self.example_inputs, dynamic_shapes=dynamic_shape - ) + # Run capture_pre_autograd_graph if it didn't run + self.capture_pre_autograd_graph() self.edge_manager = export_to_edge( self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs, From e33c25cbb7a7e62eb3f40d9be49ba71bed7d9074 Mon Sep 17 00:00:00 2001 From: Riandy Date: Fri, 6 Sep 2024 16:49:55 -0700 Subject: [PATCH 237/531] Add logic to not print stop token on Android Differential Revision: D62261849 Pull Request resolved: https://github.com/pytorch/executorch/pull/5151 --- .../example/executorchllamademo/MainActivity.java | 3 +++ .../example/executorchllamademo/PromptFormat.java | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 96b200303c9..7ed9c9ec979 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -73,6 +73,9 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa @Override public void onResult(String result) { + if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { + return; + } if (result.equals("\n\n")) { if (!mResultMessage.getText().isEmpty()) { mResultMessage.appendText(result); diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index a077f4d677f..7342b4ab00c 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -41,4 +41,16 @@ public static String getUserPromptTemplate(ModelType modelType) { return USER_PLACEHOLDER; } } + + public static String getStopToken(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return "<|eot_id|>"; + case LLAVA_1_5: + return ""; + default: + return ""; + } + } } From c02546c8c9eec19d4f5c05e35d75d279ab4c4324 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:53:46 -0700 Subject: [PATCH 238/531] [ExecuTorch] support BF16 in op_mm Differential Revision: D61981353 Pull Request resolved: https://github.com/pytorch/executorch/pull/4978 --- kernels/portable/cpu/op_mm.cpp | 27 ++++++++++++++------------- kernels/test/op_mm_test.cpp | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp index 4a6a8f3cfdc..1241182e4a9 100644 --- a/kernels/portable/cpu/op_mm.cpp +++ b/kernels/portable/cpu/op_mm.cpp @@ -34,19 +34,20 @@ mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) { ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); - ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { - size_t m = in.size(0); - size_t n = in.size(1); - size_t p = mat2.size(1); - - vec_matmul( - out.mutable_data_ptr(), - in.const_data_ptr(), - mat2.const_data_ptr(), - m, - n, - p); - }); + ET_SWITCH_REAL_TYPES_AND2( + Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { + size_t m = in.size(0); + size_t n = in.size(1); + size_t p = mat2.size(1); + + vec_matmul( + out.mutable_data_ptr(), + in.const_data_ptr(), + mat2.const_data_ptr(), + m, + n, + p); + }); return out; } diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp index 70d4b5ff0f5..c05792523f2 100644 --- a/kernels/test/op_mm_test.cpp +++ b/kernels/test/op_mm_test.cpp @@ -81,7 +81,7 @@ TEST_F(OpMmOutTest, OutputDim) { /// zeros(). TEST_F(OpMmOutTest, AllDtypesSupported) { #define TEST_ENTRY(ctype, dtype) test_dtype(); - ET_FORALL_REAL_TYPES_AND(Half, TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY // TODO: Also add tests for half, complex, quantized, and other types. Easiest // way to do that would be to make TensorFactory support zeros() and ones() From 3d6edb06cd530e61c6b744f0b7d8f46b3229d0d2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:53:50 -0700 Subject: [PATCH 239/531] [ExecuTorch] support BF16 in op_copy Differential Revision: D61981357 Pull Request resolved: https://github.com/pytorch/executorch/pull/4979 --- kernels/portable/cpu/op_copy.cpp | 8 ++++---- kernels/test/op_copy_test.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp index 764a50a5d20..86f2d5c62be 100644 --- a/kernels/portable/cpu/op_copy.cpp +++ b/kernels/portable/cpu/op_copy.cpp @@ -45,8 +45,8 @@ Tensor& copy_out( ScalarType in_type = in.scalar_type(); ScalarType src_type = src.scalar_type(); - ET_SWITCH_REALHB_TYPES(in_type, ctx, "copy.out", CTYPE, [&]() { - ET_SWITCH_REALHB_TYPES(src_type, ctx, "copy.out", CTYPE_SRC, [&]() { + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "copy.out", CTYPE, [&]() { + ET_SWITCH_REALHBBF16_TYPES(src_type, ctx, "copy.out", CTYPE_SRC, [&]() { apply_binary_elementwise_fn( [](const CTYPE val_in, const CTYPE_SRC val_src) { return convert(val_src); @@ -75,8 +75,8 @@ copy_(RuntimeContext& ctx, Tensor& in, const Tensor& src, bool non_blocking) { ScalarType in_type = in.scalar_type(); ScalarType src_type = src.scalar_type(); - ET_SWITCH_REALHB_TYPES(in_type, ctx, "copy_", CTYPE, [&]() { - ET_SWITCH_REALHB_TYPES(src_type, ctx, "copy_", CTYPE_SRC, [&]() { + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "copy_", CTYPE, [&]() { + ET_SWITCH_REALHBBF16_TYPES(src_type, ctx, "copy_", CTYPE_SRC, [&]() { apply_binary_elementwise_fn( [](const CTYPE val_in, const CTYPE_SRC val_src) { return convert(val_src); diff --git a/kernels/test/op_copy_test.cpp b/kernels/test/op_copy_test.cpp index 82332f85eb2..007b10a7636 100644 --- a/kernels/test/op_copy_test.cpp +++ b/kernels/test/op_copy_test.cpp @@ -125,13 +125,13 @@ class OpCopyInplaceTest : public OperatorTest { // regular test for copy.out TEST_F(OpCopyTest, AllRealDtypesSupported) { #define TEST_ENTRY(ctype, dtype) test_dtype(); - ET_FORALL_REAL_TYPES(TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY } TEST_F(OpCopyTest, EmptyInputSupported) { #define TEST_ENTRY(ctype, dtype) test_empty_input(); - ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY } From 9fed53b5703a5c3b1b3ee02d85ad56670950d704 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:53:55 -0700 Subject: [PATCH 240/531] [ExecuTorch] support BF16 in op_slice_scatter Differential Revision: D61981364 Pull Request resolved: https://github.com/pytorch/executorch/pull/4980 --- kernels/portable/cpu/op_slice_scatter.cpp | 4 ++-- kernels/test/op_slice_scatter_test.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp index a1f9ce4d921..47374716b4e 100644 --- a/kernels/portable/cpu/op_slice_scatter.cpp +++ b/kernels/portable/cpu/op_slice_scatter.cpp @@ -74,8 +74,8 @@ Tensor& slice_scatter_out( ScalarType in_type = input.scalar_type(); ScalarType src_type = src.scalar_type(); - ET_SWITCH_REALHB_TYPES(in_type, ctx, "slice_scatter.out", CTYPE, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "slice_scatter.out", CTYPE, [&]() { + ET_SWITCH_REALHBBF16_TYPES( src_type, ctx, "slice_scatter.out", CTYPE_SRC, [&]() { CTYPE* out_data = out.mutable_data_ptr(); const CTYPE_SRC* src_data = src.const_data_ptr(); diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp index 4901f832a33..1d5c8a43b10 100644 --- a/kernels/test/op_slice_scatter_test.cpp +++ b/kernels/test/op_slice_scatter_test.cpp @@ -49,7 +49,7 @@ class OpSliceScatterTensorOutTest : public OperatorTest { 5, 6, 7, 8, // [1, :] 9, 10, 11, 12, // [2, :] }); - + // op_slice_scatter_out(input, src, /*dim=*/0, /*start=*/0, /*end=*/2, /*step=*/1, out), // src shape should equal to input[0:2:1, :] Tensor src = tf.make( @@ -670,7 +670,7 @@ TEST_F(OpSliceScatterTensorOutTest, LegalStepsSupported) { /// zeros(). TEST_F(OpSliceScatterTensorOutTest, AllRealDtypesSupported) { #define TEST_ENTRY(ctype, dtype) test_dtype(); - ET_FORALL_REAL_TYPES(TEST_ENTRY); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY // TODO: Also add tests for half, complex, quantized, and other types. Easiest // way to do that would be to make TensorFactory support zeros() and ones() From b2ca270936b4b84ba39f9c158d63726987096cb2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:53:58 -0700 Subject: [PATCH 241/531] [ExecuTorch] support BF16 in op_scalar_tensor Differential Revision: D61981360 Pull Request resolved: https://github.com/pytorch/executorch/pull/4981 --- kernels/portable/cpu/op_scalar_tensor.cpp | 15 ++++++++------- kernels/test/op_scalar_tensor_test.cpp | 4 ++-- runtime/core/portable_type/scalar.h | 4 ++++ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/kernels/portable/cpu/op_scalar_tensor.cpp b/kernels/portable/cpu/op_scalar_tensor.cpp index b69267c9917..b79d447f6af 100644 --- a/kernels/portable/cpu/op_scalar_tensor.cpp +++ b/kernels/portable/cpu/op_scalar_tensor.cpp @@ -24,13 +24,14 @@ Tensor& scalar_tensor_out(RuntimeContext& ctx, const Scalar& s, Tensor& out) { constexpr auto name = "scalar_tensor.out"; - ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE, [&]() { - ET_SWITCH_SCALAR_OBJ_TYPES(s_type, ctx, name, CTYPE_S, [&]() { - CTYPE_S val_s; - utils::extract_scalar(s, &val_s); - out.mutable_data_ptr()[0] = convert(val_s); - }); - }); + ET_SWITCH_REAL_TYPES_AND3( + Half, Bool, BFloat16, out_type, ctx, name, CTYPE, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(s_type, ctx, name, CTYPE_S, [&]() { + CTYPE_S val_s; + utils::extract_scalar(s, &val_s); + out.mutable_data_ptr()[0] = convert(val_s); + }); + }); return out; } diff --git a/kernels/test/op_scalar_tensor_test.cpp b/kernels/test/op_scalar_tensor_test.cpp index 7a2f5ca9dab..482f6073a69 100644 --- a/kernels/test/op_scalar_tensor_test.cpp +++ b/kernels/test/op_scalar_tensor_test.cpp @@ -80,7 +80,7 @@ class OpScalarTensorOutTest : public OperatorTest { test_scalar_tensor_out_0d(9); \ } -ET_FORALL_REAL_TYPES(GENERATE_TEST_0D) +ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, GENERATE_TEST_0D) #define GENERATE_TEST(ctype, dtype) \ TEST_F(OpScalarTensorOutTest, dtype##Tensors) { \ @@ -98,7 +98,7 @@ ET_FORALL_REAL_TYPES(GENERATE_TEST_0D) test_scalar_tensor_out_3d(7); \ } -ET_FORALL_REAL_TYPES(GENERATE_TEST) +ET_FORALL_REAL_TYPES_AND3(Half, Bool, BFloat16, GENERATE_TEST) TEST_F(OpScalarTensorOutTest, InvalidOutShapeFails) { if (torch::executor::testing::SupportedFeatures::get()->is_aten) { diff --git a/runtime/core/portable_type/scalar.h b/runtime/core/portable_type/scalar.h index 2619f9e2614..1147fee7cc9 100644 --- a/runtime/core/portable_type/scalar.h +++ b/runtime/core/portable_type/scalar.h @@ -8,6 +8,8 @@ #pragma once +#include +#include #include #include @@ -39,6 +41,8 @@ class Scalar { /*implicit*/ Scalar(double val) : tag(Tag::Double) { v.as_double = val; } + /*implicit*/ Scalar(BFloat16 val) : Scalar((double)(float)val) {} + /*implicit*/ Scalar(Half val) : Scalar((double)(float)val) {} /// Returns the concrete scalar value stored within. template From 234f94894087fcbf80a3cee9feb30366d7eb8ef3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:54:02 -0700 Subject: [PATCH 242/531] [ExecuTorch] support BF16 in op_where Differential Revision: D61981359 Pull Request resolved: https://github.com/pytorch/executorch/pull/4982 --- kernels/portable/cpu/op_where.cpp | 4 ++-- kernels/test/op_where_test.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index bf42447582e..6ff4cb85fb3 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -41,8 +41,8 @@ Tensor& where_out( cond_type == ScalarType::Bool || cond_type == ScalarType::Byte, "Unhandled dtype %s for where.self_out", torch::executor::toString(cond_type)); - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() { using CTYPE_OUT = typename torch::executor::promote_types::type; apply_ternary_elementwise_fn( diff --git a/kernels/test/op_where_test.cpp b/kernels/test/op_where_test.cpp index 3388e62e2f5..7ddbbef2d74 100644 --- a/kernels/test/op_where_test.cpp +++ b/kernels/test/op_where_test.cpp @@ -80,7 +80,7 @@ class OpWhereOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_where(); - ET_FORALL_FLOAT_TYPES(ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -90,7 +90,7 @@ class OpWhereOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_where(); - ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -148,7 +148,7 @@ class OpWhereOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_where_enumerate_b_types(); - ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -157,7 +157,7 @@ class OpWhereOutTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_where(); - ET_FORALL_REAL_TYPES(ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } From 03a016863491f396b09d06ef81c6c73e95c79f14 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:54:05 -0700 Subject: [PATCH 243/531] [ExecuTorch] support BF16 in op_add Differential Revision: D61981362 Pull Request resolved: https://github.com/pytorch/executorch/pull/4983 --- kernels/optimized/cpu/op_add.cpp | 17 +++++++++-------- kernels/portable/cpu/op_add.cpp | 20 ++++++++++++++------ kernels/test/op_add_test.cpp | 23 +++++++++++++++++------ 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index a2a05891e54..d46dd85fb3f 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -83,7 +83,8 @@ Tensor& opt_add_out( ScalarType out_type = out.scalar_type(); if (b.numel() == 1) { - if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) { + if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half && + a_type != ScalarType::BFloat16) { auto error = resize_tensor(out, a.sizes()); ET_KERNEL_CHECK_MSG( ctx, @@ -186,12 +187,12 @@ Tensor& opt_add_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() { CTYPE_IN alpha_val; ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); @@ -226,7 +227,7 @@ Tensor& opt_add_scalar_out( ET_CHECK(common_type == out_type); - if (common_type == ScalarType::Half) { + if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) { common_type = ScalarType::Float; } @@ -235,7 +236,7 @@ Tensor& opt_add_scalar_out( ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor."); if (a_type == common_type && a_type == out_type && - a_type != ScalarType::Half) { + a_type != ScalarType::Half && a_type != ScalarType::BFloat16) { ET_SWITCH_REALB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() { CTYPE_B b_val; @@ -255,11 +256,11 @@ Tensor& opt_add_scalar_out( }); }); } else { - ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() { ET_SWITCH_REALB_TYPES( common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() { CTYPE_B b_val; ET_EXTRACT_SCALAR(b, b_val); diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index a435e4ee658..2cc01a97fa6 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -78,7 +78,11 @@ Tensor& add_out( InvalidArgument, out); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); @@ -94,15 +98,15 @@ Tensor& add_out( constexpr auto name = "add.out"; - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); CTYPE_IN alpha_val; utils::extract_scalar(alpha, &alpha_val); - ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { AddInner< can_cast::value, CTYPE_A, @@ -132,7 +136,11 @@ Tensor& add_scalar_out( out, "Failed to resize output tensor."); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); @@ -153,7 +161,7 @@ Tensor& add_scalar_out( constexpr auto name = "add.Scalar_out"; - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() { ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { using CTYPE_IN = typename utils::promote_type_with_scalar_type< CTYPE_A, diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp index 79a58a0c7ce..51ace05b752 100644 --- a/kernels/test/op_add_test.cpp +++ b/kernels/test/op_add_test.cpp @@ -58,6 +58,7 @@ class OpAddOutKernelTest : public OperatorTest { template void test_add_enumerate_out_types() { + test_add(); test_add(); test_add(); test_add(); @@ -73,7 +74,7 @@ class OpAddOutKernelTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_add_enumerate_out_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -82,7 +83,7 @@ class OpAddOutKernelTest : public OperatorTest { #define ENUMERATE_TEST_ENTRY(ctype, dtype) \ test_add_enumerate_b_types(); - ET_FORALL_REAL_TYPES_AND(Half, ENUMERATE_TEST_ENTRY) + ET_FORALL_REALHBF16_TYPES(ENUMERATE_TEST_ENTRY) #undef ENUMERATE_TEST_ENTRY } @@ -99,13 +100,15 @@ class OpAddOutKernelTest : public OperatorTest { // Add two tensors. op_add_out( - tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}), + tf.make(sizes, /*data=*/{1.25, 2.25, 4.5, 8.875}), tf.ones(sizes), - /*alpha=*/1.1, + /*alpha=*/1.25, out); - // Check that it matches the expected output. - EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.2, 3.3, 5.5, 9.9})); + // Check that it matches the expected output. Values selected to + // be exactly representable to avoid throwing off half/bfloat16 + // tests. + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{2.5, 3.5, 5.75, 10.125})); } }; @@ -136,6 +139,14 @@ TEST_F(OpAddOutKernelTest, DoubleTensors) { test_floating_point_add_out(); } +TEST_F(OpAddOutKernelTest, HalfTensors) { + test_floating_point_add_out(); +} + +TEST_F(OpAddOutKernelTest, BFloat16Tensors) { + test_floating_point_add_out(); +} + TEST_F(OpAddOutKernelTest, BoolAndIntInputTensor) { TensorFactory tf; TensorFactory tfi; From 1d420c95f5673d445fdec7b05c07025b6500723b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:54:09 -0700 Subject: [PATCH 244/531] [ExecuTorch] support BF16 in LLM runner & sampler Differential Revision: D61981354 Pull Request resolved: https://github.com/pytorch/executorch/pull/4984 --- extension/llm/runner/text_decoder_runner.h | 58 ++++++++----------- extension/llm/sampler/sampler.cpp | 2 + .../core/exec_aten/util/scalar_type_util.h | 33 ++++++++--- 3 files changed, 51 insertions(+), 42 deletions(-) diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 16adeeed0a5..14614775f3a 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -67,39 +67,31 @@ class TextDecoderRunner { * @return The next token. */ inline int32_t logits_to_token(const exec_aten::Tensor& logits_tensor) { - switch (logits_tensor.scalar_type()) { - // If the logit_tensor rank is 3, the shape is [batch, seq_length, - // vocab_size], get the last logits, sample and return. Else the model - // outputs the last logit, directly sample and return. - case exec_aten::ScalarType::Float: { - float* logits = logits_tensor.mutable_data_ptr(); - if (logits_tensor.dim() == 3) { - auto num_tokens = logits_tensor.size(1); - auto vocab_size = logits_tensor.size(2); - float* logits_last = logits; - logits_last += (num_tokens - 1) * vocab_size; - return sampler_->sample(logits_last); - } - return sampler_->sample(logits); - } - case exec_aten::ScalarType::Half: { - exec_aten::Half* logits = - logits_tensor.mutable_data_ptr(); - if (logits_tensor.dim() == 3) { - auto num_tokens = logits_tensor.size(1); - auto vocab_size = logits_tensor.size(2); - exec_aten::Half* logits_last = logits; - logits_last += (num_tokens - 1) * vocab_size; - return sampler_->sample(logits_last); - } - return sampler_->sample(logits); - } - default: - ET_CHECK_MSG( - false, - "Unsupported dtype output %hhd", - static_cast(logits_tensor.scalar_type())); - } + int32_t result = 0; + ET_SWITCH_THREE_TYPES( + Float, + Half, + BFloat16, + logits_tensor.scalar_type(), + unused, + "logits_to_token", + CTYPE, + [&]() { + // If the logit_tensor rank is 3, the shape is [batch, seq_length, + // vocab_size], get the last logits, sample and return. Else the model + // outputs the last logit, directly sample and return. + auto* logits = logits_tensor.mutable_data_ptr(); + if (logits_tensor.dim() == 3) { + auto num_tokens = logits_tensor.size(1); + auto vocab_size = logits_tensor.size(2); + auto* logits_last = logits; + logits_last += (num_tokens - 1) * vocab_size; + result = sampler_->sample(logits_last); + } else { + result = sampler_->sample(logits); + } + }); + return result; } protected: diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp index 64e1307d262..f7342c48f70 100644 --- a/extension/llm/sampler/sampler.cpp +++ b/extension/llm/sampler/sampler.cpp @@ -192,6 +192,8 @@ int32_t Sampler::sample(T* logits) { template int32_t Sampler::sample(float* logits); template int32_t Sampler::sample(exec_aten::Half* logits); +template int32_t Sampler::sample( + exec_aten::BFloat16* logits); } // namespace llm } // namespace extension diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index 479767b4abb..9a00acc6432 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -953,17 +953,19 @@ inline exec_aten::ScalarType promoteTypes( // #ifdef ET_INTERNAL_CHECK_SELECTIVE_BUILD -#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...) \ - case enum_type: { \ - ET_INTERNAL_CHECK_SELECTIVE_BUILD(enum_type); \ - using CTYPE_ALIAS = ScalarTypeToCppType::type; \ - return __VA_ARGS__(); \ +#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...) \ + case enum_type: { \ + ET_INTERNAL_CHECK_SELECTIVE_BUILD(enum_type); \ + using CTYPE_ALIAS = \ + ::executorch::runtime::ScalarTypeToCppType::type; \ + return __VA_ARGS__(); \ } #else -#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...) \ - case enum_type: { \ - using CTYPE_ALIAS = ScalarTypeToCppType::type; \ - return __VA_ARGS__(); \ +#define ET_INTERNAL_SWITCH_CASE(enum_type, CTYPE_ALIAS, ...) \ + case enum_type: { \ + using CTYPE_ALIAS = \ + ::executorch::runtime::ScalarTypeToCppType::type; \ + return __VA_ARGS__(); \ } #endif @@ -1343,6 +1345,19 @@ inline exec_aten::ScalarType promoteTypes( ET_INTERNAL_SWITCH_CASE( \ exec_aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__)) +#define ET_SWITCH_THREE_TYPES( \ + T1, T2, T3, TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \ + ET_INTERNAL_SWITCH( \ + TYPE, \ + CONTEXT, \ + NAME, \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::T1, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::T2, CTYPE_ALIAS, __VA_ARGS__) \ + ET_INTERNAL_SWITCH_CASE( \ + exec_aten::ScalarType::T3, CTYPE_ALIAS, __VA_ARGS__)) + } // namespace runtime } // namespace executorch From 1511fc1d7fc9c72b635c6433f2de6c0e2785bf74 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 6 Sep 2024 16:54:12 -0700 Subject: [PATCH 245/531] [ExecuTorch] Allow setting dtype to bf16 in export_llama Differential Revision: D61981363 Pull Request resolved: https://github.com/pytorch/executorch/pull/4985 --- examples/models/llama2/export_llama_lib.py | 4 ++-- extension/llm/export/builder.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index c19ddd58a26..e56d2fe848b 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -256,9 +256,9 @@ def build_args_parser() -> argparse.ArgumentParser: "--dtype-override", default="fp32", type=str, - choices=["fp32", "fp16"], + choices=["fp32", "fp16", "bf16"], help="Override the dtype of the model (default is the checkpoint dtype)." - "Options: fp32, fp16. Please be aware that only some backends support fp16.", + "Options: fp32, fp16, bf16. Please be aware that only some backends support fp16 and bf16.", ) parser.add_argument( diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 2c2e52c744f..6eecebb9466 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -46,6 +46,7 @@ def to_torch_dtype(self) -> torch.dtype: mapping = { DType.fp32: torch.float32, DType.fp16: torch.float16, + DType.bf16: torch.bfloat16, } if self not in mapping: raise ValueError(f"Unsupported dtype {self}") From 26e72e4c3b4bac9edf40d6ba7b90bde60b8a45ed Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:02:30 -0700 Subject: [PATCH 246/531] Add coreml models to CI and benchmark workflow Differential Revision: D62323397 Pull Request resolved: https://github.com/pytorch/executorch/pull/5150 --- .ci/scripts/{test.sh => test_model.sh} | 16 ++++++++++++ .github/workflows/android-perf.yml | 2 +- .github/workflows/apple-perf.yml | 3 +-- .github/workflows/periodic.yml | 2 +- .github/workflows/pull.yml | 4 +-- .github/workflows/trunk.yml | 34 ++++++++++++++++++++++++-- 6 files changed, 53 insertions(+), 8 deletions(-) rename .ci/scripts/{test.sh => test_model.sh} (93%) diff --git a/.ci/scripts/test.sh b/.ci/scripts/test_model.sh similarity index 93% rename from .ci/scripts/test.sh rename to .ci/scripts/test_model.sh index 04398c5a483..e589337666d 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test_model.sh @@ -182,6 +182,16 @@ test_model_with_qnn() { EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME} } +test_model_with_coreml() { + if [[ "${BUILD_TOOL}" == "buck2" ]]; then + echo "coreml doesn't support buck2." + exit 1 + fi + + "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" + EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit) +} + if [[ "${BACKEND}" == "portable" ]]; then echo "Testing ${MODEL_NAME} with portable kernels..." test_model @@ -191,6 +201,12 @@ elif [[ "${BACKEND}" == "qnn" ]]; then if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi +elif [[ "${BACKEND}" == "coreml" ]]; then + echo "Testing ${MODEL_NAME} with coreml..." + test_model_with_coreml + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi elif [[ "${BACKEND}" == "xnnpack" ]]; then echo "Testing ${MODEL_NAME} with xnnpack..." test_model_with_xnnpack true true diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 11950623ea0..c98fa98bb26 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -170,7 +170,7 @@ jobs: fi PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" else - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" fi echo "::endgroup::" diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 8da58653a82..416d1ca805e 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -169,9 +169,8 @@ jobs: PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" else - # TODO (huydhn): Extend the export script here to support other backends such as coreml, mps PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - bash .ci/scripts/test.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" fi echo "::endgroup::" diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 4cc57b0c7f1..df13140ca92 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -62,4 +62,4 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3e346c716e7..ca13d9bbd22 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -54,7 +54,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch with the add model on portable backend. - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "add" "${BUILD_TOOL}" "portable" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable" test-models-linux: name: test-models-linux @@ -81,7 +81,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-llama-runner-linux: name: test-llama-runner-linux diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 9d41f39172b..c1a0d175d04 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -59,7 +59,7 @@ jobs: # Setup MacOS dependencies as there is no Docker support on MacOS atm PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # Build and test xecutorch - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-custom-ops-macos: name: test-custom-ops-macos @@ -320,4 +320,34 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh ${{ matrix.model }} "cmake" "qnn" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn" + + test-coreml-model: + name: test-coreml-model + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + strategy: + fail-fast: false + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + BUILD_TOOL=cmake + BACKEND=coreml + + bash .ci/scripts/setup-conda.sh + + # Setup MacOS dependencies as there is no Docker support on MacOS atm + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh + echo "Finishing installing coreml." + + # Build and test coreml model + MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l) + for MODEL_NAME in "${MODELS[@]}"; do + echo "::group::Exporting coreml model: $MODEL_NAME" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" + echo "::endgroup::" + done From ab4810cc931cc6200374debb027f836037bf42f5 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 6 Sep 2024 17:12:18 -0700 Subject: [PATCH 247/531] Use metal instance for Android emulator for KVM support Differential Revision: D62299612 Pull Request resolved: https://github.com/pytorch/executorch/pull/5127 --- .github/workflows/android.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 1ea7f398cee..6ed558d3ad2 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -80,7 +80,8 @@ jobs: # Running Android emulator directly on the runner and not using Docker run-emulator: needs: build-llm-demo - runs-on: amz2023.linux.4xlarge + # NB: Use metal install for KVM support to run the emulator faster + runs-on: linux.24xl.spr-metal env: ANDROID_NDK_VERSION: r26c API_LEVEL: 34 @@ -128,9 +129,6 @@ jobs: uses: reactivecircus/android-emulator-runner@v2 with: api-level: ${{ env.API_LEVEL }} - # NB: x86_64 emulator is slow because the lack of KVM support on AWS, it - # seems that we can use metal instance for that but it hasn't been tried - # out yet. Also arm64-v8a arch requires an ARM runner arch: x86_64 script: ./build/run_android_emulator.sh # NB: This is to boot the emulator faster following the instructions on From 8ff79efbd10ec792dde82b4826a5d0b6fae34cab Mon Sep 17 00:00:00 2001 From: Nathanael See Date: Fri, 6 Sep 2024 17:35:50 -0700 Subject: [PATCH 248/531] print tensor storage bytes in print_readable value list section Differential Revision: D62327284 Pull Request resolved: https://github.com/pytorch/executorch/pull/5155 --- backends/vulkan/runtime/graph/Logging.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp index 2e4833bfc64..e05fa4e4876 100644 --- a/backends/vulkan/runtime/graph/Logging.cpp +++ b/backends/vulkan/runtime/graph/Logging.cpp @@ -71,8 +71,8 @@ void ComputeGraph::print_readable() { << std::setfill(' ') << std::endl; std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20) - << "sizes" << std::setw(10) << "node_type" << std::setw(10) - << "so_idx" << std::endl; + << "sizes" << std::setw(10) << "node_type" << std::setw(15) + << "storage_bytes" << std::setw(10) << "so_idx" << std::endl; size_t value_idx = 0; for (Value& val : values_) { @@ -108,6 +108,16 @@ void ComputeGraph::print_readable() { } } + // Actual storage bytes used + std::cout << std::setw(15); + if (val.isTensor()) { + const api::vTensor& v_tensor = val.toTensor(); + auto memory_reqs = v_tensor.get_memory_requirements(); + std::cout << memory_reqs.size; + } else { + std::cout << ""; + } + std::cout << std::setw(10); if (value_ref_to_shared_object_idx.count(value_idx) > 0) { size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx); From 32d83b0dbd2d2d2769df534a3657eb8dee437028 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 6 Sep 2024 23:36:22 -0700 Subject: [PATCH 249/531] [Android Java] Get rid of forwardOnes Differential Revision: D62327373 Pull Request resolved: https://github.com/pytorch/executorch/pull/5153 --- extension/android/jni/jni_layer.cpp | 31 ++++++++++++++----- .../java/org/pytorch/executorch/Module.java | 10 ++---- .../org/pytorch/executorch/NativePeer.java | 9 ------ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index ef74d6480bb..f2cfc4a5cff 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -294,6 +294,29 @@ class ExecuTorchJni : public facebook::jni::HybridClass { facebook::jni::alias_ref< facebook::jni::JArrayClass::javaobject> jinputs) { + // If no inputs is given, it will run with sample inputs (ones) + if (jinputs->size() == 0) { + if (module_->load_method(method) != Error::Ok) { + return {}; + } + auto&& underlying_method = module_->methods_[method].method; + auto&& buf = prepare_input_tensors(*underlying_method); + auto result = underlying_method->execute(); + if (result != Error::Ok) { + return {}; + } + facebook::jni::local_ref> jresult = + facebook::jni::JArrayClass::newArray( + underlying_method->outputs_size()); + + for (int i = 0; i < underlying_method->outputs_size(); i++) { + auto jevalue = + JEValue::newJEValueFromEValue(underlying_method->get_output(i)); + jresult->setElement(i, *jevalue); + } + return jresult; + } + std::vector evalues; std::vector tensors; @@ -352,20 +375,12 @@ class ExecuTorchJni : public facebook::jni::HybridClass { return jresult; } - jint forward_ones() { - auto&& load_result = module_->load_method("forward"); - auto&& buf = prepare_input_tensors(*(module_->methods_["forward"].method)); - auto&& result = module_->methods_["forward"].method->execute(); - return (jint)result; - } - static void registerNatives() { registerHybrid({ makeNativeMethod("initHybrid", ExecuTorchJni::initHybrid), makeNativeMethod("forward", ExecuTorchJni::forward), makeNativeMethod("execute", ExecuTorchJni::execute), makeNativeMethod("loadMethod", ExecuTorchJni::load_method), - makeNativeMethod("forwardOnes", ExecuTorchJni::forward_ones), }); } }; diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/src/main/java/org/pytorch/executorch/Module.java index dc4bf710d9b..de2ed78b520 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/Module.java +++ b/extension/android/src/main/java/org/pytorch/executorch/Module.java @@ -79,16 +79,12 @@ public static Module load(final String modelPath) { /** * Runs the 'forward' method of this module with the specified arguments. * - * @param inputs arguments for the ExecuTorch module's 'forward' method. + * @param inputs arguments for the ExecuTorch module's 'forward' method. Note: if method 'forward' + * requires inputs but no inputs are given, the function will not error out, but run 'forward' + * with sample inputs. * @return return value from the 'forward' method. */ public EValue[] forward(EValue... inputs) { - if (inputs.length == 0) { - // forward default args (ones) - mNativePeer.forwardOnes(); - // discard the return value - return null; - } return mNativePeer.forward(inputs); } diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java index 0e6c0a231cb..f63de985069 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java +++ b/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java @@ -43,15 +43,6 @@ public void resetNative() { @DoNotStrip public native EValue[] forward(EValue... inputs); - /** - * Run a "forward" call with the sample inputs (ones) to test a module - * - * @return the outputs of the forward call - * @apiNote This is experimental and test-only API - */ - @DoNotStrip - public native int forwardOnes(); - /** Run an arbitrary method on the module */ @DoNotStrip public native EValue[] execute(String methodName, EValue... inputs); From 3268da2adee51c219c745645dc7b631ab1b10e9a Mon Sep 17 00:00:00 2001 From: Mengtao Yuan Date: Sat, 7 Sep 2024 08:37:55 -0700 Subject: [PATCH 250/531] [eval_llama] Add option to save checkpoint after eager transforms. Differential Revision: D62150021 Pull Request resolved: https://github.com/pytorch/executorch/pull/5045 --- examples/models/llama2/eval_llama_lib.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 3ea4e77a1a6..bd650fab1ad 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -158,6 +158,15 @@ def gen_eval_wrapper( else manager.model.eval().to(device="cpu") ) + # Save the checkpoint after the eager model preparation is done. + # The reason for this option is that the checkpoint can be used + # to do evaluations in other evaluation platforms, or with data + # that is not available in this eval_llama. We save the checkpoint + # here for consistency with eval_llama. The accuracy results we + # get from eval_llama can be used as a reference to other evaluations. + if args.output_eager_checkpoint_file is not None: + torch.save(model, args.output_eager_checkpoint_file) + return EagerEvalWrapper( model=model, tokenizer=tokenizer, @@ -196,6 +205,12 @@ def build_args_parser() -> argparse.ArgumentParser: default=None, help="[For ExecuTorch] Path to the Tokenizer binary for evaluating ExecuTorch models via runtime", ) + parser.add_argument( + "--output_eager_checkpoint_file", + type=str, + default=None, + help="Save the checkpoint after source transformations, for other evaluation platform to run the same checkpoint.", + ) return parser From 13da62b00ac5f1440b5a56c9156b1b54669189ce Mon Sep 17 00:00:00 2001 From: Gyanendra Sinha Date: Sat, 7 Sep 2024 12:53:51 -0700 Subject: [PATCH 251/531] [CoreML] Add support for running statefule model. (#5143) --- .../delegate/ETCoreMLDefaultModelExecutor.mm | 7 ++- .../coreml/runtime/delegate/ETCoreMLModel.h | 26 +++++++-- .../coreml/runtime/delegate/ETCoreMLModel.mm | 57 ++++++++++++++++++- .../runtime/delegate/ETCoreMLModelManager.mm | 17 +----- .../runtime/delegate/MLModel_Prewarm.mm | 2 +- .../runtime/sdk/ETCoreMLModelAnalyzer.mm | 7 +-- .../runtime/sdk/ETCoreMLModelProfiler.h | 10 ++-- .../runtime/sdk/ETCoreMLModelProfiler.mm | 22 +++---- 8 files changed, 96 insertions(+), 52 deletions(-) diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm index 57316e28015..226307f3c8f 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm @@ -29,9 +29,10 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { if (self.ignoreOutputBackings) { predictionOptions.outputBackings = @{}; } - id outputs = [self.model.mlModel predictionFromFeatures:inputs - options:predictionOptions - error:error]; + + id outputs = [self.model predictionFromFeatures:inputs + options:predictionOptions + error:error]; if (!outputs) { return nil; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h index 9bf3183e65a..0bbd1132e9f 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h @@ -8,6 +8,10 @@ #import #import +#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include() +#define MODEL_STATE_IS_SUPPORTED 1 +#endif + NS_ASSUME_NONNULL_BEGIN @class ETCoreMLAsset; @@ -37,15 +41,12 @@ __attribute__((objc_subclassing_restricted)) orderedOutputNames:(NSOrderedSet*)orderedOutputNames error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; -- (nullable NSArray*)prepareInputs:(const std::vector&)inputs - error:(NSError* __autoreleasing*)error; - -- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs - error:(NSError* __autoreleasing*)error; - /// The underlying MLModel. @property (strong, readonly, nonatomic) MLModel* mlModel; +/// The model state. +@property (strong, readonly, nonatomic) id state API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0)); + /// The asset from which the model is loaded. @property (strong, readonly, nonatomic) ETCoreMLAsset* asset; @@ -58,6 +59,19 @@ __attribute__((objc_subclassing_restricted)) /// The ordered output names of the model. @property (copy, readonly, nonatomic) NSOrderedSet* orderedOutputNames; + +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions*)options + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareInputs:(const std::vector&)inputs + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs + error:(NSError* __autoreleasing*)error; + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error; + @end NS_ASSUME_NONNULL_END diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm index ee7218bd271..250d5cd951a 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm @@ -7,10 +7,12 @@ #import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLLogging.h" +#import "multiarray.h" +#import "objc_array_util.h" +#import "MLModel_Prewarm.h" #import -#import -#import #import #pragma mark - ETCoreMLMultiArrayDescriptor @@ -194,6 +196,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset _cache = [[NSCache alloc] init]; _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription); _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription); +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + _state = mlModel.modelDescription.stateDescriptionsByName.count > 0 ? [_mlModel newState] : nil; + } +#endif } return self; @@ -272,4 +279,48 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type } +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions *)options + error:(NSError **)error { + +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + if (self.state != nil) { + return [self.mlModel predictionFromFeatures:input + usingState:(MLState *)self.state + options:options + error:error]; + } + } +#endif + + return [self.mlModel predictionFromFeatures:input + options:options + error:error]; +} + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error { + BOOL prewarm = YES; +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + prewarm = (self.mlModel.modelDescription.stateDescriptionsByName.count == 0); + } +#endif + + NSError *localError = nil; + BOOL result = prewarm ? [self.mlModel prewarmAndReturnError:&localError] : NO; + if (!result) { + ETCoreMLLogError(localError, + "%@: Failed to prewarm model with identifier = %@", + NSStringFromClass(self.class), + self.identifier); + } + + if (error) { + *error = localError; + } + + return result; +} + @end diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index 8d6d537385b..2a5c3ed6961 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -598,21 +598,8 @@ - (BOOL)prewarmModelWithHandle:(ModelHandle *)handle if (!model) { return NO; } - - NSError *localError = nil; - BOOL result = [model.mlModel prewarmAndReturnError:&localError]; - if (!result) { - ETCoreMLLogError(localError, - "%@: Failed to prewarm model with identifier = %@", - NSStringFromClass(self.assetManager.class), - model.identifier); - } - - if (error) { - *error = localError; - } - - return result; + + return [model prewarmAndReturnError:error]; } - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm index 71ce967ac3e..97d0400796f 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm @@ -77,7 +77,7 @@ - (BOOL)prewarmAndReturnError:(NSError * __autoreleasing *)error { if (!inputs) { return NO; } - + id outputs = [self predictionFromFeatures:inputs error:error]; return outputs != nil; } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm index 1740faf00e6..988b5d808a0 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm @@ -88,10 +88,9 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod eventLogger:(const executorchcoreml::ModelEventLogger *)eventLogger error:(NSError * __autoreleasing *)error { if (self.profiler == nil) { - ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithCompiledModelAsset:self.model.asset - outputNames:self.model.orderedOutputNames - configuration:self.configuration - error:error]; + ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithModel:self.model + configuration:self.configuration + error:error]; self.profiler = profiler; } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h index 07a384a5167..7a43a30d752 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h @@ -31,14 +31,12 @@ __attribute__((objc_subclassing_restricted)) /// Constructs an `ETCoreMLModelProfiler` instance. /// -/// @param compiledModelAsset The compiled model asset (mlmodelc). -/// @param outputNames The model output names. +/// @param model The model. /// @param configuration The model configuration. /// @param error On failure, error is filled with the failure information. -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset*)compiledModelAsset - outputNames:(NSOrderedSet*)outputNames - configuration:(MLModelConfiguration*)configuration - error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; +- (nullable instancetype)initWithModel:(ETCoreMLModel*)model + configuration:(MLModelConfiguration*)configuration + error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; /// Returns profiling info of operations at the specified paths. /// diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm index c9ad324a6c0..5998701eb0f 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm @@ -8,6 +8,7 @@ #import "ETCoreMLModelProfiler.h" #import "ETCoreMLAsset.h" +#import "ETCoreMLModel.h" #import "ETCoreMLLogging.h" #import "ETCoreMLModelStructurePath.h" #import "ETCoreMLOperationProfilingInfo.h" @@ -221,8 +222,8 @@ void set_model_outputs(id output_features, } @interface ETCoreMLModelProfiler () -/// The CoreML model. -@property (readonly, strong, nonatomic) MLModel *model; +/// The model. +@property (readonly, strong, nonatomic) ETCoreMLModel *model; /// The model output names. @property (readonly, copy, nonatomic) NSOrderedSet *outputNames; #if MODEL_PROFILING_IS_AVAILABLE @@ -240,25 +241,19 @@ @interface ETCoreMLModelProfiler () @implementation ETCoreMLModelProfiler -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledModelAsset - outputNames:(NSOrderedSet *)outputNames - configuration:(MLModelConfiguration *)configuration - error:(NSError * __autoreleasing *)error { +- (nullable instancetype)initWithModel:(ETCoreMLModel *)model + configuration:(MLModelConfiguration *)configuration + error:(NSError * __autoreleasing *)error { #if MODEL_PROFILING_IS_AVAILABLE if (@available(macOS 14.4, iOS 17.4, tvOS 17.4, watchOS 10.4, *)) { - NSURL *compiledModelURL = compiledModelAsset.contentURL; + NSURL *compiledModelURL = model.asset.contentURL; MLComputePlan *computePlan = get_compute_plan_of_model_at_url(compiledModelURL, configuration, error); if (!computePlan) { return nil; } - - MLModel *model = [MLModel modelWithContentsOfURL:compiledModelURL error:error]; - if (!model) { - return nil; - } - + __block NSMutableArray *operationPaths = [NSMutableArray array]; __block NSMutableDictionary *operationToPathMap = [NSMutableDictionary dictionary]; __block NSMutableArray *topologicallySortedOperations = [NSMutableArray new]; @@ -280,7 +275,6 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod self = [super init]; if (self) { - _outputNames = [outputNames copy]; _model = model; _computePlan = computePlan; _operationToPathMap = operationToPathMap; From 258cf71a8e37ef245ee3949daae7575ee2c5d73b Mon Sep 17 00:00:00 2001 From: cccclai Date: Sat, 7 Sep 2024 22:31:57 -0700 Subject: [PATCH 252/531] Reland add proper calibration for pt2e flow Differential Revision: D62323396 Pull Request resolved: https://github.com/pytorch/executorch/pull/5152 --- examples/models/llama2/eval_llama_lib.py | 82 ++++++++++++--- examples/models/llama2/export_llama_lib.py | 29 +++++- examples/models/llama2/tokenizer/targets.bzl | 16 +++ extension/llm/export/TARGETS | 1 + extension/llm/export/builder.py | 101 ++++++++++++++++++- extension/llm/tokenizer/targets.bzl | 22 +--- 6 files changed, 212 insertions(+), 39 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index bd650fab1ad..2d10f5edc0a 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -29,6 +29,51 @@ ) +class GraphModuleEvalWrapper(EagerEvalWrapper): + """ + A wrapper class for ExecuTorch py-binded integration with the + lm-evaluation-harness library. + """ + + def __init__( + self, + model: torch.fx.GraphModule, + tokenizer: Union[SentencePieceTokenizer, Tiktoken], + max_seq_length: Optional[int] = None, + use_kv_cache: bool = False, + enable_dynamic_shape: bool = True, + ): + super().__init__( + model=model, tokenizer=tokenizer, max_seq_length=max_seq_length + ) + self._model = model.to(self.device) + self._use_kv_cache = use_kv_cache + self._enable_dynamic_shape = enable_dynamic_shape + + def _model_call(self, inps): + if self._use_kv_cache: + if not self._enable_dynamic_shape: + # graph module exported without dynamic shape won't work with a different shape. + # And we have to do single token prefill here. + result_logits = [] + for pos in range(inps.shape[-1]): + pos_tensor = torch.tensor([pos], dtype=torch.int64) + logits = self._model(inps[:, pos : pos + 1], pos_tensor) + result_logits.append(logits) + return torch.cat(result_logits, dim=1) + else: + pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) + # Batch process the whole sequence. + logits = self._model(inps[:, : self._max_seq_length], pos_tensor) + return logits + + else: + return self._model(inps) + + def _model_generate(self, context, max_length, eos_token_id): + raise Exception("unimplemented") + + class ETPybindEvalWrapper(EagerEvalWrapper): """ A wrapper class for ExecuTorch py-binded integration with the @@ -148,6 +193,13 @@ def gen_eval_wrapper( if torch.cuda.is_available() else manager.pre_autograd_graph_module.to(device="cpu") ) + return GraphModuleEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + use_kv_cache=args.use_kv_cache, + enable_dynamic_shape=args.enable_dynamic_shape, + ) else: # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but @@ -158,21 +210,21 @@ def gen_eval_wrapper( else manager.model.eval().to(device="cpu") ) - # Save the checkpoint after the eager model preparation is done. - # The reason for this option is that the checkpoint can be used - # to do evaluations in other evaluation platforms, or with data - # that is not available in this eval_llama. We save the checkpoint - # here for consistency with eval_llama. The accuracy results we - # get from eval_llama can be used as a reference to other evaluations. - if args.output_eager_checkpoint_file is not None: - torch.save(model, args.output_eager_checkpoint_file) - - return EagerEvalWrapper( - model=model, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - use_kv_cache=args.use_kv_cache, - ) + # Save the checkpoint after the eager model preparation is done. + # The reason for this option is that the checkpoint can be used + # to do evaluations in other evaluation platforms, or with data + # that is not available in this eval_llama. We save the checkpoint + # here for consistency with eval_llama. The accuracy results we + # get from eval_llama can be used as a reference to other evaluations. + if args.output_eager_checkpoint_file is not None: + torch.save(model, args.output_eager_checkpoint_file) + + return EagerEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + use_kv_cache=args.use_kv_cache, + ) def build_args_parser() -> argparse.ArgumentParser: diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index e56d2fe848b..f6abc3aaf4e 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -16,7 +16,7 @@ from enum import Enum from json import JSONDecodeError from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union import pkg_resources @@ -166,19 +166,25 @@ def build_args_parser() -> argparse.ArgumentParser: nargs="+", type=str, default=None, - help="Tasks for GPTQ calibration", + help="Tasks for GPTQ calibration from lm_eval", ) parser.add_argument( "--calibration_limit", type=int, default=None, - help="number of samples used for calibration", + help="number of samples used for calibration from lm_eval", ) parser.add_argument( "--calibration_seq_length", type=int, default=None, - help="Sequence length for GPTQ calibration", + help="Sequence length for GPTQ calibration from lm_eval", + ) + parser.add_argument( + "--calibration_data", + type=str, + default="Once upon a time", + help="Calibration prompts from users", ) parser.add_argument( "-t", @@ -420,6 +426,11 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: generate_full_logits=args.generate_full_logits, weight_type=weight_type, enable_dynamic_shape=args.enable_dynamic_shape, + calibration_tasks=args.calibration_tasks, + calibration_limit=args.calibration_limit, + calibration_seq_length=args.calibration_seq_length, + calibration_data=args.calibration_data, + tokenizer_path=args.tokenizer_path, verbose=args.verbose, max_seq_len=args.max_seq_length, metadata_str=args.metadata, @@ -630,6 +641,11 @@ def _load_llama_model( generate_full_logits: bool = False, weight_type: WeightType = WeightType.LLAMA, enable_dynamic_shape: bool = False, + calibration_tasks: Optional[List[str]] = None, + calibration_limit: Optional[int] = None, + calibration_seq_length: Optional[int] = None, + calibration_data: Optional[str] = None, + tokenizer_path: Optional[str] = None, verbose: bool = False, max_seq_len: int = 128, metadata_str: Optional[str] = None, @@ -686,6 +702,11 @@ def _load_llama_model( use_kv_cache=use_kv_cache, example_inputs=example_inputs, enable_dynamic_shape=enable_dynamic_shape, + calibration_tasks=calibration_tasks, + calibration_limit=calibration_limit, + calibration_seq_length=calibration_seq_length, + calibration_data=calibration_data, + tokenizer_path=tokenizer_path, verbose=verbose, metadata=_load_llama_model_metadata( weight_type, diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl index 70318740d6a..40f8f29ac1e 100644 --- a/examples/models/llama2/tokenizer/targets.bzl +++ b/examples/models/llama2/tokenizer/targets.bzl @@ -21,3 +21,19 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) + + runtime.python_library( + name = "tiktoken_py", + srcs = [ + "tiktoken.py", + ], + _is_external_target = True, + visibility = [ + "//bento/...", + "//bento_kernels/...", + "//executorch/...", + ], + deps = [ + "fbsource//third-party/pypi/tiktoken:tiktoken", + ], + ) diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS index 75f5cf937e8..be9bc183dbe 100644 --- a/extension/llm/export/TARGETS +++ b/extension/llm/export/TARGETS @@ -33,5 +33,6 @@ runtime.python_library( "//executorch/exir:lib", "//executorch/exir/backend:backend_details", "//executorch/extension/export_util:export_util", + "//executorch/extension/llm/tokenizer:tokenizer_py_lib", ], ) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 6eecebb9466..bc64ae869fc 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -27,6 +27,7 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.extension.export_util.utils import export_to_edge, save_pte_program +from executorch.extension.llm.tokenizer.utils import get_tokenizer from torch._export import capture_pre_autograd_graph from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer import Quantizer @@ -68,6 +69,11 @@ def __init__( example_inputs, args: Optional[Any] = None, enable_dynamic_shape: bool = False, + calibration_tasks: Optional[List[str]] = None, + calibration_limit: Optional[int] = None, + calibration_seq_length: Optional[int] = None, + calibration_data: Optional[str] = None, + tokenizer_path: Optional[str] = None, verbose: bool = False, metadata: Optional[dict] = None, dynamic_shapes: Optional[Any] = None, @@ -90,6 +96,11 @@ def __init__( self.dynamic_shapes = dynamic_shapes self._saved_pte_filename = None self.args = args + self.calibration_tasks = calibration_tasks + self.calibration_limit = calibration_limit + self.calibration_seq_length = calibration_seq_length + self.calibration_data = calibration_data + self.tokenizer_path = tokenizer_path def set_output_dir(self, output_dir: str) -> "LLMEdgeManager": """ @@ -181,6 +192,69 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager": return self + def pt2e_calibrate( + self, + prepared_module, + calibration_tasks, + calibration_limit, + calibration_seq_length, + calibration_data, + tokenizer_path, + ): + logging.info("Run calibration...") + try: + from executorch.examples.models.llama2.eval_llama_lib import ( + GraphModuleEvalWrapper, + ) + from executorch.examples.models.llama2.evaluate import evaluate_model + except ImportError: + raise ImportError( + "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh" + ) + + tokenizer = get_tokenizer(tokenizer_path) + + def calibrate_template( + module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int + ): + # TODO: change criteria & support batch inputs if necessary + pos = torch.tensor(0, dtype=torch.int64) + token_list = tokenizer.encode(prompts, bos=True, eos=False) + + with torch.no_grad(): + while token_list[-1] != tokenizer.eos_id and pos < max_len: + logits = module( + torch.full((1, 1), token_list[pos]), + torch.tensor((pos,)), + ) + pos += 1 + if pos >= len(token_list): + token_list.append(torch.argmax(logits[:], dim=-1).item()) + + calibrate_template( + module=prepared_module, + tokenizer=tokenizer, + prompts=calibration_data, + max_len=calibration_seq_length, + ) + + eval_wrapper = GraphModuleEvalWrapper( + model=prepared_module, + tokenizer=tokenizer, + max_seq_length=calibration_seq_length, + use_kv_cache=self.use_kv_cache, + enable_dynamic_shape=self.enable_dynamic_shape, + ) + eval_results = evaluate_model( + eval_wrapper, + calibration_tasks, + calibration_limit, + ) + + for task, res in eval_results["results"].items(): + print(f"{task}: {res}") + logging.info("Calibration finish...") + def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": """ Quantize the model via pt2e flow and retrieve LLMEdgeManager including the quantized model. @@ -203,8 +277,33 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage self.pre_autograd_graph_module is not None ), "Please run capture_pre_autograd_graph first" m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer) + logging.info( + f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" + ) # Calibrate - m(*self.example_inputs) + if ( + self.calibration_tasks is not None + and self.calibration_limit is not None + and self.calibration_seq_length is not None + and self.calibration_data is not None + and self.tokenizer_path is not None + ): + logging.info( + f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" + ) + self.pt2e_calibrate( + prepared_module=m, + calibration_tasks=self.calibration_tasks, + calibration_limit=self.calibration_limit, + calibration_seq_length=self.calibration_seq_length, + calibration_data=self.calibration_data, + tokenizer_path=self.tokenizer_path, + ) + else: + logging.info( + "No calibration provided, using dummy input to calibrate..." + ) + m(*self.example_inputs) m = convert_pt2e(m) DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl index f8e4df095ca..fa6cc915c4b 100644 --- a/extension/llm/tokenizer/targets.bzl +++ b/extension/llm/tokenizer/targets.bzl @@ -11,36 +11,20 @@ def define_common_targets(): srcs = [ "__init__.py", "tokenizer.py", + "utils.py", ], base_module = "executorch.extension.llm.tokenizer", visibility = [ "//executorch/examples/...", "//executorch/extension/llm/tokenizer/...", + "//executorch/extension/llm/export/...", "//bento/...", "//bento_kernels/...", ], _is_external_target = True, - external_deps = [ - "sentencepiece-py", - ], - ) - - runtime.python_library( - name = "utils", - srcs = [ - "utils.py", - ], - base_module = "executorch.extension.llm.utils", - visibility = [ - "//executorch/examples/...", - "//executorch/extension/llm/tokenizer/...", - "//bento/...", - "//bento_kernels/...", - ], deps = [ - "//executorch/examples/models/llama2/tokenizer:tiktoken", + "//executorch/examples/models/llama2/tokenizer:tiktoken_py", ], - _is_external_target = True, external_deps = [ "sentencepiece-py", ], From cb7119328f80e9c9f16a2d6ff81a8e314933ded8 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Sun, 8 Sep 2024 00:12:19 -0700 Subject: [PATCH 253/531] Make TensorImplPtr custom deleter copyable. Differential Revision: D62338334 Pull Request resolved: https://github.com/pytorch/executorch/pull/5161 --- extension/tensor/tensor_impl_ptr.cpp | 29 +++++++------ extension/tensor/tensor_impl_ptr.h | 9 ++-- .../tensor/test/tensor_impl_ptr_test.cpp | 43 +++++++++++++++++++ extension/tensor/test/tensor_ptr_test.cpp | 33 ++++++++++++++ 4 files changed, 99 insertions(+), 15 deletions(-) diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp index ea4d83f5afd..7ab278f729c 100644 --- a/extension/tensor/tensor_impl_ptr.cpp +++ b/extension/tensor/tensor_impl_ptr.cpp @@ -30,20 +30,23 @@ static void noop_deleter(void*) {} * TensorImpl is destroyed. */ struct TensorImplPtrDeleter final { - std::unique_ptr> data; - std::vector sizes; - std::vector dim_order; - std::vector strides; + // A custom deleter of the std::shared_ptr is required to be copyable until + // C++20, so any data it holds must be copyable too. Hence, we use shared_ptr + // to hold the data and metadata to avoid unnecessary copies. + std::shared_ptr data; + std::shared_ptr> sizes; + std::shared_ptr> dim_order; + std::shared_ptr> strides; void operator()(exec_aten::TensorImpl* pointer) { // Release all resources immediately since the data held by the - // TensorImplDeleter is tied to the managed object, not the smart pointer + // TensorImplPtrDeleter is tied to the managed object, not the smart pointer // itself. We need to free this memory when the object is destroyed, not // when the smart pointer (and deleter) are eventually destroyed or reset. data.reset(); - sizes = {}; - dim_order = {}; - strides = {}; + sizes.reset(); + dim_order.reset(); + strides.reset(); delete pointer; } }; @@ -90,11 +93,13 @@ TensorImplPtr make_tensor_impl_ptr( return TensorImplPtr( tensor_impl.release(), TensorImplPtrDeleter{ - std::unique_ptr>( + std::shared_ptr( data, deleter ? std::move(deleter) : noop_deleter), - std::move(sizes), - std::move(dim_order), - std::move(strides)}); + std::make_shared>(std::move(sizes)), + std::make_shared>( + std::move(dim_order)), + std::make_shared>( + std::move(strides))}); #else auto options = c10::TensorOptions() .dtype(c10::scalarTypeToTypeMeta(type)) diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h index e8180d93e72..347c9a36f9a 100644 --- a/extension/tensor/tensor_impl_ptr.h +++ b/extension/tensor/tensor_impl_ptr.h @@ -94,15 +94,18 @@ TensorImplPtr make_tensor_impl_ptr( std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = exec_aten::TensorShapeDynamism::STATIC) { - const auto data_ptr = data.data(); + auto raw_data_ptr = data.data(); + auto data_ptr = std::make_shared< + std::vector::type>>( + std::move(data)); return make_tensor_impl_ptr( T, std::move(sizes), - data_ptr, + raw_data_ptr, std::move(dim_order), std::move(strides), dynamism, - [data = std::move(data)](void*) {}); + [data_ptr = std::move(data_ptr)](void*) {}); } } // namespace extension diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index a95f807a736..60f625177b6 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -181,3 +181,46 @@ TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { EXPECT_EQ(tensor_impl->strides()[1], 1); EXPECT_EQ(tensor_impl->data(), nullptr); } + +TEST_F(TensorImplPtrTest, SharedDataManagement) { + auto data = std::make_shared>(100, 1.0f); + auto tensor_impl1 = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {10, 10}, data->data()); + auto tensor_impl2 = tensor_impl1; + + EXPECT_EQ(tensor_impl1.get(), tensor_impl2.get()); + EXPECT_EQ(tensor_impl1.use_count(), 2); + EXPECT_EQ(((float*)tensor_impl1->data())[0], 1.0f); + + ((float*)tensor_impl1->mutable_data())[0] = 2.0f; + EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f); + + tensor_impl1.reset(); + EXPECT_NE(tensor_impl2.get(), nullptr); + EXPECT_EQ(tensor_impl2.use_count(), 1); + + EXPECT_EQ(((float*)tensor_impl2->data())[0], 2.0f); +} + +TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) { + auto data = std::make_shared>(100, 1.0f); + bool deleter_called = false; + { + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, + {10, 10}, + data->data(), + {}, + {}, + exec_aten::TensorShapeDynamism::STATIC, + [data, &deleter_called](void*) mutable { + deleter_called = true; + data.reset(); + }); + + EXPECT_EQ(data.use_count(), 2); + EXPECT_FALSE(deleter_called); + } + EXPECT_TRUE(deleter_called); + EXPECT_EQ(data.use_count(), 1); +} diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 0d76600a666..1542824fb73 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -176,3 +176,36 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) { EXPECT_EQ(tensor->strides()[1], 1); EXPECT_EQ(tensor->data_ptr(), nullptr); } + +TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) { + std::vector data = {1, 2, 3, 4, 5, 6}; + + auto tensor1 = make_tensor_ptr({2, 3}, std::move(data)); + auto tensor2 = make_tensor_ptr(tensor1); + + tensor1->mutable_data_ptr()[0] = 10; + EXPECT_EQ(tensor2->const_data_ptr()[0], 10); + + tensor2->mutable_data_ptr()[5] = 20; + EXPECT_EQ(tensor1->const_data_ptr()[5], 20); +} + +TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) { + std::vector data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + + auto tensor1 = make_tensor_ptr( + {3, 4}, + std::move(data), + {}, + {}, + exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND); + auto tensor2 = make_tensor_ptr(tensor1); + + EXPECT_EQ(resize_tensor_ptr(tensor1, {2, 6}), Error::Ok); + EXPECT_EQ(tensor2->size(0), 2); + EXPECT_EQ(tensor2->size(1), 6); + + EXPECT_EQ(resize_tensor_ptr(tensor2, {4, 3}), Error::Ok); + EXPECT_EQ(tensor1->size(0), 4); + EXPECT_EQ(tensor1->size(1), 3); +} From cb944b702f2f6a770bccfbd238a2a818e8ba397d Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Sun, 8 Sep 2024 14:13:15 -0700 Subject: [PATCH 254/531] Expose the compute number of elements helper function. Differential Revision: D62352386 Pull Request resolved: https://github.com/pytorch/executorch/pull/5166 --- runtime/core/exec_aten/exec_aten.h | 8 ++++++++ runtime/core/exec_aten/util/scalar_type_util.h | 13 +++++++++++-- runtime/core/portable_type/tensor_impl.cpp | 2 -- runtime/core/portable_type/tensor_impl.h | 15 +++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h index 808d31502a9..536128d633d 100644 --- a/runtime/core/exec_aten/exec_aten.h +++ b/runtime/core/exec_aten/exec_aten.h @@ -87,6 +87,11 @@ using IntArrayRef = at::IntArrayRef; template using OptionalArrayRef = c10::OptionalArrayRef; +inline ssize_t compute_numel(const SizesType* sizes, ssize_t dim) { + return static_cast( + c10::multiply_integers(c10::ArrayRef(sizes, dim))); +} + #else // Use executor types using Tensor = torch::executor::Tensor; @@ -127,9 +132,12 @@ template using OptionalArrayRef = torch::executor::optional>; +using torch::executor::compute_numel; + #endif // Use executor types } // namespace exec_aten + namespace torch { namespace executor { using TensorList = exec_aten::TensorList; diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index 9a00acc6432..4d8712c1590 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -28,6 +28,7 @@ #include #include + #ifdef USE_ATEN_LIB // Note that a lot of the macros/functions defined in this ScalarTypeUtil.h file // are also defined in c10/core/ScalarType.h, which is included via @@ -39,14 +40,14 @@ namespace exec_aten { using ScalarType = at::ScalarType; } -#else +#else // !USE_ATEN_LIB #include #include namespace exec_aten { using ScalarType = torch::executor::ScalarType; using string_view = torch::executor::string_view; } // namespace exec_aten -#endif +#endif // USE_ATEN_LIB namespace executorch { namespace runtime { @@ -1361,6 +1362,14 @@ inline exec_aten::ScalarType promoteTypes( } // namespace runtime } // namespace executorch +namespace exec_aten { +#ifdef USE_ATEN_LIB +using ::at::elementSize; +#else // USE_ATEN_LIB +using ::executorch::runtime::elementSize; +#endif // USE_ATEN_LIB +} // namespace exec_aten + namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp index 1ef0ed435bb..ad0fa5868c1 100644 --- a/runtime/core/portable_type/tensor_impl.cpp +++ b/runtime/core/portable_type/tensor_impl.cpp @@ -20,7 +20,6 @@ namespace torch { namespace executor { -namespace { /** * Compute the number of elements based on the sizes of a tensor. */ @@ -39,7 +38,6 @@ ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) { } return numel; } -} // namespace TensorImpl::TensorImpl( ScalarType type, diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h index 19977b71e09..57fc96aa325 100644 --- a/runtime/core/portable_type/tensor_impl.h +++ b/runtime/core/portable_type/tensor_impl.h @@ -253,5 +253,20 @@ class TensorImpl { const TensorShapeDynamism shape_dynamism_; }; +/** + * Compute the number of elements based on the sizes of a tensor. + */ +ssize_t compute_numel( + const ::torch::executor::TensorImpl::SizesType* sizes, + ssize_t dim); + } // namespace executor } // namespace torch + +namespace executorch { +namespace runtime { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using torch::executor::compute_numel; +} // namespace runtime +} // namespace executorch From 237744facd0d1806a4d7815671bf8af46b7a155e Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Sun, 8 Sep 2024 14:57:36 -0700 Subject: [PATCH 255/531] Give more instructions on java format fix Pull Request resolved: https://github.com/pytorch/executorch/pull/5144 --- .github/workflows/lint.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c2f5ed31c16..56b70409d79 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -69,6 +69,10 @@ jobs: extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java) if [ -n "$FILES_NEEDS_FORMAT" ]; then echo "Warning: The following files need formatting. Please use google-java-format." - echo "$FILES_NEEDS_FORMAT" + echo "Use a binary from https://github.com/google/google-java-format/releases/" + echo "For example:" + echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64" + echo "chmod +x google-java-format_linux-x86-64" + echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT" exit 1 fi From 7c682efdbdcf432fdc2cc27477428c88c39c8b63 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Sun, 8 Sep 2024 14:58:45 -0700 Subject: [PATCH 256/531] [Android script] Add QNN related lib to AAR Add QNN related lib to AAR. Copy from QNN SDK. Pull Request resolved: https://github.com/pytorch/executorch/pull/5156 --- build/build_android_llm_demo.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 4d1a0ac1235..3c076cc5bdf 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -84,6 +84,19 @@ build_android_native_library() { # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + + # Copy QNN related so library + if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then + cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + fi } build_aar() { @@ -96,8 +109,8 @@ build_aar() { # between Java and JNI find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file - zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml + zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml popd } From 2863536122d2af3b0a8dce05a5f268d8e6ee9222 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Sun, 8 Sep 2024 16:08:25 -0700 Subject: [PATCH 257/531] [ExecuTorch] disable text animation in iOS Llama demo app Differential Revision: D62308551 Pull Request resolved: https://github.com/pytorch/executorch/pull/5134 Co-authored-by: Scott Wolchok --- .../LLaMA/LLaMA/Application/ContentView.swift | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift index d64314e5349..bac1b9ccf28 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift @@ -220,13 +220,11 @@ struct ContentView: View { let count = tokens.count tokens = [] DispatchQueue.main.async { - withAnimation { - var message = messages.removeLast() - message.text += text - message.tokenCount += count - message.dateUpdated = Date() - messages.append(message) - } + var message = messages.removeLast() + message.text += text + message.tokenCount += count + message.dateUpdated = Date() + messages.append(message) } } if shouldStopGenerating { From fd6a59085138316df357dd083251b642a40cea17 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Sun, 8 Sep 2024 16:32:01 -0700 Subject: [PATCH 258/531] [ExecuTorch] Handle rank 0 tensors correctly in optimized add/sub/div/mul Differential Revision: D62310838 Pull Request resolved: https://github.com/pytorch/executorch/pull/5140 --- kernels/optimized/cpu/op_add.cpp | 9 ++++----- kernels/optimized/cpu/op_div.cpp | 8 +++----- kernels/optimized/cpu/op_mul.cpp | 9 ++++----- kernels/optimized/cpu/op_sub.cpp | 8 +++----- kernels/test/op_add_test.cpp | 17 +++++++++++++++++ kernels/test/op_div_test.cpp | 19 +++++++++++++++++++ kernels/test/op_mul_test.cpp | 23 +++++++++++++++++++++++ kernels/test/op_sub_test.cpp | 19 +++++++++++++++++++ 8 files changed, 92 insertions(+), 20 deletions(-) diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index d46dd85fb3f..9f43cde1532 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -85,13 +85,12 @@ Tensor& opt_add_out( if (b.numel() == 1) { if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half && a_type != ScalarType::BFloat16) { - auto error = resize_tensor(out, a.sizes()); - ET_KERNEL_CHECK_MSG( + ET_KERNEL_CHECK( ctx, - error == Error::Ok, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, - out, - "Failed to resize output tensor."); + out); + ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() { ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() { CTYPE alpha_val; diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index ae3fd7b943c..01ca274bdb8 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -66,13 +66,11 @@ Tensor& opt_div_out( scalar = &b; scalar_type = b_type; } - auto error = resize_tensor(out, tensor->sizes()); - ET_KERNEL_CHECK_MSG( + ET_KERNEL_CHECK( ctx, - error == Error::Ok, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, - out, - "Failed to resize output tensor."); + out); ET_SWITCH_REALB_TYPES(tensor_type, ctx, "div.out", CTYPE, [&]() { ET_SWITCH_REALB_TYPES(scalar_type, ctx, "div.out", CTYPE_SCALAR, [&]() { CTYPE_SCALAR scalar_val = *scalar->const_data_ptr(); diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 31b0f7754fb..7f1c2405c52 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -82,13 +82,12 @@ Tensor& opt_mul_out( if (b.numel() == 1) { if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half && a_type != ScalarType::BFloat16) { - auto error = resize_tensor(out, a.sizes()); - ET_KERNEL_CHECK_MSG( + ET_KERNEL_CHECK( ctx, - error == Error::Ok, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, - out, - "Failed to resize output tensor."); + out); + ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.out", CTYPE, [&]() { ET_SWITCH_REALB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { CTYPE_B b_val = *b.const_data_ptr(); diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 252bee8aee8..609468f0d8e 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -101,13 +101,11 @@ Tensor& opt_sub_out( scalar = &b; scalar_type = b_type; } - auto error = resize_tensor(out, tensor->sizes()); - ET_KERNEL_CHECK_MSG( + ET_KERNEL_CHECK( ctx, - error == Error::Ok, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, - out, - "Failed to resize output tensor."); + out); ET_SWITCH_REAL_TYPES(tensor_type, ctx, "sub.out", CTYPE, [&]() { ET_SWITCH_REAL_TYPES(scalar_type, ctx, "sub.out", CTYPE_SCALAR, [&]() { CTYPE alpha_val; diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp index 51ace05b752..e35a4100c9a 100644 --- a/kernels/test/op_add_test.cpp +++ b/kernels/test/op_add_test.cpp @@ -352,6 +352,23 @@ TEST_F(OpAddOutKernelTest, BroadcastOneElementTensorTypePromotion) { EXPECT_TENSOR_EQ(out, expected); } +TEST_F(OpAddOutKernelTest, BroadcastOneElementRank0Tensor) { + TensorFactory tf; + + Tensor a = tf.make({1}, {5}); + Tensor b = tf.make({}, {2}); + + Tensor out = tf.zeros({1}); + + op_add_out(a, b, 1, out); + + Tensor ret = tf.make({1}, {7}); + EXPECT_TENSOR_EQ(out, ret); + + op_add_out(b, a, 1, out); + EXPECT_TENSOR_EQ(out, ret); +} + // // Death Tests // diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp index eb01893d48b..df7fdaf2ec9 100644 --- a/kernels/test/op_div_test.cpp +++ b/kernels/test/op_div_test.cpp @@ -237,6 +237,25 @@ TEST_F(OpDivOutTest, BroadcastScalarSupported2) { EXPECT_TENSOR_EQ(out, ret); } +TEST_F(OpDivOutTest, BroadcastScalarRank0Supported) { + TensorFactory tf; + + Tensor a = tf.make({1}, {8}); + Tensor b = tf.make({}, {2}); + + Tensor out = tf.zeros({1}); + + op_div_out(a, b, out); + + Tensor ret = tf.make({1}, {4}); + EXPECT_TENSOR_EQ(out, ret); + + op_div_out(b, a, out); + + ret = tf.make({1}, {0.25}); + EXPECT_TENSOR_EQ(out, ret); +} + TEST_F(OpDivOutTest, BroadcastDimSizeIsOneAB) { TensorFactory tf; diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 41a8656f967..84a7e8dedc4 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -182,6 +182,23 @@ class OpMulOutTest : public OperatorTest { EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); } + + template + void test_both_scalar_input_broadcast() { + TensorFactory tf_a; + + // a is a rank-1 scalar and b is a rank-0 scalar + Tensor a = tf_a.make({1}, /*data=*/{2}); + Tensor b = tf_a.make({}, /*data=*/{2}); + + // Destination for output of mul. + Tensor out = tf_a.make({1}, /*data=*/{2}); + Tensor expected = tf_a.make({1}, /*data=*/{4}); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(op_mul_out(a, b, out), expected); + EXPECT_TENSOR_CLOSE(op_mul_out(b, a, out), expected); + } }; class OpMulScalarOutTest : public OperatorTest { @@ -309,6 +326,12 @@ TEST_F(OpMulOutTest, ScalarInputBroadcastTest) { test_scalar_input_broadcast(); } +TEST_F(OpMulOutTest, BothScalarInputBroadcastTest) { + test_both_scalar_input_broadcast(); + test_both_scalar_input_broadcast(); + test_both_scalar_input_broadcast(); +} + TEST_F(OpMulOutTest, MismatchedOutputShapesDies) { if (SupportedFeatures::get()->is_aten) { GTEST_SKIP() << "ATen currently supports mismatched shapes"; diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp index 4bfc22bfcbb..9f795516723 100644 --- a/kernels/test/op_sub_test.cpp +++ b/kernels/test/op_sub_test.cpp @@ -206,6 +206,25 @@ TEST_F(OpSubOutTest, BroadcastScalarSupported2) { EXPECT_TENSOR_EQ(out, ret); } +TEST_F(OpSubOutTest, BroadcastScalarRank0Supported) { + TensorFactory tf; + + Tensor a = tf.make({1}, {5}); + Tensor b = tf.make({}, {2}); + + Tensor out = tf.zeros({1}); + + op_sub_out(a, b, 1, out); + + Tensor ret = tf.make({1}, {3}); + EXPECT_TENSOR_EQ(out, ret); + + op_sub_out(b, a, 1, out); + + ret = tf.make({1}, {-3}); + EXPECT_TENSOR_EQ(out, ret); +} + // // Death Tests // From 99fbca390e4ce190ab078f9ca78d96bfbaa6e2e5 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Sun, 8 Sep 2024 16:54:54 -0700 Subject: [PATCH 259/531] Provide more options to create an owning tensor. Differential Revision: D62339509 Pull Request resolved: https://github.com/pytorch/executorch/pull/5169 --- extension/tensor/targets.bzl | 2 +- extension/tensor/tensor_impl_ptr.cpp | 23 +++++++ extension/tensor/tensor_impl_ptr.h | 47 ++++++++++---- extension/tensor/tensor_ptr.h | 48 +++++++++++--- extension/tensor/test/targets.bzl | 1 - .../tensor/test/tensor_impl_ptr_test.cpp | 62 +++++++++++++++++++ 6 files changed, 162 insertions(+), 21 deletions(-) diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index d00136f8d5b..4998b5cf15b 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -27,10 +27,10 @@ def define_common_targets(): ], deps = [ "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, - "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], exported_deps = [ "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, ], ) diff --git a/extension/tensor/tensor_impl_ptr.cpp b/extension/tensor/tensor_impl_ptr.cpp index 7ab278f729c..01f69095b23 100644 --- a/extension/tensor/tensor_impl_ptr.cpp +++ b/extension/tensor/tensor_impl_ptr.cpp @@ -121,5 +121,28 @@ TensorImplPtr make_tensor_impl_ptr( #endif // USE_ATEN_LIB } +TensorImplPtr make_tensor_impl_ptr( + exec_aten::ScalarType scalar_type, + std::vector sizes, + std::vector data, + std::vector dim_order, + std::vector strides, + exec_aten::TensorShapeDynamism dynamism) { + ET_CHECK_MSG( + data.size() >= exec_aten::compute_numel(sizes.data(), sizes.size()) * + exec_aten::elementSize(scalar_type), + "Data size is smaller than required by sizes and scalar type."); + auto raw_data_ptr = data.data(); + auto data_ptr = std::make_shared>(std::move(data)); + return make_tensor_impl_ptr( + scalar_type, + std::move(sizes), + raw_data_ptr, + std::move(dim_order), + std::move(strides), + dynamism, + [data_ptr = std::move(data_ptr)](void*) {}); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h index 347c9a36f9a..3ccede79b1d 100644 --- a/extension/tensor/tensor_impl_ptr.h +++ b/extension/tensor/tensor_impl_ptr.h @@ -74,32 +74,32 @@ TensorImplPtr make_tensor_impl_ptr( * specified properties. * * This template overload is specialized for cases where the tensor data is - * provided as a vector of a specific scalar type, rather than a raw pointer. - * The deleter ensures that the data vector is properly managed and its - * lifetime is tied to the TensorImpl. + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. The deleter ensures that the data vector is properly + * managed and its lifetime is tied to the TensorImpl. * - * @tparam T The scalar type of the tensor elements. + * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. * @param data A vector containing the tensor's data. * @param dim_order A vector specifying the order of dimensions. * @param strides A vector specifying the strides of each dimension. * @param dynamism Specifies the mutability of the tensor's shape. - * @return A TensorImplPtr managing the newly created TensorImpl. + * @return A TensorImplPtr that manages the newly created TensorImpl. */ -template +template TensorImplPtr make_tensor_impl_ptr( std::vector sizes, - std::vector::type> data, + std::vector data, std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = exec_aten::TensorShapeDynamism::STATIC) { + constexpr exec_aten::ScalarType scalar_type = + runtime::CppTypeToScalarType::value; auto raw_data_ptr = data.data(); - auto data_ptr = std::make_shared< - std::vector::type>>( - std::move(data)); + auto data_ptr = std::make_shared>(std::move(data)); return make_tensor_impl_ptr( - T, + scalar_type, std::move(sizes), raw_data_ptr, std::move(dim_order), @@ -108,5 +108,30 @@ TensorImplPtr make_tensor_impl_ptr( [data_ptr = std::move(data_ptr)](void*) {}); } +/** + * Creates a TensorImplPtr that manages a newly created TensorImpl with the + * specified properties. + * + * This overload accepts a raw memory buffer stored in a std::vector + * and a scalar type to interpret the data. The vector is managed, and the + * memory's lifetime is tied to the TensorImpl. + * + * @param scalar_type The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A vector containing the raw memory for the tensor's data. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of each dimension. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorImplPtr managing the newly created TensorImpl. + */ +TensorImplPtr make_tensor_impl_ptr( + exec_aten::ScalarType scalar_type, + std::vector sizes, + std::vector data, + std::vector dim_order = {}, + std::vector strides = {}, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC); + } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 76b5dc833ed..18568876607 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -141,27 +141,59 @@ inline TensorPtr make_tensor_ptr( * Creates a TensorPtr that manages a Tensor with the specified properties. * * This template overload is specialized for cases where the tensor data is - * provided as a vector of a specific scalar type, rather than a raw pointer. - * The deleter ensures that the data vector is properly managed and its - * lifetime is tied to the TensorImpl. + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. The deleter ensures that the data vector is properly + * managed and its lifetime is tied to the TensorImpl. * - * @tparam T The scalar type of the tensor elements. + * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. * @param data A vector containing the tensor's data. * @param dim_order A vector specifying the order of dimensions. * @param strides A vector specifying the strides of each dimension. * @param dynamism Specifies the mutability of the tensor's shape. - * @return A TensorImplPtr managing the newly created TensorImpl. + * @return A TensorPtr that manages the newly created TensorImpl. */ -template +template TensorPtr make_tensor_ptr( std::vector sizes, - std::vector::type> data, + std::vector data, std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = exec_aten::TensorShapeDynamism::STATIC) { - return make_tensor_ptr(make_tensor_impl_ptr( + return make_tensor_ptr(make_tensor_impl_ptr( + std::move(sizes), + std::move(data), + std::move(dim_order), + std::move(strides), + dynamism)); +} + +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This overload accepts a raw memory buffer stored in a std::vector + * and a scalar type to interpret the data. The vector is managed, and the + * memory's lifetime is tied to the TensorImpl. + * + * @param scalar_type The scalar type of the tensor elements. + * @param sizes A vector specifying the size of each dimension. + * @param data A vector containing the raw memory for the tensor's data. + * @param dim_order A vector specifying the order of dimensions. + * @param strides A vector specifying the strides of each dimension. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorPtr managing the newly created Tensor. + */ +inline TensorPtr make_tensor_ptr( + exec_aten::ScalarType scalar_type, + std::vector sizes, + std::vector data, + std::vector dim_order = {}, + std::vector strides = {}, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return make_tensor_ptr(make_tensor_impl_ptr( + scalar_type, std::move(sizes), std::move(data), std::move(dim_order), diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl index ad62031ec08..632cc3fb88e 100644 --- a/extension/tensor/test/targets.bzl +++ b/extension/tensor/test/targets.bzl @@ -19,6 +19,5 @@ def define_common_targets(): ], deps = [ "//executorch/extension/tensor:tensor" + aten_suffix, - "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix, ], ) diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index 60f625177b6..45d79f240af 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -224,3 +224,65 @@ TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) { EXPECT_TRUE(deleter_called); EXPECT_EQ(data.use_count(), 1); } + +TEST_F(TensorImplPtrTest, TensorImplDeducedScalarType) { + std::vector data = {1.0, 2.0, 3.0, 4.0}; + auto tensor_impl = make_tensor_impl_ptr({2, 2}, std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 2); + EXPECT_EQ(tensor_impl->strides()[0], 2); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0); + EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0); +} + +TEST_F(TensorImplPtrTest, TensorImplUint8BufferWithFloatScalarType) { + std::vector data( + 4 * exec_aten::elementSize(exec_aten::ScalarType::Float)); + + float* float_data = reinterpret_cast(data.data()); + float_data[0] = 1.0f; + float_data[1] = 2.0f; + float_data[2] = 3.0f; + float_data[3] = 4.0f; + + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {2, 2}, std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 2); + EXPECT_EQ(tensor_impl->strides()[0], 2); + EXPECT_EQ(tensor_impl->strides()[1], 1); + + EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl->data())[1], 2.0f); + EXPECT_EQ(((float*)tensor_impl->data())[2], 3.0f); + EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f); +} + +TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooSmallExpectDeath) { + std::vector data( + 2 * exec_aten::elementSize(exec_aten::ScalarType::Float)); + ET_EXPECT_DEATH( + { + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {2, 2}, std::move(data)); + }, + ""); +} + +TEST_F(TensorImplPtrTest, TensorImplUint8BufferTooLarge) { + std::vector data( + 4 * exec_aten::elementSize(exec_aten::ScalarType::Float)); + auto tensor_impl = make_tensor_impl_ptr( + exec_aten::ScalarType::Float, {2, 2}, std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 2); + EXPECT_EQ(tensor_impl->strides()[0], 2); + EXPECT_EQ(tensor_impl->strides()[1], 1); +} From 0f4caa10a6a41b28fd475e94dcfa7cbc5f2b4776 Mon Sep 17 00:00:00 2001 From: lucylq Date: Mon, 9 Sep 2024 10:29:01 -0700 Subject: [PATCH 260/531] [flamingo] Update preproc imports (#5160) update preproc --- examples/models/flamingo/preprocess/export_preprocess_lib.py | 2 +- examples/models/flamingo/preprocess/test_preprocess.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/flamingo/preprocess/export_preprocess_lib.py b/examples/models/flamingo/preprocess/export_preprocess_lib.py index 358b1f2149a..366f5989222 100644 --- a/examples/models/flamingo/preprocess/export_preprocess_lib.py +++ b/examples/models/flamingo/preprocess/export_preprocess_lib.py @@ -14,7 +14,7 @@ from executorch.extension.llm.custom_ops import preprocess_custom_ops # noqa from torch.export import Dim, ExportedProgram -from torchtune.models.clip.inference._transforms import _CLIPImageTransform +from torchtune.models.clip.inference._transform import _CLIPImageTransform def get_example_inputs() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/examples/models/flamingo/preprocess/test_preprocess.py b/examples/models/flamingo/preprocess/test_preprocess.py index 34ad0ab8ed1..b990f44ca1b 100644 --- a/examples/models/flamingo/preprocess/test_preprocess.py +++ b/examples/models/flamingo/preprocess/test_preprocess.py @@ -22,7 +22,7 @@ from parameterized import parameterized from PIL import Image -from torchtune.models.clip.inference._transforms import ( +from torchtune.models.clip.inference._transform import ( _CLIPImageTransform, CLIPImageTransform, ) From 2dee34e5dd9550cca9fffdf2d76e07465787f444 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 9 Sep 2024 10:33:58 -0700 Subject: [PATCH 261/531] Refactor namespace usage in module tests. Differential Revision: D62366679 Pull Request resolved: https://github.com/pytorch/executorch/pull/5172 --- extension/module/test/module_test.cpp | 108 +++++++++++++------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp index 75cead25a72..7db4784dc93 100644 --- a/extension/module/test/module_test.cpp +++ b/extension/module/test/module_test.cpp @@ -15,9 +15,8 @@ #include -using namespace ::testing; - -namespace torch::executor { +using namespace ::executorch::extension; +using namespace ::executorch::runtime; class ModuleTest : public ::testing::Test { protected: @@ -102,13 +101,13 @@ TEST_F(ModuleTest, TestMethodMeta) { const auto input_meta = meta->input_tensor_meta(0); EXPECT_TRUE(input_meta.ok()); - EXPECT_EQ(input_meta->scalar_type(), ScalarType::Float); + EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float); EXPECT_EQ(input_meta->sizes().size(), 1); EXPECT_EQ(input_meta->sizes()[0], 1); const auto output_meta = meta->output_tensor_meta(0); EXPECT_TRUE(output_meta.ok()); - EXPECT_EQ(output_meta->scalar_type(), ScalarType::Float); + EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float); EXPECT_EQ(output_meta->sizes().size(), 1); EXPECT_EQ(output_meta->sizes()[0], 1); } @@ -125,11 +124,11 @@ TEST_F(ModuleTest, TestExecute) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); EXPECT_TRUE(result.ok()); @@ -149,11 +148,11 @@ TEST_F(ModuleTest, TestExecutePreload) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -169,11 +168,11 @@ TEST_F(ModuleTest, TestExecutePreload_method) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -192,11 +191,11 @@ TEST_F(ModuleTest, TestExecutePreloadProgramAndMethod) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = - module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -225,10 +224,11 @@ TEST_F(ModuleTest, TestGet) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module.get("forward", {Tensor(&tensor), Tensor(&tensor)}); + const auto result = module.get( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->toTensor().const_data_ptr(); @@ -240,10 +240,11 @@ TEST_F(ModuleTest, TestForward) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - const auto result = module->forward({Tensor(&tensor), Tensor(&tensor)}); + const auto result = + module->forward({exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -251,9 +252,10 @@ TEST_F(ModuleTest, TestForward) { EXPECT_NEAR(data[0], 2, 1e-5); std::array input2{2, 3}; - TensorImpl tensor2( - ScalarType::Float, sizes.size(), sizes.data(), input2.data()); - const auto result2 = module->forward({Tensor(&tensor2), Tensor(&tensor2)}); + exec_aten::TensorImpl tensor2( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input2.data()); + const auto result2 = module->forward( + {exec_aten::Tensor(&tensor2), exec_aten::Tensor(&tensor2)}); EXPECT_TRUE(result2.ok()); const auto data2 = result->at(0).toTensor().const_data_ptr(); @@ -298,10 +300,9 @@ TEST_F(ModuleTest, TestProgramSharingBetweenModules) { } TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { - auto loader = util::FileDataLoader::from(model_path_.c_str()); + auto loader = FileDataLoader::from(model_path_.c_str()); EXPECT_TRUE(loader.ok()); - auto data_loader = - std::make_unique(std::move(loader.get())); + auto data_loader = std::make_unique(std::move(loader.get())); auto module1 = std::make_unique(std::move(data_loader)); @@ -311,24 +312,24 @@ TEST_F(ModuleTest, TestProgramSharingAndDataLoaderManagement) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result1 = - module1->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result1 = module1->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result1.ok()); auto module2 = std::make_unique(module1->program()); - auto result2 = - module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result2 = module2->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result2.ok()); module1 = std::make_unique("/path/to/nonexistent/file.pte"); EXPECT_FALSE(module1->is_loaded()); - auto result3 = - module2->execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result3 = module2->execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result3.ok()); } @@ -336,10 +337,10 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { std::shared_ptr shared_program; { - auto loader = util::FileDataLoader::from(model_path_.c_str()); + auto loader = FileDataLoader::from(model_path_.c_str()); EXPECT_TRUE(loader.ok()); auto data_loader = - std::make_unique(std::move(loader.get())); + std::make_unique(std::move(loader.get())); auto* data_loader_ptr = data_loader.get(); Module module(std::move(data_loader)); @@ -362,10 +363,11 @@ TEST_F(ModuleTest, TestProgramPersistenceAndReuseAfterModuleDestruction) { std::array input{1}; std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), input.data()); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, sizes.size(), sizes.data(), input.data()); - auto result = module.execute("forward", {Tensor(&tensor), Tensor(&tensor)}); + auto result = module.execute( + "forward", {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); auto data = result->at(0).toTensor().const_data_ptr(); @@ -391,10 +393,14 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { const std::array& input) { Module module(program); std::array sizes{1}; - TensorImpl tensor( - ScalarType::Float, sizes.size(), sizes.data(), (void*)input.data()); - - const auto result = module.forward({Tensor(&tensor), Tensor(&tensor)}); + exec_aten::TensorImpl tensor( + exec_aten::ScalarType::Float, + sizes.size(), + sizes.data(), + (void*)input.data()); + + const auto result = module.forward( + {exec_aten::Tensor(&tensor), exec_aten::Tensor(&tensor)}); EXPECT_TRUE(result.ok()); const auto data = result->at(0).toTensor().const_data_ptr(); @@ -413,5 +419,3 @@ TEST_F(ModuleTest, TestConcurrentExecutionWithSharedProgram) { t4.join(); t5.join(); } - -} // namespace torch::executor From 647bfd4ee25de169597a4e0f6bee154cc72b25a6 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 9 Sep 2024 10:34:00 -0700 Subject: [PATCH 262/531] Add an overload to skip dtype and sizes. Differential Revision: D62366751 Pull Request resolved: https://github.com/pytorch/executorch/pull/5177 --- extension/tensor/tensor_impl_ptr.h | 36 +++++++++- extension/tensor/tensor_ptr.h | 21 ++++++ .../tensor/test/tensor_impl_ptr_test.cpp | 70 ++++++++++++++++++- extension/tensor/test/tensor_ptr_test.cpp | 70 ++++++++++++++++++- 4 files changed, 194 insertions(+), 3 deletions(-) diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h index 3ccede79b1d..f336faf07b0 100644 --- a/extension/tensor/tensor_impl_ptr.h +++ b/extension/tensor/tensor_impl_ptr.h @@ -96,7 +96,7 @@ TensorImplPtr make_tensor_impl_ptr( exec_aten::TensorShapeDynamism::STATIC) { constexpr exec_aten::ScalarType scalar_type = runtime::CppTypeToScalarType::value; - auto raw_data_ptr = data.data(); + const auto raw_data_ptr = data.data(); auto data_ptr = std::make_shared>(std::move(data)); return make_tensor_impl_ptr( scalar_type, @@ -108,6 +108,40 @@ TensorImplPtr make_tensor_impl_ptr( [data_ptr = std::move(data_ptr)](void*) {}); } +/** + * Creates a TensorImplPtr that manages a newly created TensorImpl with the + * specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. The deleter ensures that the data vector is properly + * managed and its lifetime is tied to the TensorImpl. + * + * @tparam T The C++ type of the tensor elements, deduced from the vector. + * @param data A vector containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorImplPtr that manages the newly created TensorImpl. + */ +template +TensorImplPtr make_tensor_impl_ptr( + std::vector data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + constexpr exec_aten::ScalarType scalar_type = + runtime::CppTypeToScalarType::value; + std::vector sizes{exec_aten::SizesType(data.size())}; + const auto raw_data_ptr = data.data(); + auto data_ptr = std::make_shared>(std::move(data)); + return make_tensor_impl_ptr( + scalar_type, + std::move(sizes), + raw_data_ptr, + {0}, + {1}, + dynamism, + [data_ptr = std::move(data_ptr)](void*) {}); +} + /** * Creates a TensorImplPtr that manages a newly created TensorImpl with the * specified properties. diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 18568876607..ef29d598b84 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -169,6 +169,27 @@ TensorPtr make_tensor_ptr( dynamism)); } +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This template overload is specialized for cases where the tensor data is + * provided as a vector. The scalar type is automatically deduced from the + * vector's data type. The deleter ensures that the data vector is properly + * managed and its lifetime is tied to the TensorImpl. + * + * @tparam T The C++ type of the tensor elements, deduced from the vector. + * @param data A vector containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorPtr that manages the newly created TensorImpl. + */ +template +TensorPtr make_tensor_ptr( + std::vector data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::STATIC) { + return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism)); +} + /** * Creates a TensorPtr that manages a Tensor with the specified properties. * diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index 45d79f240af..09d55de3e8e 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -172,7 +172,7 @@ TEST_F(TensorImplPtrTest, TensorImplOwningData) { } TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { - auto tensor_impl = make_tensor_impl_ptr({0, 5}, {}); + auto tensor_impl = make_tensor_impl_ptr({0, 5}, std::vector()); EXPECT_EQ(tensor_impl->dim(), 2); EXPECT_EQ(tensor_impl->size(0), 0); @@ -182,6 +182,74 @@ TEST_F(TensorImplPtrTest, TensorImplOwningEmptyData) { EXPECT_EQ(tensor_impl->data(), nullptr); } +TEST_F(TensorImplPtrTest, TensorImplDataOnlyDoubleType) { + std::vector data = {1.0, 2.0, 3.0, 4.0}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((double*)tensor_impl->data())[0], 1.0); + EXPECT_EQ(((double*)tensor_impl->data())[3], 4.0); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt32Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((int32_t*)tensor_impl->data())[0], 10); + EXPECT_EQ(((int32_t*)tensor_impl->data())[3], 40); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyInt64Type) { + std::vector data = {100, 200, 300, 400}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((int64_t*)tensor_impl->data())[0], 100); + EXPECT_EQ(((int64_t*)tensor_impl->data())[3], 400); +} + +TEST_F(TensorImplPtrTest, TensorImplDataOnlyUint8Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor_impl = make_tensor_impl_ptr(std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 1); + EXPECT_EQ(tensor_impl->size(0), 4); + EXPECT_EQ(tensor_impl->strides()[0], 1); + EXPECT_EQ(((uint8_t*)tensor_impl->data())[0], 10); + EXPECT_EQ(((uint8_t*)tensor_impl->data())[3], 40); +} + +TEST_F(TensorImplPtrTest, TensorImplAmbiguityWithMixedVectors) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto tensor_impl = make_tensor_impl_ptr(std::move(sizes), std::move(data)); + + EXPECT_EQ(tensor_impl->dim(), 2); + EXPECT_EQ(tensor_impl->size(0), 2); + EXPECT_EQ(tensor_impl->size(1), 2); + EXPECT_EQ(tensor_impl->strides()[0], 2); + EXPECT_EQ(tensor_impl->strides()[1], 1); + EXPECT_EQ(((float*)tensor_impl->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl->data())[3], 4.0f); + + auto tensor_impl2 = make_tensor_impl_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor_impl2->dim(), 2); + EXPECT_EQ(tensor_impl2->size(0), 2); + EXPECT_EQ(tensor_impl2->size(1), 2); + EXPECT_EQ(tensor_impl2->strides()[0], 2); + EXPECT_EQ(tensor_impl2->strides()[1], 1); + EXPECT_EQ(((float*)tensor_impl2->data())[0], 1.0f); + EXPECT_EQ(((float*)tensor_impl2->data())[3], 4.0f); +} + TEST_F(TensorImplPtrTest, SharedDataManagement) { auto data = std::make_shared>(100, 1.0f); auto tensor_impl1 = make_tensor_impl_ptr( diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 1542824fb73..24aa20a8355 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -167,7 +167,7 @@ TEST_F(TensorPtrTest, TensorOwningData) { } TEST_F(TensorPtrTest, TensorOwningEmptyData) { - auto tensor = make_tensor_ptr({0, 5}, {}); + auto tensor = make_tensor_ptr({0, 5}, std::vector()); EXPECT_EQ(tensor->dim(), 2); EXPECT_EQ(tensor->size(0), 0); @@ -177,6 +177,74 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) { EXPECT_EQ(tensor->data_ptr(), nullptr); } +TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) { + std::vector data = {1.0, 2.0, 3.0, 4.0}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 10); + EXPECT_EQ(tensor->const_data_ptr()[3], 40); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) { + std::vector data = {100, 200, 300, 400}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 100); + EXPECT_EQ(tensor->const_data_ptr()[3], 400); +} + +TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) { + std::vector data = {10, 20, 30, 40}; + auto tensor = make_tensor_ptr(std::move(data)); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 10); + EXPECT_EQ(tensor->const_data_ptr()[3], 40); +} + +TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) { + std::vector sizes = {2, 2}; + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto tensor = make_tensor_ptr(std::move(sizes), std::move(data)); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 2); + EXPECT_EQ(tensor->size(1), 2); + EXPECT_EQ(tensor->strides()[0], 2); + EXPECT_EQ(tensor->strides()[1], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0f); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0f); + + auto tensor2 = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 2); + EXPECT_EQ(tensor2->size(1), 2); + EXPECT_EQ(tensor2->strides()[0], 2); + EXPECT_EQ(tensor2->strides()[1], 1); + EXPECT_EQ(tensor2->const_data_ptr()[0], 1.0f); + EXPECT_EQ(tensor2->const_data_ptr()[3], 4.0f); +} + TEST_F(TensorPtrTest, TensorSharingImplModifiesSharedDataVector) { std::vector data = {1, 2, 3, 4, 5, 6}; From b52d4b6f8fe8167c88c3d78d5a34242982acb661 Mon Sep 17 00:00:00 2001 From: Chirag Modi <98582575+cmodi-meta@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:34:14 -0700 Subject: [PATCH 263/531] Enable Llama3 Multi-turn conversation Differential Revision: D61134262 Pull Request resolved: https://github.com/pytorch/executorch/pull/4721 --- .../executorchllamademo/MainActivity.java | 71 +++++++++++++++++-- .../executorchllamademo/MessageAdapter.java | 38 ++++++++++ .../executorchllamademo/PromptFormat.java | 15 +++- .../executorchllamademo/SettingsFields.java | 4 +- 4 files changed, 118 insertions(+), 10 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 7ed9c9ec979..308f5fac50a 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -70,6 +70,9 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa private SettingsFields mCurrentSettingsFields; private Handler mMemoryUpdateHandler; private Runnable memoryUpdater; + private int promptID = 0; + + private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; @Override public void onResult(String result) { @@ -195,6 +198,11 @@ private void populateExistingMessages(String existingMsgJSON) { mMessageAdapter.notifyDataSetChanged(); } + private int setPromptID() { + + return mMessageAdapter.getMaxPromptID() + 1; + } + @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); @@ -216,6 +224,7 @@ protected void onCreate(Bundle savedInstanceState) { String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); if (!existingMsgJSON.isEmpty()) { populateExistingMessages(existingMsgJSON); + promptID = setPromptID(); } mSettingsButton = requireViewById(R.id.settings); mSettingsButton.setOnClickListener( @@ -552,6 +561,48 @@ private void addSelectedImagesToChatThread(List selectedImageUri) { mMessageAdapter.notifyDataSetChanged(); } + private String getConversationHistory() { + String conversationHistory = ""; + + ArrayList conversations = + mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK); + if (conversations.isEmpty()) { + return conversationHistory; + } + + int prevPromptID = conversations.get(0).getPromptID(); + String conversationFormat = + PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType()); + String format = conversationFormat; + for (int i = 0; i < conversations.size(); i++) { + Message conversation = conversations.get(i); + int currentPromptID = conversation.getPromptID(); + if (currentPromptID != prevPromptID) { + conversationHistory = conversationHistory + format; + format = conversationFormat; + prevPromptID = currentPromptID; + } + if (conversation.getIsSent()) { + format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText()); + } else { + format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText()); + } + } + conversationHistory = conversationHistory + format; + + return conversationHistory; + } + + private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) { + if (conversationHistory.isEmpty()) { + return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + } + + return mCurrentSettingsFields.getFormattedSystemPrompt() + + conversationHistory + + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt); + } + private void onModelRunStarted() { mSendButton.setClickable(false); mSendButton.setImageResource(R.drawable.baseline_stop_24); @@ -586,19 +637,19 @@ private void onModelRunStopped() { + image.getBytes().length); }); String rawPrompt = mEditTextMessage.getText().toString(); - String prompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); // We store raw prompt into message adapter, because we don't want to show the extra // tokens from system prompt - mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, 0)); + mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID)); mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); - mResultMessage = new Message("", false, MessageType.TEXT, 0); + mResultMessage = new Message("", false, MessageType.TEXT, promptID); mMessageAdapter.add(mResultMessage); // Scroll to bottom of the list mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); // After images are added to prompt and chat thread, we clear the imageURI list // Note: This has to be done after imageURIs are no longer needed by LlamaModule mSelectedImageUri = null; + promptID++; Runnable runnable = new Runnable() { @Override @@ -610,10 +661,10 @@ public void run() { onModelRunStarted(); } }); - ETLogging.getInstance().log("Running inference.. prompt=" + prompt); long generateStartTime = System.currentTimeMillis(); if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) == ModelUtils.VISION_MODEL) { + ETLogging.getInstance().log("Running inference.. prompt=" + rawPrompt); if (!processedImageList.isEmpty()) { // For now, Llava only support 1 image. ETImage img = processedImageList.get(0); @@ -622,7 +673,7 @@ public void run() { img.getWidth(), img.getHeight(), ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - prompt, + rawPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, false, MainActivity.this); @@ -633,14 +684,20 @@ public void run() { 0, 0, ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - prompt, + rawPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, false, MainActivity.this); } } else { + String finalPrompt = + getTotalFormattedPrompt(getConversationHistory(), rawPrompt); + ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); mModule.generate( - prompt, ModelUtils.TEXT_MODEL_SEQ_LEN, false, MainActivity.this); + finalPrompt, + (int) (finalPrompt.length() * 0.75) + 64, + false, + MainActivity.this); } long generateDuration = System.currentTimeMillis() - generateStartTime; diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java index d9cbd95a1a7..2538c852e48 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java @@ -16,6 +16,7 @@ import android.widget.ImageView; import android.widget.TextView; import java.util.ArrayList; +import java.util.Collections; public class MessageAdapter extends ArrayAdapter { @@ -90,4 +91,41 @@ public void clear() { public ArrayList getSavedMessages() { return savedMessages; } + + public ArrayList getRecentSavedTextMessages(int numOfLatestPromptMessages) { + ArrayList recentMessages = new ArrayList(); + int lastIndex = savedMessages.size() - 1; + Message messageToAdd = savedMessages.get(lastIndex); + int oldPromptID = messageToAdd.getPromptID(); + + for (int i = 0; i < savedMessages.size(); i++) { + messageToAdd = savedMessages.get(lastIndex - i); + if (messageToAdd.getMessageType() != MessageType.SYSTEM) { + if (messageToAdd.getPromptID() != oldPromptID) { + numOfLatestPromptMessages--; + oldPromptID = messageToAdd.getPromptID(); + } + if (numOfLatestPromptMessages > 0) { + if (messageToAdd.getMessageType() == MessageType.TEXT) { + recentMessages.add(messageToAdd); + } + } else { + break; + } + } + } + + // To place the order in [input1, output1, input2, output2...] + Collections.reverse(recentMessages); + return recentMessages; + } + + public int getMaxPromptID() { + int maxPromptID = -1; + for (Message msg : savedMessages) { + + maxPromptID = Math.max(msg.getPromptID(), maxPromptID); + } + return maxPromptID; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 7342b4ab00c..4b450553236 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -12,6 +12,7 @@ public class PromptFormat { public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; + public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; public static String getSystemPromptTemplate(ModelType modelType) { switch (modelType) { @@ -33,8 +34,20 @@ public static String getUserPromptTemplate(ModelType modelType) { case LLAMA_3_1: return "<|start_header_id|>user<|end_header_id|>\n" + USER_PLACEHOLDER - + "<|eot_id|>\n" + + "<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>"; + + case LLAVA_1_5: + default: + return USER_PLACEHOLDER; + } + } + + public static String getConversationFormat(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>"; case LLAVA_1_5: return USER_PLACEHOLDER + " ASSISTANT:"; default: diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java index 466d3303e28..b71799981b2 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java @@ -38,12 +38,12 @@ public String getFormattedSystemAndUserPrompt(String prompt) { return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt); } - private String getFormattedSystemPrompt() { + public String getFormattedSystemPrompt() { return PromptFormat.getSystemPromptTemplate(modelType) .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt); } - private String getFormattedUserPrompt(String prompt) { + public String getFormattedUserPrompt(String prompt) { return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt); } From cd9d5361fa46a1037a85711b7b2717120112b525 Mon Sep 17 00:00:00 2001 From: Max Ren <40742183+mcr229@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:59:27 -0700 Subject: [PATCH 264/531] Make convert to linear an export pass Differential Revision: D62266927 Pull Request resolved: https://github.com/pytorch/executorch/pull/5133 --- backends/xnnpack/passes/convert_to_linear.py | 39 +++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py index 69f882523c8..2cef71bf927 100644 --- a/backends/xnnpack/passes/convert_to_linear.py +++ b/backends/xnnpack/passes/convert_to_linear.py @@ -13,9 +13,8 @@ from executorch.backends.transforms.addmm_mm_to_linear import ( apply_addmm_mm_to_linear_transform, ) -from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass from torch.fx.passes.infra.pass_base import PassResult from torch.fx.passes.utils.source_matcher_utils import ( @@ -27,7 +26,7 @@ logger.setLevel(logging.WARNING) -class ConvertToLinearPass(XNNPACKPass): +class ConvertToLinearPass(ExportPass): linear_modules = [ torch.nn.Linear, torch.nn.functional.linear, @@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str): map_ = {"input": 0, "weight": 1} return None if arg == "bias" else node.args[map_[arg]] - def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node): + def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node): """ For linear decomposed with mm + add, find bias in src partition """ - out_channels = get_shape(weight)[0] - bias = None - - # Try to find bias node in all nodes - for node in src_partition.nodes: - if is_param_node(self.exported_program, node) and node != weight: - bias = node - - if bias is not None: - assert get_shape(bias) == [ - out_channels - ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}" - else: - assert exir_ops.edge.aten.add.Tensor not in [ - node.target for node in src_partition.nodes - ], f"Expecting to find bias for Linear module: {src_partition} but could not find it" - return bias + mm_users = list(mm_node.users.keys()) + if len(mm_users) != 1: + return None + + add_node = mm_users[0] + if add_node.target != exir_ops.edge.aten.add.Tensor: + return None + + for arg in add_node.all_input_nodes: + if arg != mm_node and arg in src_partition.input_nodes: + return arg + + return None def create_linear( self, @@ -119,7 +114,7 @@ def create_linear( src_partition.input_nodes + src_partition.params, # bias can be in params ) if linear_bias is None and node.target == exir_ops.edge.aten.mm.default: - linear_bias = self.find_bias_for_mm(src_partition, linear_weight) + linear_bias = self.find_bias_for_mm(src_partition, node) logger.debug(f"Found bias(?): {linear_bias} from node {node}") From b69ae0cd20ad924d80e43bb10da3861b62a0612d Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 9 Sep 2024 11:37:12 -0700 Subject: [PATCH 265/531] Hide and simplify operator registry internals Differential Revision: D62167345 Pull Request resolved: https://github.com/pytorch/executorch/pull/5061 --- .../RegisterCodegenUnboxedKernels.cpp | 10 +- codegen/templates/RegisterKernels.cpp | 3 +- .../make_boxed_from_unboxed_functor.h | 6 +- .../make_boxed_from_unboxed_functor_test.cpp | 35 ++-- extension/pybindings/pybindings.cpp | 4 +- runtime/executor/method.cpp | 17 +- runtime/executor/test/executor_test.cpp | 40 +++-- .../executor/test/kernel_integration_test.cpp | 2 +- .../executor/test/kernel_resolution_test.cpp | 6 +- runtime/kernel/operator_registry.cpp | 164 +++++++++--------- runtime/kernel/operator_registry.h | 119 ++++++------- .../test/kernel_double_registration_test.cpp | 4 +- .../operator_registry_max_kernel_num_test.cpp | 13 +- .../kernel/test/operator_registry_test.cpp | 113 +++++++----- .../test/test_kernel_manual_registration.cpp | 8 +- runtime/kernel/test/test_util.h | 10 +- 16 files changed, 289 insertions(+), 265 deletions(-) diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fed..3076cde1a99 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include "${fn_header}" // Generated Function import headers @@ -21,7 +22,8 @@ // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast // arguments into C++ types (instead of IValue) and delegate to unboxed kernels. -using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; +using KernelSpan = + ::executorch::runtime::Span; namespace torch { namespace executor { namespace function { @@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; -// Explicitly convert to ArrayRef, so that the API can take an empty C array of +// Explicitly convert to Span, so that the API can take an empty C array of // Kernels. -static KernelArrayRef kernel_array_ref( +static KernelSpan kernel_span( kernels_to_register, kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +static auto success_with_kernel_reg = register_kernels(kernel_span); } // namespace } // namespace function } // namespace executor diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp index 2313a30a307..91eac200222 100644 --- a/codegen/templates/RegisterKernels.cpp +++ b/codegen/templates/RegisterKernels.cpp @@ -19,7 +19,8 @@ Error register_all_kernels() { Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; - Error success_with_kernel_reg = register_kernels(kernels_to_register); + Error success_with_kernel_reg = + ::executorch::runtime::register_kernels({kernels_to_register}); if (success_with_kernel_reg != Error::Ok) { ET_LOG(Error, "Failed register all kernels"); return success_with_kernel_reg; diff --git a/extension/kernel_util/make_boxed_from_unboxed_functor.h b/extension/kernel_util/make_boxed_from_unboxed_functor.h index 2b21914f49b..409c981cbb1 100644 --- a/extension/kernel_util/make_boxed_from_unboxed_functor.h +++ b/extension/kernel_util/make_boxed_from_unboxed_functor.h @@ -173,9 +173,9 @@ static executorch::runtime::Kernel make_boxed_kernel( } // namespace extension } // namespace executorch -#define EXECUTORCH_LIBRARY(ns, op_name, func) \ - static auto res_##ns = ::executorch::runtime::register_kernels( \ - ::executorch::extension::make_boxed_kernel( \ +#define EXECUTORCH_LIBRARY(ns, op_name, func) \ + static auto res_##ns = ::executorch::runtime::register_kernel( \ + ::executorch::extension::make_boxed_kernel( \ #ns "::" op_name, EXECUTORCH_FN(func))) namespace torch { diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp index da9596def70..dce3694d517 100644 --- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp +++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp @@ -21,10 +21,11 @@ using exec_aten::ScalarType; using exec_aten::Tensor; using exec_aten::TensorImpl; using executorch::runtime::BoxedEvalueList; +using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::getOpsFn; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::registry_has_op_function; Tensor& my_op_out(KernelRuntimeContext& ctx, const Tensor& a, Tensor& out) { (void)ctx; @@ -91,12 +92,12 @@ class MakeBoxedFromUnboxedFunctorTest : public ::testing::Test { TEST_F(MakeBoxedFromUnboxedFunctorTest, Basic) { EXECUTORCH_LIBRARY(my_ns, "my_op.out", my_op_out); - EXPECT_TRUE(hasOpsFn("my_ns::my_op.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::my_op.out")); } TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { EXECUTORCH_LIBRARY(my_ns, "set_1.out", set_1_out); - EXPECT_TRUE(hasOpsFn("my_ns::set_1.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::set_1.out")); // prepare out tensor TensorImpl::SizesType sizes[1] = {5}; @@ -106,7 +107,8 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { auto a = Tensor(&a_impl); // get boxed callable - auto fn = getOpsFn("my_ns::set_1.out"); + auto fn = get_op_function_from_registry("my_ns::set_1.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it KernelRuntimeContext context; @@ -115,7 +117,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { EValue* stack[1]; stack[0] = &values[0]; - fn(context, stack); + (*fn)(context, stack); // check result EXPECT_EQ(a.const_data_ptr()[0], 1); @@ -123,7 +125,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxLogicWorks) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { EXECUTORCH_LIBRARY(my_ns, "add_tensor.out", add_tensor_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_tensor.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_tensor.out")); // prepare ArrayRef input. torch::executor::testing::TensorFactory tf; @@ -135,13 +137,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { // prepare out tensor. EValue out(tf.zeros({5})); - auto fn = getOpsFn("my_ns::add_tensor.out"); + auto fn = get_op_function_from_registry("my_ns::add_tensor.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; - fn(context, stack); + (*fn)(context, stack); // check result. for (int i = 0; i < 5; i++) { @@ -151,7 +154,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { EXECUTORCH_LIBRARY(my_ns, "add_optional_scalar.out", add_optional_scalar_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_optional_scalar.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_scalar.out")); // prepare optional input. EValue scalar((int64_t)3); @@ -160,13 +163,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { // prepare out tensor. torch::executor::testing::TensorFactory tf; EValue out(tf.ones({1})); - auto fn = getOpsFn("my_ns::add_optional_scalar.out"); + auto fn = get_op_function_from_registry("my_ns::add_optional_scalar.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[3] = {scalar, scalar_none, out}; EValue* stack[3] = {&values[0], &values[1], &values[2]}; - fn(context, stack); + (*fn)(context, stack); // check result. EXPECT_EQ(stack[2]->toTensor().const_data_ptr()[0], 4); @@ -174,7 +178,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptional) { TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) { EXECUTORCH_LIBRARY(my_ns, "add_optional_tensor.out", add_optional_tensor_out); - EXPECT_TRUE(hasOpsFn("my_ns::add_optional_tensor.out")); + EXPECT_TRUE(registry_has_op_function("my_ns::add_optional_tensor.out")); // prepare optional tensors. torch::executor::testing::TensorFactory tf; @@ -186,13 +190,14 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) { // prepare out tensor. EValue out(tf.zeros({5})); - auto fn = getOpsFn("my_ns::add_optional_tensor.out"); + auto fn = get_op_function_from_registry("my_ns::add_optional_tensor.out"); + ASSERT_EQ(fn.error(), Error::Ok); // run it. KernelRuntimeContext context; EValue values[2] = {boxed_array_ref, out}; EValue* stack[2] = {&values[0], &values[1]}; - fn(context, stack); + (*fn)(context, stack); // check result. for (int i = 0; i < 5; i++) { diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index c605c48c582..000cecf4b23 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -79,7 +79,7 @@ using ::executorch::runtime::DataLoader; using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::EventTracerDebugLogLevel; -using ::executorch::runtime::get_kernels; +using ::executorch::runtime::get_registered_kernels; using ::executorch::runtime::HierarchicalAllocator; using ::executorch::runtime::Kernel; using ::executorch::runtime::MemoryAllocator; @@ -774,7 +774,7 @@ void create_profile_block(const std::string& name) { } py::list get_operator_names() { - ArrayRef kernels = get_kernels(); + Span kernels = get_registered_kernels(); py::list res; for (const Kernel& k : kernels) { if (k.name_ != nullptr) { diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index d39ba875531..4ec02aee921 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -527,19 +527,20 @@ Error Method::resolve_operator( i, static_cast(err)); meta[count].dim_order_ = - ArrayRef(dim_order_ptr, size); + Span(dim_order_ptr, size); count++; } } - // search kernel - if (hasOpsFn(operator_name, ArrayRef(meta, count))) { - kernels[kernel_index] = - getOpsFn(operator_name, ArrayRef(meta, count)); - return Error::Ok; - } else { + + // Find a kernel with the matching name and tensor meta. + Result op_function = + get_op_function_from_registry(operator_name, {meta, count}); + if (!op_function.ok()) { ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name); - return Error::OperatorMissing; + return op_function.error(); } + kernels[kernel_index] = op_function.get(); + return Error::Ok; } Result Method::load( diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp index da0d53374f1..15b3982297c 100644 --- a/runtime/executor/test/executor_test.cpp +++ b/runtime/executor/test/executor_test.cpp @@ -24,11 +24,13 @@ using exec_aten::SizesType; using exec_aten::Tensor; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::getOpsFn; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; -using executorch::runtime::register_kernels; +using executorch::runtime::OpFunction; +using executorch::runtime::register_kernel; +using executorch::runtime::registry_has_op_function; +using executorch::runtime::Result; using executorch::runtime::testing::TensorFactory; namespace pytree = ::executorch::extension::pytree; @@ -87,9 +89,9 @@ TEST_F(ExecutorTest, TensorHalf) { TEST_F(ExecutorTest, RegistryLookupAndCall) { const char* op_name = "aten::add.out"; - ASSERT_TRUE(hasOpsFn(op_name)); - auto func = getOpsFn(op_name); - ASSERT_TRUE(func); + Result func = get_op_function_from_registry(op_name); + ASSERT_EQ(func.error(), Error::Ok); + ASSERT_NE(*func, nullptr); TensorFactory tf; constexpr size_t num_evalues = 4; @@ -108,7 +110,7 @@ TEST_F(ExecutorTest, RegistryLookupAndCall) { kernel_args[4] = &evalues[3]; KernelRuntimeContext context{}; - func(context, kernel_args); + (*func)(context, kernel_args); auto c_ptr = evalues[3].toTensor().const_data_ptr(); ASSERT_EQ(c_ptr[3], 12); } @@ -166,15 +168,15 @@ TEST_F(ExecutorTest, EValueToScalar) { void test_op(KernelRuntimeContext& /*unused*/, EValue** /*unused*/) {} TEST_F(ExecutorTest, OpRegistration) { - auto s1 = register_kernels({Kernel("test", test_op)}); - auto s2 = register_kernels({Kernel("test_2", test_op)}); + auto s1 = register_kernel(Kernel("test", test_op)); + auto s2 = register_kernel(Kernel("test_2", test_op)); ASSERT_EQ(Error::Ok, s1); ASSERT_EQ(Error::Ok, s2); ET_EXPECT_DEATH( - []() { (void)register_kernels({Kernel("test", test_op)}); }(), ""); + []() { (void)register_kernel(Kernel("test", test_op)); }(), ""); - ASSERT_TRUE(hasOpsFn("test")); - ASSERT_TRUE(hasOpsFn("test_2")); + ASSERT_TRUE(registry_has_op_function("test")); + ASSERT_TRUE(registry_has_op_function("test_2")); } TEST_F(ExecutorTest, OpRegistrationWithContext) { @@ -184,25 +186,27 @@ TEST_F(ExecutorTest, OpRegistrationWithContext) { (void)context; *(values[0]) = Scalar(100); }); - auto s1 = register_kernels({op}); + auto s1 = register_kernel(op); ASSERT_EQ(Error::Ok, s1); - ASSERT_TRUE(hasOpsFn("test_op_with_context")); - auto func = getOpsFn("test_op_with_context"); + Result func = + get_op_function_from_registry("test_op_with_context"); + ASSERT_EQ(func.error(), Error::Ok); + EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); } TEST_F(ExecutorTest, AddMulAlreadyRegistered) { - ASSERT_TRUE(hasOpsFn("aten::add.out")); - ASSERT_TRUE(hasOpsFn("aten::mul.out")); + ASSERT_TRUE(registry_has_op_function("aten::add.out")); + ASSERT_TRUE(registry_has_op_function("aten::mul.out")); } TEST(PyTreeEValue, List) { diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp index 3e7da810933..616398b7416 100644 --- a/runtime/executor/test/kernel_integration_test.cpp +++ b/runtime/executor/test/kernel_integration_test.cpp @@ -94,7 +94,7 @@ struct KernelControl { executorch::runtime::KernelKey("v1/6;0,1|6;0,1|6;0,1|6;0,1"); Kernel kernel = executorch::runtime::Kernel( "aten::add.out", key, KernelControl::kernel_hook); - Error err = executorch::runtime::register_kernels({kernel}); + Error err = executorch::runtime::register_kernel(kernel); EXPECT_EQ(err, Error::Ok); registered_ = true; diff --git a/runtime/executor/test/kernel_resolution_test.cpp b/runtime/executor/test/kernel_resolution_test.cpp index 7ce16a8e9f3..aae0ff9b7ea 100644 --- a/runtime/executor/test/kernel_resolution_test.cpp +++ b/runtime/executor/test/kernel_resolution_test.cpp @@ -34,7 +34,7 @@ using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::Method; using executorch::runtime::Program; -using executorch::runtime::register_kernels; +using executorch::runtime::register_kernel; using executorch::runtime::Result; using executorch::runtime::TensorMeta; using executorch::runtime::testing::ManagedMemoryManager; @@ -77,7 +77,7 @@ TEST_F(KernelResolutionTest, InitExecutionPlanSuccess) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernel(kernel_1); EXPECT_EQ(s1, executorch::runtime::Error::Ok); ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); @@ -109,7 +109,7 @@ TEST_F(KernelResolutionTest, ResolveKernelKeySuccess) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernel(kernel_1); EXPECT_EQ(s1, executorch::runtime::Error::Ok); ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp index a8fd50d7b91..78aa0a51732 100644 --- a/runtime/kernel/operator_registry.cpp +++ b/runtime/kernel/operator_registry.cpp @@ -8,53 +8,63 @@ #include -#include -#include #include #include +#include +#include namespace executorch { namespace runtime { -OperatorRegistry& getOperatorRegistry(); -OperatorRegistry& getOperatorRegistry() { - static OperatorRegistry operator_registry; - return operator_registry; -} - -Error register_kernels(const ArrayRef& kernels) { - Error success = getOperatorRegistry().register_kernels(kernels); - if (success == Error::InvalidArgument || success == Error::Internal) { - ET_CHECK_MSG( - false, - "Kernel registration failed with error %" PRIu32 - ", see error log for details.", - static_cast(success)); - } - return success; -} - -Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { - // Operator registration happens in static initialization time when PAL init - // may or may not happen already. Here we are assuming et_pal_init() doesn't - // have any side effect even if falled multiple times. +namespace { + +// Maximum number of operators and their associated kernels that can be +// registered. +#ifdef MAX_KERNEL_NUM +constexpr uint32_t kMaxRegisteredKernels = MAX_KERNEL_NUM; +#else +constexpr uint32_t kMaxOperators = 250; +constexpr uint32_t kMaxKernelsPerOp = 8; +constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp; +#endif + +// Data that backs the kernel table. Since Kernel has a custom default +// constructor (implicitly, because it contains KernelKey, which has a custom +// ctor), some toolchains don't like having a global array of them: it would +// require constructing them at init time. Since we don't care about the values +// until we add each entry to the table, allocate static zeroed memory instead +// and point the table at it. +// @lint-ignore CLANGTIDY facebook-hte-CArray +alignas(sizeof(Kernel)) uint8_t + registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)]; + +/// Global table of registered kernels. +Kernel* registered_kernels = reinterpret_cast(registered_kernels_data); + +/// The number of kernels registered in the table. +size_t num_registered_kernels = 0; + +// Registers the kernels, but may return an error. +Error register_kernels_internal(const Span kernels) { + // Operator registration happens in static initialization time before or after + // PAL init, so call it here. It is safe to call multiple times. ::et_pal_init(); - if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) { + if (kernels.size() + num_registered_kernels > kMaxRegisteredKernels) { ET_LOG( Error, - "The total number of kernels to be registered is larger than the limit %" PRIu32 - ". %" PRIu32 - " kernels are already registered and we're trying to register another %" PRIu32 - " kernels.", - kMaxNumOfKernels, - (uint32_t)this->num_kernels_, + "The total number of kernels to be registered is larger than the limit " + "%" PRIu32 ". %" PRIu32 + " kernels are already registered and we're trying to register another " + "%" PRIu32 " kernels.", + kMaxRegisteredKernels, + (uint32_t)num_registered_kernels, (uint32_t)kernels.size()); ET_LOG(Error, "======== Kernels already in the registry: ========"); - for (size_t i = 0; i < this->num_kernels_; i++) { - ET_LOG(Error, "%s", this->kernels_[i].name_); - ET_LOG_KERNEL_KEY(this->kernels_[i].kernel_key_); + for (size_t i = 0; i < num_registered_kernels; i++) { + ET_LOG(Error, "%s", registered_kernels[i].name_); + ET_LOG_KERNEL_KEY(registered_kernels[i].kernel_key_); } ET_LOG(Error, "======== Kernels being registered: ========"); for (size_t i = 0; i < kernels.size(); i++) { @@ -67,9 +77,9 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { const char* lib_name = et_pal_get_shared_library_name(kernels.data()); for (const auto& kernel : kernels) { - // linear search. This is fine if the number of kernels are small. - for (int32_t i = 0; i < this->num_kernels_; i++) { - Kernel k = this->kernels_[i]; + // Linear search. This is fine if the number of kernels is small. + for (int32_t i = 0; i < num_registered_kernels; i++) { + Kernel k = registered_kernels[i]; if (strcmp(kernel.name_, k.name_) == 0 && kernel.kernel_key_ == k.kernel_key_) { ET_LOG(Error, "Re-registering %s, from %s", k.name_, lib_name); @@ -77,7 +87,7 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { return Error::InvalidArgument; } } - this->kernels_[this->num_kernels_++] = kernel; + registered_kernels[num_registered_kernels++] = kernel; } ET_LOG( Debug, @@ -87,11 +97,23 @@ Error OperatorRegistry::register_kernels(const ArrayRef& kernels) { return Error::Ok; } -bool hasOpsFn(const char* name, ArrayRef kernel_key) { - return getOperatorRegistry().hasOpsFn(name, kernel_key); +} // namespace + +// Registers the kernels, but panics if an error occurs. Always returns Ok. +Error register_kernels(const Span kernels) { + Error success = register_kernels_internal(kernels); + if (success == Error::InvalidArgument || success == Error::Internal) { + ET_CHECK_MSG( + false, + "Kernel registration failed with error %" PRIu32 + ", see error log for details.", + static_cast(success)); + } + return success; } -static int copy_char_as_number_to_buf(char num, char* buf) { +namespace { +int copy_char_as_number_to_buf(char num, char* buf) { if ((char)num < 10) { *buf = '0' + (char)num; buf += 1; @@ -104,10 +126,10 @@ static int copy_char_as_number_to_buf(char num, char* buf) { return 2; } } +} // namespace -void make_kernel_key_string(ArrayRef key, char* buf); - -void make_kernel_key_string(ArrayRef key, char* buf) { +namespace internal { +void make_kernel_key_string(Span key, char* buf) { if (key.empty()) { // If no tensor is present in an op, kernel key does not apply return; @@ -130,61 +152,43 @@ void make_kernel_key_string(ArrayRef key, char* buf) { buf += 1; } } +} // namespace internal -bool OperatorRegistry::hasOpsFn( +bool registry_has_op_function( const char* name, - ArrayRef meta_list) { - char buf[KernelKey::MAX_SIZE] = {0}; - make_kernel_key_string(meta_list, buf); - KernelKey kernel_key = KernelKey(buf); - - for (size_t idx = 0; idx < this->num_kernels_; idx++) { - if (strcmp(this->kernels_[idx].name_, name) == 0) { - if (this->kernels_[idx].kernel_key_.is_fallback() || - this->kernels_[idx].kernel_key_ == kernel_key) { - return true; - } - } - } - - return false; + Span meta_list) { + return get_op_function_from_registry(name, meta_list).ok(); } -const OpFunction& getOpsFn(const char* name, ArrayRef kernel_key) { - return getOperatorRegistry().getOpsFn(name, kernel_key); -} - -const OpFunction& OperatorRegistry::getOpsFn( +Result get_op_function_from_registry( const char* name, - ArrayRef meta_list) { + Span meta_list) { + // @lint-ignore CLANGTIDY facebook-hte-CArray char buf[KernelKey::MAX_SIZE] = {0}; - make_kernel_key_string(meta_list, buf); + internal::make_kernel_key_string(meta_list, buf); KernelKey kernel_key = KernelKey(buf); int32_t fallback_idx = -1; - for (size_t idx = 0; idx < this->num_kernels_; idx++) { - if (strcmp(this->kernels_[idx].name_, name) == 0) { - if (this->kernels_[idx].kernel_key_ == kernel_key) { - return this->kernels_[idx].op_; + for (size_t idx = 0; idx < num_registered_kernels; idx++) { + if (strcmp(registered_kernels[idx].name_, name) == 0) { + if (registered_kernels[idx].kernel_key_ == kernel_key) { + return registered_kernels[idx].op_; } - if (this->kernels_[idx].kernel_key_.is_fallback()) { + if (registered_kernels[idx].kernel_key_.is_fallback()) { fallback_idx = idx; } } } if (fallback_idx != -1) { - return this->kernels_[fallback_idx].op_; + return registered_kernels[fallback_idx].op_; } - ET_CHECK_MSG(false, "kernel '%s' not found.", name); + ET_LOG(Error, "kernel '%s' not found.", name); ET_LOG_TENSOR_META(meta_list); + return Error::OperatorMissing; } -ArrayRef get_kernels() { - return getOperatorRegistry().get_kernels(); -} - -ArrayRef OperatorRegistry::get_kernels() { - return ArrayRef(this->kernels_, this->num_kernels_); +Span get_registered_kernels() { + return {registered_kernels, num_registered_kernels}; } } // namespace runtime diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h index f1be83306f8..4b71f436d41 100644 --- a/runtime/kernel/operator_registry.h +++ b/runtime/kernel/operator_registry.h @@ -14,8 +14,11 @@ #include #include #include +#include +#include #include #include + // Debug switch for operator registry #if defined(ET_OP_REGISTRY_DEBUG) #include @@ -48,12 +51,10 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**); */ struct TensorMeta { exec_aten::ScalarType dtype_; - ArrayRef dim_order_; + Span dim_order_; TensorMeta() = default; - TensorMeta( - exec_aten::ScalarType dtype, - ArrayRef order) + TensorMeta(exec_aten::ScalarType dtype, Span order) : dtype_(dtype), dim_order_(order) {} bool operator==(const TensorMeta& other) const { @@ -190,73 +191,49 @@ struct Kernel { Kernel() {} }; -// Maximum number of operators and their associated kernels that can be -// registered. -constexpr uint32_t kOperatorTableMaxSize = 250; -constexpr uint32_t kMaxNumOfKernelPerOp = 8; -#ifdef MAX_KERNEL_NUM -constexpr uint32_t kMaxNumOfKernels = MAX_KERNEL_NUM; -#else -constexpr uint32_t kMaxNumOfKernels = - kOperatorTableMaxSize * kMaxNumOfKernelPerOp; -#endif +namespace internal { +void make_kernel_key_string(Span key, char* buf); +} // namespace internal + /** - * See OperatorRegistry::hasOpsFn() + * Checks whether an operator exists with a given name and TensorMeta list. When + * TensorMeta is empty, it means this op does not have specialized kernels, so + * it checks whether it has any fallback kernels. */ -bool hasOpsFn(const char* name, ArrayRef meta_list = {}); +bool registry_has_op_function( + const char* name, + Span meta_list = {}); /** - * See OperatorRegistry::getOpsFn() + * Returns the operator with a given name and TensorMeta list, if present. */ -const OpFunction& getOpsFn( +::executorch::runtime::Result get_op_function_from_registry( const char* name, - ArrayRef meta_list = {}); + Span meta_list = {}); /** - * See OperatorRegistry::get_kernels() + * Returns all registered kernels. */ -ArrayRef get_kernels(); +Span get_registered_kernels(); /** - * See OperatorRegistry::register_kernels(). Notice that the returned Error - * object should be handled internally and the reason for keep returning is to - * satisfy the requirement to run this in static initialization time. + * Registers the provided kernels. + * + * @param[in] kernels Kernel objects to register. + * @retval Error::Ok always. Panics on error. This function needs to return a + * non-void type to run at static initialization time. */ -ET_NODISCARD Error register_kernels(const ArrayRef&); - -struct OperatorRegistry { - public: - OperatorRegistry() : num_kernels_(0) {} - - /** - * Registers the Kernels object (i.e. string name and function reference - * pair). The kernels will be merged into Operators based on the op name. - * - * @param[in] kernels Kernel object - * @retval Error code representing whether registration was successful. - */ - ET_NODISCARD Error register_kernels(const ArrayRef&); - - /** - * Checks whether an operator with a given name and TensorMeta list. - * When TensorMeta is empty, it means this op does not have specialized - * kernels, so it checks whether it has any fallback kernels. - */ - bool hasOpsFn(const char* name, ArrayRef meta_list); +ET_NODISCARD Error register_kernels(const Span); - /** - * Get the operator with a given name and TensorMeta list - */ - const OpFunction& getOpsFn(const char* name, ArrayRef meta_list); - - /** - * Return all registered operators. - */ - ArrayRef get_kernels(); - - private: - Kernel kernels_[kMaxNumOfKernels]; - uint32_t num_kernels_; +/** + * Registers a single kernel. + * + * @param[in] kernel Kernel object to register. + * @retval Error::Ok always. Panics on error. This function needs to return a + * non-void type to run at static initialization time. + */ +ET_NODISCARD inline Error register_kernel(const Kernel& kernel) { + return register_kernels({&kernel, 1}); }; } // namespace runtime @@ -266,16 +243,32 @@ namespace torch { namespace executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. -using ::executorch::runtime::get_kernels; -using ::executorch::runtime::getOpsFn; -using ::executorch::runtime::hasOpsFn; using ::executorch::runtime::Kernel; using ::executorch::runtime::KernelKey; using ::executorch::runtime::KernelRuntimeContext; -using ::executorch::runtime::OperatorRegistry; using ::executorch::runtime::OpFunction; -using ::executorch::runtime::register_kernels; using ::executorch::runtime::TensorMeta; using RuntimeContext = ::executorch::runtime::KernelRuntimeContext; + +inline ::executorch::runtime::Error register_kernels(ArrayRef kernels) { + return ::executorch::runtime::register_kernels( + {kernels.data(), kernels.size()}); +} +inline OpFunction getOpsFn( + const char* name, + ArrayRef meta_list = {}) { + auto result = ::executorch::runtime::get_op_function_from_registry( + name, {meta_list.data(), meta_list.size()}); + ET_CHECK(result.ok()); // get_op_function_from_registry() logs details. + return *result; +} +inline bool hasOpsFn(const char* name, ArrayRef meta_list = {}) { + return ::executorch::runtime::registry_has_op_function( + name, {meta_list.data(), meta_list.size()}); +} +inline ArrayRef get_kernels() { + Span kernels = ::executorch::runtime::get_registered_kernels(); + return ArrayRef(kernels.data(), kernels.size()); +} } // namespace executor } // namespace torch diff --git a/runtime/kernel/test/kernel_double_registration_test.cpp b/runtime/kernel/test/kernel_double_registration_test.cpp index bef3b46f46b..1739dffd31b 100644 --- a/runtime/kernel/test/kernel_double_registration_test.cpp +++ b/runtime/kernel/test/kernel_double_registration_test.cpp @@ -20,6 +20,7 @@ using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::register_kernels; class KernelDoubleRegistrationTest : public ::testing::Test { public: @@ -33,10 +34,9 @@ TEST_F(KernelDoubleRegistrationTest, Basic) { "aten::add.out", "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); Error err = Error::InvalidArgument; ET_EXPECT_DEATH( - { auto res = register_kernels(kernels_array); }, + { (void)register_kernels({kernels}); }, std::to_string(static_cast(err))); } diff --git a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp index 16520358c75..6f6fe4b9e1b 100644 --- a/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp +++ b/runtime/kernel/test/operator_registry_max_kernel_num_test.cpp @@ -19,9 +19,10 @@ using namespace ::testing; using executorch::runtime::ArrayRef; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::hasOpsFn; using executorch::runtime::Kernel; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::register_kernels; +using executorch::runtime::registry_has_op_function; class OperatorRegistryMaxKernelNumTest : public ::testing::Test { public: @@ -33,11 +34,10 @@ class OperatorRegistryMaxKernelNumTest : public ::testing::Test { // Register one kernel when max_kernel_num=1; success TEST_F(OperatorRegistryMaxKernelNumTest, RegisterOneOp) { Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - auto s1 = register_kernels(kernels_array); + auto s1 = register_kernels({kernels}); EXPECT_EQ(s1, Error::Ok); - EXPECT_FALSE(hasOpsFn("fpp")); - EXPECT_TRUE(hasOpsFn("foo")); + EXPECT_FALSE(registry_has_op_function("fpp")); + EXPECT_TRUE(registry_has_op_function("foo")); } // Register two kernels when max_kernel_num=1; fail @@ -45,8 +45,7 @@ TEST_F(OperatorRegistryMaxKernelNumTest, RegisterTwoOpsFail) { Kernel kernels[] = { Kernel("foo1", [](KernelRuntimeContext&, EValue**) {}), Kernel("foo2", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); ET_EXPECT_DEATH( - { (void)register_kernels(kernels_array); }, + { (void)register_kernels({kernels}); }, "The total number of kernels to be registered is larger than the limit 1"); } diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp index 60cd5723cd0..57439a2bd0f 100644 --- a/runtime/kernel/test/operator_registry_test.cpp +++ b/runtime/kernel/test/operator_registry_test.cpp @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -20,15 +22,17 @@ using namespace ::testing; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; -using executorch::runtime::ArrayRef; using executorch::runtime::Error; using executorch::runtime::EValue; -using executorch::runtime::hasOpsFn; +using executorch::runtime::get_op_function_from_registry; using executorch::runtime::Kernel; using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::OpFunction; using executorch::runtime::register_kernels; +using executorch::runtime::registry_has_op_function; +using executorch::runtime::Result; +using executorch::runtime::Span; using executorch::runtime::TensorMeta; using executorch::runtime::testing::make_kernel_key; @@ -41,18 +45,18 @@ class OperatorRegistryTest : public ::testing::Test { TEST_F(OperatorRegistryTest, Basic) { Kernel kernels[] = {Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - auto s1 = register_kernels(kernels_array); - EXPECT_FALSE(hasOpsFn("fpp")); - EXPECT_TRUE(hasOpsFn("foo")); + Span kernels_span(kernels); + (void)register_kernels(kernels_span); + EXPECT_FALSE(registry_has_op_function("fpp")); + EXPECT_TRUE(registry_has_op_function("foo")); } TEST_F(OperatorRegistryTest, RegisterOpsMoreThanOnceDie) { Kernel kernels[] = { Kernel("foo", [](KernelRuntimeContext&, EValue**) {}), Kernel("foo", [](KernelRuntimeContext&, EValue**) {})}; - ArrayRef kernels_array = ArrayRef(kernels); - ET_EXPECT_DEATH({ auto res = register_kernels(kernels_array); }, ""); + Span kernels_span = Span(kernels); + ET_EXPECT_DEATH({ (void)register_kernels(kernels_span); }, ""); } constexpr int BUF_SIZE = KernelKey::MAX_SIZE; @@ -91,24 +95,31 @@ TEST_F(OperatorRegistryTest, RegisterKernels) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::boo", user_kernel_key)); + Span user_kernel_key(meta); + // no fallback kernel is registered - EXPECT_FALSE(hasOpsFn("test::boo", {})); - OpFunction func = getOpsFn("test::boo", user_kernel_key); + EXPECT_FALSE(registry_has_op_function("test::boo", {})); + Result fallback_func = + get_op_function_from_registry("test::boo", {}); + EXPECT_NE(fallback_func.error(), Error::Ok); + + EXPECT_TRUE(registry_has_op_function("test::boo", user_kernel_key)); + Result func = + get_op_function_from_registry("test::boo", user_kernel_key); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); @@ -136,18 +147,18 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) { auto s1 = register_kernels(kernels); // has both kernels Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); + Span user_kernel_key_1(meta); TensorMeta meta_2[] = {TensorMeta(ScalarType::Float, dim_order_type)}; - ArrayRef user_kernel_key_2 = ArrayRef(meta_2, 1); - - EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_1)); - EXPECT_TRUE(hasOpsFn("test::bar", user_kernel_key_2)); + Span user_kernel_key_2(meta_2); // no fallback kernel is registered - EXPECT_FALSE(hasOpsFn("test::bar", {})); + EXPECT_FALSE(registry_has_op_function("test::bar", {})); + Result fallback_func = + get_op_function_from_registry("test::bar", {}); + EXPECT_NE(fallback_func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); @@ -156,16 +167,22 @@ TEST_F(OperatorRegistryTest, RegisterTwoKernels) { KernelRuntimeContext context{}; // test kernel_1 - OpFunction func_1 = getOpsFn("test::bar", user_kernel_key_1); - func_1(context, evalues); + EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_1)); + Result func_1 = + get_op_function_from_registry("test::bar", user_kernel_key_1); + EXPECT_EQ(func_1.error(), Error::Ok); + (*func_1)(context, evalues); auto val_1 = values[0].toScalar().to(); ASSERT_EQ(val_1, 100); // test kernel_2 + EXPECT_TRUE(registry_has_op_function("test::bar", user_kernel_key_2)); + Result func_2 = + get_op_function_from_registry("test::bar", user_kernel_key_2); + EXPECT_EQ(func_2.error(), Error::Ok); values[0] = Scalar(0); - OpFunction func_2 = getOpsFn("test::bar", user_kernel_key_2); - func_2(context, evalues); + (*func_2)(context, evalues); auto val_2 = values[0].toScalar().to(); ASSERT_EQ(val_2, 50); @@ -202,27 +219,26 @@ TEST_F(OperatorRegistryTest, ExecutorChecksKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::qux", user_kernel_key_1)); + Span user_kernel_key_1(meta); + EXPECT_TRUE(registry_has_op_function("test::qux", user_kernel_key_1)); Tensor::DimOrderType dims_channel_first[] = {0, 3, 1, 2}; auto dim_order_type_channel_first = - ArrayRef(dims_channel_first, 4); + Span(dims_channel_first, 4); TensorMeta meta_channel_first[] = { TensorMeta(ScalarType::Long, dim_order_type_channel_first)}; - ArrayRef user_kernel_key_2 = - ArrayRef(meta_channel_first, 1); - EXPECT_FALSE(hasOpsFn("test::qux", user_kernel_key_2)); + Span user_kernel_key_2(meta_channel_first); + EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_2)); TensorMeta meta_float[] = {TensorMeta(ScalarType::Float, dim_order_type)}; - ArrayRef user_kernel_key_3 = ArrayRef(meta_float, 1); - EXPECT_FALSE(hasOpsFn("test::qux", ArrayRef(user_kernel_key_3))); + Span user_kernel_key_3(meta_float); + EXPECT_FALSE(registry_has_op_function("test::qux", user_kernel_key_3)); } TEST_F(OperatorRegistryTest, ExecutorUsesKernel) { @@ -235,23 +251,25 @@ TEST_F(OperatorRegistryTest, ExecutorUsesKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); Tensor::DimOrderType dims[] = {0, 1, 2, 3}; - auto dim_order_type = ArrayRef(dims, 4); + auto dim_order_type = Span(dims, 4); TensorMeta meta[] = {TensorMeta(ScalarType::Long, dim_order_type)}; - ArrayRef user_kernel_key_1 = ArrayRef(meta, 1); - EXPECT_TRUE(hasOpsFn("test::quux", ArrayRef(meta))); + Span user_kernel_key_1(meta); - OpFunction func = getOpsFn("test::quux", ArrayRef(meta)); + EXPECT_TRUE(registry_has_op_function("test::quux", user_kernel_key_1)); + Result func = + get_op_function_from_registry("test::quux", user_kernel_key_1); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); @@ -265,20 +283,21 @@ TEST_F(OperatorRegistryTest, ExecutorUsesFallbackKernel) { (void)context; *(stack[0]) = Scalar(100); }); - auto s1 = register_kernels({kernel_1}); + auto s1 = register_kernels({&kernel_1, 1}); EXPECT_EQ(s1, Error::Ok); - EXPECT_TRUE(hasOpsFn("test::corge")); - EXPECT_TRUE(hasOpsFn("test::corge", ArrayRef())); + EXPECT_TRUE(registry_has_op_function("test::corge")); + EXPECT_TRUE(registry_has_op_function("test::corge", {})); - OpFunction func = getOpsFn("test::corge", ArrayRef()); + Result func = get_op_function_from_registry("test::corge", {}); + EXPECT_EQ(func.error(), Error::Ok); EValue values[1]; values[0] = Scalar(0); EValue* kernels[1]; kernels[0] = &values[0]; KernelRuntimeContext context{}; - func(context, kernels); + (*func)(context, kernels); auto val = values[0].toScalar().to(); ASSERT_EQ(val, 100); diff --git a/runtime/kernel/test/test_kernel_manual_registration.cpp b/runtime/kernel/test/test_kernel_manual_registration.cpp index c150b61ad73..de8853c7813 100644 --- a/runtime/kernel/test/test_kernel_manual_registration.cpp +++ b/runtime/kernel/test/test_kernel_manual_registration.cpp @@ -15,7 +15,7 @@ using namespace ::testing; using executorch::runtime::Error; -using executorch::runtime::hasOpsFn; +using executorch::runtime::registry_has_op_function; class KernelManualRegistrationTest : public ::testing::Test { public: @@ -26,15 +26,15 @@ class KernelManualRegistrationTest : public ::testing::Test { TEST_F(KernelManualRegistrationTest, ManualRegister) { // Before registering, we can't find the add operator. - EXPECT_FALSE(hasOpsFn("aten::add.out")); + EXPECT_FALSE(registry_has_op_function("aten::add.out")); // Call the generated registration function. Error result = torch::executor::register_all_kernels(); EXPECT_EQ(result, Error::Ok); // We can now find the registered add operator. - EXPECT_TRUE(hasOpsFn("aten::add.out")); + EXPECT_TRUE(registry_has_op_function("aten::add.out")); // We can't find a random other operator. - EXPECT_FALSE(hasOpsFn("fpp")); + EXPECT_FALSE(registry_has_op_function("fpp")); } diff --git a/runtime/kernel/test/test_util.h b/runtime/kernel/test/test_util.h index 23993fd39d6..0c6c651af32 100644 --- a/runtime/kernel/test/test_util.h +++ b/runtime/kernel/test/test_util.h @@ -16,9 +16,6 @@ namespace executorch { namespace runtime { -// Defined in //executorch/runtime/kernel/operator_registry.cpp. -void make_kernel_key_string(ArrayRef key, char* buf); - namespace testing { inline void make_kernel_key( @@ -28,12 +25,11 @@ inline void make_kernel_key( char* buf) { std::vector meta; for (auto& t : tensors) { - ArrayRef dim_order( - t.second.data(), t.second.size()); + Span dim_order(t.second.data(), t.second.size()); meta.emplace_back(t.first, dim_order); } - auto meatadata = ArrayRef(meta.data(), meta.size()); - make_kernel_key_string(meatadata, buf); + Span metadata(meta.data(), meta.size()); + internal::make_kernel_key_string(metadata, buf); } } // namespace testing From 6b1e3287a0bfb6671d1a5515fa5f328e18d45152 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 9 Sep 2024 12:32:53 -0700 Subject: [PATCH 266/531] [ExecuTorch] Support BFloat16 in CPUBlas gemm Differential Revision: D62151658 Pull Request resolved: https://github.com/pytorch/executorch/pull/5122 --- kernels/optimized/blas/CPUBlas.cpp | 23 +++++++++++++++++++++++ kernels/optimized/blas/CPUBlas.h | 10 ++++++++++ kernels/optimized/test/libblas_test.cpp | 4 +++- 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp index 35b208d30fc..99003f8f0ea 100644 --- a/kernels/optimized/blas/CPUBlas.cpp +++ b/kernels/optimized/blas/CPUBlas.cpp @@ -173,5 +173,28 @@ void gemm( } // clang-format on +// clang-format off +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const BFloat16 alpha, + const BFloat16 *a, int64_t lda, + const BFloat16 *b, int64_t ldb, + const BFloat16 beta, + BFloat16 *c, int64_t ldc) { + normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); + + using acc_type = utils::compute_dtype; + gemm_impl( + transa, transb, + m, n, k, + static_cast(alpha), + a, lda, + b, ldb, + static_cast(beta), + c, ldc); +} +// clang-format on + } // namespace cpublas } // namespace executorch diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h index dd4a24cbce0..71e50601238 100644 --- a/kernels/optimized/blas/CPUBlas.h +++ b/kernels/optimized/blas/CPUBlas.h @@ -17,6 +17,7 @@ namespace executorch { namespace cpublas { +using BFloat16 = torch::executor::BFloat16; using Half = torch::executor::Half; enum class TransposeType { @@ -104,6 +105,15 @@ void gemm( const Half *b, int64_t ldb, const Half beta, Half *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const BFloat16 alpha, + const BFloat16 *a, int64_t lda, + const BFloat16 *b, int64_t ldb, + const BFloat16 beta, + BFloat16 *c, int64_t ldc); // clang-format on // clang-format off diff --git a/kernels/optimized/test/libblas_test.cpp b/kernels/optimized/test/libblas_test.cpp index 8f30a357e1a..24aeaba776a 100644 --- a/kernels/optimized/test/libblas_test.cpp +++ b/kernels/optimized/test/libblas_test.cpp @@ -9,6 +9,7 @@ #include #include +#include #include @@ -17,7 +18,8 @@ _(); \ _(); \ _(); \ - _(); + _(); \ + _(); namespace { From eca9ed501c36a60ff54e0d0735a6f9261188adae Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Mon, 9 Sep 2024 14:13:11 -0700 Subject: [PATCH 267/531] q to s start ops | add dim order sanity check Differential Revision: D59984028 Pull Request resolved: https://github.com/pytorch/executorch/pull/4332 --- kernels/portable/cpu/op_reflection_pad1d.cpp | 5 +++++ kernels/portable/cpu/op_reflection_pad2d.cpp | 5 +++++ kernels/portable/cpu/op_reflection_pad3d.cpp | 5 +++++ kernels/portable/cpu/op_relu.cpp | 3 +++ kernels/portable/cpu/op_remainder.cpp | 6 ++++++ kernels/portable/cpu/op_repeat.cpp | 5 +++++ kernels/portable/cpu/op_roll.cpp | 3 +++ kernels/portable/cpu/op_round.cpp | 3 +++ kernels/portable/cpu/op_rsub.cpp | 3 +++ kernels/portable/cpu/op_scatter_add.cpp | 9 +++++++++ kernels/portable/cpu/op_select_scatter.cpp | 3 +++ kernels/portable/cpu/op_sigmoid.cpp | 3 +++ kernels/portable/cpu/op_sign.cpp | 3 +++ kernels/portable/cpu/op_slice_copy.cpp | 3 +++ kernels/portable/cpu/op_slice_scatter.cpp | 3 +++ kernels/portable/cpu/op_softmax.cpp | 3 +++ kernels/portable/cpu/op_split_copy.cpp | 5 +++++ kernels/portable/cpu/op_split_with_sizes_copy.cpp | 5 +++++ kernels/portable/cpu/op_squeeze_copy.cpp | 10 ++++++++++ kernels/portable/cpu/op_stack.cpp | 10 ++++++++++ kernels/portable/cpu/op_sub.cpp | 6 ++++++ kernels/portable/cpu/op_sum.cpp | 5 +++++ kernels/portable/cpu/util/select_copy_util.cpp | 4 ++++ 23 files changed, 110 insertions(+) diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp index 66a2333619f..53fbbc9c56a 100644 --- a/kernels/portable/cpu/op_reflection_pad1d.cpp +++ b/kernels/portable/cpu/op_reflection_pad1d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad1d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(1, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp index a16d92ff1ce..8de0baba43b 100644 --- a/kernels/portable/cpu/op_reflection_pad2d.cpp +++ b/kernels/portable/cpu/op_reflection_pad2d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad2d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(2, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp index 9629b9e4c4e..4ba78733046 100644 --- a/kernels/portable/cpu/op_reflection_pad3d.cpp +++ b/kernels/portable/cpu/op_reflection_pad3d.cpp @@ -28,6 +28,11 @@ Tensor& reflection_pad3d_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType target_sizes[kTensorDimensionLimit]; size_t target_ndim = 0; get_padding_out_target_size(3, in, padding, target_sizes, &target_ndim); diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp index b9136cb3392..e59aec3ae64 100644 --- a/kernels/portable/cpu/op_relu.cpp +++ b/kernels/portable/cpu/op_relu.cpp @@ -35,6 +35,9 @@ Tensor& relu_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "relu.out", CTYPE, [&]() { apply_unary_map_fn( [](const CTYPE val_in) { diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index 7c858c1c08a..3a641829773 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -80,6 +80,9 @@ Tensor& remainder_Tensor_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type); @@ -124,6 +127,9 @@ Tensor& remainder_Scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType common_type = utils::promote_type_with_scalar(a_type, b); diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp index 644ebc98420..3b5596b2163 100644 --- a/kernels/portable/cpu/op_repeat.cpp +++ b/kernels/portable/cpu/op_repeat.cpp @@ -62,6 +62,11 @@ Tensor& repeat_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp index 4eff081eec4..09c7667c812 100644 --- a/kernels/portable/cpu/op_roll.cpp +++ b/kernels/portable/cpu/op_roll.cpp @@ -60,6 +60,9 @@ Tensor& roll_out( ET_KERNEL_CHECK( ctx, check_roll_args(in, shifts, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + if (in.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp index 0b28ba41887..33af6508be2 100644 --- a/kernels/portable/cpu/op_round.cpp +++ b/kernels/portable/cpu/op_round.cpp @@ -45,6 +45,9 @@ Tensor& round_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + auto in_scalar_type = in.scalar_type(); ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "round.out", CTYPE, [&] { diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 6a5ef598ef4..442221d6693 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -31,6 +31,9 @@ Tensor& rsub_scalar_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp index e10d87f9193..b4cf0d84f04 100644 --- a/kernels/portable/cpu/op_scatter_add.cpp +++ b/kernels/portable/cpu/op_scatter_add.cpp @@ -65,6 +65,15 @@ Tensor& scatter_add_out( InvalidArgument, out); + ET_KERNEL_CHECK( + context, + tensors_have_same_dim_order(self, src, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + context, tensor_is_default_dim_order(index), InvalidArgument, out); + if (dim < 0) { dim += nonzero_dim(self); } diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp index 71e7d9dfefd..db3ef8b1d29 100644 --- a/kernels/portable/cpu/op_select_scatter.cpp +++ b/kernels/portable/cpu/op_select_scatter.cpp @@ -33,6 +33,9 @@ Tensor& select_scatter_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, src, out), InvalidArgument, out); + // Account for negative indices if (dim < 0) { dim += in.dim(); diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index b696c29518b..919d42a721a 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -24,6 +24,9 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // Resize for dynamic shape ET_KERNEL_CHECK_MSG( ctx, diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp index 6dc6f3d015e..1c18788404d 100644 --- a/kernels/portable/cpu/op_sign.cpp +++ b/kernels/portable/cpu/op_sign.cpp @@ -30,6 +30,9 @@ Tensor& sign_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp index 41a76567906..2b5c48737d6 100644 --- a/kernels/portable/cpu/op_slice_copy.cpp +++ b/kernels/portable/cpu/op_slice_copy.cpp @@ -33,6 +33,9 @@ Tensor& slice_copy_Tensor_out( dim += in.dim(); } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // If user do not set value to end_val, set end to in.size(dim) (largest // value available) int64_t end = end_val.has_value() ? end_val.value() : in.size(dim); diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp index 47374716b4e..97f75553c1d 100644 --- a/kernels/portable/cpu/op_slice_scatter.cpp +++ b/kernels/portable/cpu/op_slice_scatter.cpp @@ -40,6 +40,9 @@ Tensor& slice_scatter_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out), InvalidArgument, out); + if (input.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp index 9f1565ff161..544887bed62 100644 --- a/kernels/portable/cpu/op_softmax.cpp +++ b/kernels/portable/cpu/op_softmax.cpp @@ -36,6 +36,9 @@ Tensor& softmax_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + // Adjust for negative dim dim = dim < 0 ? dim + nonzero_dim(in) : dim; diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp index a604e76b51c..1829b356ff2 100644 --- a/kernels/portable/cpu/op_split_copy.cpp +++ b/kernels/portable/cpu/op_split_copy.cpp @@ -46,6 +46,11 @@ void split_copy_Tensor_out( check_split_copy_args(input, split_size, dim, out), InvalidArgument, ); + for (size_t i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, ); + } + const size_t leading_dims = getLeadingDims(input, dim); const size_t trailing_dims = getTrailingDims(input, dim); const size_t step = input.size(dim) * trailing_dims; diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp index 7d1b485e7a4..623394e8013 100644 --- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp +++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp @@ -38,6 +38,11 @@ void split_with_sizes_copy_out( check_split_with_sizes_copy_args(in, split_sizes, dim, out), InvalidArgument, ); + for (size_t i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out[i]), InvalidArgument, ); + } + // If out is empty, then nothing needs to be done after checking the args. // Valid args implies that in.size(dim) == 0 and split_sizes is also empty. if (out.size() == 0) { diff --git a/kernels/portable/cpu/op_squeeze_copy.cpp b/kernels/portable/cpu/op_squeeze_copy.cpp index 5be91ff827d..11489e31729 100644 --- a/kernels/portable/cpu/op_squeeze_copy.cpp +++ b/kernels/portable/cpu/op_squeeze_copy.cpp @@ -29,6 +29,11 @@ Tensor& squeeze_copy_dim_out( ET_KERNEL_CHECK( ctx, check_squeeze_copy_dim_args(in, dim, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + if (dim < 0) { dim += nonzero_dim(in); } @@ -62,6 +67,11 @@ Tensor& squeeze_copy_dims_out( ET_KERNEL_CHECK( ctx, check_squeeze_copy_dims_args(in, dims, out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_squeeze_copy_dims_out_target_size( diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp index f241120ae2f..6859f2a8746 100644 --- a/kernels/portable/cpu/op_stack.cpp +++ b/kernels/portable/cpu/op_stack.cpp @@ -31,6 +31,16 @@ Tensor& stack_out( ET_KERNEL_CHECK( ctx, check_stack_args(tensors, dim, out), InvalidArgument, out); + for (size_t i = 0; i < tensors.size(); ++i) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(tensors[i], out), + InvalidArgument, + out); + } + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index 04254653a43..b97b7b490f3 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -78,6 +78,9 @@ Tensor& sub_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); @@ -131,6 +134,9 @@ Tensor& sub_scalar_out( ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); ScalarType alpha_type = utils::get_scalar_dtype(alpha); diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index dfa897206a9..c9a4260344e 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -38,6 +38,11 @@ Tensor& sum_dim_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_SWITCH_REAL_TYPES_AND( Bool, in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] { ET_SWITCH_REAL_TYPES_AND( diff --git a/kernels/portable/cpu/util/select_copy_util.cpp b/kernels/portable/cpu/util/select_copy_util.cpp index cf56b3e4ca2..2564317b043 100644 --- a/kernels/portable/cpu/util/select_copy_util.cpp +++ b/kernels/portable/cpu/util/select_copy_util.cpp @@ -38,6 +38,10 @@ Error select_copy_util( return Error::InvalidArgument; } + if (!tensors_have_same_dim_order(in, out)) { + return Error::InvalidArgument; + } + // If the input is a empty tensor, no other operation could be done. We just // return the output. if (in.numel() == 0) { From 85410e4010ee3940c5ae95931e4869d3ae502d00 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Tue, 10 Sep 2024 06:11:03 +0800 Subject: [PATCH 268/531] Qualcomm AI Engine Direct - Optimization and fix mutable buffer issue (#5072) * Qualcomm AI Engine Direct - Optimization and fix mutable buffer issue Summary: - Add a pass to convert linear to conv2d: We found the accuracy drop because of QNN Linear op in llama3. And it will be fixed with convert linear to conv2d pass. - Workaround the issue about mutable buffer for index_put op: We add a pass to replace the input of index_put op. Under the workaround, it will result in performance regression. - Insert copy op for int64 inputs to convert int64 to int32 in i64toi32 pass - Support QNN RMS Norm and use native rms norm in llama_transformer - Add a pass to compose rms norm * Use transform to replace rms_norm * temporarily remove test-llama-runner-qnn-linux --------- Co-authored-by: Sheng Feng Wu --- .github/workflows/pull.yml | 35 ----- backends/qualcomm/builders/__init__.py | 2 + backends/qualcomm/builders/node_visitor.py | 2 +- backends/qualcomm/builders/op_conv2d.py | 86 ++++-------- backends/qualcomm/builders/op_rms_norm.py | 127 ++++++++++++++++++ backends/qualcomm/builders/qnn_constants.py | 7 + .../passes/annotate_and_quant_scalar.py | 1 + backends/qualcomm/passes/i64_to_i32.py | 24 ++++ .../qualcomm/passes/recompose_rms_norm.py | 76 +++++++++++ .../passes/replace_index_put_input.py | 54 ++++++++ .../qualcomm/quantizer/custom_annotation.py | 10 +- backends/qualcomm/quantizer/utils.py | 25 ++++ backends/qualcomm/tests/models.py | 10 ++ backends/qualcomm/tests/test_qnn_delegate.py | 13 ++ backends/qualcomm/utils/utils.py | 7 + examples/models/llama2/TARGETS | 1 + examples/models/llama2/export_llama_lib.py | 18 ++- examples/models/llama2/llama_transformer.py | 1 + .../llama2/source_transformation/rms_norm.py | 23 ++++ .../llama2/source_transformation/sdpa.py | 5 +- extension/llm/export/partitioner_lib.py | 4 +- extension/llm/export/quantizer_lib.py | 9 +- 22 files changed, 431 insertions(+), 109 deletions(-) create mode 100644 backends/qualcomm/builders/op_rms_norm.py create mode 100644 backends/qualcomm/passes/recompose_rms_norm.py create mode 100644 backends/qualcomm/passes/replace_index_put_input.py create mode 100644 examples/models/llama2/source_transformation/rms_norm.py diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ca13d9bbd22..259ebb19863 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -372,38 +372,3 @@ jobs: # Run pytest with coverage pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test - - - test-llama-runner-qnn-linux: - name: test-llama-runner-qnn-linux - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - dtype: [fp32] - build-tool: [cmake] - mode: [qnn] - fail-fast: false - with: - runner: linux.2xlarge - docker-image: executorch-ubuntu-22.04-clang12-android - submodules: 'true' - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - timeout: 900 - script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - DTYPE=${{ matrix.dtype }} - BUILD_TOOL=${{ matrix.build-tool }} - MODE=${{ matrix.mode }} - - PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh - PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh - - # Setup executorch - PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 - # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh - # Test llama2 - PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index d3bf98bae72..79c02e22072 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -38,6 +38,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, @@ -92,6 +93,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index e07a745df5f..514bc6efd78 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -202,7 +202,7 @@ def get_quant_tensor_value( dtype = quant_configs[QCOM_DTYPE] - tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype) + tensor = tensor.div(scale).add(zero_point).round().to(dtype) # Make the backends access data correctly if quant_configs.get(QCOM_BITWIDTH) == 4: mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8) diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py index 909cc6a21f6..4b58edbac63 100644 --- a/backends/qualcomm/builders/op_conv2d.py +++ b/backends/qualcomm/builders/op_conv2d.py @@ -10,16 +10,7 @@ import numpy as np import torch -from executorch.backends.qualcomm.utils.constants import ( - QCOM_DATA, - QCOM_DTYPE, - QCOM_QUANT_ATTRS, - QCOM_QUANT_MAX, - QCOM_QUANT_MIN, - QCOM_SCALE, - QCOM_ZERO_POINT, -) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.qualcomm.utils.constants import QCOM_DATA from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import ( @@ -94,52 +85,6 @@ def _add_conv_op_parameter( return conv_op - def _get_bias_tensor( - self, - node: torch.fx.Node, - nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper], - num_output_channel: int, - ) -> PyQnnWrapper.PyQnnOpWrapper: - # build dummy node if bias is not given - bias_node = ( - node.args[2] - if node.args[2] is not None - else torch.fx.Node( - node.graph, - node.name + "_runtime_bias", - "call_function", - exir_ops.edge.aten.full.default, - (), # args - {}, # kwargs - ) - ) - # zeros tensor to meet HTP constraint if bias is not given - bias_tensor = ( - get_parameter(bias_node, self.edge_program) - if node.args[2] is not None - else torch.zeros(num_output_channel) - ) - # insert quant attribute to meet HTP constraint if bias is not given - if ( - node.args[2] is None - and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None - ): - quant_attrs = bias_quant_attrs.copy() - quant_attrs[QCOM_ZERO_POINT] = 0 - quant_attrs[QCOM_SCALE] = 0 - quant_attrs[QCOM_DTYPE] = torch.int32 - quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max - quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1 - bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs - - return self.define_tensor( - bias_node, - bias_tensor, - PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, - nodes_to_wrappers, - is_input_tensor=False, - ) - def _define_conv1d( self, node: torch.fx.Node, @@ -204,9 +149,17 @@ def _define_conv1d( is_input_tensor=False, ) conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) stride = [1] + cast(List[int], node.args[3]) padding = [0] + cast(List[int], node.args[4]) @@ -312,9 +265,18 @@ def define_node( is_input_tensor=False, ) conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) output_tensor = self.get_tensor(node, node) output_tensor_wrapper = self.define_tensor( diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py new file mode 100644 index 00000000000..e99b1f47ba1 --- /dev/null +++ b/backends/qualcomm/builders/op_rms_norm.py @@ -0,0 +1,127 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper +import numpy as np + +import torch +from executorch.backends.qualcomm.builders.utils import get_parameter +from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class RmsNormVisitor(NodeVisitor): + target = ["aten.rms_norm.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + # args of node : ['input', 'normalized_shape', 'weight', 'eps'] + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + # should be a immutable list + normalized_shapes = node.args[1] + if ( + len(normalized_shapes) != 1 + and normalized_shapes[0] != input_tensor.shape[-1] + ): + print("Only supports normalization with last input dimension") + return + axes = [node.args[0].meta["val"].dim() - 1] + axes_shape = [len(axes)] + + weight_node = node.args[2] + weight_tensor = get_parameter(weight_node, self.edge_program) + weight_tensor_wrapper = self.define_tensor( + weight_node, + weight_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + # Fake node, nn moudle seems to be inconsistant with document + bias_tensor = torch.zeros(weight_tensor.shape) + bias_node = torch.fx.Node( + node.graph, + node.name + "_runtime_bias", + "call_function", + exir_ops.edge.aten.tensor.default, + (), # args + {}, # kwargs + ) + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + epsilon = node.args[3] + if isinstance(epsilon, torch.fx.Node): + epsilon = get_parameter(epsilon, self.edge_program) + epsilon = ( + epsilon + if isinstance(epsilon, float) + else torch.finfo(epsilon.dtype).eps + ) + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + + rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpRmsNorm.op_name, + ) + + rms_nrom_op.AddInputTensors( + [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper] + ) + rms_nrom_op.AddOutputTensors([output_tensor_wrapper]) + rms_nrom_op.AddScalarParam( + OpRmsNorm.param_epsilon, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32, + {QCOM_DATA: np.float32(epsilon)}, + ) + rms_nrom_op.AddTensorParam( + OpRmsNorm.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(axes_shape), + axes_shape, + np.array(axes, dtype=np.uint32), + True, + ) + + return rms_nrom_op diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index 4a87e5dbbb3..8ac702f2ad5 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -278,6 +278,13 @@ class OpResizeNearestNeighbor: param_half_pixel_centers: str = "half_pixel_centers" +@dataclass(init=False, frozen=True) +class OpRmsNorm: + op_name: str = "RmsNorm" + param_epsilon: str = "epsilon" + param_axes: str = "axes" + + @dataclass(init=False, frozen=True) class OpScatterNd: op_name: str = "ScatterNd" diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py index 5f111ee9c8b..1ec2ac64b5a 100644 --- a/backends/qualcomm/passes/annotate_and_quant_scalar.py +++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py @@ -78,6 +78,7 @@ def _annotate_scalar_node( float, torch.float32, torch.int32, + torch.int64, ]: return diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py index 7814a3ff0d6..1d2171cc37a 100644 --- a/backends/qualcomm/passes/i64_to_i32.py +++ b/backends/qualcomm/passes/i64_to_i32.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. import torch from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult +from torch._subclasses.fake_tensor import FakeTensor class I64toI32(ExportPass): @@ -16,6 +18,8 @@ class I64toI32(ExportPass): def __init__(self, edge_program: torch.export.ExportedProgram): super(I64toI32, self).__init__() self.edge_program = edge_program + # pyre-ignore[4] + self.copy_op = exir_ops.edge.aten._to_copy.default def _update_meta(self, node: torch.fx.node) -> None: meta_val = node.meta["val"] @@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None: if meta_val.dtype == torch.int64: node.meta["val"] = meta_val.to(torch.float) + # pyre-ignore[2] + def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool: + return isinstance(node_val, FakeTensor) and node_val.dtype == dtype + def _cast_to_int32(self, graph_module: torch.fx.GraphModule): for n in graph_module.graph.nodes: if is_constant(n, self.edge_program): @@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule): if param.dtype == torch.int64: # QNN does not support int64 self._update_meta(n) + elif n.op == "placeholder": + node_val = n.meta["val"] + if self._is_tensor_of_dtype(node_val, torch.int64): + with graph_module.graph.inserting_after(n): + args = (n,) + to_dst_node = graph_module.graph.create_node( + "call_function", + self.copy_op, + args, + {"dtype": torch.int32}, + ) + to_dst_node.meta["val"] = node_val.to(torch.int32) + + # Replace usage of the src dtype result with the dst dtype result. + n.replace_all_uses_with(to_dst_node) + to_dst_node.args = (n,) def call(self, graph_module: torch.fx.GraphModule): self._cast_to_int32(graph_module) diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py new file mode 100644 index 00000000000..b26de8bd794 --- /dev/null +++ b/backends/qualcomm/passes/recompose_rms_norm.py @@ -0,0 +1,76 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + +from .utils import dq_ops + + +class RecomposeRmsNorm(ExportPass): + """ + Merge decomposed operators back to one super node. + """ + + def __init__(self): + super().__init__() + + def _get_eps_node(self, nodes): + # eps: one of inputs of add node + add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0] + for a in add_node.args: + if isinstance(a, float) or a.op != "call_function": + return a + + def _get_gamma_node(self, output_node): + # gamma: one of inputs of output node + for a in output_node.args: + if a.op != "call_function" or a.target in dq_ops: + return a + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + partitions = get_source_partitions(graph, [torch.nn.RMSNorm]) + for _, src_partitions in partitions.items(): + for src_partition in src_partitions: + input_len = len(src_partition.input_nodes) + if input_len == 1: + input_node = src_partition.input_nodes[0] + elif input_len == 2: + inp_0, inp_1 = src_partition.input_nodes + input_node = inp_0 if len(inp_0.users) == 2 else inp_1 + else: + raise RuntimeError( + f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs" + ) + + output_node = src_partition.output_nodes[0] + eps_node = self._get_eps_node(src_partition.nodes) + gamma_node = self._get_gamma_node(output_node) + + with graph.inserting_before(output_node): + # args schema + # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + rms_node = graph.create_node( + "call_function", + exir_ops.edge.aten.rms_norm.default, + ( + input_node, + list(gamma_node.meta["val"].shape), + gamma_node, + eps_node, + ), + ) + users = output_node.users.copy() + for user in users: + user.replace_input_with(output_node, rms_node) + # copy metadata + rms_node.meta = output_node.meta + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py new file mode 100644 index 00000000000..1eb210cf67e --- /dev/null +++ b/backends/qualcomm/passes/replace_index_put_input.py @@ -0,0 +1,54 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ReplaceIndexPutInput(ExportPass): + """ + Index put input workaround for quantized module + """ + + dq_q_map = { + # per tensor + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, + # per channel + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default, + } + + def __init__(self, edge_program: torch.export.ExportedProgram): + super(ReplaceIndexPutInput, self).__init__() + self.edge_program = edge_program + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target == exir_ops.edge.aten.index_put.default: + if ( + copy_node := list(node.users)[0] + ) and copy_node.target == exir_ops.edge.aten.copy.default: + m_buffer_node = copy_node.args[0] + bad_frozen_node = node.args[0] + if QCOM_QUANT_ATTRS in bad_frozen_node.meta: + m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[ + QCOM_QUANT_ATTRS + ] + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = ( + self.dq_q_map[ + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] + ] + ) + with graph.inserting_after(bad_frozen_node): + node.replace_input_with(bad_frozen_node, m_buffer_node) + else: + continue + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index b2c86e50d33..9cde50b9c70 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -91,15 +91,17 @@ def is_edge_condition(node: Node): def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig): if is_edge_condition(node): return - if node.target == torch.ops.aten.index_put_.default: + if node.target in [ + torch.ops.aten.index_put.default, + torch.ops.aten.index_put_.default, + ]: annotate_index_put(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) elif node.target == torch.ops.aten.cat.default: annotate_cat(node, quantization_config) # Expect that the inputs of the cat op are select ops - for arg in node.args[0][1:]: - annotate_single_in_single_out(arg, quantization_config) - annotate_matmul_input1(node.args[0][0], quantization_config) + for arg in node.args[0]: + annotate_matmul_input1(arg, quantization_config) else: annotate_single_in_single_out(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index d31b4753a3d..5f299f9bc65 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -684,6 +684,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non annotate_single_in_single_out(node, quantization_config) +@register_annotator([torch.ops.aten.rms_norm.default]) +def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act_node = node.args[0] + weight_node = node.args[2] + + if _is_annotated([node]): + return + + # TODO current only support 16a16w + _annotate_input_qspec_map( + node, + act_node, + quantization_config.input_activation, + ) + + _annotate_input_qspec_map( + node, + weight_node, + quantization_config.input_activation, + ) + nodes_to_mark_annotated = [node] + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + @register_annotator([torch.ops.aten.rsqrt.default]) def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 319cc6092cd..127f704e8c9 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -734,6 +734,16 @@ def forward(self, x): ) +class RmsNorm(torch.nn.Module): + def __init__(self): + super().__init__() + self.eps = 1e-5 + self.rms = torch.nn.RMSNorm([4], 1e-5) + + def forward(self, x): + return self.rms(x) + + class Rsqrt(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index cba23f935c2..71e3b13ff8e 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -393,6 +393,11 @@ def test_qnn_backend_reshape(self): sample_input = (torch.randn([3, 4]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -1000,6 +1005,14 @@ def test_qnn_backend_reshape(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a4w + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 6dc0c4c3c8d..3e274a0ce77 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -38,7 +38,11 @@ from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import ( RecomposePixelUnshuffle, ) +from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy +from executorch.backends.qualcomm.passes.replace_index_put_input import ( + ReplaceIndexPutInput, +) from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( _soc_info_table, QcomChipset, @@ -56,6 +60,7 @@ convert_to_option, ) from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC + from executorch.exir import ExirExportedProgram from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.lowered_backend_module import LoweredBackendModule @@ -201,6 +206,7 @@ def _transform(edge_program: ExportedProgram) -> None: graph_module = edge_program.graph_module RemoveRedundancy()(graph_module) RecomposePixelUnshuffle()(graph_module) + RecomposeRmsNorm()(graph_module) ConvertToLinear()(graph_module) ConvertPReLU(edge_program)(graph_module) ConvertBmmToMatmul()(graph_module) @@ -211,6 +217,7 @@ def _transform(edge_program: ExportedProgram) -> None: AnnotateDecomposed(edge_program)(graph_module) FoldQDQ()(graph_module) LayoutTransform(edge_program)(graph_module) + ReplaceIndexPutInput(edge_program)(graph_module) # Since QDQ nodes are stripped, update graph signature again to validate program edge_program._graph_signature = _get_updated_graph_signature( diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index 467949a5ebf..18a10fb9fdb 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -71,6 +71,7 @@ runtime.python_library( "export_llama_lib.py", "model.py", "source_transformation/quantize.py", + "source_transformation/rms_norm.py", "source_transformation/rope.py", "source_transformation/sdpa.py", ], diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index f6abc3aaf4e..968117eef20 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -49,6 +49,7 @@ get_quant_embedding_transform, get_quant_weight_transform, ) +from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis from .source_transformation.sdpa import ( replace_causal_mask, @@ -406,9 +407,16 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: if args.use_kv_cache: if args.qnn: + # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` + from executorch.backends.qualcomm.utils.utils import ( + convert_linear_to_conv2d, + ) + transforms.append(replace_kv_cache_with_simple_kv_cache) transforms.append(replace_sdpa_with_flex_sdpa) transforms.append(replace_causal_mask) + transforms.append(replace_rms_norm_with_native_rms_norm) + transforms.append(convert_linear_to_conv2d) elif args.coreml or args.mps: # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition @@ -552,7 +560,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - canonicalize_program(builder.edge_manager.exported_program()) + # TODO: Need to remove this once we have better way to handle buffer size + canonicalize_program( + builder.edge_manager.exported_program(), custom_buffer_size=542048256 + ) builder = builder.to_executorch() @@ -569,7 +580,10 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.num_sharding > 0 and args.qnn: from executorch.backends.qualcomm.utils.utils import canonicalize_program - canonicalize_program(builder.edge_manager.exported_program()) + # TODO: Need to remove this once we have better way to handle buffer size + canonicalize_program( + builder.edge_manager.exported_program(), custom_buffer_size=542048256 + ) builder = builder.to_executorch() diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py index 0c93115ee3b..534d90c6ed9 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama2/llama_transformer.py @@ -39,6 +39,7 @@ def __init__(self, dim: int, eps: float = 1e-6): """ super().__init__() + self.dim = dim self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama2/source_transformation/rms_norm.py new file mode 100644 index 00000000000..ff7e8b67457 --- /dev/null +++ b/examples/models/llama2/source_transformation/rms_norm.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.examples.models.llama2.llama_transformer import RMSNorm + + +def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): + for name, child in module.named_children(): + if isinstance(child, RMSNorm): + rms_norm = torch.nn.RMSNorm(child.dim, eps=child.eps) + rms_norm.weight = child.weight + setattr( + module, + name, + rms_norm, + ) + else: + replace_rms_norm_with_native_rms_norm(child) + return module diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py index 8e5de7d97ae..c48fdf0ae58 100644 --- a/examples/models/llama2/source_transformation/sdpa.py +++ b/examples/models/llama2/source_transformation/sdpa.py @@ -118,8 +118,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ - if n_rep == 1: - return hidden_states + # TODO: Encounter the bug about source partition, need to investigate more on it. + # if n_rep == 1: + # return hidden_states new_kv = [] batch, n_heads, seqlen, head_dim = hidden_states.shape diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index e75d5bef3fb..2f4c87d6fd8 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -130,11 +130,11 @@ def get_qnn_partitioner( ) except ImportError: raise ImportError( - "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html" + "Please install the Qualcomm backend following https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html" ) use_fp16 = True - skip_node_op_set = {"llama.fallback.default"} + skip_node_op_set = {"llama.fallback.default", "aten.embedding.default"} if pt2e_quantize is not None: use_fp16 = False diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index 7fc53358c50..45d9932724e 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -180,8 +180,9 @@ def get_qnn_quantizer( # Due to the error with 16a16w in Qnn Htp, we need to disable per channel linear quantization when use 16a16w # TODO: enable it after the issue is fixed logging.warning( - "Disable per channel quantization for linear due to the error with QNN HTP 16a16w." + "Disable per channel quantization for linear and conv due to the error with QNN HTP 16a16w." ) + qnn_quantizer.set_per_channel_conv_quant(enable=False) qnn_quantizer.set_per_channel_linear_quant(enable=False) qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS) qnn_quantizer.set_bit16_op_quant_config( @@ -208,6 +209,12 @@ def get_qnn_quantizer( quantization_mode is None ), "Currently qnn backend only supports QnnQuantizer via pt2e flow" qnn_quantizer.add_custom_quant_annotations(custom_annotations) + qnn_quantizer.add_discard_ops( + [ + torch.ops.aten.embedding.default, + ] + ) + return qnn_quantizer, quant_dtype From d2014e3a531f03a500e2919de052640d0b27b788 Mon Sep 17 00:00:00 2001 From: Yi Li <47999440+LeeOHzzZ@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:37:53 -0700 Subject: [PATCH 269/531] Add a target rule for ops_registrations (#5083) Differential Revision: D62206605 Pull Request resolved: https://github.com/pytorch/executorch/pull/5191 --- backends/cadence/aot/TARGETS | 11 +++++++++++ backends/cadence/aot/ops_registrations.py | 17 +++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index d077169022a..08093efe317 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -60,6 +60,17 @@ python_library( ], ) +python_library( + name = "ops_registrations", + srcs = [ + "ops_registrations.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/backends/cadence/aot:utils", + ], +) + export_file(name = "functions.yaml") executorch_generated_lib( diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index a4d856ebed2..a5e00573918 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -4,11 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-strict + from math import prod from typing import Optional, Tuple import torch -from executorch.exir.scalar_type import ScalarType from torch.library import impl, Library from .utils import get_conv1d_output_size, get_conv2d_output_size @@ -74,8 +75,8 @@ def quantize_per_tensor_meta( zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=dtype) @@ -86,8 +87,8 @@ def dequantize_per_tensor_meta( zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.float) @@ -102,7 +103,7 @@ def quantized_linear_meta( out_shift: torch.Tensor, out_zero_point: int, offset: Optional[torch.Tensor], -): +) -> torch.Tensor: # src comes in shape [leading_dims, in_dim] # weight comes in shape [out_dim, in_dim] # output comes in empty with shape [leading_dims, out_dim] @@ -162,7 +163,7 @@ def quantized_layer_norm_meta( eps: float, output_scale: float, output_zero_point: int, -): +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.uint8) @@ -173,7 +174,7 @@ def quantized_relu_meta( out_zero_point: int, out_multiplier: torch.Tensor, out_shift: torch.Tensor, -): +) -> torch.Tensor: return X.new_empty(X.size(), dtype=torch.uint8) From b23ee01ba03b3cf7f32976f02e19c91bb28b4aef Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:54:17 -0700 Subject: [PATCH 270/531] Register LLM prefill native method in JNI We added API in Java. Need to register in JNI as well Pull Request resolved: https://github.com/pytorch/executorch/pull/5201 --- extension/android/jni/jni_layer_llama.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 0d43317c3ca..1b993341e54 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -285,6 +285,12 @@ class ExecuTorchLlamaJni makeNativeMethod("generate", ExecuTorchLlamaJni::generate), makeNativeMethod("stop", ExecuTorchLlamaJni::stop), makeNativeMethod("load", ExecuTorchLlamaJni::load), + makeNativeMethod( + "prefillImagesNative", ExecuTorchLlamaJni::prefill_images), + makeNativeMethod( + "prefillPromptNative", ExecuTorchLlamaJni::prefill_prompt), + makeNativeMethod( + "generateFromPos", ExecuTorchLlamaJni::generate_from_pos), }); } }; From 28beeff0daee6ffdd0dac2fad2c85718c56ea38c Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Mon, 9 Sep 2024 16:16:06 -0700 Subject: [PATCH 271/531] Clean up devtools/etdump Differential Revision: D62394222 Pull Request resolved: https://github.com/pytorch/executorch/pull/5180 --- devtools/etdump/emitter.cpp | 70 ++--- devtools/etdump/emitter.h | 29 +- devtools/etdump/etdump_flatcc.cpp | 434 +++++++++++++------------- devtools/etdump/etdump_flatcc.h | 121 ++++--- devtools/etdump/scalar_type.fbs | 2 +- devtools/etdump/targets.bzl | 4 +- devtools/etdump/tests/etdump_test.cpp | 47 +-- 7 files changed, 364 insertions(+), 343 deletions(-) diff --git a/devtools/etdump/emitter.cpp b/devtools/etdump/emitter.cpp index dfca6295306..653c75cb084 100644 --- a/devtools/etdump/emitter.cpp +++ b/devtools/etdump/emitter.cpp @@ -6,16 +6,25 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include + #include +#include + +#include +#include + +#include -#include "executorch/devtools/etdump/emitter.h" -#include "executorch/runtime/platform/assert.h" +using executorch::etdump::internal::ETDumpStaticAllocator; -namespace torch { -namespace executor { +namespace executorch { +namespace etdump { +namespace internal { -static int _allocator_fn( +namespace { + +int allocator_fn( void* alloc_context, flatcc_iovec_t* b, size_t request, @@ -24,8 +33,8 @@ static int _allocator_fn( void* p; size_t n; - struct etdump_static_allocator* state = - (struct etdump_static_allocator*)alloc_context; + ETDumpStaticAllocator* state = + reinterpret_cast(alloc_context); // This allocator doesn't support freeing memory. if (request == 0) { @@ -113,14 +122,14 @@ static int _allocator_fn( // This emitter implementation emits to a fixed size buffer and will fail if it // runs out of room on either end. -static int _emitter_fn( +int emitter_fn( void* emit_context, const flatcc_iovec_t* iov, int iov_count, flatbuffers_soffset_t offset, size_t len) { - struct etdump_static_allocator* E = - (struct etdump_static_allocator*)emit_context; + ETDumpStaticAllocator* E = + reinterpret_cast(emit_context); uint8_t* p; if (offset < 0) { @@ -144,40 +153,15 @@ static int _emitter_fn( return 0; } -/******************************************************************************* - * Public Functions - ******************************************************************************/ - -int etdump_static_allocator_builder_init( - flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { - ET_CHECK(builder != nullptr); - ET_CHECK(alloc != nullptr); - - // Ensure data size is multiple of 32 (minimum allocation size). - ET_CHECK((alloc->data_size & 0x1F) == 0); - // Ensure out_size is divisable by 2 to ensure front/back sizes are equal for - // emitter.. - ET_CHECK((alloc->out_size & 0x1) == 0); - - return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); -} - -void etdump_static_allocator_reset(struct etdump_static_allocator* alloc) { - ET_CHECK(alloc != nullptr); - alloc->allocated = 0; - size_t n = alloc->out_size / 2; - alloc->front_cursor = &alloc->data[alloc->data_size + n]; - alloc->front_left = n; -} +} // namespace -int et_flatcc_custom_init( +int etdump_flatcc_custom_init( flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { + struct ETDumpStaticAllocator* alloc) { return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); + builder, emitter_fn, alloc, allocator_fn, alloc); } -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h index bf8ab0b1e1c..09c1b56aa56 100644 --- a/devtools/etdump/emitter.h +++ b/devtools/etdump/emitter.h @@ -6,26 +6,23 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#pragma once -#include -#include +#include +#include -#pragma once +#include -namespace torch { -namespace executor { +typedef struct flatcc_builder flatcc_builder_t; -int et_flatcc_custom_init( - flatcc_builder_t* builder, - struct etdump_static_allocator* alloc); +namespace executorch { +namespace etdump { +namespace internal { -int etdump_static_allocator_builder_init( +int etdump_flatcc_custom_init( flatcc_builder_t* builder, - struct etdump_static_allocator* alloc); - -void etdump_static_allocator_reset(struct etdump_static_allocator* alloc); + internal::ETDumpStaticAllocator* alloc); -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp index ca46c12f51c..4c05bb5acee 100644 --- a/devtools/etdump/etdump_flatcc.cpp +++ b/devtools/etdump/etdump_flatcc.cpp @@ -6,19 +6,33 @@ * LICENSE file in the root directory of this source tree. */ -#include "executorch/devtools/etdump/etdump_flatcc.h" +#include + +#include + +#include #include #include +#include +#include +#include + #include -#include -#include -#include "executorch/devtools/etdump/emitter.h" -#include "executorch/runtime/core/exec_aten/exec_aten.h" -#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h" -#include "executorch/runtime/platform/assert.h" -namespace torch { -namespace executor { +using ::exec_aten::Tensor; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::ChainID; +using ::executorch::runtime::DebugHandle; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; + +namespace executorch { +namespace etdump { namespace { @@ -50,30 +64,30 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( } etdump_Tensor_ref_t add_tensor_entry( - flatcc_builder_t* builder, + flatcc_builder_t* builder_, const exec_aten::Tensor& tensor, long offset) { - etdump_Tensor_start(builder); + etdump_Tensor_start(builder_); etdump_Tensor_scalar_type_add( - builder, get_flatbuffer_scalar_type(tensor.scalar_type())); - etdump_Tensor_sizes_start(builder); + builder_, get_flatbuffer_scalar_type(tensor.scalar_type())); + etdump_Tensor_sizes_start(builder_); for (auto dim : tensor.sizes()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_sizes_push(builder, &cast_dim); + etdump_Tensor_sizes_push(builder_, &cast_dim); } - etdump_Tensor_sizes_end(builder); + etdump_Tensor_sizes_end(builder_); - etdump_Tensor_strides_start(builder); + etdump_Tensor_strides_start(builder_); for (auto dim : tensor.strides()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_strides_push(builder, &cast_dim); + etdump_Tensor_strides_push(builder_, &cast_dim); } - etdump_Tensor_strides_end(builder); - etdump_Tensor_offset_add(builder, offset); + etdump_Tensor_strides_end(builder_); + etdump_Tensor_offset_add(builder_, offset); - return etdump_Tensor_end(builder); + return etdump_Tensor_end(builder_); } static uint8_t* alignPointer(void* ptr, size_t alignment) { @@ -88,71 +102,71 @@ static uint8_t* alignPointer(void* ptr, size_t alignment) { } // namespace -constexpr size_t max_alloc_buf_size = 128 * 1024; - // Constructor implementation ETDumpGen::ETDumpGen(Span buffer) { - // Initialize the flatcc builder using the buffer and buffer size. + constexpr size_t max_alloc_buf_size = 128 * 1024; + + // Initialize the flatcc builder_ using the buffer and buffer size. if (buffer.data() != nullptr) { - builder = (struct flatcc_builder*)alignPointer(buffer.data(), 64); + builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64); uintptr_t buffer_with_builder = - (uintptr_t)alignPointer(builder + sizeof(struct flatcc_builder), 64); + (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64); size_t buffer_size = buffer.size() - (size_t)(buffer_with_builder - (uintptr_t)buffer.data()); - alloc.set_buffer( + alloc_.set_buffer( (uint8_t*)buffer_with_builder, buffer_size, (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size : buffer_size / 4)); - et_flatcc_custom_init(builder, &alloc); + internal::etdump_flatcc_custom_init(builder_, &alloc_); } else { - builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); + builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); ET_CHECK_MSG( - builder != nullptr, "Failed to allocate memory for flatcc builder."); - flatcc_builder_init(builder); + builder_ != nullptr, "Failed to allocate memory for flatcc builder_."); + flatcc_builder_init(builder_); } reset(); } ETDumpGen::~ETDumpGen() { - flatcc_builder_clear(builder); + flatcc_builder_clear(builder_); if (!is_static_etdump()) { - free(builder); + free(builder_); } } void ETDumpGen::reset() { - etdump_gen_state = ETDumpGen_Init; - num_blocks = 0; - flatcc_builder_reset(builder); - flatbuffers_buffer_start(builder, etdump_ETDump_file_identifier); - etdump_ETDump_start_as_root_with_size(builder); - etdump_ETDump_version_add(builder, ETDUMP_VERSION); - etdump_ETDump_run_data_start(builder); - etdump_ETDump_run_data_push_start(builder); + state_ = State::Init; + num_blocks_ = 0; + flatcc_builder_reset(builder_); + flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier); + etdump_ETDump_start_as_root_with_size(builder_); + etdump_ETDump_version_add(builder_, ETDUMP_VERSION); + etdump_ETDump_run_data_start(builder_); + etdump_ETDump_run_data_push_start(builder_); } void ETDumpGen::create_event_block(const char* name) { - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Done) { + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::Done) { reset(); } - if (num_blocks > 0) { - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_push_start(builder); + if (num_blocks_ > 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_push_start(builder_); } - ++num_blocks; - etdump_RunData_name_create_strn(builder, name, strlen(name)); - if (bundled_input_index != -1) { - etdump_RunData_bundled_input_index_add(builder, bundled_input_index); + ++num_blocks_; + etdump_RunData_name_create_strn(builder_, name, strlen(name)); + if (bundled_input_index_ != -1) { + etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_); } - etdump_gen_state = ETDumpGen_Block_Created; + state_ = State::BlockCreated; } int64_t ETDumpGen::create_string_entry(const char* name) { - return flatbuffers_string_create_str(builder, name); + return flatbuffers_string_create_str(builder_, name); } // ETDumpGen has the following possible states, ETDumpGen_Init, @@ -169,16 +183,15 @@ int64_t ETDumpGen::create_string_entry(const char* name) { // type again. In this case once we close the allocators table and start pushing // to the events table we cannot push to the allocators table again. void ETDumpGen::check_ready_to_add_events() { - if (etdump_gen_state != ETDumpGen_Adding_Events) { + if (state_ != State::AddingEvents) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Adding_Allocators || - etdump_gen_state == ETDumpGen_Block_Created), + (state_ == State::AddingAllocators || state_ == State::BlockCreated), "ETDumpGen in an invalid state. Cannot add new events now."); - if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); + if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); } - etdump_RunData_events_start(builder); - etdump_gen_state = ETDumpGen_Adding_Events; + etdump_RunData_events_start(builder_); + state_ = State::AddingEvents; } } @@ -231,29 +244,29 @@ void ETDumpGen::end_profiling_delegate( check_ready_to_add_events(); // Start building the ProfileEvent entry. - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, event_tracer_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); // Delegate debug identifier can either be of a string type or an integer // type. If it's a string type then it's a value of type // flatbuffers_string_ref_t type, whereas if it's an integer type then we // write the integer value directly. if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } else { etdump_ProfileEvent_delegate_debug_id_str_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_profiling_delegate( @@ -268,24 +281,24 @@ void ETDumpGen::log_profiling_delegate( "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, delegate_debug_index); + builder_, delegate_debug_index); } else { - etdump_ProfileEvent_delegate_debug_id_str_add(builder, string_id); + etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_intermediate_output_delegate( @@ -331,7 +344,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper( ET_CHECK_MSG( (name == nullptr) ^ (delegate_debug_index == -1), "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n"); return; } @@ -339,71 +352,71 @@ void ETDumpGen::log_intermediate_output_delegate_helper( check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { - etdump_DebugEvent_delegate_debug_id_int_add(builder, delegate_debug_index); + etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index); } else { - etdump_DebugEvent_delegate_debug_id_str_add(builder, string_id); + etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id); } // Check the type of `output` then call the corresponding logging functions if constexpr (std::is_same::value) { long offset = copy_tensor_to_debug_buffer(output); - etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder, output, offset); + etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); } else if constexpr (std::is_same>::value) { - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < output.size(); ++i) { long offset = copy_tensor_to_debug_buffer(output[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, output[i], offset)); + builder_, add_tensor_entry(builder_, output[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); } else if constexpr (std::is_same::value) { - auto int_ref = etdump_Int_create(builder, output); + auto int_ref = etdump_Int_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); } else if constexpr (std::is_same::value) { - auto double_ref = etdump_Double_create(builder, output); + auto double_ref = etdump_Double_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); } else if constexpr (std::is_same::value) { flatbuffers_bool_t flatbuffer_bool_val = output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); } else { ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n"); } - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { @@ -413,32 +426,31 @@ void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); check_ready_to_add_events(); - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, prof_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, prof_entry.chain_id); - etdump_ProfileEvent_instruction_id_add(builder, prof_entry.debug_handle); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id); + etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle); if (prof_entry.event_id != -1) { - etdump_ProfileEvent_name_add(builder, prof_entry.event_id); + etdump_ProfileEvent_name_add(builder_, prof_entry.event_id); } - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } AllocatorID ETDumpGen::track_allocator(const char* name) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Block_Created || - etdump_gen_state == ETDumpGen_Adding_Allocators), + (state_ == State::BlockCreated || state_ == State::AddingAllocators), "Allocators can only be added immediately after a new block is created and before any events are added."); - if (etdump_gen_state != ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_start(builder); - etdump_gen_state = ETDumpGen_Adding_Allocators; + if (state_ != State::AddingAllocators) { + etdump_RunData_allocators_start(builder_); + state_ = State::AddingAllocators; } flatbuffers_string_ref_t ref = create_string_entry(name); - etdump_RunData_allocators_push_create(builder, ref); - return etdump_RunData_allocators_reserved_len(builder); + etdump_RunData_allocators_push_create(builder_, ref); + return etdump_RunData_allocators_reserved_len(builder_); } void ETDumpGen::track_allocation( @@ -446,43 +458,43 @@ void ETDumpGen::track_allocation( size_t allocation_size) { check_ready_to_add_events(); - etdump_RunData_events_push_start(builder); - etdump_Event_allocation_event_create(builder, allocator_id, allocation_size); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size); + etdump_RunData_events_push_end(builder_); } -etdump_result ETDumpGen::get_etdump_data() { - etdump_result result; - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); - } else if (etdump_gen_state == ETDumpGen_Init) { +ETDumpResult ETDumpGen::get_etdump_data() { + ETDumpResult result; + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); + } else if (state_ == State::Init) { result.buf = nullptr; result.size = 0; return result; } - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_end(builder); - etdump_ETDump_ref_t root = etdump_ETDump_end(builder); - flatbuffers_buffer_end(builder, root); - if (num_blocks == 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_end(builder_); + etdump_ETDump_ref_t root = etdump_ETDump_end(builder_); + flatbuffers_buffer_end(builder_, root); + if (num_blocks_ == 0) { result = {nullptr, 0}; } else { - if (alloc.data) { - result.buf = alloc.front_cursor; - result.size = alloc.out_size - alloc.front_left; + if (alloc_.data) { + result.buf = alloc_.front_cursor; + result.size = alloc_.out_size - alloc_.front_left; } else { result.buf = - flatcc_builder_finalize_aligned_buffer(builder, &result.size); + flatcc_builder_finalize_aligned_buffer(builder_, &result.size); } } - etdump_gen_state = ETDumpGen_Done; + state_ = State::Done; return result; } void ETDumpGen::set_debug_buffer(Span buffer) { - debug_buffer = buffer; + debug_buffer_ = buffer; } size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { @@ -490,94 +502,94 @@ size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { return static_cast(-1); } uint8_t* offset_ptr = - alignPointer(debug_buffer.data() + debug_buffer_offset, 64); - debug_buffer_offset = (offset_ptr - debug_buffer.data()) + tensor.nbytes(); + alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64); + debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes(); ET_CHECK_MSG( - debug_buffer_offset <= debug_buffer.size(), + debug_buffer_offset_ <= debug_buffer_.size(), "Ran out of space to store intermediate outputs."); memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes()); - return (size_t)(offset_ptr - debug_buffer.data()); + return (size_t)(offset_ptr - debug_buffer_.data()); } void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { return; } check_ready_to_add_events(); - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); switch (evalue.tag) { case Tag::Tensor: { exec_aten::Tensor tensor = evalue.toTensor(); long offset = copy_tensor_to_debug_buffer(tensor); etdump_Tensor_ref_t tensor_ref = - add_tensor_entry(builder, tensor, offset); + add_tensor_entry(builder_, tensor, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::ListTensor: { exec_aten::ArrayRef tensors = evalue.toTensorList(); - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < tensors.size(); ++i) { long offset = copy_tensor_to_debug_buffer(tensors[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, tensors[i], offset)); + builder_, add_tensor_entry(builder_, tensors[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Int: { int64_t val = evalue.toInt(); - auto int_ref = etdump_Int_create(builder, val); + auto int_ref = etdump_Int_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Double: { double val = evalue.toDouble(); - auto double_ref = etdump_Double_create(builder, val); + auto double_ref = etdump_Double_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -585,13 +597,13 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { case Tag::Bool: { flatbuffers_bool_t flatbuffer_bool_val = evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -604,20 +616,20 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { break; } - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } size_t ETDumpGen::get_num_blocks() { - return num_blocks; + return num_blocks_; } bool ETDumpGen::is_static_etdump() { - return alloc.data != nullptr; + return alloc_.data != nullptr; } -} // namespace executor -} // namespace torch +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h index e56d09f8107..0bd891a0970 100644 --- a/devtools/etdump/etdump_flatcc.h +++ b/devtools/etdump/etdump_flatcc.h @@ -8,33 +8,22 @@ #pragma once -#include #include -#include "executorch/runtime/core/event_tracer.h" -#include "executorch/runtime/platform/platform.h" + +#include +#include +#include #define ETDUMP_VERSION 0 struct flatcc_builder; -namespace torch { -namespace executor { - -enum ETDumpGen_State { - ETDumpGen_Init, - ETDumpGen_Block_Created, - ETDumpGen_Adding_Allocators, - ETDumpGen_Adding_Events, - ETDumpGen_Done, -}; +namespace executorch { +namespace etdump { -struct etdump_result { - void* buf; - size_t size; -}; - -struct etdump_static_allocator { - etdump_static_allocator() {} +namespace internal { +struct ETDumpStaticAllocator { + ETDumpStaticAllocator() = default; void set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) { @@ -64,61 +53,72 @@ struct etdump_static_allocator { // Bytes left in front of front_cursor. size_t front_left{0}; }; +} // namespace internal + +struct ETDumpResult { + void* buf; + size_t size; +}; -class ETDumpGen : public EventTracer { +class ETDumpGen : public ::executorch::runtime::EventTracer { public: - ETDumpGen(Span buffer = {nullptr, (size_t)0}); + ETDumpGen(::executorch::runtime::Span buffer = {nullptr, (size_t)0}); ~ETDumpGen() override; void clear_builder(); void create_event_block(const char* name) override; - virtual EventTracerEntry start_profiling( + virtual ::executorch::runtime::EventTracerEntry start_profiling( const char* name, - ChainID chain_id = -1, - DebugHandle debug_handle = 0) override; - virtual void end_profiling(EventTracerEntry prof_entry) override; - virtual EventTracerEntry start_profiling_delegate( + ::executorch::runtime::ChainID chain_id = -1, + ::executorch::runtime::DebugHandle debug_handle = 0) override; + virtual void end_profiling( + ::executorch::runtime::EventTracerEntry prof_entry) override; + virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate( const char* name, - DebugHandle delegate_debug_index) override; + ::executorch::runtime::DebugHandle delegate_debug_index) override; virtual void end_profiling_delegate( - EventTracerEntry prof_entry, + ::executorch::runtime::EventTracerEntry prof_entry, const void* metadata, size_t metadata_len) override; virtual void log_profiling_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, et_timestamp_t start_time, et_timestamp_t end_time, const void* metadata, size_t metadata_len) override; - virtual void track_allocation(AllocatorID id, size_t size) override; - virtual AllocatorID track_allocator(const char* name) override; + virtual void track_allocation( + ::executorch::runtime::AllocatorID id, + size_t size) override; + virtual ::executorch::runtime::AllocatorID track_allocator( + const char* name) override; virtual void log_evalue( - const EValue& evalue, - LoggedEValueType evalue_type = - LoggedEValueType::kIntermediateOutput) override; + const ::executorch::runtime::EValue& evalue, + ::executorch::runtime::LoggedEValueType evalue_type = + ::executorch::runtime::LoggedEValueType::kIntermediateOutput) + override; /** * Log an intermediate tensor output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const Tensor& output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const exec_aten::Tensor& output) override; /** * Log an intermediate tensor array output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const ArrayRef output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const ::executorch::runtime::ArrayRef output) override; /** * Log an intermediate int output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const int& output) override; /** @@ -126,7 +126,7 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const bool& output) override; /** @@ -134,22 +134,22 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const double& output) override; - void set_debug_buffer(Span buffer); - etdump_result get_etdump_data(); + void set_debug_buffer(::executorch::runtime::Span buffer); + ETDumpResult get_etdump_data(); size_t get_num_blocks(); bool is_static_etdump(); void reset(); private: - struct flatcc_builder* builder; - size_t num_blocks = 0; - Span debug_buffer; - size_t debug_buffer_offset = 0; - int bundled_input_index = -1; - ETDumpGen_State etdump_gen_state = ETDumpGen_Init; - struct etdump_static_allocator alloc; + enum class State { + Init, + BlockCreated, + AddingAllocators, + AddingEvents, + Done, + }; void check_ready_to_add_events(); int64_t create_string_entry(const char* name); @@ -162,9 +162,26 @@ class ETDumpGen : public EventTracer { template void log_intermediate_output_delegate_helper( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const T& output); + + struct flatcc_builder* builder_; + size_t num_blocks_ = 0; + ::executorch::runtime::Span debug_buffer_; + size_t debug_buffer_offset_ = 0; + int bundled_input_index_ = -1; + State state_ = State::Init; + struct internal::ETDumpStaticAllocator alloc_; }; +} // namespace etdump +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using etdump_result = ::executorch::etdump::ETDumpResult; +using ::executorch::etdump::ETDumpGen; } // namespace executor } // namespace torch diff --git a/devtools/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs index fdfe550e9e3..a8da080c679 100644 --- a/devtools/etdump/scalar_type.fbs +++ b/devtools/etdump/scalar_type.fbs @@ -14,6 +14,7 @@ enum ScalarType : byte { SHORT = 2, INT = 3, LONG = 4, + HALF = 5, FLOAT = 6, DOUBLE = 7, BOOL = 11, @@ -24,7 +25,6 @@ enum ScalarType : byte { QUINT4X2 = 16, QUINT2X4 = 17, // Types currently not implemented. - // Half = 5, // COMPLEXHALF = 8, // COMPLEXFLOAT = 9, // COMPLEXDOUBLE = 10, diff --git a/devtools/etdump/targets.bzl b/devtools/etdump/targets.bzl index 6d548ce650f..ddbb35eab74 100644 --- a/devtools/etdump/targets.bzl +++ b/devtools/etdump/targets.bzl @@ -95,9 +95,11 @@ def define_common_targets(): "etdump_flatcc.cpp", "emitter.cpp", ], + headers = [ + "emitter.h", + ], exported_headers = [ "etdump_flatcc.h", - "emitter.h", ], deps = [ "//executorch/runtime/platform:platform", diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp index de8c0abc39d..b750e21eb07 100644 --- a/devtools/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -20,8 +20,20 @@ #include #include -namespace torch { -namespace executor { +using ::exec_aten::ScalarType; +using ::exec_aten::Tensor; +using ::executorch::etdump::ETDumpGen; +using ::executorch::etdump::ETDumpResult; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::BoxedEvalueList; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; +using ::executorch::runtime::testing::TensorFactory; class ProfilerETDumpTest : public ::testing::Test { protected: @@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) { EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) { etdump_gen[i]->start_profiling("test_event_1", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) { TEST_F(ProfilerETDumpTest, DebugEvent) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); etdump_gen[i]->create_event_block("test_block"); @@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) { TEST_F(ProfilerETDumpTest, DebugEventTensorList) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})}; EValue evalue_1(storage[0]); EValue evalue_2(storage[1]); @@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) { } TEST_F(ProfilerETDumpTest, VerifyLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) { etdump_gen[i]->log_evalue(evalue); etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) { entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) { entry = etdump_gen[i]->start_profiling("test_event2", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { Span buffer((uint8_t*)ptr, 2048); etdump_gen[i]->create_event_block("test_block"); - testing::TensorFactory tf; + TensorFactory tf; ET_EXPECT_DEATH( etdump_gen[i]->log_intermediate_output_delegate( @@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { static_cast(-1), true); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { } TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { etdump_gen[i]->log_intermediate_output_delegate( nullptr, 258, tf.ones({5, 6})); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) { etdump_gen[i]->end_profiling(entry), "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { } } } - -} // namespace executor -} // namespace torch From 6ce9f5216615dad2a2f30c352a4c773edefd81b2 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Mon, 9 Sep 2024 16:24:44 -0700 Subject: [PATCH 272/531] t to z start ops | add dim order sanity check Differential Revision: D59990127 Pull Request resolved: https://github.com/pytorch/executorch/pull/4328 --- kernels/portable/cpu/op_t_copy.cpp | 5 +++++ kernels/portable/cpu/op_to_copy.cpp | 5 +++++ kernels/portable/cpu/op_transpose_copy.cpp | 3 +++ kernels/portable/cpu/op_tril.cpp | 5 +++++ kernels/portable/cpu/op_unbind_copy.cpp | 7 +++++++ kernels/portable/cpu/op_unsqueeze_copy.cpp | 5 +++++ kernels/portable/cpu/op_var.cpp | 5 +++++ kernels/portable/cpu/op_view_copy.cpp | 5 +++++ kernels/portable/cpu/op_where.cpp | 3 +++ 9 files changed, 43 insertions(+) diff --git a/kernels/portable/cpu/op_t_copy.cpp b/kernels/portable/cpu/op_t_copy.cpp index c6a2ad5fdb5..46807a42f22 100644 --- a/kernels/portable/cpu/op_t_copy.cpp +++ b/kernels/portable/cpu/op_t_copy.cpp @@ -47,6 +47,11 @@ Tensor& t_copy_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { return out; } + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; get_transpose_out_target_size(in, 1, 0, expected_out_size, &expected_out_dim); diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp index c0c04e65e93..46bd0bf987e 100644 --- a/kernels/portable/cpu/op_to_copy.cpp +++ b/kernels/portable/cpu/op_to_copy.cpp @@ -46,6 +46,11 @@ Tensor& to_copy_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + ET_SWITCH_REALHBBF16_TYPES(self.scalar_type(), ctx, "to_copy", CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES( out.scalar_type(), ctx, "to_copy", CTYPE_OUT, [&] { diff --git a/kernels/portable/cpu/op_transpose_copy.cpp b/kernels/portable/cpu/op_transpose_copy.cpp index 79c04646a73..d2456b8592e 100644 --- a/kernels/portable/cpu/op_transpose_copy.cpp +++ b/kernels/portable/cpu/op_transpose_copy.cpp @@ -57,6 +57,9 @@ Tensor& transpose_copy_int_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { transpose_tensors(in, dim0, dim1, out); }); diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp index cdf87bea4ba..46a91e8c627 100644 --- a/kernels/portable/cpu/op_tril.cpp +++ b/kernels/portable/cpu/op_tril.cpp @@ -145,6 +145,11 @@ Tensor& tril_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + if (self.numel() == 0) { return out; } diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp index da5a73d624c..cea4ccce345 100644 --- a/kernels/portable/cpu/op_unbind_copy.cpp +++ b/kernels/portable/cpu/op_unbind_copy.cpp @@ -36,6 +36,13 @@ void unbind_copy_int_out( ET_KERNEL_CHECK( ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, ); + for (int i = 0; i < out.size(); ++i) { + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, ); + } + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(input), InvalidArgument, ); + if (input.numel() == 0) { return; } diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp index f6d25a04983..1c0a5c79990 100644 --- a/kernels/portable/cpu/op_unsqueeze_copy.cpp +++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp @@ -38,6 +38,11 @@ Tensor& unsqueeze_copy_out( ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out); ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + for (size_t i = 0; i < out.dim(); ++i) { if (i < dim) { expected_output_size[i] = self.size(i); diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index 52019e381c0..fa49269196e 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -74,6 +74,11 @@ Tensor& var_out( ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, diff --git a/kernels/portable/cpu/op_view_copy.cpp b/kernels/portable/cpu/op_view_copy.cpp index f7174caac1e..ba72396b44f 100644 --- a/kernels/portable/cpu/op_view_copy.cpp +++ b/kernels/portable/cpu/op_view_copy.cpp @@ -44,6 +44,11 @@ Tensor& view_copy_out( out, "Failed to resize output tensor."); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(self), InvalidArgument, out); + ET_KERNEL_CHECK( ctx, check_view_copy_args(self, size_int64_t, out), InvalidArgument, out); diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index 6ff4cb85fb3..90f6e3df92b 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -35,6 +35,9 @@ Tensor& where_out( InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out); + constexpr auto name = "where.self_out"; ET_CHECK_MSG( From 542ecb59abd30edd0c4840af8eb8a513872e88fd Mon Sep 17 00:00:00 2001 From: Chirag Modi <98582575+cmodi-meta@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:26:23 -0700 Subject: [PATCH 273/531] Add Echo parameter to multimodal runner (llava) and jni layer (#5181) * Add Echo parameter to multimodal runner (llava) and jni layer * Rebasing - Unify order of echo parameter to be last in all layers --- .../executorchllamademo/MainActivity.java | 12 ++++----- examples/models/llava/runner/llava_runner.cpp | 14 +++++++---- examples/models/llava/runner/llava_runner.h | 7 ++++-- extension/android/jni/jni_layer_llama.cpp | 13 ++++++---- .../org/pytorch/executorch/LlamaModule.java | 25 ++++++++++--------- extension/llm/runner/multimodal_runner.h | 7 ++++-- 6 files changed, 46 insertions(+), 32 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 308f5fac50a..e9f32a927cc 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -675,8 +675,8 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, rawPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, - false, - MainActivity.this); + MainActivity.this, + false); } else { // no image selected, we pass in empty int array mModule.generate( @@ -686,8 +686,8 @@ public void run() { ModelUtils.VISION_MODEL_IMAGE_CHANNELS, rawPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, - false, - MainActivity.this); + MainActivity.this, + false); } } else { String finalPrompt = @@ -696,8 +696,8 @@ public void run() { mModule.generate( finalPrompt, (int) (finalPrompt.length() * 0.75) + 64, - false, - MainActivity.this); + MainActivity.this, + false); } long generateDuration = System.currentTimeMillis() - generateStartTime; diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 64763c72576..20c45009da7 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -99,9 +99,12 @@ Error LlavaRunner::generate_from_pos( int64_t start_pos, std::function token_callback, std::function - stats_callback) { + stats_callback, + bool echo) { // prefill user prompt. No BOS because preset prompt already has it. - token_callback(prompt); + if (echo) { + token_callback(prompt); + } uint64_t prefill_next_token = ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0)); @@ -125,7 +128,8 @@ Error LlavaRunner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, - std::function stats_callback) { + std::function stats_callback, + bool echo) { ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -160,8 +164,8 @@ Error LlavaRunner::generate( util::get_rss_bytes() / 1024.0 / 1024.0); // Generate tokens - Error err = - generate_from_pos(prompt, seq_len, pos, wrapped_callback, stats_callback); + Error err = generate_from_pos( + prompt, seq_len, pos, wrapped_callback, stats_callback, echo); ET_LOG( Info, diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h index 923f8180a83..e671718ae5e 100644 --- a/examples/models/llava/runner/llava_runner.h +++ b/examples/models/llava/runner/llava_runner.h @@ -36,7 +36,8 @@ class LlavaRunner : public MultimodalRunner { int32_t seq_len = 1024, std::function token_callback = {}, std::function - stats_callback = {}); + stats_callback = {}, + bool echo = true); /** * Prefill an LLaVA Module with the given images input. @@ -70,6 +71,7 @@ class LlavaRunner : public MultimodalRunner { * @param start_pos The starting position in KV cache of the input in the LLM. * @param token_callback What to do after a token is generated. * @param stats_callback What to do with Stats. + * @param echo Whether to echo the input prompt or not. * @return The error code. */ Error generate_from_pos( @@ -78,7 +80,8 @@ class LlavaRunner : public MultimodalRunner { int64_t start_pos = 0, std::function token_callback = {}, std::function - stats_callback = {}); + stats_callback = {}, + bool echo = true); private: inline static const std::string kPresetPrompt = diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 1b993341e54..6194853fe7c 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -150,8 +150,8 @@ class ExecuTorchLlamaJni jint channels, facebook::jni::alias_ref prompt, jint seq_len, - jboolean echo, - facebook::jni::alias_ref callback) { + facebook::jni::alias_ref callback, + jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { auto image_size = image->size(); std::vector images; @@ -170,7 +170,8 @@ class ExecuTorchLlamaJni prompt->toStdString(), seq_len, [callback](std::string result) { callback->onResult(result); }, - [callback](const Stats& result) { callback->onStats(result); }); + [callback](const Stats& result) { callback->onStats(result); }, + echo); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { runner_->generate( prompt->toStdString(), @@ -248,7 +249,8 @@ class ExecuTorchLlamaJni facebook::jni::alias_ref prompt, jint seq_len, jlong start_pos, - facebook::jni::alias_ref callback) { + facebook::jni::alias_ref callback, + jboolean echo) { if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { return static_cast(Error::NotSupported); } @@ -259,7 +261,8 @@ class ExecuTorchLlamaJni [callback](const std::string& result) { callback->onResult(result); }, [callback](const ::executorch::extension::llm::Stats& stats) { callback->onStats(stats); - })); + }, + echo)); } void stop() { diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index c4de23df0ee..e3ba11b8505 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -60,7 +60,7 @@ public void resetNative() { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, LlamaCallback llamaCallback) { - return generate(prompt, DEFAULT_SEQ_LEN, DEFAULT_ECHO, llamaCallback); + return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback, DEFAULT_ECHO); } /** @@ -71,18 +71,18 @@ public int generate(String prompt, LlamaCallback llamaCallback) { * @param llamaCallback callback object to receive results. */ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, DEFAULT_ECHO, llamaCallback); + return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, DEFAULT_ECHO); } /** * Start generating tokens from the module. * * @param prompt Input prompt + * @param llamaCallback callback object to receive results * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param llamaCallback callback object to receive results. */ - public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, echo, llamaCallback); + public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) { + return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llamaCallback, echo); } /** @@ -90,11 +90,11 @@ public int generate(String prompt, boolean echo, LlamaCallback llamaCallback) { * * @param prompt Input prompt * @param seqLen sequence length + * @param llamaCallback callback object to receive results * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param llamaCallback callback object to receive results. */ - public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llamaCallback) { - return generate(null, 0, 0, 0, prompt, seqLen, echo, llamaCallback); + public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, boolean echo) { + return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, echo); } /** @@ -106,8 +106,8 @@ public int generate(String prompt, int seqLen, boolean echo, LlamaCallback llama * @param channels Input image number of channels * @param prompt Input prompt * @param seqLen sequence length - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) * @param llamaCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) */ @DoNotStrip public native int generate( @@ -117,8 +117,8 @@ public native int generate( int channels, String prompt, int seqLen, - boolean echo, - LlamaCallback llamaCallback); + LlamaCallback llamaCallback, + boolean echo); /** * Prefill an LLaVA Module with the given images input. @@ -172,10 +172,11 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) { * @param seqLen The total sequence length, including the prompt tokens and new tokens. * @param startPos The starting position in KV cache of the input in the LLM. * @param llamaCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not. * @return The error code. */ public native int generateFromPos( - String prompt, int seqLen, long startPos, LlamaCallback callback); + String prompt, int seqLen, long startPos, LlamaCallback callback, boolean echo); /** Stop current generate() before it finishes. */ @DoNotStrip diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 70ecafee810..6798f648a0c 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -59,7 +59,8 @@ class MultimodalRunner { const std::string& prompt, int32_t seq_len = 1024, std::function token_callback = {}, - std::function stats_callback = {}) = 0; + std::function stats_callback = {}, + bool echo = true) = 0; /** * Prefill an LLaVA Module with the given images input. @@ -95,6 +96,7 @@ class MultimodalRunner { * @param start_pos The starting position in KV cache of the input in the LLM. * @param token_callback What to do after a token is generated. * @param stats_callback What to do with Stats. + * @param echo Whether to echo the input prompt or not. * @return The error code. */ virtual runtime::Error generate_from_pos( @@ -103,7 +105,8 @@ class MultimodalRunner { int64_t start_pos = 0, std::function token_callback = {}, std::function - stats_callback = {}) = 0; + stats_callback = {}, + bool echo = true) = 0; inline void stop() { text_token_generator_->stop(); From 59d9bad823ed5c824427be8d53e3270a7d3bcc9b Mon Sep 17 00:00:00 2001 From: lucylq Date: Mon, 9 Sep 2024 16:29:24 -0700 Subject: [PATCH 274/531] Use c++17 for size test Differential Revision: D62389344 Pull Request resolved: https://github.com/pytorch/executorch/pull/5178 --- extension/llm/custom_ops/op_sdpa.cpp | 4 ++-- .../core/exec_aten/util/scalar_type_util.h | 4 ++++ runtime/core/portable_type/half.h | 4 ++-- runtime/core/portable_type/string_view.h | 21 +++++++------------ test/CMakeLists.txt | 3 +-- test/build_size_test.sh | 17 --------------- 6 files changed, 16 insertions(+), 37 deletions(-) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index 56db1c208ea..c5ac365825b 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -158,7 +158,7 @@ static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { template < typename scalar_t, typename std::enable_if_t< - ::executorch::runtime::is_reduced_floating_point::value, + ::executorch::runtime::is_reduced_floating_point_v, int> = 0> static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { (void)ptr; @@ -247,7 +247,7 @@ void cpu_flash_attention( "KV_split_size must be greater than q_split_size"); constexpr bool is_reduced_type = - ::executorch::runtime::is_reduced_floating_point::value; + ::executorch::runtime::is_reduced_floating_point_v; ET_CHECK_MSG( !is_reduced_type, "FlashAttention does not support reduced types."); diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h index 4d8712c1590..7c576f889fb 100644 --- a/runtime/core/exec_aten/util/scalar_type_util.h +++ b/runtime/core/exec_aten/util/scalar_type_util.h @@ -73,6 +73,10 @@ struct is_reduced_floating_point bool, std::is_same::value || std::is_same::value> {}; + +template +constexpr bool is_reduced_floating_point_v = + is_reduced_floating_point::value; #endif /// Maps ScalarTypes to C++ types. diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index 5aded68270b..8987d82804b 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -62,7 +62,7 @@ struct alignas(2) Half { namespace internal { inline float fp32_from_bits(uint32_t w) { - static_assert(sizeof(float) == sizeof(uint32_t), ""); + static_assert(sizeof(float) == sizeof(uint32_t)); union { uint32_t as_bits; float as_value; @@ -71,7 +71,7 @@ inline float fp32_from_bits(uint32_t w) { } inline uint32_t fp32_to_bits(float f) { - static_assert(sizeof(float) == sizeof(uint32_t), ""); + static_assert(sizeof(float) == sizeof(uint32_t)); union { float as_value; uint32_t as_bits; diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h index 4036539ccc5..47a9f335eb5 100644 --- a/runtime/core/portable_type/string_view.h +++ b/runtime/core/portable_type/string_view.h @@ -79,13 +79,10 @@ class basic_string_view final { } constexpr const_reference at(size_type pos) const { - return (pos >= size_) - ? (ET_ASSERT_MESSAGE_EMIT( - " (%s): " - "string_view::operator[] or string_view::at() out of range", - pos >= size_), - torch::executor::runtime_abort()) - : at_(pos); + ET_CHECK_MSG( + pos >= size_, + "string_view::operator[] or string_view::at() out of range"); + return at_(pos); } constexpr const_reference front() const { @@ -140,13 +137,9 @@ class basic_string_view final { constexpr basic_string_view substr(size_type pos = 0, size_type count = npos) const { - return (pos > size_) - ? (ET_ASSERT_MESSAGE_EMIT( - " (%s): " - "basic_string_view::substr parameter out of bounds.", - pos > size_), - torch::executor::runtime_abort()) - : substr_(pos, count); + ET_CHECK_MSG( + pos > size_, "basic_string_view::substr parameter out of bounds."); + return substr_(pos, count); } constexpr int compare(basic_string_view rhs) const noexcept { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5dbe47c8671..b651bd2dd93 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -19,8 +19,7 @@ cmake_minimum_required(VERSION 3.19) project(size_test) -# Use C++11 for size test. -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/test/build_size_test.sh b/test/build_size_test.sh index 540b78e9f05..428e351cf08 100644 --- a/test/build_size_test.sh +++ b/test/build_size_test.sh @@ -11,29 +11,12 @@ set -e # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" -# Set compile flags for Clang and GCC. -# -Wno-gnu allows us to use gnu statement-expressions. -# -Werror -Wc++17* ensure we do not use features from C++17. -CXX_FLAGS="-Wno-gnu" -compiler=$(cc --version) -if [[ $compiler == *"clang"* ]]; then - CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-extensions -Wc++14-extensions" -elif [[ $compiler == *"cc"* ]]; then - CXX_FLAGS="$CXX_FLAGS -Werror -Wc++17-compat -Wc++14-compat" -else - echo "Unknown compiler: $compiler" - exit 1 -fi -echo "Using compiler $compiler with flags $CXX_FLAGS" - cmake_install_executorch_lib() { echo "Installing libexecutorch.a" rm -rf cmake-out retry cmake -DBUCK2="$BUCK2" \ - -DCMAKE_CXX_STANDARD=11 \ -DCMAKE_CXX_STANDARD_REQUIRED=ON \ - -DCMAKE_CXX_FLAGS="$CXX_FLAGS" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ From 7650667b21eb98bfc1a5664cff866328e43582d8 Mon Sep 17 00:00:00 2001 From: Peixuan Liu Date: Mon, 9 Sep 2024 17:12:04 -0700 Subject: [PATCH 275/531] Add a default delegate time scale converter Differential Revision: D62160650 Pull Request resolved: https://github.com/pytorch/executorch/pull/5076 --- devtools/inspector/_inspector.py | 27 +++++++++++----- devtools/inspector/_inspector_utils.py | 11 +++++++ devtools/inspector/tests/inspector_test.py | 32 ++++++++++++++++++- .../inspector/tests/inspector_utils_test.py | 17 ++++++++++ 4 files changed, 78 insertions(+), 9 deletions(-) diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py index f98e3cd3a56..82b1ffe1f73 100644 --- a/devtools/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import dataclasses import logging import sys @@ -39,6 +41,7 @@ ) from executorch.devtools.etrecord import ETRecord, parse_etrecord from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, EXCLUDED_COLUMNS_WHEN_PRINTING, @@ -52,7 +55,6 @@ is_inference_output_equal, ProgramOutput, RESERVED_FRAMEWORK_EVENT_NAMES, - TIME_SCALE_DICT, TimeScale, verify_debug_data_equivalence, ) @@ -799,9 +801,7 @@ class GroupedRunInstances: # Construct the EventBlocks event_blocks = [] - scale_factor = ( - TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] - ) + scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale) for run_signature, grouped_run_instance in run_groups.items(): run_group: OrderedDict[EventSignature, List[InstructionEvent]] = ( grouped_run_instance.events @@ -966,6 +966,9 @@ def __init__( debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs. delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is: (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]] + delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of + target_time_scale/source_time_scale. + enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False. Returns: None @@ -980,6 +983,14 @@ def __init__( self._source_time_scale = source_time_scale self._target_time_scale = target_time_scale + if delegate_time_scale_converter is None: + scale_factor = calculate_time_scale_factor( + source_time_scale, target_time_scale + ) + delegate_time_scale_converter = ( + lambda event_name, input_time: input_time / scale_factor + ) + if etrecord is None: self._etrecord = None elif isinstance(etrecord, ETRecord): @@ -1002,10 +1013,10 @@ def __init__( ) self.event_blocks = EventBlock._gen_from_etdump( - etdump, - self._source_time_scale, - self._target_time_scale, - output_buffer, + etdump=etdump, + source_time_scale=self._source_time_scale, + target_time_scale=self._target_time_scale, + output_buffer=output_buffer, delegate_metadata_parser=delegate_metadata_parser, delegate_time_scale_converter=delegate_time_scale_converter, ) diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py index 98b5fdc722f..5f04e2d0413 100644 --- a/devtools/inspector/_inspector_utils.py +++ b/devtools/inspector/_inspector_utils.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import math from enum import Enum from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union @@ -63,6 +65,15 @@ class TimeScale(Enum): } +def calculate_time_scale_factor( + source_time_scale: TimeScale, target_time_scale: TimeScale +) -> float: + """ + Calculate the factor (source divided by target) between two time scales + """ + return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] + + # Model Debug Output InferenceOutput: TypeAlias = Union[ torch.Tensor, List[torch.Tensor], int, float, str, bool, None diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py index 55f0cd10ae9..e801557cabd 100644 --- a/devtools/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -4,13 +4,15 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import random import statistics import tempfile import unittest from contextlib import redirect_stdout -from typing import List +from typing import Callable, List from unittest.mock import patch @@ -32,6 +34,7 @@ InstructionEvent, InstructionEventSignature, ProfileEventSignature, + TimeScale, ) from executorch.exir import ExportedProgram @@ -88,6 +91,33 @@ def test_inspector_constructor(self): # Because we mocked parse_etrecord() to return None, this method shouldn't be called mock_gen_graphs_from_etrecord.assert_not_called() + def test_default_delegate_time_scale_converter(self): + # Create a context manager to patch functions called by Inspector.__init__ + with patch.object( + _inspector, "parse_etrecord", return_value=None + ), patch.object( + _inspector, "gen_etdump_object", return_value=None + ), patch.object( + EventBlock, "_gen_from_etdump" + ) as mock_gen_from_etdump, patch.object( + _inspector, "gen_graphs_from_etrecord" + ), patch.object( + _inspector, "create_debug_handle_to_op_node_mapping" + ): + # Call the constructor of Inspector + Inspector( + etdump_path=ETDUMP_PATH, + etrecord=ETRECORD_PATH, + source_time_scale=TimeScale.US, + target_time_scale=TimeScale.S, + ) + + # Verify delegate_time_scale_converter is set to be a callable + self.assertIsInstance( + mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"), + Callable, + ) + def test_inspector_print_data_tabular(self): # Create a context manager to patch functions called by Inspector.__init__ with patch.object( diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py index d853732fcc7..27e2cb0647f 100644 --- a/devtools/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import tempfile import unittest from typing import Dict, Tuple @@ -23,11 +25,13 @@ from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, find_populated_event, gen_graphs_from_etrecord, is_inference_output_equal, + TimeScale, ) @@ -170,6 +174,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self): ) ) + def test_calculate_time_scale_factor_second_based(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000 + ) + self.assertEqual( + calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000 + ) + + def test_calculate_time_scale_factor_cycles(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1 + ) + def gen_mock_operator_graph_with_expected_map() -> ( Tuple[OperatorGraph, Dict[int, OperatorNode]] From f4126309fef0d104f192556130776f0b7f2a6601 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:18:19 +0800 Subject: [PATCH 276/531] Qualcomm AI Engine Direct - Uplevel QNN version for ci test (#5174) * Qualcomm AI Engine Direct - Uplevel QNN version for ci test Summary: - For rms norm, we need to uplevel to QNN sdk 2.25 or above - There is no libc++.so in ${QNN_SDK_ROOT}/lib/x86_64-linux-clang for QNN sdk 2.23 or above. So, we need to install it manually. * enable llama test for qnn --- .ci/scripts/build-qnn-sdk.sh | 2 +- .ci/scripts/setup-qnn-deps.sh | 26 ++++++++++++++++++++++++-- .ci/scripts/test_llama.sh | 2 +- .github/workflows/pull.yml | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index c48ac2056aa..2492b1fd3d6 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -11,7 +11,7 @@ set -o xtrace build_qnn_backend() { echo "Start building qnn backend." export ANDROID_NDK_ROOT=/opt/ndk - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh index 3b39e1aafe3..92ffd07bccc 100644 --- a/.ci/scripts/setup-qnn-deps.sh +++ b/.ci/scripts/setup-qnn-deps.sh @@ -7,14 +7,18 @@ set -ex +verify_pkg_installed() { + echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed") +} + install_qnn() { echo "Start installing qnn." QNN_INSTALLATION_DIR=/tmp/qnn mkdir -p "${QNN_INSTALLATION_DIR}" - curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip" + curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip" echo "Finishing downloading qnn sdk." - unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp + unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp echo "Finishing unzip qnn sdk." @@ -26,4 +30,22 @@ install_qnn() { ls -lah "${QNN_INSTALLATION_DIR}" } +setup_libc++() { + sudo apt-get update + pkgs_to_check=('libc++-dev') + j=0 + while [ $j -lt ${#pkgs_to_check[*]} ]; do + install_status=$(verify_pkg_installed ${pkgs_to_check[$j]}) + if [ "$install_status" == "" ]; then + sudo apt-get install -y ${pkgs_to_check[$j]} + if [[ $? -ne 0 ]]; then + echo "ERROR: Failed to install required packages for libc++" + exit 1 + fi + fi + j=$(( $j +1)); + done +} + +setup_libc++ install_qnn diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 290ece7b8e6..5721b7fd607 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -75,7 +75,7 @@ echo "COREML option ${COREML}" if [[ "${MODE}" =~ .*qnn.* ]]; then QNN=ON export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" export PYTHONPATH=".." cp schema/program.fbs exir/_serialize/program.fbs diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 259ebb19863..ca13d9bbd22 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -372,3 +372,38 @@ jobs: # Run pytest with coverage pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test + + + test-llama-runner-qnn-linux: + name: test-llama-runner-qnn-linux + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + dtype: [fp32] + build-tool: [cmake] + mode: [qnn] + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12-android + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + DTYPE=${{ matrix.dtype }} + BUILD_TOOL=${{ matrix.build-tool }} + MODE=${{ matrix.mode }} + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + + # Setup executorch + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + # Test llama2 + PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" From c5a385e3b98a33989898d103c57bd65c00855b63 Mon Sep 17 00:00:00 2001 From: lucylq Date: Mon, 9 Sep 2024 19:39:16 -0700 Subject: [PATCH 277/531] Update schema to include infinity for double values Differential Revision: D62393242 Pull Request resolved: https://github.com/pytorch/executorch/pull/5203 --- exir/_serialize/_dataclass.py | 7 +++++++ exir/_serialize/_flatbuffer.py | 29 ++++++++++++++++------------- exir/emit/test/test_emit.py | 32 ++++++++++++++++++++++++++++++++ exir/schema.py | 18 +++++++++++++++++- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/exir/_serialize/_dataclass.py b/exir/_serialize/_dataclass.py index 8f6ef1c172b..013d733bcda 100644 --- a/exir/_serialize/_dataclass.py +++ b/exir/_serialize/_dataclass.py @@ -129,6 +129,13 @@ class Example data[key] = [_json_to_dataclass(e, T) for e in value] continue + # If T is a Union, then check which type in the Union it is and initialize. + # eg. Double type in schema.py + if get_origin(T) is Union: + res = [x for x in get_args(get_type_hints(cls)[key]) if x == type(value)] + data[key] = res[0](value) + continue + # If T is an enum then lookup the value in the enum otherwise try to # cast value to whatever type is required if isinstance(T, enum.EnumMeta): diff --git a/exir/_serialize/_flatbuffer.py b/exir/_serialize/_flatbuffer.py index 93006612c73..4599249f00c 100644 --- a/exir/_serialize/_flatbuffer.py +++ b/exir/_serialize/_flatbuffer.py @@ -29,14 +29,6 @@ def _is_valid_alignment(alignment: int) -> bool: return alignment > 0 and (alignment & (alignment - 1)) == 0 -# TODO(T182299196): Replace this hack with a proper flatc binary. -def _replace_infinity_in_json_file(content: str) -> str: - content = re.sub( - r'"double_val"\s*:\s*(-)?Infinity', r'"double_val": "\g<1>inf"', content - ) - return content - - def _patch_schema_alignment( schema: bytes, constant_tensor_alignment: Optional[int], @@ -291,11 +283,8 @@ def _program_json_to_flatbuffer( json_path = os.path.join(temp_dir, file_stem + ".json") output_path = os.path.join(temp_dir, file_stem + ".pte") - # TODO(T182299196): Replace this hack with a proper flatc binary. - replaced_program_json = _replace_infinity_in_json_file(program_json) - with open(json_path, "wb") as json_file: - json_file.write(replaced_program_json.encode("ascii")) + json_file.write(program_json.encode("ascii")) try: _flatc_compile(temp_dir, schema_info.root_path, json_path) @@ -330,6 +319,19 @@ def _program_json_to_flatbuffer( ) +def _replace_infinity_in_json_file(content: bytes) -> bytes: + """Replace -inf and inf with "inf" and "-inf" in the JSON file. program.fbs + is used to convert from flatbuffer to JSON. +-inf float values are not + supported by JSON, so we replace them with the string equivalent. When + converting from JSON to python dataclasses, the string is read as a Union + of float and string (see schema.py). + """ + content = re.sub( + rb'"double_val"\s*:\s*(-)?inf', rb'"double_val": "\g<1>inf"', content + ) + return content + + def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes: """Converts binary flatbuffer data into Program-compatible JSON. @@ -348,4 +350,5 @@ def _program_flatbuffer_to_json(program_flatbuffer: bytes) -> bytes: _flatc_decompile(temp_dir, schema_info.root_path, bin_path) with open(json_path, "rb") as output_file: - return output_file.read() + json_data = output_file.read() + return _replace_infinity_in_json_file(json_data) diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index f1b980a9aea..123896ecdba 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -23,6 +23,7 @@ ExecutorchProgramManager, to_edge, ) +from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult from executorch.exir.dialects._ops import ops as exir_ops @@ -35,6 +36,7 @@ from executorch.exir.schema import ( Bool, DelegateCall, + Double, EValue, ExecutionPlan, Int, @@ -1620,3 +1622,33 @@ def forward(self, x): executorch_module = _load_for_executorch_from_buffer(model.buffer) self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1)) self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1) + + def test_infinity_in_model(self) -> None: + class InfinityMaskModel(nn.Module): + def __init__(self): + super().__init__() + self.mask = torch.tensor([[1, 0], [0, 1]], dtype=torch.float32) + + def forward(self, x): + masked_weights = x.masked_fill(self.mask == 0, float("-inf")) + return masked_weights + + model = to_edge( + export( + InfinityMaskModel(), + (torch.randn(2, 2),), + ) + ) + + # Confirm that we can serialize the model with infinity in it. + model = model.to_executorch() + + # Assert that the infinity is stored as a string "-inf". + values = model.executorch_program.execution_plan[0].values + self.assertEqual(values[5].val, Double(double_val=float("-inf"))) + + # Confirm that we can also deserialize the model with infinity in it. + pte_data = deserialize_pte_binary(model.buffer) + self.assertEqual( + pte_data.execution_plan, model.executorch_program.execution_plan + ) diff --git a/exir/schema.py b/exir/schema.py index 706bc611403..9436465459a 100644 --- a/exir/schema.py +++ b/exir/schema.py @@ -75,7 +75,23 @@ class Bool: @dataclass class Double: - double_val: float + double_val: Union[float, str] + + def __init__(self, double_val: float) -> None: + if double_val == float("inf"): + self.double_val = "inf" + elif double_val == float("-inf"): + self.double_val = "-inf" + else: + self.double_val = double_val + + def __post_init__(self) -> None: + if isinstance(self.double_val, str): + assert self.double_val in ["inf", "-inf"] + else: + assert isinstance(self.double_val, float) + assert not self.double_val == float("inf") + assert not self.double_val == float("-inf") @dataclass From f471556c05a26de383435cbf9f9896bb24f8ca0d Mon Sep 17 00:00:00 2001 From: Yifan Shen Date: Mon, 9 Sep 2024 19:50:46 -0700 Subject: [PATCH 278/531] Partition Mutable Buffer as Core ML State (#5165) * partition mutable buffer to coreml state * delegate llama mutable buffer to coreml * fix lint * support embedding quantize * try fix CI: 1. pin coremltools 8.0b2; 2. refrain from defaulting stateful llama until CI machine upgraded to MacOS 15 * address review comments: 1. add arg help info; 2. add mutable buffer partition log * fix CI: executorch example model test env is using older transformers, that does not support numpy 2.0 --------- Co-authored-by: yifan_shen3 --- .../coreml/partition/coreml_partitioner.py | 13 ++++- .../coreml/scripts/install_requirements.sh | 7 ++- .../coreml/test/test_coreml_partitioner.py | 49 +++++++++++++++++++ examples/models/llama2/export_llama_lib.py | 9 +++- exir/backend/utils.py | 34 +++++++++++++ extension/llm/export/partitioner_lib.py | 30 ++++++------ 6 files changed, 124 insertions(+), 18 deletions(-) diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py index ecf6d44b19c..c0b6663f729 100644 --- a/backends/apple/coreml/partition/coreml_partitioner.py +++ b/backends/apple/coreml/partition/coreml_partitioner.py @@ -17,7 +17,7 @@ Partitioner, PartitionResult, ) -from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer from torch.export.exported_program import ExportedProgram from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase @@ -61,6 +61,7 @@ def __init__( self, skip_ops_for_coreml_delegation: Optional[List[str]] = None, compile_specs: Optional[List[CompileSpec]] = None, + take_over_mutable_buffer: Optional[bool] = True, ) -> None: if skip_ops_for_coreml_delegation is None: skip_ops_for_coreml_delegation = [] @@ -69,6 +70,7 @@ def __init__( backend_id=CoreMLBackend.__name__, compile_specs=compile_specs if compile_specs is not None else [], ) + self.take_over_mutable_buffer = take_over_mutable_buffer def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible @@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) + if self.take_over_mutable_buffer: + logger.info( + "Core ML partitioner will take over torch mutable buffer as Core ML state, " + "so if your model contains mutable buffer, " + "then you will need MacOS15+/iOS18+ to execute. " + "If you want your mutable buffer model to be compatible with older OS, " + "then please set `take_over_mutable_buffer=False`" + ) + tag_mutated_buffer(exported_program) return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh index 0018b5ffc2d..b6c9a073e08 100755 --- a/backends/apple/coreml/scripts/install_requirements.sh +++ b/backends/apple/coreml/scripts/install_requirements.sh @@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party" mkdir "$COREML_DIR_PATH/third-party" echo "${green}ExecuTorch: Cloning coremltools." -git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH +git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH cd $COREMLTOOLS_DIR_PATH STATUS=$? @@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel echo "${green}ExecuTorch: Installing coremltools." pip install "$COREMLTOOLS_DIR_PATH" +# CoreMLTools have started supporting numpy 2.0, +# but ExecuTorch example model test env is still using older transformers, +# so for now we will need to downgrade numpy to 1.x +# TODO: Remove this numpy downgrade once later transformers starts to be used +pip install numpy==1.26.4 STATUS=$? if [ $STATUS -ne 0 ]; then echo "${red}ExecuTorch: Failed to install coremltools." diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py index 34cf531b261..72a7fbf0932 100644 --- a/backends/apple/coreml/test/test_coreml_partitioner.py +++ b/backends/apple/coreml/test/test_coreml_partitioner.py @@ -4,11 +4,14 @@ import unittest +import coremltools as ct + import executorch.exir import torch import torchvision +from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner @@ -86,8 +89,54 @@ def test_vit_skip_conv(self): if node.op == "call_function" ] == total + def test_buffer(self): + embedding_dim = 3 + max_seq_len = 2 + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer( + "cache", + torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32), + ) + + def forward(self, q, k_val, input_pos): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + model = Model() + model.eval() + + q = torch.randn((1, embedding_dim)) + k_val = torch.randn((1, embedding_dim)) + input_pos = torch.tensor([0]) + example_inputs = (q, k_val, input_pos) + exir_program_aten = torch.export.export(model, example_inputs) + + compile_specs = CoreMLBackend.generate_compile_specs( + minimum_deployment_target=ct.target.iOS18 + ) + partitioner = CoreMLPartitioner(compile_specs=compile_specs) + edge_program_manager = executorch.exir.to_edge( + exir_program_aten, compile_config=self.edge_compile_config + ) + delegated_program_manager = edge_program_manager.to_backend(partitioner) + + assert [ + node.target.__name__ + for node in delegated_program_manager.exported_program().graph.nodes + if node.op == "call_function" + ] == [ + "executorch_call_delegate", + "getitem", + ] + if __name__ == "__main__": test_runner = TestCoreMLPartitioner() test_runner.test_add_sub_skip_mm() test_runner.test_vit_skip_conv() + test_runner.test_buffer() diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 968117eef20..60ebb979164 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -288,6 +288,11 @@ def build_args_parser() -> argparse.ArgumentParser: parser.add_argument("-V", "--vulkan", action="store_true") parser.add_argument("--mps", action="store_true") parser.add_argument("--coreml", action="store_true") + parser.add_argument( + "--coreml-enable-state", + action="store_true", + help="This option is only for coreml, and is only supported for MacOS15+/iOS18+", + ) parser.add_argument( "--qnn", action="store_true", @@ -523,7 +528,9 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 if args.coreml: coreml_partitioner = get_coreml_partitioner( - args.use_kv_cache, args.pt2e_quantize + args.use_kv_cache and args.coreml_enable_state, + args.embedding_quantize, + args.pt2e_quantize, ) partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" diff --git a/exir/backend/utils.py b/exir/backend/utils.py index 2b768fe7c23..fb5e16c6bd0 100644 --- a/exir/backend/utils.py +++ b/exir/backend/utils.py @@ -383,6 +383,40 @@ def tag_constant_data(edge_program: ExportedProgram) -> None: node.meta["delegation_tag"] = user_tags.pop() +def tag_mutated_buffer(edge_program: ExportedProgram) -> None: + """ + Util function for partitioners. This function tags the mutated buffer nodes + whose users all belong within the same partition. This should be called after tagging all other nodes. + Any buffer which is used as input to a subgraph, will be tagged with the same tag as that + subgraph. Throw error when buffers is used across different partitions. That is the + underlying data will be owned by multiple delegates. + """ + for node in edge_program.graph.nodes: + # Determine whether this node is a mutated buffer + is_mutated_buffer_node = False + if node.op == "placeholder" and is_buffer(edge_program, node): + for node_user in node.users: + if node_user.name in edge_program.graph_signature.buffers_to_mutate: + is_mutated_buffer_node = True + break + # This node is mutated buffer, tag it + if is_mutated_buffer_node: + user_tags = set() + for user in node.users: + user_tag = user.meta.get("delegation_tag", None) + if user_tag is not None: + user_tags.add(user_tag) + if len(user_tags) > 1: + logging.info( + f"The data node is used across multiple partitions, including {user_tags}. " + "If the data is too large and it's not preferred to copy, please tag the " + "constant node like node.['no_copy'] = True and they won't be copied." + ) + # tag the data node with the same tag as the last user + if len(user_tags) > 0: + node.meta["delegation_tag"] = user_tags.pop() + + # TODO - style: use templated types class DelegateMappingBuilder: """ diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 2f4c87d6fd8..2bd087ac119 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -56,11 +56,10 @@ def get_mps_partitioner(use_kv_cache: bool = False): def get_coreml_partitioner( - use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None + enable_state: bool = False, + embedding_quantize: Optional[str] = None, + pt2e_quantize: Optional[str] = None, ): - assert ( - use_kv_cache is True - ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment" try: import coremltools as ct from executorch.backends.apple.coreml.compiler import ( # pyre-ignore @@ -75,22 +74,22 @@ def get_coreml_partitioner( ) minimum_deployment_target = ct.target.iOS15 - # In Core ML, quantization in introduced in iOS 16 - if pt2e_quantize is not None: + # In Core ML, stateful execution is introduced in iOS 18 + if enable_state: + minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) + # In Core ML, quantization is introduced in iOS 16 + if embedding_quantize is not None or pt2e_quantize is not None: minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS16) # In Core ML, 8-bit activation quantization is introduced in iOS 17 - if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"): + if ( + embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 8 + ) or pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17) # In Core ML, 4-bit weight compression is introduced in iOS 18 - if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"): + if ( + embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4 + ) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) - # In Core ML, stateful execution is introduced in iOS 18 - # TODO (https://github.com/pytorch/executorch/issues/4209) - # For now, since mutable buffer is kept in executorch runtime, - # state is out of place and can be handled by older iOS. - # Once mutable buffer can be handed over to delegate, i.e. state becomes in-place, we will have - # if use_kv_cache: - # minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16] minimum_deployment_target=minimum_deployment_target, @@ -101,6 +100,7 @@ def get_coreml_partitioner( ) return CoreMLPartitioner( # pyre-fixme[16] compile_specs=compile_specs, + take_over_mutable_buffer=enable_state, ) From 67ae762f6db62bb7b43692155879b8af65d7f355 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:27:36 +0800 Subject: [PATCH 279/531] Qualcomm AI Engine Direct - Add the argument to specify soc model (#5211) * Qualcomm AI Engine Direct - Add the argument to specify soc model * address review --- examples/models/llama2/export_llama_lib.py | 10 +++++++++- examples/qualcomm/utils.py | 9 +-------- extension/llm/export/partitioner_lib.py | 10 +++++++++- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 60ebb979164..4e42e047dab 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -321,6 +321,14 @@ def build_args_parser() -> argparse.ArgumentParser: default=False, help="Generate logits for all inputs.", ) + + parser.add_argument( + "--soc_model", + help="[QNN backend] SoC model of current device. e.g. 'SM8650' for Snapdragon 8 Gen 3.", + type=str, + required=False, + default="SM8650", + ) return parser @@ -540,7 +548,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 partitioners.append( get_qnn_partitioner( - args.use_kv_cache, args.pt2e_quantize, args.num_sharding + args.use_kv_cache, args.pt2e_quantize, args.num_sharding, args.soc_model ) ) # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 1a748bb45e1..5ecd7dd4591 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -230,19 +230,12 @@ def build_executorch_binary( else: edge_prog = capture_program(model, inputs) - arch_table = { - "SM8650": QcomChipset.SM8650, - "SM8550": QcomChipset.SM8550, - "SM8475": QcomChipset.SM8475, - "SM8450": QcomChipset.SM8450, - } - backend_options = generate_htp_compiler_spec( use_fp16=False if quant_dtype else True ) qnn_partitioner = QnnPartitioner( generate_qnn_executorch_compiler_spec( - soc_model=arch_table[soc_model], + soc_model=getattr(QcomChipset, soc_model), backend_options=backend_options, debug=False, saver=False, diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 2bd087ac119..29c7b3731fb 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -108,6 +108,7 @@ def get_qnn_partitioner( use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None, num_sharding: int = 0, + soc_model: str = "SM8650", # default to SM8650 ): assert ( use_kv_cache is True @@ -138,9 +139,16 @@ def get_qnn_partitioner( if pt2e_quantize is not None: use_fp16 = False + soc_chip_table = { + "SM8650": QcomChipset.SM8650, + "SM8550": QcomChipset.SM8550, + "SM8475": QcomChipset.SM8475, + "SM8450": QcomChipset.SM8450, + } + return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=QcomChipset.SM8650, # default to SM8650 # pyre-fixme[16] + soc_model=soc_chip_table[soc_model], # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, From 63e794aa627b04288437c704aecebf8ff6c41227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Per=20=C3=85strand?= Date: Tue, 10 Sep 2024 05:29:19 +0200 Subject: [PATCH 280/531] Add pass to convert special case of mean.dim to averagepool2d Differential Revision: D62034655 Pull Request resolved: https://github.com/pytorch/executorch/pull/4900 --- backends/arm/operators/op_mean_dim.py | 26 ------- backends/arm/passes/arm_pass_manager.py | 4 + .../arm/passes/meandim_to_averagepool_pass.py | 52 +++++++++++++ backends/arm/test/ops/test_mean_dim.py | 7 +- .../passes/test_meandim_to_averagepool2d.py | 75 +++++++++++++++++++ 5 files changed, 137 insertions(+), 27 deletions(-) create mode 100644 backends/arm/passes/meandim_to_averagepool_pass.py create mode 100644 backends/arm/test/passes/test_meandim_to_averagepool2d.py diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py index 20e1b2b8d76..339aa62719f 100644 --- a/backends/arm/operators/op_mean_dim.py +++ b/backends/arm/operators/op_mean_dim.py @@ -11,7 +11,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common @register_node_visitor @@ -30,29 +29,4 @@ def define_node( is_quant_node: bool, ) -> None: - input_tensor = inputs[0] - dim = node.args[1] - keep_dim = node.args[2] - - # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions. - # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead. - if dim == [-1, -2]: - if keep_dim is True: - # Given the shape format of input is (N, C, H, W) - kernel_size = [input_tensor.shape[2], input_tensor.shape[3]] - stride = [1, 1] - padding = [0, 0, 0, 0] - - build_avg_pool_2d_common( - node, - tosa_graph, - input_tensor, - kernel_size, - stride, - padding, - is_quant_node, - output, - ) - return - raise AssertionError("unsupported") diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index 914bf57aabc..db8511df613 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -15,6 +15,9 @@ from executorch.backends.arm.passes.convert_split_to_slice import ( ConvertSplitToSlicePass, ) +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -33,6 +36,7 @@ def transform_to_backend_pipeline( self.add_pass(SizeAdjustConv2DPass()) self.add_pass(RemoveClonePass()) self.add_pass(ConvertExpandCopyToRepeatPass()) + self.add_pass(ConvertMeanDimToAveragePool()) self.add_pass(ConvertSplitToSlicePass()) for spec in compile_spec: if spec.key == "permute_memory_format": diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py new file mode 100644 index 00000000000..3f57e8023ca --- /dev/null +++ b/backends/arm/passes/meandim_to_averagepool_pass.py @@ -0,0 +1,52 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, cast, Dict, Tuple + +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue + +Argument = Any + + +class ConvertMeanDimToAveragePool(ExportPass): + """ + Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation. + """ + + def call_operator( + self, + op: torch.fx.node.Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != exir_ops.edge.aten.mean.dim: + return super().call_operator(op, args, kwargs, meta) + + input_value = cast(ProxyValue, args[0]) + dim = cast(list, args[1]) + keep_dim = cast(bool, args[2]) if len(args) > 2 else False + + # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True + # so check the dim argument for this case + if dim == [-1, -2] and keep_dim is True: + # Given the shape format of input is (N, C, H, W) + kernel_size = [ + input_value.to_tensor().size()[2], + input_value.to_tensor().size()[3], + ] + stride = [1, 1] + return super().call_operator( + exir_ops.edge.aten.avg_pool2d.default, + (input_value, kernel_size, stride), + {}, + meta, + ) + else: + return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e0db958f743..e48d749c194 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -106,7 +106,12 @@ def _test_meandim_tosa_u55_BI_pipeline( .check(["torch.ops.quantized_decomposed"]) .to_edge() .partition() - .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mean_dim", + "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default", + ] + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() ) diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py new file mode 100644 index 00000000000..1cd63e6e52e --- /dev/null +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -0,0 +1,75 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import RunPasses + + +class MeanDim(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=[-1, -2], keepdim=True) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class MeanDim2(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=1) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class TestMeandimToAveragePool2dPass(unittest.TestCase): + """ + Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d + for the special case where dim is [-1, -2] and keepdim is True. + """ + + def test_tosa_BI_meandim_to_averagepool(self): + module = MeanDim() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) + + def test_tosa_BI_meandim_no_modification(self): + module = MeanDim2() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) From 370f30416dc8e5ba478b82ea97ee38e20a85528d Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Mon, 9 Sep 2024 23:41:21 -0400 Subject: [PATCH 281/531] Add slice_scatter test: large end value Differential Revision: D62309150 Pull Request resolved: https://github.com/pytorch/executorch/pull/5138 --- kernels/test/op_slice_scatter_test.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp index 1d5c8a43b10..1d5e972ef2e 100644 --- a/kernels/test/op_slice_scatter_test.cpp +++ b/kernels/test/op_slice_scatter_test.cpp @@ -863,3 +863,24 @@ TEST_F(OpSliceScatterTensorOutTest, DynamicShapeTest) { EXPECT_TENSOR_EQ(ret_default_end, out); EXPECT_TENSOR_EQ(ret_default_end, expected); } + +TEST_F(OpSliceScatterTensorOutTest, LargeEndValue) { + TensorFactory tf; + + Tensor input = tf.zeros({1, 1, 2, 5, 3, 3}); + Tensor src = tf.ones({1, 1, 2, 5, 3, 3}); + + Tensor out = tf.zeros({1, 1, 2, 5, 3, 3}); + Tensor expected = tf.ones({1, 1, 2, 5, 3, 3}); + + Tensor ret = op_slice_scatter_out( + input, + src, + /*dim=*/1, + /*start=*/0, + /*end=*/9223372036854775807, + /*step=*/1, + out); + EXPECT_TENSOR_EQ(ret, out); + EXPECT_TENSOR_EQ(ret, expected); +} From 083b9e65b522e445c081ba27cec674ab3045db53 Mon Sep 17 00:00:00 2001 From: Jorge Pineda <32918197+jorgep31415@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:50:26 -0700 Subject: [PATCH 282/531] [ET-VK] Fix gpuinfo CI Differential Revision: D62403691 Pull Request resolved: https://github.com/pytorch/executorch/pull/5202 --- backends/vulkan/tools/gpuinfo/include/architecture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 20c6254e1a0..9af908eb170 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) { }); std::vector data(app.nthread_logic); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + out_buf.copy_to(data.data(), out_buf.nbytes()); if (verbose) { std::stringstream ss; From 1eeded16a45eb58ca9703f783e9aa91376f665a9 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 9 Sep 2024 21:02:15 -0700 Subject: [PATCH 283/531] Let the app check "aatp/data" subdir for AWS. Differential Revision: D62409615 Pull Request resolved: https://github.com/pytorch/executorch/pull/5208 --- .../Benchmark.xcodeproj/project.pbxproj | 64 +++---- extension/apple/Benchmark/Tests/Tests.mm | 173 ++++++++++-------- 2 files changed, 130 insertions(+), 107 deletions(-) diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index 4dcffaffbf6..1bc3188fe17 100644 --- a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -10,14 +10,14 @@ 03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; }; 03B2D37A2C8A515C0046936E /* Tests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* Tests.mm */; }; 03C7FA382C8AA3EC00E6E9AE /* Models in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Models */; }; - 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */; }; - 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */; }; - 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */; }; - 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */; }; - 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */; }; - 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */; }; - 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */; }; - 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */; }; + 03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; }; + 03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; }; + 03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; }; + 03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */; }; + 03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */; }; + 03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */; }; + 03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A32C8FE44600FE4619 /* executorch.xcframework */; settings = {ATTRIBUTES = (Required, ); }; }; + 03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */; }; 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; }; 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; }; 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; }; @@ -45,14 +45,14 @@ 03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; 03B2D3792C8A515C0046936E /* Tests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = Tests.mm; sourceTree = ""; }; 03C7FA322C8AA24200E6E9AE /* Models */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Models; sourceTree = SOURCE_ROOT; }; - 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; - 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; - 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; - 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; - 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; - 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; - 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; - 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; + 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = ""; }; + 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = ""; }; + 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = ""; }; + 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = ""; }; + 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = ""; }; + 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = ""; }; + 03DD00A32C8FE44600FE4619 /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = ""; }; + 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = ""; }; 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; }; 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; }; 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; }; @@ -79,14 +79,14 @@ 03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */, 03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */, 03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */, - 03ED6CFF2C8AAFB300F2D6EE /* backend_coreml.xcframework in Frameworks */, - 03ED6D032C8AAFB300F2D6EE /* backend_xnnpack.xcframework in Frameworks */, - 03ED6D092C8AAFB300F2D6EE /* kernels_optimized.xcframework in Frameworks */, - 03ED6D012C8AAFB300F2D6EE /* backend_mps.xcframework in Frameworks */, - 03ED6D0D2C8AAFB300F2D6EE /* kernels_quantized.xcframework in Frameworks */, - 03ED6D0B2C8AAFB300F2D6EE /* kernels_portable.xcframework in Frameworks */, - 03ED6D052C8AAFB300F2D6EE /* executorch.xcframework in Frameworks */, - 03ED6D072C8AAFB300F2D6EE /* kernels_custom.xcframework in Frameworks */, + 03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */, + 03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */, + 03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */, + 03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */, + 03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */, + 03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */, + 03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */, + 03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -141,14 +141,14 @@ 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */, 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */, 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */, - 03ED6CFE2C8AAFB300F2D6EE /* backend_coreml.xcframework */, - 03ED6D002C8AAFB300F2D6EE /* backend_mps.xcframework */, - 03ED6D022C8AAFB300F2D6EE /* backend_xnnpack.xcframework */, - 03ED6D042C8AAFB300F2D6EE /* executorch.xcframework */, - 03ED6D062C8AAFB300F2D6EE /* kernels_custom.xcframework */, - 03ED6D082C8AAFB300F2D6EE /* kernels_optimized.xcframework */, - 03ED6D0A2C8AAFB300F2D6EE /* kernels_portable.xcframework */, - 03ED6D0C2C8AAFB300F2D6EE /* kernels_quantized.xcframework */, + 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */, + 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */, + 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */, + 03DD00A32C8FE44600FE4619 /* executorch.xcframework */, + 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */, + 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */, + 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */, + 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */, ); name = Frameworks; sourceTree = SOURCE_ROOT; diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm index 5cf958765d3..dd85cb69542 100644 --- a/extension/apple/Benchmark/Tests/Tests.mm +++ b/extension/apple/Benchmark/Tests/Tests.mm @@ -22,82 +22,105 @@ @interface Tests : XCTestCase @implementation Tests + (void)initialize { - if (self == [Tests class]) { - NSString *modelsDir = [[NSBundle bundleForClass:[self class]].resourcePath - stringByAppendingPathComponent:@"Models"]; - NSArray *models = - [NSFileManager.defaultManager contentsOfDirectoryAtPath:modelsDir - error:nil]; - for (NSString *model in models) { - NSString *modelName = model.stringByDeletingPathExtension; - NSString *modelPath = [modelsDir stringByAppendingPathComponent:model]; - XCTAssertGreaterThan(modelPath.length, 0); - - SEL testLoadSelector = NSSelectorFromString( - [NSString stringWithFormat:@"test_load_%@", modelName]); - IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { - auto __block module = std::make_unique(modelPath.UTF8String); - [_self - measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] - options:XCTMeasureOptions.defaultOptions - block:^{ - XCTAssertEqual(module->load_method("forward"), - Error::Ok); - }]; - }); - class_addMethod( - [self class], testLoadSelector, testLoadImplementation, "v@:"); - - SEL testForwardSelector = NSSelectorFromString( - [NSString stringWithFormat:@"test_forward_%@", modelName]); - IMP testForwardImplementation = imp_implementationWithBlock(^(id _self) { - auto __block module = std::make_unique(modelPath.UTF8String); - XCTAssertEqual(module->load_method("forward"), Error::Ok); - - const auto method_meta = module->method_meta("forward"); - XCTAssertEqual(method_meta.error(), Error::Ok); - - const auto num_inputs = method_meta->num_inputs(); - XCTAssertGreaterThan(num_inputs, 0); - - std::vector> buffers; - buffers.reserve(num_inputs); - std::vector tensors; - tensors.reserve(num_inputs); - std::vector __block inputs; - inputs.reserve(num_inputs); - - for (auto index = 0; index < num_inputs; ++index) { - auto input_tag = method_meta->input_tag(index); - XCTAssertEqual(input_tag.error(), Error::Ok); - - switch (*input_tag) { - case Tag::Tensor: { - const auto tensor_meta = method_meta->input_tensor_meta(index); - XCTAssertEqual(tensor_meta.error(), Error::Ok); - - const auto sizes = tensor_meta->sizes(); - buffers.emplace_back(tensor_meta->nbytes(), - 0b01010101); // Set all bytes to be non-zero. - tensors.emplace_back(from_blob(buffers.rbegin()->data(), - {sizes.begin(), sizes.end()}, - tensor_meta->scalar_type())); - inputs.emplace_back(tensors.back()); - } break; - default: - XCTFail("Unsupported tag %i at input %d", *input_tag, index); - } + if (self != [self class]) { + return; + } + for (NSBundle *bundle in @[ + [NSBundle mainBundle], + [NSBundle bundleForClass:[self class]], + ]) { + for (NSString *directory in @[ + @"Models", + @"aatp/data", + ]) { + NSString *directoryPath = + [bundle.resourcePath stringByAppendingPathComponent:directory]; + NSArray *filePaths = + [NSFileManager.defaultManager contentsOfDirectoryAtPath:directoryPath + error:nil]; + for (NSString *filePath in filePaths) { + if (![filePath hasSuffix:@".pte"]) { + continue; } - [_self - measureWithMetrics:@[ [XCTClockMetric new], [XCTMemoryMetric new] ] - options:XCTMeasureOptions.defaultOptions - block:^{ - XCTAssertEqual(module->forward(inputs).error(), - Error::Ok); - }]; - }); - class_addMethod( - [self class], testForwardSelector, testForwardImplementation, "v@:"); + NSString *modelPath = + [directoryPath stringByAppendingPathComponent:filePath]; + NSString *directoryName = + [directory stringByReplacingOccurrencesOfString:@"/" + withString:@"_"] + .lowercaseString; + NSString *modelName = + modelPath.lastPathComponent.stringByDeletingPathExtension; + + SEL testLoadSelector = NSSelectorFromString([NSString + stringWithFormat:@"test_load_%@_%@", directoryName, modelName]); + IMP testLoadImplementation = imp_implementationWithBlock(^(id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + [_self measureWithMetrics:@[ + [XCTClockMetric new], + [XCTMemoryMetric new], + ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->load_method("forward"), + Error::Ok); + }]; + }); + class_addMethod( + [self class], testLoadSelector, testLoadImplementation, "v@:"); + + SEL testForwardSelector = NSSelectorFromString([NSString + stringWithFormat:@"test_forward_%@_%@", directoryName, modelName]); + IMP testForwardImplementation = imp_implementationWithBlock(^( + id _self) { + auto __block module = std::make_unique(modelPath.UTF8String); + XCTAssertEqual(module->load_method("forward"), Error::Ok); + + const auto method_meta = module->method_meta("forward"); + XCTAssertEqual(method_meta.error(), Error::Ok); + + const auto num_inputs = method_meta->num_inputs(); + XCTAssertGreaterThan(num_inputs, 0); + + std::vector __block tensors; + tensors.reserve(num_inputs); + std::vector __block inputs; + inputs.reserve(num_inputs); + + for (auto index = 0; index < num_inputs; ++index) { + const auto input_tag = method_meta->input_tag(index); + XCTAssertEqual(input_tag.error(), Error::Ok); + + switch (*input_tag) { + case Tag::Tensor: { + const auto tensor_meta = method_meta->input_tensor_meta(index); + XCTAssertEqual(tensor_meta.error(), Error::Ok); + + const auto sizes = tensor_meta->sizes(); + tensors.emplace_back(make_tensor_ptr( + tensor_meta->scalar_type(), + {sizes.begin(), sizes.end()}, + std::vector(tensor_meta->nbytes(), 0b01010101))); + inputs.emplace_back(tensors.back()); + } break; + default: + XCTFail("Unsupported tag %i at input %d", *input_tag, index); + } + } + [_self measureWithMetrics:@[ + [XCTClockMetric new], + [XCTMemoryMetric new], + ] + options:XCTMeasureOptions.defaultOptions + block:^{ + XCTAssertEqual(module->forward(inputs).error(), + Error::Ok); + }]; + }); + class_addMethod([self class], + testForwardSelector, + testForwardImplementation, + "v@:"); + } } } } From 126abb5f63bf08ebce56f141cc16815c43b6024a Mon Sep 17 00:00:00 2001 From: Yi Li <47999440+LeeOHzzZ@users.noreply.github.com> Date: Mon, 9 Sep 2024 21:20:46 -0700 Subject: [PATCH 284/531] Update the API of registering fake kernels to new standard (#5084) Differential Revision: D62206602 Pull Request resolved: https://github.com/pytorch/executorch/pull/5190 --- backends/cadence/aot/ops_registrations.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index a5e00573918..e73de6ab7ce 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -10,7 +10,7 @@ from typing import Optional, Tuple import torch -from torch.library import impl, Library +from torch.library import Library, register_fake from .utils import get_conv1d_output_size, get_conv2d_output_size @@ -68,7 +68,7 @@ m = Library("cadence", "IMPL", "Meta") -@impl(m, "quantize_per_tensor") +@register_fake("cadence::quantize_per_tensor") def quantize_per_tensor_meta( input: torch.Tensor, scale: float, @@ -80,7 +80,7 @@ def quantize_per_tensor_meta( return input.new_empty(input.size(), dtype=dtype) -@impl(m, "dequantize_per_tensor") +@register_fake("cadence::dequantize_per_tensor") def dequantize_per_tensor_meta( input: torch.Tensor, scale: float, @@ -92,7 +92,7 @@ def dequantize_per_tensor_meta( return input.new_empty(input.size(), dtype=torch.float) -@impl(m, "quantized_linear") +@register_fake("cadence::quantized_linear") def quantized_linear_meta( src: torch.Tensor, weight: torch.Tensor, @@ -114,7 +114,7 @@ def quantized_linear_meta( return src.new_empty(out_size, dtype=torch.uint8) -@impl(m, "quantized_conv") +@register_fake("cadence::quantized_conv") def quantized_conv_meta( input: torch.Tensor, weight: torch.Tensor, @@ -152,7 +152,7 @@ def quantized_conv_meta( return input.new_empty(output_size, dtype=input.dtype) -@impl(m, "quantized_layer_norm") +@register_fake("cadence::quantized_layer_norm") def quantized_layer_norm_meta( input: torch.Tensor, X_scale: torch.Tensor, @@ -167,7 +167,7 @@ def quantized_layer_norm_meta( return input.new_empty(input.size(), dtype=torch.uint8) -@impl(m, "quantized_relu") +@register_fake("cadence::quantized_relu") def quantized_relu_meta( X: torch.Tensor, X_zero_point: torch.Tensor, @@ -178,7 +178,7 @@ def quantized_relu_meta( return X.new_empty(X.size(), dtype=torch.uint8) -@impl(m, "quantized_matmul") +@register_fake("cadence::quantized_matmul") def quantized_matmul_meta( X: torch.Tensor, X_zero_point: int, From 657789e97f99b51c29375efeb23b7b2fbdefbe30 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:00:19 +0800 Subject: [PATCH 285/531] Qualcomm AI Engine Direct - Apply spin quant R1 and R2 (#5175) * Qualcomm AI Engine Direct - Apply spin quant R1 and R2 Summary: - Add a argument optimized_rotation_path to specify the optimized rotation file - Refer to https://github.com/facebookresearch/SpinQuant?tab=readme-ov-file to apply R1 R2 * remove not used * address review * rename the rotation file to apply_spin_quant_r1_r2 * fix name in TARGETS --------- Co-authored-by: Sheng Feng Wu --- examples/models/llama2/TARGETS | 1 + examples/models/llama2/export_llama_lib.py | 15 ++ .../apply_spin_quant_r1_r2.py | 179 ++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index 18a10fb9fdb..ae3e1e00f98 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -70,6 +70,7 @@ runtime.python_library( "export_llama.py", "export_llama_lib.py", "model.py", + "source_transformation/apply_spin_quant_r1_r2.py", "source_transformation/quantize.py", "source_transformation/rms_norm.py", "source_transformation/rope.py", diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 4e42e047dab..977348946b3 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -45,6 +45,10 @@ from executorch.util.activation_memory_profiler import generate_memory_trace from ..model_factory import EagerModelFactory +from .source_transformation.apply_spin_quant_r1_r2 import ( + fuse_layer_norms, + get_model_with_r1_r2, +) from .source_transformation.quantize import ( get_quant_embedding_transform, get_quant_weight_transform, @@ -225,6 +229,13 @@ def build_args_parser() -> argparse.ArgumentParser: default=f"{ckpt_dir}/params/demo_config.json", help="config.json", ) + parser.add_argument( + "--optimized_rotation_path", + default=None, + required=False, + help="[QNN Backend] Optimized rotation checkpoint path. Just apply R1/R2 here." + "You can download the optimized rotation matrices from https://github.com/facebookresearch/SpinQuant/tree/main", + ) parser.add_argument( "-m", "--metadata", @@ -436,6 +447,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: # to get free perf gain. transforms.append(replace_sdpa_with_simple_sdpa) transforms.append(replace_causal_mask) + + if args.optimized_rotation_path: + transforms.append(fuse_layer_norms) + transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) return ( _load_llama_model( modelname=modelname, diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py new file mode 100644 index 00000000000..e71007b1958 --- /dev/null +++ b/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py @@ -0,0 +1,179 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import typing + +import torch + + +def rotate_embeddings(model, R1: torch.Tensor) -> None: + # Rotate the embeddings. + for W in [model.tok_embeddings]: + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_attention_inputs(layer, R1) -> None: + # Rotate the WQ, WK and WV matrices of the self-attention layer. + for W in [layer.attention.wq, layer.attention.wk, layer.attention.wv]: + dtype = W.weight.dtype + W_ = W.weight.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_attention_output(layer, R1) -> None: + # Rotate output matrix of the self-attention layer. + W = layer.attention.wo + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype) + if W.bias is not None: + b = W.bias.data.to(device="cpu", dtype=torch.float32) + W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype) + + +def rotate_mlp_input(layer, R1): + # Rotate the MLP input weights. + mlp_inputs = [layer.feed_forward.w3, layer.feed_forward.w1] + for W in mlp_inputs: + dtype = W.weight.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_mlp_output(layer, R1): + # Rotate the MLP output weights and bias. + W = layer.feed_forward.w2 + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(R1.T, W_).to(device="cpu", dtype=dtype) + + if W.bias is not None: + b = W.bias.data.to(device="cpu", dtype=torch.float32) + W.bias.data = torch.matmul(R1.T, b).to(device="cpu", dtype=dtype) + + +def rotate_head(model, R1: torch.Tensor) -> None: + # Rotate the head. + W = model.output + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + W.weight.data = torch.matmul(W_, R1).to(device="cpu", dtype=dtype) + + +def rotate_ov_proj(layer, head_dim, R2=None): + W = layer.attention.wv + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32).t() + transposed_shape = W_.shape + temp = W_.reshape(-1, transposed_shape[-1] // head_dim, head_dim) + temp = temp.to(torch.float32) @ R2 + W_ = temp.reshape(transposed_shape).t() + W.weight.data = W_.to(device="cpu", dtype=dtype) + + W = layer.attention.wo + dtype = W.weight.data.dtype + W_ = W.weight.data.to(device="cpu", dtype=torch.float32) + init_shape = W_.shape + temp = W_.reshape(-1, init_shape[-1] // head_dim, head_dim) + temp = temp.to(torch.float32) @ R2 + W_ = temp.reshape(init_shape) + W.weight.data = W_.to(device="cpu", dtype=dtype) + + +def cleanup_memory() -> None: + """Run GC and clear GPU memory.""" + import gc + + # gc.collect and empty cache are necessary to clean up GPU memory if the model was distributed + gc.collect() + + +def get_model_with_r1_r2(optimized_rotation_path: str): + return lambda model: apply_spin_quant_r1_r2(model, optimized_rotation_path) + + +def apply_spin_quant_r1_r2(model: torch.nn.Module, optimized_rotation_path: str): + optimized_rotation = torch.load(optimized_rotation_path, weights_only=True) + R1 = optimized_rotation["R1"].to(torch.float32) + config = model.params + num_heads = config.n_heads + head_dim = config.dim // num_heads + + rotate_embeddings(model, R1) + rotate_head(model, R1) + cleanup_memory() + + for idx, layer in enumerate(model.layers): + key = f"model.layers.{idx}.self_attn.R2" + R2 = optimized_rotation[key].to(torch.float32) + rotate_attention_inputs(layer, R1) + rotate_attention_output(layer, R1) + rotate_mlp_input(layer, R1) + rotate_mlp_output(layer, R1) + rotate_ov_proj(layer, head_dim, R2=R2) + return model + + +def fuse_ln_linear( + layernorm: torch.nn.Module, linear_layers: typing.Iterable[torch.nn.Linear] +) -> None: + """ + fuse the linear operations in Layernorm into the adjacent linear blocks. + """ + for linear in linear_layers: + linear_dtype = linear.weight.dtype + + # Calculating new weight and bias + W_ = linear.weight.data.to(dtype=torch.float32) + linear.weight.data = (W_ * layernorm.weight.to(dtype=torch.float32)).to( + linear_dtype + ) + + if hasattr(layernorm, "bias"): + if linear.bias is None: + linear.bias = torch.nn.Parameter( + torch.zeros(linear.out_features, dtype=torch.float32) + ) + linear.bias.data = linear.bias.data.to(dtype=torch.float32) + torch.matmul( + W_, layernorm.bias.to(dtype=torch.float32) + ) + linear.bias.data = linear.bias.data.to(linear_dtype) + + +def fuse_layer_norms(model: torch.nn.Module): + # Embedding fusion + for W in [model.tok_embeddings]: + W_ = W.weight.data.to(dtype=torch.float32) + W.weight.data = (W_ - W_.mean(dim=-1, keepdim=True)).to(W.weight.data.dtype) + + # Fuse the linear operations in Layernorm into the adjacent linear blocks. + for layer in model.layers: + # fuse the input layernorms into the linear layers + fuse_ln_linear(layer.ffn_norm, [layer.feed_forward.w3, layer.feed_forward.w1]) + fuse_ln_linear( + layer.attention_norm, + [ + layer.attention.wq, + layer.attention.wk, + layer.attention.wv, + ], + ) + + W_norm = layer.ffn_norm.weight.data + layer.ffn_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + W_norm = layer.attention_norm.weight.data + layer.attention_norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + + fuse_ln_linear( + model.norm, + [model.output], + ) + W_norm = model.norm.weight.data + model.norm.weight.data = torch.ones_like(W_norm, dtype=torch.float32) + + return model From 549f14b555c5be9462b0a2b4af5c43a04180f829 Mon Sep 17 00:00:00 2001 From: lucylq Date: Tue, 10 Sep 2024 08:48:13 -0700 Subject: [PATCH 286/531] Restore constant segment Differential Revision: D62278416 Pull Request resolved: https://github.com/pytorch/executorch/pull/5141 --- exir/_serialize/_program.py | 18 ++++++++++++++++++ exir/_serialize/test/test_program.py | 19 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 2256d5fcc99..00a3d4700f0 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -553,6 +553,24 @@ def _restore_segments(program: Program, segment_data: bytes) -> Program: location=DataLocation.INLINE, index=data_index ) + # Replace constants from constant_segment into constant_buffer. + if program.constant_segment and len(program.constant_segment.offsets) > 0: + buffers: List[Buffer] = [] + constant_segment = segments[program.constant_segment.segment_index] + for i in range(len(program.constant_segment.offsets)): + start_offset = program.constant_segment.offsets[i] + # Note: this is the original end offset plus any padding between + # it and the next start offset. + end_offset = ( + program.constant_segment.offsets[i + 1] + if i < len(program.constant_segment.offsets) - 1 + else len(constant_segment) + ) + buffers.append(Buffer(storage=constant_segment[start_offset:end_offset])) + program.constant_buffer = buffers + program.constant_segment.segment_index = 0 + program.constant_segment.offsets = [] + # Clear out the segments list since the original Program didn't have one. program.segments = [] return program diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index afd8e3d282e..f20c0b39798 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -272,6 +272,15 @@ def constant_segment_with_tensor_alignment( f"{segment_table}", ) + # Convert back. + program2 = deserialize_pte_binary(pte_data) + # Programs are the same besides constant_buffer, as deserialization + # does not preserve constant segment; padding may be added + # during serialization. + self.assertEqual(program2.execution_plan, program.execution_plan) + # Number of constant tensors should be the same. + self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer)) + def test_canonicalize_delegate_indices(self) -> None: def make_execution_plan( name: str, delegates: List[BackendDelegate] @@ -462,7 +471,6 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes: assert len(ret) == size return ret - @unittest.skip("TODO(T181362263): Update restore segments to restore cords") def test_round_trip_with_segments(self) -> None: # Create a program with some delegate data blobs. program = get_test_program() @@ -803,6 +811,15 @@ def test_constant_segment_and_delegate_segment(self) -> None: + b"\x40\x44\x44", ) + # Convert back. + program2 = deserialize_pte_binary(pte_data) + # Programs are the same besides constant_buffer, as deserialization + # does not preserve constant segment; padding may be added + # during serialization. + self.assertEqual(program2.execution_plan, program.execution_plan) + # Number of constant tensors should be the same. + self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer)) + # Common data for extended header tests. The two example values should produce # the example data. From e826de3e3f4b997a1dd589a4a3cbdbb73ec2cbbb Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 10 Sep 2024 12:51:07 -0400 Subject: [PATCH 287/531] Add Half/BFloat16 tests for op_mul Differential Revision: D62417216 Pull Request resolved: https://github.com/pytorch/executorch/pull/5213 --- kernels/portable/cpu/op_mul.cpp | 6 +++++- kernels/test/op_mul_test.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 8fc4f9d4593..34e7e085687 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -123,7 +123,11 @@ Tensor& mul_scalar_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); - ET_KERNEL_CHECK(ctx, tensor_is_realhb_type(out), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); ScalarType a_type = a.scalar_type(); ScalarType b_type = utils::get_scalar_dtype(b); diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 84a7e8dedc4..f8205ea601e 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -586,3 +586,29 @@ TEST_F(OpMulScalarOutTest, OptimizedSanityCheck) { // Check that it matches the expected output. EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); } + +TEST_F(OpMulScalarOutTest, HalfSanityCheck) { + TensorFactory tf; + + const std::vector sizes = {2, 2}; + + Tensor out = tf.zeros(sizes); + + op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); +} + +TEST_F(OpMulScalarOutTest, BFloat16SanityCheck) { + TensorFactory tf; + + const std::vector sizes = {2, 2}; + + Tensor out = tf.zeros(sizes); + + op_mul_scalar_out(tf.make(sizes, {1.3, 2.1, 4.6, 8.2}), 2.0, out); + + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE(out, tf.make(sizes, {2.6, 4.2, 9.2, 16.4})); +} From 43e2f2d5095a05a0fef64c9855146fcc8c741eca Mon Sep 17 00:00:00 2001 From: haowhsu-quic <111341466+haowhsu-quic@users.noreply.github.com> Date: Wed, 11 Sep 2024 01:35:51 +0800 Subject: [PATCH 288/531] Qualcomm AI Engine Direct - support skip quantization (#5070) Summary: - Utility to skip operator annotation, unskipped nodes will be gathered into submodules and lowered with quantization annotation. Skipped nodes could either fallback to cpu or delegated with HTP fp16. - Fix uplevel breakage. - Refactor & retire some outdated implmentation. --- backends/qualcomm/builders/op_batch_norm.py | 15 + backends/qualcomm/builders/op_softmax.py | 2 +- .../passes/annotate_and_quant_scalar.py | 55 +-- .../passes/recompose_pixel_shuffle.py | 46 --- .../passes/recompose_pixel_unshuffle.py | 25 -- backends/qualcomm/quantizer/utils.py | 47 ++- backends/qualcomm/tests/models.py | 10 + backends/qualcomm/tests/test_qnn_delegate.py | 171 +++++++-- backends/qualcomm/tests/utils.py | 6 +- backends/qualcomm/utils/utils.py | 347 ++++++++++++++++++ examples/qualcomm/oss_scripts/llama2/llama.py | 41 +-- .../qualcomm/scripts/mobilebert_fine_tune.py | 82 +++-- examples/qualcomm/utils.py | 77 ++-- 13 files changed, 710 insertions(+), 214 deletions(-) delete mode 100644 backends/qualcomm/passes/recompose_pixel_shuffle.py diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py index 13b24c0d722..6b2e9ab91d8 100644 --- a/backends/qualcomm/builders/op_batch_norm.py +++ b/backends/qualcomm/builders/op_batch_norm.py @@ -8,6 +8,11 @@ import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper import torch +from executorch.backends.qualcomm.utils.constants import ( + QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_SCALE, +) from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW @@ -21,6 +26,14 @@ class BatchNorm(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) + def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor): + if isinstance(tensor, torch._subclasses.FakeTensor): + return + + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + diff = max(abs(tensor.max()), abs(tensor.min())) + quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX] + def define_node( self, node: torch.fx.Node, @@ -48,6 +61,7 @@ def define_node( amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps) bias_tensor = bias_tensor - amount + self.update_encoding(bias_node, bias_tensor) bias_tensor_wrapper = self.define_tensor( bias_node, bias_tensor, @@ -57,6 +71,7 @@ def define_node( ) filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps) + self.update_encoding(filter_node, filter_tensor) filter_tensor_wrapper = self.define_tensor( filter_node, filter_tensor, diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py index ae4c89bbb96..cda40aed458 100644 --- a/backends/qualcomm/builders/op_softmax.py +++ b/backends/qualcomm/builders/op_softmax.py @@ -17,7 +17,7 @@ @register_node_visitor class Softmax(NodeVisitor): - target = ["aten._softmax.default"] + target = ["aten._softmax.default", "aten._safe_softmax.default"] def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py index 1ec2ac64b5a..1db50694ece 100644 --- a/backends/qualcomm/passes/annotate_and_quant_scalar.py +++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py @@ -14,7 +14,7 @@ from executorch.exir.passes import dead_code_elimination_pass from torch.fx.passes.utils.source_matcher_utils import get_source_partitions -from .utils import get_quant_attrs +from .utils import dq_ops, get_quant_attrs class AnnotateAndQuantScalar(ExportPass): @@ -89,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule): graph_module.graph, self.binary_op_sources ) src_partitions = list(itertools.chain(*src_partitions.values())) + processed = set() for src_partition in src_partitions: - output = src_partition.output_nodes[0] - if ( - output.meta.get(QCOM_QUANT_ATTRS) - and len(src_partition.input_nodes) == 1 - ): - dq_node = src_partition.input_nodes[0] - q_node = dq_node.args[0] - q_node_attrs = get_quant_attrs(graph_module, q_node) - - scalar_nodes = [n for n in output.args if n != dq_node] - if len(scalar_nodes) == 0: + # need post process here to identify partitioned nodes: + src_fn_dict = {} + for n in src_partition.nodes: + # e.g. + # meta["source_fn_stack"]: [('mul', )] + # we'll use as grouping key + node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], []) + node_list.append(n) + + for nodes in src_fn_dict.values(): + output = [n for n in nodes if n in src_partition.output_nodes][0] + # if all args have been annotated, it shouldn't be a scalar operation + if all(arg.target in dq_ops for arg in output.args): continue - scalar_node = scalar_nodes[0] - source_scalar_node = self._get_source_scalar_node(scalar_node) - # we'll abandon cast op here, since the constant scalar will - # be pre-loaded into QNN context binary - output.replace_input_with(scalar_node, source_scalar_node) + if output not in processed and QCOM_QUANT_ATTRS in output.meta: + dq_node = [n for n in output.args if n.target in dq_ops][0] + q_node = dq_node.args[0] + q_node_attrs = get_quant_attrs(graph_module, q_node) + + scalar_nodes = [n for n in output.args if n != dq_node] + if len(scalar_nodes) == 0: + continue + + scalar_node = scalar_nodes[0] + source_scalar_node = self._get_source_scalar_node(scalar_node) + # we'll abandon cast op here, since the constant scalar will + # be pre-loaded into QNN context binary + output.replace_input_with(scalar_node, source_scalar_node) - scalar_quant_attrs = self._update_scalar_node_attrs( - source_scalar_node, q_node_attrs - ) - self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + scalar_quant_attrs = self._update_scalar_node_attrs( + source_scalar_node, q_node_attrs + ) + self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + processed.add(output) def call(self, graph_module: torch.fx.GraphModule): self._traverse_binary_node(graph_module) diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py deleted file mode 100644 index 9eec6bfa264..00000000000 --- a/backends/qualcomm/passes/recompose_pixel_shuffle.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions - - -class RecomposePixelShuffle(ExportPass): - """ - Merge decomposed operators back to one super node. - """ - - def __init__(self): - super().__init__() - - def call(self, graph_module: torch.fx.GraphModule): - graph = graph_module.graph - # decomposed core aten ops - partitions = get_source_partitions(graph, [torch.nn.PixelShuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - upscale_factor = h_out_shape / h_in_shape - - pixel_shuffle_node = graph.create_node( - "call_function", - exir_ops.edge.aten.pixel_shuffle.default, - (input_node, int(upscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_shuffle_node) - # copy metadata - pixel_shuffle_node.meta = output_node.meta - - graph.eliminate_dead_code() - graph_module.recompile() - return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py index a47f3d119a5..00d46639089 100644 --- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py @@ -6,7 +6,6 @@ import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions class RecomposePixelUnshuffle(ExportPass): @@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule): # copy metadata pixel_unshuffle_node.meta = node.meta - # decomposed core aten ops - if not self.quantization_capture: - partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - downscale_factor = h_in_shape / h_out_shape - - op = self.op - pixel_unshuffle_node = graph.create_node( - "call_function", - op, - (input_node, int(downscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_unshuffle_node) - # copy metadata - pixel_unshuffle_node.meta = output_node.meta - graph.eliminate_dead_code() graph_module.recompile() return PassResult(graph_module, True) diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index 5f299f9bc65..d3ae1194acd 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import numbers +import operator from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple @@ -77,7 +78,7 @@ def _derive_bias_qparams_fn( def get_default_8bit_qnn_ptq_config( - act_symmetric: bool = False, act_observer=MinMaxObserver + act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver ) -> QuantizationConfig: extra_args: Dict[str, Any] = {"eps": 2**-12} @@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config( quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, ch_axis=0, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) bias_quantization_spec = QuantizationSpec( @@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) quantization_config = QuantizationConfig( @@ -619,7 +620,13 @@ def annotate_upsample_nearest2d( annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default]) +@register_annotator( + [ + torch.ops.aten.softmax.int, + torch.ops.aten._softmax.default, + torch.ops.aten._safe_softmax.default, + ] +) def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -1000,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None node.meta["source_fn_stack"] = [(node, torch.nn.Linear)] +@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default]) +def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act, weight, bias = node.args[0:3] + if _is_annotated([node]): + return + + _annotate_input_qspec_map( + node, + act, + quantization_config.input_activation, + ) + # QNN requires uint8 instead of int8 in 'weight' config + _annotate_input_qspec_map( + node, + weight, + quantization_config.input_activation, + ) + _annotate_input_qspec_map( + node, + bias, + quantization_config.bias, + ) + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node, *node.args[0:3]]) + + +@register_annotator([operator.getitem]) +def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None: + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node]) + + @register_annotator([torch.ops.aten.layer_norm.default]) def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None: act_node = node.args[0] diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 127f704e8c9..e448a219284 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -55,6 +55,16 @@ def forward(self, x): return self.avgPool(x) +class BatchNorm(torch.nn.Module): + def __init__(self, n_features): + super().__init__() + self.native_batchnorm = torch.nn.BatchNorm2d(n_features) + self.eval() + + def forward(self, x): + return self.native_batchnorm(x) + + class Bmm(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 71e3b13ff8e..d17fce2b839 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -16,6 +16,7 @@ from executorch.backends.qualcomm.tests.utils import ( generate_context_binary, QnnPartitioner, + QnnQuantizer, QuantDtype, TestQNN, to_backend, @@ -33,6 +34,7 @@ from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + skip_annotation, ) from executorch.examples.qualcomm.utils import setup_common_args_and_variables @@ -50,8 +52,8 @@ from executorch.examples.models.mobilenet_v3 import MV3Model from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel from executorch.examples.models.wav2letter import Wav2LetterModel +from executorch.exir import to_edge from executorch.exir.backend.backend_api import disable_validation -from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram class TestQNNFloatingPointOperator(TestQNN): @@ -81,6 +83,11 @@ def test_qnn_backend_avg_pool2d(self): sample_input = (torch.randn(1, 3, 2, 2),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -291,7 +298,6 @@ def test_qnn_backend_layer_norm(self): sample_input = (torch.randn(196, 768),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_leaky_relu(self): test_comb = [ { @@ -334,7 +340,7 @@ def test_qnn_backend_mean_dim(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) - @unittest.skip("it will hang in runtime") + @unittest.skip("failed to lower in QNN 2.25") def test_qnn_backend_mha(self): module = MultiheadAttention() # noqa: F405 sample_input = (torch.randn(1, 197, 96),) @@ -362,7 +368,6 @@ def test_qnn_backend_pow_tensor_scalar(self): sample_input = (torch.rand([2, 4, 3, 3]),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_prelu(self): test_comb = [ { @@ -660,6 +665,12 @@ def test_qnn_backend_avg_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -667,13 +678,6 @@ def test_qnn_backend_bmm(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("not applicable") - def test_qnn_backend_cast(self): - module = Cast() # noqa: F405 - sample_input = (10 * torch.rand((9, 4, 5, 3)),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cat(self): modules = [Cat2(), Cat3(), Cat4()] # noqa: F405 sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2)) @@ -1342,16 +1346,10 @@ def test_qnn_backend_multi_contexts_composite(self): lowered_method=to_backend, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1401,6 +1399,7 @@ def test_qnn_backend_online_prepare(self): sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1444,7 +1443,7 @@ def setUp(self): saver=False, ) - def test_qnn_backend_skip_node_id(self): + def test_qnn_backend_skip_node_id_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1455,7 +1454,43 @@ def test_qnn_backend_skip_node_id(self): skip_node_id_set={"aten_add_tensor", "aten_mean_dim"}, ) - def test_qnn_backend_skip_node_op(self): + def test_qnn_backend_skip_node_id_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"conv2d"}, + ) + self.assertEqual(len(exported_progs), 1) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_skip_node_op_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1466,6 +1501,79 @@ def test_qnn_backend_skip_node_op(self): skip_node_op_set={"aten.add.Tensor"}, ) + def test_qnn_backend_skip_node_op_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.add.Tensor}, + ) + self.assertEqual(len(exported_progs), 2) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_graph_level_mixed_precision(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"add", "mean"}, + fallback_to_cpu=False, + ) + self.assertEqual(len(exported_progs), 5) + # lower all graph again, the skipped operators will be delegated with fp16 + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + def test_qnn_backend_multi_contexts(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1506,16 +1614,10 @@ def test_qnn_backend_multi_contexts_composite(self): quantize_method=self.get_qdq_module, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1568,6 +1670,7 @@ def test_qnn_backend_online_prepare(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -2431,6 +2534,7 @@ def test_stories_single_llama(self): model_out = msg["result"][0] self.assertTrue(model_out.startswith(golden_start_with)) + @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2471,13 +2575,8 @@ def test_mobilebert(self): for k, v in cpu.items(): self.assertLessEqual(abs(v[0] - htp[k][0]), 2) - @unittest.skip("will be enabled after TODOs got resolved") + @unittest.skip("eagar mode fake quant works well, need further investigation") def test_ptq_mobilebert(self): - # TODO: 2 approaches to resolve accuracy issue - # 1. fallback embedding layers: - # - skip annotation in quantizer (need PR to provide helper funciton) - # - skip operators in partitioner (use existent "skip_node_op_set") - # 2. investigate different quantization configurations / mechanisms if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2494,6 +2593,8 @@ def test_ptq_mobilebert(self): self.model, "--pretrained_weight", self.pretrained_weight, + "--ptq", + "16a16w", "--ip", self.ip, "--port", diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index b206a7e1330..0d9e1a69679 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -41,7 +41,7 @@ from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import ExecutorchProgram +from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -192,7 +192,9 @@ def verify_output( with tempfile.TemporaryDirectory() as tmp_dir: buffer = ( executorch_prog.buffer - if isinstance(executorch_prog, ExecutorchProgram) + if isinstance( + executorch_prog, (ExecutorchProgram, ExecutorchProgramManager) + ) else executorch_prog.buffer() ) ( diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 3e274a0ce77..2a954f90d24 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator from collections import OrderedDict from typing import Callable, Dict, List, Tuple @@ -68,9 +69,74 @@ from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions from torch.export.exported_program import ExportedProgram from torch.fx import passes +from torch.fx.passes.operator_support import OperatorSupportBase from torch.library import Library +class _AnnotationSkipper(OperatorSupportBase): + """ + Class used to partition out unwanted graph nodes. + e.g. - nodes are prevented from quantization annotation + - nodes have been grouped together as a submodule + + Attributes + ---------- + fp_node_id_set : set + a set contains nodes' name to be left in fp precision + fp_node_op_set : set + a set contains nodes' target (aten dialect) to be left in fp precision + skip_annotated_submodule : bool + flag to skip annotated submodule or not + + Methods + ------- + should_delegate(n: torch.fx.Node) + identify the residual nodes haven't be lowered with fixed-precision + should_skip(n: torch.fx.Node) + identify the nodes should be kept out with fixed-precision or not + is_node_supported(_, node: torch.fx.Node) + overridden method for graph partitioning + """ + + def __init__( + self, + fp_node_id_set: set = None, + fp_node_op_set: set = None, + skip_annotated_submodule: bool = False, + ): + self.fp_node_id_set = fp_node_id_set + self.fp_node_op_set = fp_node_op_set + self.skip_annotated_submodule = skip_annotated_submodule + + def should_delegate(self, n: torch.fx.Node): + return n.op == "call_function" and n.target != operator.getitem + + def should_skip(self, n: torch.fx.Node): + return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set + + def is_node_supported(self, _, node: torch.fx.Node) -> bool: + if self.skip_annotated_submodule: + if node.op == "get_attr": + return all(self.should_delegate(user) for user in node.users) + return self.should_delegate(node) + + if any( + [ + node.op in ("placeholder", "output"), + self.should_skip(node), + # check if parameters belong to fallbacked operator + ( + node.op == "get_attr" + and all(self.should_skip(user) for user in node.users) + ), + ] + ): + print(f"[QNN Quantizer Annotation]: {node.name} | Skipped") + return False + + return True + + def qnn_capture_config(): return exir.CaptureConfig(enable_aot=True) @@ -189,8 +255,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]: # The below super ops are supported by QNN remove_decompositions = [ torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.pixel_unshuffle.default, torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardswish.default, + torch.ops.aten._safe_softmax.default, ] for key in remove_decompositions: @@ -245,6 +313,285 @@ def capture_program( return edge_ep +def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn): + from torch.fx.passes.utils.fuser_utils import ( + erase_nodes, + fuse_as_graphmodule, + insert_subgm, + legalize_graph, + topo_sort, + ) + + partitions = ptn.propose_partitions() + # insert meta for each partition group + for i, partition in enumerate(partitions): + for node in partition.nodes: + node.meta[subgm_tag] = i + + for i in range(len(partitions)): + # find nodes with same group id in current graph + node_list = [ + node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i + ] + # fuse group nodes into submodule + sorted_nodes = topo_sort(node_list) + submodule_name = f"{subgm_tag}_{i}" + subgm, orig_inputs, orig_outputs = fuse_as_graphmodule( + gm, sorted_nodes, submodule_name + ) + # insert submodule & trim group nodes + gm = insert_subgm( + gm, + subgm_cb(subgm, submodule_name), + orig_inputs, + orig_outputs, + ) + erase_nodes(gm, sorted_nodes) + legalize_graph(gm) + + gm.recompile() + return gm + + +def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn): + from executorch.exir.backend.backend_api import to_backend + + # return lowered program for user to debug + exported_progs = [] + # partition each submodule which went through convert_pt2e + for node in gm.graph.nodes: + if node.op == "call_module" and subgm_tag in node.name: + # obtain sample inputs through meta + subgm_input = [ + torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype) + for arg in node.args + ] + # program meets QNN backend requirement + sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input)) + # start lowering with given partitioner + exported_progs.append(to_backend(sub_prog.exported_program, ptn)) + # replace submodule with lowered module + gm.set_submodule( + node.name, + exported_progs[-1].graph_module, + ) + # if node has multiple outputs, getitems will be default generated + if all(n.target != operator.getitem for n in node.users): + with gm.graph.inserting_after(node): + getitem_node = gm.graph.call_function( + operator.getitem, + (node, 0), + ) + getitem_node.meta = node.meta + node.replace_all_uses_with( + replace_with=getitem_node, + delete_user_cb=lambda user: user.target != operator.getitem, + ) + + gm.recompile() + return gm, exported_progs + + +def skip_annotation( + nn_module: torch.nn.Module, + quantizer, + partitioner, + sample_input: Tuple[torch.Tensor, ...], + calibration_cb: Callable[[torch.fx.GraphModule], None], + fp_node_id_set: set = None, + fp_node_op_set: set = None, + fallback_to_cpu: bool = True, +): + r""" + Exclude speific operators from quantizer annotation. + Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu' + to False for trying to delegate them with FP16 precision. + + e.g.: consider following graph: + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \_______ _______/ + add_1 + (torch.ops.aten.add.default) + | + output + + If user wants to skip convolution op by names with + 'skip_node_id_set' = {"conv2d_1"} + "bias_1 / weight_1 / input_1 / input_2 / conv2d_1" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 input_2 + | (placeholder) | | + \ | / | + \ | / | + \ | / | + conv2d_1 | + \ / + \ / + \ / + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to skip convolution op by target with + 'skip_node_op_set' = {torch.ops.aten.conv2d.default} + "bias_1 / weight_1 / input_1 / conv2d_1, + bias_2 / weight_2 / input_2 / conv2d_2" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \__ __/ + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to delegate the skipped conv2d from above graph + with 'fallback_to_cpu' = False: + + [Generated graph] + input_1 input_2 + (placeholder) (placeholder) + | | + \ / + lowered_module_2 + (QNN fp16 precision) + | + | + lowered_module_1 + (QNN fixed precision) + | + output + + Args: + nn_module (torch.nn.Module): The module to be lowered. + quantizer (QnnQuantizer): Instance of QnnQuantizer. + partitioner (QnnPartitioner): Instance of QnnPartitioner. + sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting. + calibration_cb (callable): Callback function for user-defined calibration. + fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision. + fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision. + fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not. + + Returns: + exported_programs: List of programs lowered to QnnBackend (quantized graphs only). + """ + from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QnnExecuTorchHtpPrecision, + ) + from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( + convert_to_option, + ) + from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + + def prepare_subgm(subgm, subgm_name): + # prepare current submodule for quantization annotation + subgm_prepared = prepare_pt2e(subgm, quantizer) + # overwrite this attribute or name will be set to "GraphModule" + # we could not identify each submodule if action is not performed + subgm_prepared.__class__.__name__ = subgm_name + return subgm_prepared + + fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set() + fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set() + graph_module = torch.export.export(nn_module, sample_input).module() + # define node support type + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(fp_node_id_set, fp_node_op_set), + allows_single_node_partition=True, + ) + subgm_tag = "annotated_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=prepare_subgm, + ptn=capability_partitioner, + ) + # perform calibration + calibration_cb(graph_module) + # convert sub modules which went through prepare_pt2e + for node in graph_module.graph.nodes: + if node.op == "call_module": + graph_module.set_submodule( + node.name, convert_pt2e(graph_module.get_submodule(node.name)) + ) + # canonicalize graph for lowering again + graph_module, exported_progs = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + + if not fallback_to_cpu: + try: + from executorch.exir.backend.partitioner import DelegationSpec + + # change HTP compiler spec for hardware to enable fp16 + qnn_option = generate_qnn_executorch_option( + partitioner.compiler_specs_snapshot + ) + compile_option = convert_to_option(qnn_option) + htp_options = compile_option.backend_options.htp_options + htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16 + partitioner.delegation_spec = DelegationSpec( + "QnnBackend", + [ + CompileSpec( + QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option) + ) + ], + ) + except: + print( + "Failed to change HTP compiler spec with 'use_fp16' as True," + " skipped operators will fallback to cpu," + ) + return graph_module, exported_progs + + # try lowering skipped operator into fp16 + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(skip_annotated_submodule=True), + allows_single_node_partition=True, + ) + subgm_tag = "skipped_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=lambda subgm, _: subgm, + ptn=capability_partitioner, + ) + graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + exported_progs.extend(exported_progs_fp) + + return graph_module, exported_progs + + def from_context_binary( ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650 ): diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py index f7fda3b9849..df8c876abf2 100644 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ b/examples/qualcomm/oss_scripts/llama2/llama.py @@ -16,8 +16,7 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.passes.build_quant_io import BuildQuantIo -from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype -from executorch.backends.qualcomm.quantizer.utils import get_16a4w_qnn_ptq_config +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( QcomChipset, ) @@ -34,13 +33,13 @@ ) from executorch.examples.qualcomm.utils import ( make_output_dir, + make_quantizer, setup_common_args_and_variables, SimpleADB, ) from executorch.exir import EdgeCompileConfig, EdgeProgramManager from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import _get_updated_graph_signature from executorch.extension.llm.export.builder import DType from sentencepiece import SentencePieceProcessor @@ -274,20 +273,12 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type): def quantize(self, quant_dtype, custom_annotations=()): self.quant_dtype = quant_dtype - quantizer = QnnQuantizer() - quantizer.set_per_channel_linear_quant(True) - quantizer.set_per_channel_conv_quant(True) - - if quant_dtype == QuantDtype.use_8a8w: - pass # default setting - elif quant_dtype == QuantDtype.use_16a4w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_16a4w_qnn_ptq_config(act_observer=MinMaxObserver) - ) - quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") - else: - raise AssertionError(f"No support for QuantDtype {quant_dtype}.") + quantizer = make_quantizer( + quant_dtype=quant_dtype, + per_channel_conv=True, + per_channel_linear=True, + act_observer=MinMaxObserver, + ) quantizer.add_custom_quant_annotations(custom_annotations) self.has_quant_io = True @@ -367,6 +358,7 @@ def compile(args): ) end_load_ts = time.time() print("torch.load checkpoint", end_load_ts - start_ts) + llama_instance = None with torch.device("meta"): llama_instance = LlamaModel(config, output_new_cache_only=True) @@ -383,16 +375,13 @@ def compile(args): for layer in llama_instance.layers: if getattr(layer.attention, "prepare_sha", None): layer.attention.prepare_sha() - kv_type = torch.uint8 - if args.ptq == "8a8w": - quant_dtype = QuantDtype.use_8a8w - elif args.ptq == "16a4w": - quant_dtype = QuantDtype.use_16a4w - else: - raise AssertionError( - f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." - ) + kv_type = torch.uint8 + assert args.ptq in [ + "8a8w", + "16a4w", + ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." + quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") assert args.tokenizer_model is not None, "Need tokenizer model for calibration" if args.dtype_override is not None: diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py index 278ab8e8c02..605bb27d330 100755 --- a/examples/qualcomm/scripts/mobilebert_fine_tune.py +++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py @@ -13,13 +13,24 @@ import torch from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QcomChipset, +) +from executorch.backends.qualcomm.utils.utils import ( + generate_htp_compiler_spec, + generate_qnn_executorch_compiler_spec, + skip_annotation, +) from executorch.examples.qualcomm.utils import ( build_executorch_binary, make_output_dir, + make_quantizer, parse_skip_delegation_node, + QnnPartitioner, setup_common_args_and_variables, SimpleADB, ) +from executorch.exir import to_edge from transformers import BertTokenizer, MobileBertForSequenceClassification @@ -204,8 +215,6 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): ) model.load_state_dict( - # TODO: If possible, it's better to set weights_only to True - # https://pytorch.org/docs/stable/generated/torch.load.html torch.load( ( f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model" @@ -213,7 +222,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size): else pretrained_weight ), map_location=torch.device("cpu"), - weights_only=False, + weights_only=True, ), ) @@ -232,38 +241,65 @@ def main(args): "Please specify a device serial by -s/--device argument." ) - pte_filename = "ptq_mb_qnn" if args.ptq else "mb_qnn" - batch_size = 1 if args.ptq else 3 + batch_size, pte_filename = 1, "ptq_mb_qnn" model, data_val, labels = get_fine_tuned_mobilebert( args.artifact, args.pretrained_weight, batch_size ) inputs, input_list = get_dataset(data_val) - if args.ptq == "8a8w": - quant_dtype = QuantDtype.use_8a8w - elif args.ptq == "16a16w": - quant_dtype = QuantDtype.use_16a16w - elif args.ptq == "16a4w": - quant_dtype = QuantDtype.use_16a4w - else: + try: + quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") + except: raise AssertionError( f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w." ) if args.use_fp16: quant_dtype = None + pte_filename = "mb_qnn" + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=quant_dtype, + shared_buffer=args.shared_buffer, + ) + else: - build_executorch_binary( - model, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - quant_dtype=quant_dtype, - shared_buffer=args.shared_buffer, - ) + def calibrator(gm): + for input in inputs: + gm(*input) + + quantizer = make_quantizer(quant_dtype=quant_dtype) + backend_options = generate_htp_compiler_spec(quant_dtype is not None) + partitioner = QnnPartitioner( + generate_qnn_executorch_compiler_spec( + soc_model=getattr(QcomChipset, args.model), + backend_options=backend_options, + ), + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + ) + # skip embedding layer cause it's quantization sensitive + graph_module, _ = skip_annotation( + nn_module=model, + quantizer=quantizer, + partitioner=partitioner, + sample_input=inputs[0], + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.embedding.default}, + ) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, inputs[0]), + ).to_executorch() + + with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: + file.write(exec_prog.buffer) if args.compile_only: sys.exit(0) diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 5ecd7dd4591..5d9a3aef262 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -19,6 +19,7 @@ from executorch.backends.qualcomm.quantizer.quantizer import ( get_16a4w_qnn_ptq_config, get_default_16bit_qnn_ptq_config, + get_default_8bit_qnn_ptq_config, QnnQuantizer, QuantDtype, ) @@ -30,7 +31,7 @@ generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, ) -from executorch.exir import EdgeCompileConfig, EdgeProgramManager +from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass @@ -178,6 +179,39 @@ def pull_etdump(self, output_path, callback=None): callback() +def make_quantizer( + quant_dtype: Optional[QuantDtype], + custom_annotations=(), + per_channel_conv=True, + per_channel_linear=False, + act_observer=MovingAverageMinMaxObserver, +): + quantizer = QnnQuantizer() + quantizer.add_custom_quant_annotations(custom_annotations) + quantizer.set_per_channel_conv_quant(per_channel_conv) + quantizer.set_per_channel_linear_quant(per_channel_linear) + + if quant_dtype == QuantDtype.use_8a8w: + quantizer.set_bit8_op_quant_config( + get_default_8bit_qnn_ptq_config(act_observer=act_observer) + ) + elif quant_dtype == QuantDtype.use_16a16w: + quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) + quantizer.set_bit16_op_quant_config( + get_default_16bit_qnn_ptq_config(act_observer=act_observer) + ) + elif quant_dtype == QuantDtype.use_16a4w: + quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) + quantizer.set_bit16_op_quant_config( + get_16a4w_qnn_ptq_config(act_observer=act_observer) + ) + quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") + else: + raise AssertionError(f"No support for QuantDtype {quant_dtype}.") + + return quantizer + + # TODO: refactor to support different backends def build_executorch_binary( model, # noqa: B006 @@ -195,27 +229,13 @@ def build_executorch_binary( act_observer=MovingAverageMinMaxObserver, ): if quant_dtype is not None: - quantizer = QnnQuantizer() - quantizer.add_custom_quant_annotations(custom_annotations) - quantizer.set_per_channel_linear_quant(per_channel_linear) - quantizer.set_per_channel_conv_quant(True) - - if quant_dtype == QuantDtype.use_8a8w: - pass # default setting - elif quant_dtype == QuantDtype.use_16a16w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_default_16bit_qnn_ptq_config(act_observer=act_observer) - ) - elif quant_dtype == QuantDtype.use_16a4w: - quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS) - quantizer.set_bit16_op_quant_config( - get_16a4w_qnn_ptq_config(act_observer=act_observer) - ) - quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4") - else: - raise AssertionError(f"No support for QuantDtype {quant_dtype}.") - + quantizer = make_quantizer( + quant_dtype=quant_dtype, + custom_annotations=custom_annotations, + per_channel_conv=True, + per_channel_linear=per_channel_linear, + act_observer=act_observer, + ) captured_model = torch.export.export(model, inputs).module() annotated_model = prepare_pt2e(captured_model, quantizer) print("Quantizing the model...") @@ -225,6 +245,7 @@ def build_executorch_binary( else: for data in dataset: annotated_model(*data) + quantized_model = convert_pt2e(annotated_model) edge_prog = capture_program(quantized_model, inputs) else: @@ -237,10 +258,7 @@ def build_executorch_binary( generate_qnn_executorch_compiler_spec( soc_model=getattr(QcomChipset, soc_model), backend_options=backend_options, - debug=False, - saver=False, shared_buffer=shared_buffer, - profile=False, ), skip_node_id_set, skip_node_op_set, @@ -256,15 +274,12 @@ def build_executorch_binary( alloc_graph_input=not shared_buffer, alloc_graph_output=not shared_buffer, ), - extract_delegate_segments=True, ) if metadata is None: - edge_prog.exported_program = to_backend( - edge_prog.exported_program, qnn_partitioner - ) - edge_prog.exported_program.graph_module.graph.print_tabular() - exec_prog = edge_prog.to_executorch(config=executorch_config) + exported_program = to_backend(edge_prog.exported_program, qnn_partitioner) + exported_program.graph_module.graph.print_tabular() + exec_prog = to_edge(exported_program).to_executorch(config=executorch_config) with open(f"{file_name}.pte", "wb") as file: file.write(exec_prog.buffer) else: From 30acae55f06ed19521ffd6c82676111546b4b9b3 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Tue, 10 Sep 2024 10:49:11 -0700 Subject: [PATCH 289/531] Switch over backend tests to export_for_training Differential Revision: D62428363 Pull Request resolved: https://github.com/pytorch/executorch/pull/5220 --- backends/example/test_example_delegate.py | 4 ++-- exir/backend/test/TARGETS | 17 ++++++++--------- exir/backend/test/test_partitioner.py | 19 +++++++++---------- exir/backend/test/test_passes.py | 4 ++-- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py index 973b457bade..d830c1bb312 100644 --- a/backends/example/test_example_delegate.py +++ b/backends/example/test_example_delegate.py @@ -46,7 +46,7 @@ def get_example_inputs(): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() # print("original model:", m) quantizer = ExampleQuantizer() # quantizer = XNNPACKQuantizer() @@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() quantizer = ExampleQuantizer() m = prepare_pt2e(m, quantizer) diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS index b99f374d83c..5c3a5e3eb32 100644 --- a/exir/backend/test/TARGETS +++ b/exir/backend/test/TARGETS @@ -82,15 +82,14 @@ python_library( "//executorch/test/...", ], deps = [ - ":backend_with_compiler_demo", - "//caffe2:torch", - "//executorch/exir:graph_module", - "//executorch/exir/backend:compile_spec_schema", - "//executorch/exir/backend:partitioner", - "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", - "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner", - "//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess", - "//executorch/exir/dialects:lib", + "fbcode//caffe2:torch", + "fbcode//executorch/exir:graph_module", + "fbcode//executorch/exir/backend:compile_spec_schema", + "fbcode//executorch/exir/backend:partitioner", + "fbcode//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib", + "fbcode//executorch/exir/backend/test:backend_with_compiler_demo", + "fbcode//executorch/exir/backend/test/demos/rpc:executor_backend_preprocess", + "fbcode//executorch/exir/dialects:lib", ], ) diff --git a/exir/backend/test/test_partitioner.py b/exir/backend/test/test_partitioner.py index 3973011a269..da1ae0444dd 100644 --- a/exir/backend/test/test_partitioner.py +++ b/exir/backend/test/test_partitioner.py @@ -39,9 +39,8 @@ _load_for_executorch_from_buffer, ) from executorch.extension.pytree import tree_flatten -from torch._export import capture_pre_autograd_graph from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param -from torch.export import export +from torch.export import export, export_for_training from torch.fx.passes.operator_support import any_chain @@ -77,7 +76,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() aten = export(model, example_inputs) spec_key = "path" spec_value = "/a/b/c/d" @@ -138,7 +137,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() aten = export(model, example_inputs) edge = exir.to_edge(aten) @@ -178,7 +177,7 @@ def partition( mlp = MLP() example_inputs = mlp.get_random_inputs() - model = capture_pre_autograd_graph(mlp, example_inputs) + model = export_for_training(mlp, example_inputs).module() edge = exir.to_edge(export(model, example_inputs)) with self.assertRaisesRegex( @@ -230,7 +229,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerNoTagData()) @@ -309,7 +308,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerTagData()) @@ -384,7 +383,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(self.AddConst(), (torch.ones(2, 2),)) + model = export_for_training(self.AddConst(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) delegated = edge.to_backend(PartitionerTagData()) @@ -472,7 +471,7 @@ def partition( ) inputs = (torch.ones(2, 2),) - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) exec_prog = edge.to_backend(PartitionerTagData()).to_executorch() executorch_module = _load_for_executorch_from_buffer(exec_prog.buffer) @@ -532,7 +531,7 @@ def partition( partition_tags=partition_tags, ) - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(export(model, (torch.ones(2, 2),))) with self.assertRaises(RuntimeError) as error: _ = edge.to_backend(PartitionerTagData()) diff --git a/exir/backend/test/test_passes.py b/exir/backend/test/test_passes.py index 8a43431520d..4dcc7757faa 100644 --- a/exir/backend/test/test_passes.py +++ b/exir/backend/test/test_passes.py @@ -11,8 +11,8 @@ from executorch.exir.backend.canonical_partitioners.duplicate_constant_node_pass import ( duplicate_constant_node, ) -from torch._export import capture_pre_autograd_graph from torch._export.utils import is_buffer +from torch.export import export_for_training from torch.testing import FileCheck @@ -29,7 +29,7 @@ def forward(self, x): z = x - self.const return y, z - model = capture_pre_autograd_graph(ReuseConstData(), (torch.ones(2, 2),)) + model = export_for_training(ReuseConstData(), (torch.ones(2, 2),)).module() edge = exir.to_edge(torch.export.export(model, (torch.ones(2, 2),))) const_nodes = [ From db342399a6bb1317d05a7a1f13f324ef50982aac Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 10 Sep 2024 12:56:29 -0500 Subject: [PATCH 290/531] [LLava] Fix stats for C++ runner Differential Revision: D62420000 Pull Request resolved: https://github.com/pytorch/executorch/pull/5147 --- .ci/scripts/test_llava.sh | 1 + examples/models/llava/runner/llava_runner.cpp | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 7dc6d15e407..8ac87b2302d 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -33,6 +33,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi EXECUTORCH_COMMON_CMAKE_ARGS=" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp index 20c45009da7..1924b057ec4 100644 --- a/examples/models/llava/runner/llava_runner.cpp +++ b/examples/models/llava/runner/llava_runner.cpp @@ -108,6 +108,8 @@ Error LlavaRunner::generate_from_pos( uint64_t prefill_next_token = ET_UNWRAP(prefill_prompt(prompt, start_pos, /*bos=*/0, /*eos*/ 0)); + stats_.first_token_ms = util::time_in_ms(); + stats_.prompt_eval_end_ms = util::time_in_ms(); stats_.num_prompt_tokens = start_pos; // Generate tokens @@ -116,7 +118,6 @@ Error LlavaRunner::generate_from_pos( // Bookkeeping stats_.num_generated_tokens = num_generated_tokens; - ::executorch::llm::print_report(stats_); if (stats_callback) { stats_callback(stats_); } @@ -151,6 +152,7 @@ Error LlavaRunner::generate( }; int64_t pos = 0; + stats_.inference_start_ms = util::time_in_ms(); // prefill preset prompt prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0); @@ -167,6 +169,9 @@ Error LlavaRunner::generate( Error err = generate_from_pos( prompt, seq_len, pos, wrapped_callback, stats_callback, echo); + stats_.inference_end_ms = util::time_in_ms(); + ::executorch::llm::print_report(stats_); + ET_LOG( Info, "RSS after finishing text generation: %f MiB (0 if unsupported)", From 02304d7c003ea2974901afab27075f288355b451 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Tue, 10 Sep 2024 11:02:57 -0700 Subject: [PATCH 291/531] Update bundled_program to use new namespace Differential Revision: D62402292 Pull Request resolved: https://github.com/pytorch/executorch/pull/5200 --- devtools/bundled_program/bundled_program.cpp | 41 ++++++----- devtools/bundled_program/bundled_program.h | 71 +++++++++++++++++--- extension/pybindings/pybindings.cpp | 14 ++-- 3 files changed, 93 insertions(+), 33 deletions(-) diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp index d174cbdcdad..54f84f6fef1 100644 --- a/devtools/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -23,13 +23,21 @@ #include #include -namespace torch { -namespace executor { +using exec_aten::ArrayRef; +using exec_aten::Half; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Method; +using ::executorch::runtime::Result; + +namespace executorch { namespace bundled_program { namespace { -#define kMaxDim 16 +constexpr size_t kMaxDim = 16; #ifdef USE_ATEN_LIB @@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { } #else // !USE_ATEN_LIB +using torch::executor::TensorImpl; // Create a tensorimpl with same content using bundled tensor TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { ScalarType scalar_type = @@ -234,9 +243,9 @@ get_method_test_suite( } // namespace // Load testset_idx-th bundled data into the Method -ET_NODISCARD Error LoadBundledInput( +ET_NODISCARD Error load_bundled_input( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx) { ET_CHECK_OR_RETURN_ERROR( bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( @@ -319,19 +328,19 @@ ET_NODISCARD Error LoadBundledInput( ET_CHECK_OR_RETURN_ERROR( status == Error::Ok, NotSupported, - "set_input failed during load bundled inputs with status %" PRIu32, - static_cast(status)); + "set_input failed during load bundled inputs with status 0%" PRIx32, + static_cast(status)); } - internal::event_tracer_set_bundled_input_index( + ::executorch::runtime::internal::event_tracer_set_bundled_input_index( method.get_event_tracer(), testset_idx); return Error::Ok; } -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( +ET_NODISCARD Error verify_method_outputs( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol, double atol) { @@ -390,12 +399,12 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( return Error::Ok; } -ET_NODISCARD Error GetProgramData( +ET_NODISCARD Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, size_t* out_program_data_len) { - if (IsBundledProgram(file_data)) { + if (is_bundled_program(file_data, file_data_len)) { auto program_bundled = bundled_program_flatbuffer::GetBundledProgram(file_data); *out_program_data = program_bundled->program()->data(); @@ -410,11 +419,13 @@ ET_NODISCARD Error GetProgramData( return Error::Ok; } -bool IsBundledProgram(void* file_data) { +bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) { + // Even though the flatbuffer API doesn't accept a length, it's important to + // require one so that we could change the internal representation, or use a + // future API that does require a length. return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( file_data); } } // namespace bundled_program -} // namespace executor -} // namespace torch +} // namespace executorch diff --git a/devtools/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h index 8b42923866e..884ca6f21bc 100644 --- a/devtools/bundled_program/bundled_program.h +++ b/devtools/bundled_program/bundled_program.h @@ -11,14 +11,13 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { namespace bundled_program { /** * An opaque pointer to a serialized bundled program. */ -using serialized_bundled_program = const void; +using SerializedBundledProgram = const void; /** * Load testset_idx-th bundled input of method_idx-th Method test in @@ -31,9 +30,9 @@ using serialized_bundled_program = const void; * @returns Return Error::Ok if load successfully, or the error happens during * execution. */ -ET_NODISCARD Error LoadBundledInput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error load_bundled_input( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx); /** @@ -49,9 +48,9 @@ ET_NODISCARD Error LoadBundledInput( * @returns Return Error::Ok if two outputs match, or the error happens during * execution. */ -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error verify_method_outputs( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol = 1e-5, double atol = 1e-8); @@ -73,7 +72,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( * in it, and out_program_data/out_program_data_len point to the data. Other * values on failure. */ -ET_NODISCARD Error GetProgramData( +ET_NODISCARD ::executorch::runtime::Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, @@ -83,11 +82,61 @@ ET_NODISCARD Error GetProgramData( * Checks whether the given file is a bundled program. * * @param[in] file_data The contents of the given file. + * @param[in] file_data_len The length of file_data, in bytes. * * @returns true if the given file is a bundled program, false otherwise */ -bool IsBundledProgram(void* file_data); +bool is_bundled_program(void* file_data, size_t file_data_len); + +/// DEPRECATED: Use the version with the file_data_len parameter. +ET_DEPRECATED inline bool is_bundled_program(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return is_bundled_program(file_data, 128); +} + +} // namespace bundled_program +} // namespace executorch + +namespace torch { +namespace executor { +namespace bundled_program { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using serialized_bundled_program = + ::executorch::bundled_program::SerializedBundledProgram; + +ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx) { + return ::executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); +} + +ET_NODISCARD inline ::executorch::runtime::Error +VerifyResultWithBundledExpectedOutput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx, + double rtol = 1e-5, + double atol = 1e-8) { + return ::executorch::bundled_program::verify_method_outputs( + method, bundled_program_ptr, testset_idx, rtol, atol); +} + +ET_NODISCARD inline ::executorch::runtime::Error GetProgramData( + void* file_data, + size_t file_data_len, + const void** out_program_data, + size_t* out_program_data_len) { + return ::executorch::bundled_program::get_program_data( + file_data, file_data_len, out_program_data, out_program_data_len); +} +inline bool IsBundledProgram(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return ::executorch::bundled_program::is_bundled_program(file_data, 128); +} } // namespace bundled_program } // namespace executor } // namespace torch diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 000cecf4b23..57bc44d1394 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -71,6 +71,7 @@ void et_pal_emit_log_message( } namespace py = pybind11; +using executorch::bundled_program::verify_method_outputs; using ::executorch::extension::BufferDataLoader; using ::executorch::extension::MallocMemoryAllocator; using ::executorch::extension::MmapDataLoader; @@ -92,8 +93,6 @@ using ::executorch::runtime::Span; using ::executorch::runtime::Tag; using torch::executor::etdump_result; using torch::executor::ETDumpGen; -using torch::executor::bundled_program::LoadBundledInput; -using torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput; #ifndef USE_ATEN_LIB using ::executorch::extension::alias_attensor_to_etensor; @@ -655,11 +654,11 @@ struct PyModule final { const std::string method_name, size_t testset_idx) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); - Error status = LoadBundledInput( + Error status = executorch::bundled_program::load_bundled_input( module_->get_method(method_name), bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, - "LoadBundledInput failed with status %" PRIu32, + "load_bundled_input failed with status 0x%" PRIx32, static_cast(status)); } @@ -671,13 +670,14 @@ struct PyModule final { double atol = 1e-8) { const void* bundled_program_ptr = m.get_bundled_program_ptr(); auto& method = module_->get_method(method_name); - Error status = LoadBundledInput(method, bundled_program_ptr, testset_idx); + Error status = executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); THROW_IF_ERROR( status, - "LoadBundledInput failed with status %" PRIu32, + "load_bundled_input failed with status 0x%" PRIx32, static_cast(status)); py::list outputs = plan_execute(method_name); - status = VerifyResultWithBundledExpectedOutput( + status = executorch::bundled_program::verify_method_outputs( method, bundled_program_ptr, testset_idx, rtol, atol); THROW_IF_ERROR( status, From c76b22fc95b2c8f8e91d5a2be34491e9a82e5810 Mon Sep 17 00:00:00 2001 From: shewu-quic <138087975+shewu-quic@users.noreply.github.com> Date: Wed, 11 Sep 2024 02:08:43 +0800 Subject: [PATCH 292/531] Qualcomm AI Engine Direct - Fixed the order of the transforms for llama (#5221) * Qualcomm AI Engine Direct - Fixed the order of the transforms for llama * fixed ci --------- Co-authored-by: Sheng Feng Wu --- examples/models/llama2/eval_llama_lib.py | 7 ++++++- examples/models/llama2/export_llama_lib.py | 9 +++++---- extension/llm/export/builder.py | 10 +++++++++- extension/llm/export/partitioner_lib.py | 9 +-------- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 2d10f5edc0a..b8987ac5d49 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -41,6 +41,7 @@ def __init__( tokenizer: Union[SentencePieceTokenizer, Tiktoken], max_seq_length: Optional[int] = None, use_kv_cache: bool = False, + generate_full_logits: bool = False, enable_dynamic_shape: bool = True, ): super().__init__( @@ -48,6 +49,7 @@ def __init__( ) self._model = model.to(self.device) self._use_kv_cache = use_kv_cache + self._generate_full_logits = generate_full_logits self._enable_dynamic_shape = enable_dynamic_shape def _model_call(self, inps): @@ -60,7 +62,10 @@ def _model_call(self, inps): pos_tensor = torch.tensor([pos], dtype=torch.int64) logits = self._model(inps[:, pos : pos + 1], pos_tensor) result_logits.append(logits) - return torch.cat(result_logits, dim=1) + if self._generate_full_logits: + return torch.cat(result_logits, dim=1) + else: + return torch.stack(result_logits, dim=1) else: pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) # Batch process the whole sequence. diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 977348946b3..611bf16428d 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -233,7 +233,7 @@ def build_args_parser() -> argparse.ArgumentParser: "--optimized_rotation_path", default=None, required=False, - help="[QNN Backend] Optimized rotation checkpoint path. Just apply R1/R2 here." + help="[QNN backend] Optimized rotation checkpoint path. Just apply R1/R2 here." "You can download the optimized rotation matrices from https://github.com/facebookresearch/SpinQuant/tree/main", ) parser.add_argument( @@ -440,6 +440,9 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: transforms.append(replace_sdpa_with_flex_sdpa) transforms.append(replace_causal_mask) transforms.append(replace_rms_norm_with_native_rms_norm) + if args.optimized_rotation_path: + transforms.append(fuse_layer_norms) + transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) transforms.append(convert_linear_to_conv2d) elif args.coreml or args.mps: @@ -448,9 +451,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: transforms.append(replace_sdpa_with_simple_sdpa) transforms.append(replace_causal_mask) - if args.optimized_rotation_path: - transforms.append(fuse_layer_norms) - transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) return ( _load_llama_model( modelname=modelname, @@ -744,6 +744,7 @@ def _load_llama_model( max_seq_len=model.params.max_seq_len, dtype=dtype, use_kv_cache=use_kv_cache, + generate_full_logits=generate_full_logits, example_inputs=example_inputs, enable_dynamic_shape=enable_dynamic_shape, calibration_tasks=calibration_tasks, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index bc64ae869fc..4237ae7b3a7 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -69,6 +69,7 @@ def __init__( example_inputs, args: Optional[Any] = None, enable_dynamic_shape: bool = False, + generate_full_logits: bool = False, calibration_tasks: Optional[List[str]] = None, calibration_limit: Optional[int] = None, calibration_seq_length: Optional[int] = None, @@ -86,6 +87,7 @@ def __init__( self.dtype = dtype self.example_inputs = example_inputs self.use_kv_cache = use_kv_cache + self.generate_full_logits = generate_full_logits self.enable_dynamic_shape = enable_dynamic_shape self.verbose = verbose self.metadata = metadata @@ -229,7 +231,12 @@ def calibrate_template( ) pos += 1 if pos >= len(token_list): - token_list.append(torch.argmax(logits[:], dim=-1).item()) + if self.generate_full_logits: + token_list.append( + torch.argmax(logits[:, -1], dim=-1).item() + ) + else: + token_list.append(torch.argmax(logits[:], dim=-1).item()) calibrate_template( module=prepared_module, @@ -243,6 +250,7 @@ def calibrate_template( tokenizer=tokenizer, max_seq_length=calibration_seq_length, use_kv_cache=self.use_kv_cache, + generate_full_logits=self.generate_full_logits, enable_dynamic_shape=self.enable_dynamic_shape, ) eval_results = evaluate_model( diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 29c7b3731fb..f5cc04ead48 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -139,16 +139,9 @@ def get_qnn_partitioner( if pt2e_quantize is not None: use_fp16 = False - soc_chip_table = { - "SM8650": QcomChipset.SM8650, - "SM8550": QcomChipset.SM8550, - "SM8475": QcomChipset.SM8475, - "SM8450": QcomChipset.SM8450, - } - return QnnPartitioner( # pyre-fixme[16] generate_qnn_executorch_compiler_spec( # pyre-fixme[16] - soc_model=soc_chip_table[soc_model], # pyre-fixme[16] + soc_model=getattr(QcomChipset, soc_model), # pyre-fixme[16] # pyre-fixme[16] backend_options=generate_htp_compiler_spec( use_fp16=use_fp16, From d38ca81dbbfe641f71229ab1057cd34a881b3e10 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:42:36 -0700 Subject: [PATCH 293/531] Android refactor cmake build Differential Revision: D62408596 Pull Request resolved: https://github.com/pytorch/executorch/pull/5204 --- build/build_android_llm_demo.sh | 17 +--- .../android/LlamaDemo/setup-with-qnn.sh | 4 +- examples/demo-apps/android/LlamaDemo/setup.sh | 2 +- extension/android/CMakeLists.txt | 99 +++++++------------ extension/android/jni/BUCK | 2 +- extension/android/jni/jni_layer.cpp | 12 ++- extension/android/jni/jni_layer_llama.cpp | 32 +----- .../org/pytorch/executorch/LlamaModule.java | 2 +- 8 files changed, 57 insertions(+), 113 deletions(-) diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 3c076cc5bdf..917512d71b6 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -54,20 +54,6 @@ build_android_native_library() { fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release - cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - - cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -75,6 +61,7 @@ build_android_native_library() { -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -110,7 +97,7 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + zip -r executorch-llama.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml popd } diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 87d0f47c956..4deafb83487 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -37,6 +37,7 @@ cmake examples/models/llama2 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -47,6 +48,7 @@ cmake extension/android \ -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -59,7 +61,7 @@ mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}" BUILD_AAR_DIR="$(mktemp -d)" mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs" JNI_LIBS_PATH="${BUILD_AAR_DIR}/jni" -cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" +cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/libexecutorch_jni.so" cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/" diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 91a68d4b88b..78816680bc7 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -56,7 +56,7 @@ cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Relea BUILD_AAR_DIR="$(mktemp -d)" mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" "${BUILD_AAR_DIR}/libs" -cp "${CMAKE_OUT}"/extension/android/libexecutorch_llama_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" +cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/libexecutorch.so" cp extension/android/build/libs/executorch.jar "${BUILD_AAR_DIR}/libs" echo \ \ diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 74f98960002..c9396a55879 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -10,7 +10,6 @@ project(executorch_jni) if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) - # Can't set to 11 due to executor_runner.cpp make_unique endif() if(NOT ANDROID) @@ -71,78 +70,54 @@ if(TARGET vulkan_backend) list(APPEND link_libraries vulkan_backend) endif() +if(EXECUTORCH_BUILD_KERNELS_CUSTOM) + add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/custom_ops + ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops + ) + list(APPEND link_libraries custom_ops) + target_link_options_shared_lib(custom_ops) +endif() + add_library(executorch_jni SHARED jni/jni_layer.cpp) -target_link_libraries(executorch_jni ${link_libraries}) -target_include_directories( - executorch_jni PRIVATE ${_common_include_directories} -) -target_compile_options(executorch_jni PUBLIC ${_common_compile_options}) if(EXECUTORCH_BUILD_LLAMA_JNI) - set(LLAMA_RUNNER_PATH - ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner/libllama_runner.a - ) - add_library(llama_runner STATIC IMPORTED) - set_property( - TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH} - ) - + target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp) + list(APPEND link_libraries llama_runner llava_runner) + target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1) add_subdirectory( ${EXECUTORCH_ROOT}/examples/models/llava/runner ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llava/runner ) - set(CUSTOM_OPS_PATH - ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a + add_subdirectory( + ${EXECUTORCH_ROOT}/examples/models/llama2/runner + ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner ) - add_library(custom_ops STATIC IMPORTED) - set_property(TARGET custom_ops PROPERTY IMPORTED_LOCATION ${CUSTOM_OPS_PATH}) - target_link_options_shared_lib(custom_ops) +endif() - target_link_options_shared_lib(quantized_ops_lib) - - set(LLAMA_JNI_SRCS jni/jni_layer_llama.cpp) - add_library(executorch_llama_jni SHARED ${LLAMA_JNI_SRCS}) - if(TARGET pthreadpool) - target_compile_definitions(executorch_llama_jni PRIVATE ET_USE_THREADPOOL=1) - target_include_directories( - executorch_llama_jni - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include - ) - target_include_directories( - executorch_llama_jni - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include - ) - endif() +if(TARGET quantized_kernels) + list(APPEND link_libraries quantized_kernels quantized_ops_lib) +endif() + +target_include_directories( + executorch_jni PRIVATE ${_common_include_directories} +) + +target_compile_options(executorch_jni PUBLIC ${_common_compile_options}) + +target_link_libraries(executorch_jni ${link_libraries}) + +if(TARGET pthreadpool) + target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1) target_include_directories( - executorch_llama_jni PRIVATE ${_common_include_directories} - ) - target_link_libraries( - executorch_llama_jni - ${link_libraries} - llama_runner - llava_runner - custom_ops - cpublas - eigen_blas - quantized_kernels - quantized_ops_lib + executorch_jni + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/cpuinfo/include ) - target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options}) - # link re2 - set(ABSL_ENABLE_INSTALL ON) - set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp - ) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 + target_include_directories( + executorch_jni + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../backends/xnnpack/third-party/pthreadpool/include ) - set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - target_link_libraries(executorch_llama_jni re2::re2) endif() diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index 7cdf8ef7ec4..f7e7932a21b 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -77,7 +77,7 @@ fb_android_cxx_library( "-fexceptions", "-Wno-format", ], - soname = "libexecutorch_llama_jni.$(ext)", + soname = "libexecutorch.$(ext)", visibility = ["PUBLIC"], deps = [ "//fbandroid/libraries/fbjni:fbjni", diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp index f2cfc4a5cff..1ef81b20b08 100644 --- a/extension/android/jni/jni_layer.cpp +++ b/extension/android/jni/jni_layer.cpp @@ -386,7 +386,15 @@ class ExecuTorchJni : public facebook::jni::HybridClass { }; } // namespace executorch::extension +#ifdef EXECUTORCH_BUILD_LLAMA_JNI +extern void register_natives_for_llama(); +#else +// No op if we don't build llama +void register_natives_for_llama() {} +#endif JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) { - return facebook::jni::initialize( - vm, [] { executorch::extension::ExecuTorchJni::registerNatives(); }); + return facebook::jni::initialize(vm, [] { + executorch::extension::ExecuTorchJni::registerNatives(); + register_natives_for_llama(); + }); } diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 6194853fe7c..e6a9b5de58c 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -30,33 +30,6 @@ #include #include -#ifdef __ANDROID__ -#include - -// For Android, write to logcat -void et_pal_emit_log_message( - et_timestamp_t timestamp, - et_pal_log_level_t level, - const char* filename, - const char* function, - size_t line, - const char* message, - size_t length) { - int android_log_level = ANDROID_LOG_UNKNOWN; - if (level == 'D') { - android_log_level = ANDROID_LOG_DEBUG; - } else if (level == 'I') { - android_log_level = ANDROID_LOG_INFO; - } else if (level == 'E') { - android_log_level = ANDROID_LOG_ERROR; - } else if (level == 'F') { - android_log_level = ANDROID_LOG_FATAL; - } - - __android_log_print(android_log_level, "LLAMA", "%s", message); -} -#endif - using namespace torch::executor; namespace executorch_jni { @@ -300,7 +273,6 @@ class ExecuTorchLlamaJni } // namespace executorch_jni -JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) { - return facebook::jni::initialize( - vm, [] { executorch_jni::ExecuTorchLlamaJni::registerNatives(); }); +void register_natives_for_llama() { + executorch_jni::ExecuTorchLlamaJni::registerNatives(); } diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java index e3ba11b8505..7c77dbae08f 100644 --- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java +++ b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java @@ -28,7 +28,7 @@ public class LlamaModule { if (!NativeLoader.isInitialized()) { NativeLoader.init(new SystemDelegate()); } - NativeLoader.loadLibrary("executorch_llama_jni"); + NativeLoader.loadLibrary("executorch"); } private final HybridData mHybridData; From a4d67e2d31c70d602616bb9b3e7e4e7e5758ca10 Mon Sep 17 00:00:00 2001 From: Riandy Date: Tue, 10 Sep 2024 12:23:48 -0700 Subject: [PATCH 294/531] Android: Leverage prefillPrompt and prefillImage on Llava Differential Revision: D62411342 Pull Request resolved: https://github.com/pytorch/executorch/pull/5224 --- .../executorchllamademo/MainActivity.java | 94 ++++++++++--------- .../executorchllamademo/PromptFormat.java | 5 + 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index e9f32a927cc..ac14270ed51 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -19,6 +19,7 @@ import android.os.Bundle; import android.os.Handler; import android.os.Looper; +import android.os.Process; import android.provider.MediaStore; import android.system.ErrnoException; import android.system.Os; @@ -44,6 +45,8 @@ import java.lang.reflect.Type; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; @@ -71,15 +74,16 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa private Handler mMemoryUpdateHandler; private Runnable memoryUpdater; private int promptID = 0; - + private long startPos = 0; private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; + private Executor executor; @Override public void onResult(String result) { if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { return; } - if (result.equals("\n\n")) { + if (result.equals("\n\n") || result.equals("\n")) { if (!mResultMessage.getText().isEmpty()) { mResultMessage.appendText(result); run(); @@ -150,6 +154,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera + (float) loadDuration / 1000 + " sec." + " You can send text or image for inference"; + + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + ETLogging.getInstance().log("Llava start prefill prompt"); + startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0); + ETLogging.getInstance().log("Llava completes prefill prompt"); + } } Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); @@ -241,6 +251,7 @@ protected void onCreate(Bundle savedInstanceState) { setupCameraRoll(); startMemoryUpdate(); setupShowLogsButton(); + executor = Executors.newSingleThreadExecutor(); } @Override @@ -546,6 +557,32 @@ private void showMediaPreview(List uris) { imageViews.get(i).setVisibility(View.VISIBLE); imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); } + + // For LLava, we want to call prefill_image as soon as an image is selected + // Llava only support 1 image for now + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + List processedImageList = getProcessedImagesForModel(mSelectedImageUri); + if (!processedImageList.isEmpty()) { + mMessageAdapter.add( + new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0)); + mMessageAdapter.notifyDataSetChanged(); + Runnable runnable = + () -> { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("Starting runnable prefill image"); + ETImage img = processedImageList.get(0); + ETLogging.getInstance().log("Llava start prefill image"); + startPos = + mModule.prefillImages( + img.getInts(), + img.getWidth(), + img.getHeight(), + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + startPos); + }; + executor.execute(runnable); + } + } } private void addSelectedImagesToChatThread(List selectedImageUri) { @@ -618,24 +655,6 @@ private void onModelRunStopped() { mSendButton.setOnClickListener( view -> { addSelectedImagesToChatThread(mSelectedImageUri); - List processedImageList = getProcessedImagesForModel(mSelectedImageUri); - processedImageList.forEach( - image -> { - ETLogging.getInstance() - .log( - "Image preprocessed:" - + " uri = " - + image.getUri().getLastPathSegment() - + "," - + " width = " - + image.getWidth() - + "," - + " height = " - + image.getHeight() - + "," - + " bytes size = " - + image.getBytes().length); - }); String rawPrompt = mEditTextMessage.getText().toString(); // We store raw prompt into message adapter, because we don't want to show the extra // tokens from system prompt @@ -654,6 +673,8 @@ private void onModelRunStopped() { new Runnable() { @Override public void run() { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("starting runnable generate()"); runOnUiThread( new Runnable() { @Override @@ -664,31 +685,12 @@ public void run() { long generateStartTime = System.currentTimeMillis(); if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) == ModelUtils.VISION_MODEL) { - ETLogging.getInstance().log("Running inference.. prompt=" + rawPrompt); - if (!processedImageList.isEmpty()) { - // For now, Llava only support 1 image. - ETImage img = processedImageList.get(0); - mModule.generate( - processedImageList.get(0).getInts(), - img.getWidth(), - img.getHeight(), - ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - rawPrompt, - ModelUtils.VISION_MODEL_SEQ_LEN, - MainActivity.this, - false); - } else { - // no image selected, we pass in empty int array - mModule.generate( - new int[0], - 0, - 0, - ModelUtils.VISION_MODEL_IMAGE_CHANNELS, - rawPrompt, - ModelUtils.VISION_MODEL_SEQ_LEN, - MainActivity.this, - false); - } + mModule.generateFromPos( + mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt), + ModelUtils.VISION_MODEL_SEQ_LEN, + startPos, + MainActivity.this, + false); } else { String finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt); @@ -712,7 +714,7 @@ public void run() { ETLogging.getInstance().log("Inference completed"); } }; - new Thread(runnable).start(); + executor.execute(runnable); }); mMessageAdapter.notifyDataSetChanged(); } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 4b450553236..640d3782128 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -66,4 +66,9 @@ public static String getStopToken(ModelType modelType) { return ""; } } + + public static String getLlavaPresetPrompt() { + return "A chat between a curious human and an artificial intelligence assistant. The assistant" + + " gives helpful, detailed, and polite answers to the human's questions. USER: "; + } } From b54206d78fbcf9ea2839280c31a7b84127d8fc5d Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Tue, 10 Sep 2024 12:38:09 -0700 Subject: [PATCH 295/531] Update the minimum C++ version to C++17 Differential Revision: D62329462 Pull Request resolved: https://github.com/pytorch/executorch/pull/5158 --- CONTRIBUTING.md | 4 +--- docs/source/getting-started-setup.md | 8 +++---- docs/source/runtime-overview.md | 2 +- runtime/platform/compiler.h | 31 +++++++++++++++++++++------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ad23f84d17..d434c1fe198 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -131,9 +131,7 @@ for detailed advice. #### C++ language version -**C++11.** - -NOTE: The code does not yet fully conform to this, and some files require C++17. +**C++17.** Rationale: This is a compromise between being compatible with older, proprietary toolchains, and having access to relatively modern C++ features. diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index d610f020ef2..1fbe35c72bc 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -59,13 +59,11 @@ also work in similar environments. - We recommend `conda` as it provides cross-language support and integrates smoothly with `pip` (Python's built-in package manager) - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative. -* `g++` version 8 or higher, `clang++` version 8 or higher, or another - C++17-compatible toolchain that supports GNU C-style [statement - expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ... - })` syntax). +* `g++` version 7 or higher, `clang++` version 5 or higher, or another + C++17-compatible toolchain. Note that the cross-compilable core runtime code supports a wider range of -toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for +toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for portability details. ## Quick Setup: Colab/Jupyter Notebook Prototype diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md index 7bc8b4dd8b4..6766e678e0e 100644 --- a/docs/source/runtime-overview.md +++ b/docs/source/runtime-overview.md @@ -96,7 +96,7 @@ can build it for a wide variety of target systems. #### C++ Language Considerations -* The code is C++11-compatible to work with older toolchains. +* The code is C++17-compatible to work with older toolchains. * The runtime does not use exceptions or RTTI, although it is not antagonistic to them. * The code is compatible with GCC and Clang, and has also been built with diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index c7f603756c8..9a8e18c0f1e 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -13,17 +13,32 @@ #pragma once -// Compiler support checks. +/* + * Compiler support checks. Follows the logic used by pytorch/c10/util/C++17.h + * but may support older versions. + */ + +// https://gcc.gnu.org/projects/cxx-status.html#cxx17 +#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \ + __GNUC__ < 7 +#error \ + "You're trying to build ExecuTorch with a too old version of GCC. We need GCC 7 or later." +#endif + +// https://clang.llvm.org/cxx_status.html#cxx17 +#if defined(__clang__) && __clang_major__ < 5 +#error \ + "You're trying to build ExecuTorch with a too old version of Clang. We need Clang 5 or later." +#endif -#if !defined(__cplusplus) -#error ExecuTorch must be compiled using a C++ compiler. +#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \ + (!defined(_MSC_VER) && __cplusplus < 201703L) +#error "You need C++17 to compile ExecuTorch" #endif -#if __cplusplus < 201103L && (!defined(_MSC_VER) || _MSC_VER < 1600) && \ - (!defined(__GNUC__) || \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400)) -#error ExecuTorch must use a compiler supporting at least the C++11 standard. -#error __cplusplus _MSC_VER __GNUC__ __GNUC_MINOR__ __GNUC_PATCHLEVEL__ +#if defined(_WIN32) && (defined(min) || defined(max)) +#error \ + "Macro clash with min and max -- define NOMINMAX when compiling your program on Windows" #endif /* From 4ce0f9d3e9fcd05ad7eef90c193a8c982219ae9a Mon Sep 17 00:00:00 2001 From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:49:53 -0400 Subject: [PATCH 296/531] Introduce PlatformMemoryAllocator Differential Revision: D60601742 Pull Request resolved: https://github.com/pytorch/executorch/pull/5121 --- runtime/executor/method.cpp | 26 ++- runtime/executor/method.h | 6 +- runtime/executor/platform_memory_allocator.h | 111 ++++++++++++ runtime/executor/program.h | 3 +- runtime/executor/targets.bzl | 3 + .../executor/test/kernel_integration_test.cpp | 159 +++++++++++++++++- .../executor/test/managed_memory_manager.h | 5 +- runtime/platform/default/minimal.cpp | 6 + runtime/platform/default/posix.cpp | 23 +++ runtime/platform/platform.h | 19 +++ .../test/executor_pal_override_test.cpp | 46 +++++ runtime/platform/test/stub_platform.cpp | 10 ++ runtime/platform/test/stub_platform.h | 6 + 13 files changed, 410 insertions(+), 13 deletions(-) create mode 100644 runtime/executor/platform_memory_allocator.h diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 4ec02aee921..a6ed7e354a9 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,8 @@ namespace executorch { namespace runtime { +using internal::PlatformMemoryAllocator; + /** * Runtime state for a backend delegate. */ @@ -548,7 +551,16 @@ Result Method::load( const Program* program, MemoryManager* memory_manager, EventTracer* event_tracer) { - Method method(program, memory_manager, event_tracer); + MemoryAllocator* temp_allocator = memory_manager->temp_allocator(); + if (temp_allocator == nullptr) { + PlatformMemoryAllocator* platform_allocator = + ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR( + memory_manager->method_allocator(), PlatformMemoryAllocator); + new (platform_allocator) PlatformMemoryAllocator(); + temp_allocator = platform_allocator; + } + Method method(program, memory_manager, event_tracer, temp_allocator); + Error err = method.init(s_plan); if (err != Error::Ok) { return err; @@ -1039,16 +1051,14 @@ Error Method::execute_instruction() { auto instruction = instructions->Get(step_state_.instr_idx); size_t next_instr_idx = step_state_.instr_idx + 1; Error err = Error::Ok; + switch (instruction->instr_args_type()) { case executorch_flatbuffer::InstructionArguments::KernelCall: { EXECUTORCH_SCOPE_PROF("OPERATOR_CALL"); internal::EventTracerProfileScope event_tracer_scope = internal::EventTracerProfileScope(event_tracer_, "OPERATOR_CALL"); // TODO(T147221312): Also expose tensor resizer via the context. - // The temp_allocator passed can be null, but calling allocate_temp will - // fail - KernelRuntimeContext context( - event_tracer_, memory_manager_->temp_allocator()); + KernelRuntimeContext context(event_tracer_, temp_allocator_); auto args = chain.argument_lists_[step_state_.instr_idx]; chain.kernels_[step_state_.instr_idx](context, args.data()); // We reset the temp_allocator after the switch statement @@ -1096,7 +1106,7 @@ Error Method::execute_instruction() { step_state_.instr_idx); BackendExecutionContext backend_execution_context( /*event_tracer*/ event_tracer_, - /*temp_allocator*/ memory_manager_->temp_allocator()); + /*temp_allocator*/ temp_allocator_); err = delegates_[delegate_idx].Execute( backend_execution_context, chain.argument_lists_[step_state_.instr_idx].data()); @@ -1168,8 +1178,8 @@ Error Method::execute_instruction() { err = Error::InvalidProgram; } // Reset the temp allocator for every instruction. - if (memory_manager_->temp_allocator() != nullptr) { - memory_manager_->temp_allocator()->reset(); + if (temp_allocator_ != nullptr) { + temp_allocator_->reset(); } if (err == Error::Ok) { step_state_.instr_idx = next_instr_idx; diff --git a/runtime/executor/method.h b/runtime/executor/method.h index 7d96096accf..0a35d6b9282 100644 --- a/runtime/executor/method.h +++ b/runtime/executor/method.h @@ -53,6 +53,7 @@ class Method final { : step_state_(rhs.step_state_), program_(rhs.program_), memory_manager_(rhs.memory_manager_), + temp_allocator_(rhs.temp_allocator_), serialization_plan_(rhs.serialization_plan_), event_tracer_(rhs.event_tracer_), n_value_(rhs.n_value_), @@ -273,10 +274,12 @@ class Method final { Method( const Program* program, MemoryManager* memory_manager, - EventTracer* event_tracer) + EventTracer* event_tracer, + MemoryAllocator* temp_allocator) : step_state_(), program_(program), memory_manager_(memory_manager), + temp_allocator_(temp_allocator), serialization_plan_(nullptr), event_tracer_(event_tracer), n_value_(0), @@ -319,6 +322,7 @@ class Method final { StepState step_state_; const Program* program_; MemoryManager* memory_manager_; + MemoryAllocator* temp_allocator_; executorch_flatbuffer::ExecutionPlan* serialization_plan_; EventTracer* event_tracer_; diff --git a/runtime/executor/platform_memory_allocator.h b/runtime/executor/platform_memory_allocator.h new file mode 100644 index 00000000000..09195a460ac --- /dev/null +++ b/runtime/executor/platform_memory_allocator.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace executorch { +namespace runtime { +namespace internal { + +/** + * PlatformMemoryAllocator is a memory allocator that uses a linked list to + * manage allocated nodes. It overrides the allocate method of MemoryAllocator + * using the PAL fallback allocator method `et_pal_allocate`. + */ +class PlatformMemoryAllocator final : public MemoryAllocator { + private: + // We allocate a little more than requested and use that memory as a node in + // a linked list, pushing the allocated buffers onto a list that's iterated + // and freed when the KernelRuntimeContext is destroyed. + struct AllocationNode { + void* data; + AllocationNode* next; + }; + + AllocationNode* head_ = nullptr; + + public: + PlatformMemoryAllocator() : MemoryAllocator(0, nullptr) {} + + void* allocate(size_t size, size_t alignment = kDefaultAlignment) override { + if (!isPowerOf2(alignment)) { + ET_LOG(Error, "Alignment %zu is not a power of 2", alignment); + return nullptr; + } + + // Allocate enough memory for the node, the data and the alignment bump. + size_t alloc_size = sizeof(AllocationNode) + size + alignment; + void* node_memory = et_pal_allocate(alloc_size); + + // If allocation failed, log message and return nullptr. + if (node_memory == nullptr) { + ET_LOG(Error, "Failed to allocate %zu bytes", alloc_size); + return nullptr; + } + + // Compute data pointer. + uint8_t* data_ptr = + reinterpret_cast(node_memory) + sizeof(AllocationNode); + + // Align the data pointer. + void* aligned_data_ptr = alignPointer(data_ptr, alignment); + + // Assert that the alignment didn't overflow the allocated memory. + ET_DCHECK_MSG( + reinterpret_cast(aligned_data_ptr) + size <= + reinterpret_cast(node_memory) + alloc_size, + "aligned_data_ptr %p + size %zu > node_memory %p + alloc_size %zu", + aligned_data_ptr, + size, + node_memory, + alloc_size); + + // Construct the node. + AllocationNode* new_node = reinterpret_cast(node_memory); + new_node->data = aligned_data_ptr; + new_node->next = head_; + head_ = new_node; + + // Return the aligned data pointer. + return head_->data; + } + + void reset() override { + AllocationNode* current = head_; + while (current != nullptr) { + AllocationNode* next = current->next; + et_pal_free(current); + current = next; + } + head_ = nullptr; + } + + ~PlatformMemoryAllocator() override { + reset(); + } + + private: + // Disable copy and move. + PlatformMemoryAllocator(const PlatformMemoryAllocator&) = delete; + PlatformMemoryAllocator& operator=(const PlatformMemoryAllocator&) = delete; + PlatformMemoryAllocator(PlatformMemoryAllocator&&) noexcept = delete; + PlatformMemoryAllocator& operator=(PlatformMemoryAllocator&&) noexcept = + delete; +}; + +} // namespace internal +} // namespace runtime +} // namespace executorch diff --git a/runtime/executor/program.h b/runtime/executor/program.h index a599cc958e0..f7469eb2192 100644 --- a/runtime/executor/program.h +++ b/runtime/executor/program.h @@ -123,7 +123,8 @@ class Program final { * * @param[in] method_name The name of the method to load. * @param[in] memory_manager The allocators to use during initialization and - * execution of the loaded method. + * execution of the loaded method. If `memory_manager.temp_allocator()` is + * null, the runtime will allocate temp memory using `et_pal_allocate()`. * @param[in] event_tracer The event tracer to use for this method run. * * @returns The loaded method on success, or an error on failure. diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index 46f997a80ad..cc91255d7b5 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -65,6 +65,9 @@ def define_common_targets(): "tensor_parser_exec_aten.cpp", "tensor_parser{}.cpp".format(aten_suffix if aten_mode else "_portable"), ], + headers = [ + "platform_memory_allocator.h", + ], exported_headers = [ "method.h", "method_meta.h", diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp index 616398b7416..4f1ac0240b9 100644 --- a/runtime/executor/test/kernel_integration_test.cpp +++ b/runtime/executor/test/kernel_integration_test.cpp @@ -34,6 +34,7 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::Kernel; using executorch::runtime::KernelKey; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::MemoryAllocator; using executorch::runtime::Method; using executorch::runtime::Program; using executorch::runtime::Result; @@ -59,10 +60,26 @@ struct KernelControl { // returning. Error fail_value = Error::Ok; + // If true, the kernel should allocate temporary memory. + bool allocate_temp_memory = false; + + // If true, the kernel should simulate allocating temporary memory. + bool simulate_temp_memory_allocation = false; + + // The size of the temporary memory to allocate. + int temp_memory_size = 0; + + // The total size of all allocations. + int total_allocated_size = 0; + void reset() { call_count = 0; call_context_fail = false; fail_value = Error::Ok; + allocate_temp_memory = false; + simulate_temp_memory_allocation = false; + temp_memory_size = 0; + total_allocated_size = 0; } /** @@ -117,6 +134,33 @@ struct KernelControl { if (control->call_context_fail) { context.fail(control->fail_value); } + + // Allocate temporary memory. + if (control->allocate_temp_memory) { + Result temp_mem_res = + context.allocate_temp(control->temp_memory_size); + if (temp_mem_res.ok()) { + control->total_allocated_size += control->temp_memory_size; + // We actually use the memory, to test default memory allocation was + // successful. + uint8_t* array = (uint8_t*)(temp_mem_res.get()); + for (int i = 0; i < control->temp_memory_size; i++) { + array[i] = i % 256; + } + } + } + + // Simulate allocating temporary memory. We use this, for testing that when + // a temp allocator is provided, the kernel will use it, instead of + // allocating memory with the default platform memory allocator. + // The provided TempMemoryAllocator class in this file, simulates allocating + // memory instead of actually allocating anything. + if (control->simulate_temp_memory_allocation) { + Result temp_mem_res = + context.allocate_temp(control->temp_memory_size); + control->total_allocated_size += control->temp_memory_size; + EXPECT_EQ(temp_mem_res.error(), Error::Ok); + } } static bool registered_; @@ -126,6 +170,44 @@ struct KernelControl { bool KernelControl::registered_ = false; KernelControl KernelControl::singleton_; +/** + * MemoryAllocator that keeps track of the number/sizes of its allocations, + * to test the case where the user provides a temp allocator. + */ +class TempMemoryAllocator final : public MemoryAllocator { + public: + TempMemoryAllocator() : MemoryAllocator(0, nullptr) {} + + // The number of times allocate() has been called. + int number_of_allocations = 0; + + // The number of times reset() has been called. + int number_of_resets = 0; + + // The amount of memory currently allocated (should go to 0 when reset is + // called). + int currently_allocated_size = 0; + + // The total size of all allocations. + int total_allocated_size = 0; + + void* allocate(size_t size, ET_UNUSED size_t alignment = kDefaultAlignment) + override { + number_of_allocations += 1; + currently_allocated_size += size; + total_allocated_size += size; + // This is a simulation, we don't actually allocate memory. But we need to + // return a non-null pointer, so we return a bad, non-zero address that will + // crash if anyone tries to dereference it. + return (void*)1; + } + + void reset() override { + number_of_resets += 1; + currently_allocated_size = 0; + } +}; + class KernelIntegrationTest : public ::testing::Test { protected: void SetUp() override { @@ -152,7 +234,9 @@ class KernelIntegrationTest : public ::testing::Test { // Load the forward method. mmm_ = std::make_unique( - kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); + kDefaultNonConstMemBytes, + kDefaultRuntimeMemBytes, + temp_allocator_.get()); Result method = program_->load_method("forward", &mmm_->get()); ASSERT_EQ(method.error(), Error::Ok); method_ = std::make_unique(std::move(method.get())); @@ -185,6 +269,19 @@ class KernelIntegrationTest : public ::testing::Test { // The KernelControl associated with method_. KernelControl* control_; + + // The temp memory allocator provided by the user. By default, none is + // provided. + std::unique_ptr temp_allocator_ = nullptr; +}; + +class KernelTempMemoryAllocatorIntegrationTest : public KernelIntegrationTest { + protected: + void SetUp() override { + // Create a temp allocator for the test before calling the parent SetUp. + temp_allocator_ = std::make_unique(); + KernelIntegrationTest::SetUp(); + } }; TEST_F(KernelIntegrationTest, KernelHookIsCalled) { @@ -222,3 +319,63 @@ TEST_F(KernelIntegrationTest, FailurePropagates) { EXPECT_EQ(err, Error::Ok); EXPECT_EQ(control_->call_count, 3); } + +TEST_F(KernelIntegrationTest, DefaultPlatformMemoryAllocator) { + // Tell the kernel to allocate memory. Since no temp allocator is provided, + // this will allocate memory using the default platform memory allocator. + control_->allocate_temp_memory = true; + + control_->temp_memory_size = 4; + // This is not a simulation. This actually allocates memory, using the + // default platform memory allocator. + Error err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 1); + EXPECT_EQ(control_->total_allocated_size, 4); + + control_->temp_memory_size = 8; + // This is not a simulation. This actually allocates memory, using the + // default platform memory allocator. + err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 2); + EXPECT_EQ(control_->total_allocated_size, 12); +} + +TEST_F(KernelTempMemoryAllocatorIntegrationTest, UsingTempMemoryAllocator) { + // In this test we provide a temp allocator to the method, and tell the kernel + // to allocate memory using it. We want to make sure that the kernel uses the + // temp allocator, and that the temp allocator is reset after the execution. + // Since we are testing that the kernel uses the temp allocator, and not the + // temp allocator itself, we don't need to test the actual allocation of + // memory. Therefore, we set simulate_temp_memory_allocation to true, so that + // the kernel will not actually allocate memory, but will instead simulate + // allocating memory. + // The provided TempMemoryAllocator, simulates allocating memory by increasing + // total_allocated_size and currently_allocated_size by the requested size. + // We simulate resetting the allocator by setting currently_allocated_size + // back to 0. + control_->simulate_temp_memory_allocation = true; + + control_->temp_memory_size = 4; + Error err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 1); + EXPECT_EQ(control_->total_allocated_size, 4); + EXPECT_EQ(temp_allocator_->number_of_allocations, 1); + EXPECT_EQ(temp_allocator_->total_allocated_size, 4); + // The temp allocator should have been reset after the execution. + EXPECT_EQ(temp_allocator_->number_of_resets, 1); + EXPECT_EQ(temp_allocator_->currently_allocated_size, 0); + + control_->temp_memory_size = 8; + err = method_->execute(); + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(control_->call_count, 2); + EXPECT_EQ(control_->total_allocated_size, 12); + EXPECT_EQ(temp_allocator_->number_of_allocations, 2); + EXPECT_EQ(temp_allocator_->total_allocated_size, 12); + // The temp allocator should have been reset after the execution. + EXPECT_EQ(temp_allocator_->number_of_resets, 2); + EXPECT_EQ(temp_allocator_->currently_allocated_size, 0); +} diff --git a/runtime/executor/test/managed_memory_manager.h b/runtime/executor/test/managed_memory_manager.h index 667aa35ca24..a01091527b0 100644 --- a/runtime/executor/test/managed_memory_manager.h +++ b/runtime/executor/test/managed_memory_manager.h @@ -27,7 +27,8 @@ class ManagedMemoryManager { public: ManagedMemoryManager( size_t planned_memory_bytes, - size_t method_allocator_bytes) + size_t method_allocator_bytes, + MemoryAllocator* temp_allocator = nullptr) : planned_memory_buffer_(new uint8_t[planned_memory_bytes]), planned_memory_span_( planned_memory_buffer_.get(), @@ -35,7 +36,7 @@ class ManagedMemoryManager { planned_memory_({&planned_memory_span_, 1}), method_allocator_pool_(new uint8_t[method_allocator_bytes]), method_allocator_(method_allocator_bytes, method_allocator_pool_.get()), - memory_manager_(&method_allocator_, &planned_memory_) {} + memory_manager_(&method_allocator_, &planned_memory_, temp_allocator) {} MemoryManager& get() { return memory_manager_; diff --git a/runtime/platform/default/minimal.cpp b/runtime/platform/default/minimal.cpp index e1db2083f4a..8236f993188 100644 --- a/runtime/platform/default/minimal.cpp +++ b/runtime/platform/default/minimal.cpp @@ -47,3 +47,9 @@ void et_pal_emit_log_message( ET_UNUSED size_t line, ET_UNUSED const char* message, ET_UNUSED size_t length) {} + +void* et_pal_allocate(ET_UNUSED size_t size) { + return nullptr; +} + +void et_pal_free(ET_UNUSED void* ptr) {} diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp index cfc8cafc491..aba504f53e0 100644 --- a/runtime/platform/default/posix.cpp +++ b/runtime/platform/default/posix.cpp @@ -170,3 +170,26 @@ void et_pal_emit_log_message( message); fflush(ET_LOG_OUTPUT_FILE); } + +/** + * NOTE: Core runtime code must not call this directly. It may only be called by + * a MemoryAllocator wrapper. + * + * Allocates size bytes of memory via malloc. + * + * @param[in] size Number of bytes to allocate. + * @returns the allocated memory, or nullptr on failure. Must be freed using + * et_pal_free(). + */ +void* et_pal_allocate(size_t size) { + return malloc(size); +} + +/** + * Frees memory allocated by et_pal_allocate(). + * + * @param[in] ptr Pointer to memory to free. May be nullptr. + */ +void et_pal_free(void* ptr) { + free(ptr); +} diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h index e29dad8e9a8..03cdef8eb2f 100644 --- a/runtime/platform/platform.h +++ b/runtime/platform/platform.h @@ -115,4 +115,23 @@ void et_pal_emit_log_message( const char* message, size_t length) ET_INTERNAL_PLATFORM_WEAKNESS; +/** + * NOTE: Core runtime code must not call this directly. It may only be called by + * a MemoryAllocator wrapper. + * + * Allocates size bytes of memory. + * + * @param[in] size Number of bytes to allocate. + * @returns the allocated memory, or nullptr on failure. Must be freed using + * et_pal_free(). + */ +void* et_pal_allocate(size_t size) ET_INTERNAL_PLATFORM_WEAKNESS; + +/** + * Frees memory allocated by et_pal_allocate(). + * + * @param[in] ptr Pointer to memory to free. May be nullptr. + */ +void et_pal_free(void* ptr) ET_INTERNAL_PLATFORM_WEAKNESS; + } // extern "C" diff --git a/runtime/platform/test/executor_pal_override_test.cpp b/runtime/platform/test/executor_pal_override_test.cpp index bb9ea2ce589..9bc500e652e 100644 --- a/runtime/platform/test/executor_pal_override_test.cpp +++ b/runtime/platform/test/executor_pal_override_test.cpp @@ -53,12 +53,29 @@ class PalSpy : public PlatformIntercept { last_log_message_args.length = length; } + void* allocate(size_t size) override { + ++allocate_call_count; + last_allocated_size = size; + last_allocated_ptr = (void*)0x1234; + return nullptr; + } + + void free(void* ptr) override { + ++free_call_count; + last_freed_ptr = ptr; + } + virtual ~PalSpy() = default; size_t init_call_count = 0; size_t current_ticks_call_count = 0; size_t emit_log_message_call_count = 0; et_tick_ratio_t tick_ns_multiplier = {1, 1}; + size_t allocate_call_count = 0; + size_t free_call_count = 0; + size_t last_allocated_size = 0; + void* last_allocated_ptr = nullptr; + void* last_freed_ptr = nullptr; /// The args that were passed to the most recent call to emit_log_message(). struct { @@ -158,4 +175,33 @@ TEST(ExecutorPalOverrideTest, TickToNsMultiplier) { EXPECT_EQ(et_pal_ticks_to_ns_multiplier().denominator, 1); } +TEST(ExecutorPalOverrideTest, AllocateSmokeTest) { + PalSpy spy; + InterceptWith iw(spy); + + // Validate that et_pal_allocate is overridden. + EXPECT_EQ(spy.allocate_call_count, 0); + EXPECT_EQ(spy.last_allocated_ptr, nullptr); + et_pal_allocate(4); + EXPECT_EQ(spy.allocate_call_count, 1); + EXPECT_EQ(spy.last_allocated_size, 4); + EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234); +} + +TEST(ExecutorPalOverrideTest, FreeSmokeTest) { + PalSpy spy; + InterceptWith iw(spy); + + et_pal_allocate(4); + EXPECT_EQ(spy.last_allocated_size, 4); + EXPECT_EQ(spy.last_allocated_ptr, (void*)0x1234); + + // Validate that et_pal_free is overridden. + EXPECT_EQ(spy.free_call_count, 0); + EXPECT_EQ(spy.last_freed_ptr, nullptr); + et_pal_free(spy.last_allocated_ptr); + EXPECT_EQ(spy.free_call_count, 1); + EXPECT_EQ(spy.last_freed_ptr, (void*)0x1234); +} + #endif diff --git a/runtime/platform/test/stub_platform.cpp b/runtime/platform/test/stub_platform.cpp index f7ad2f9ee63..8cee404e4e1 100644 --- a/runtime/platform/test/stub_platform.cpp +++ b/runtime/platform/test/stub_platform.cpp @@ -75,6 +75,16 @@ void et_pal_emit_log_message( timestamp, level, filename, function, line, message, length); } +void* et_pal_allocate(size_t size) { + ASSERT_INTERCEPT_INSTALLED(); + return platform_intercept->allocate(size); +} + +void et_pal_free(void* ptr) { + ASSERT_INTERCEPT_INSTALLED(); + platform_intercept->free(ptr); +} + } // extern "C" #include diff --git a/runtime/platform/test/stub_platform.h b/runtime/platform/test/stub_platform.h index af3756f3136..de5599b53b0 100644 --- a/runtime/platform/test/stub_platform.h +++ b/runtime/platform/test/stub_platform.h @@ -45,6 +45,12 @@ class PlatformIntercept { ET_UNUSED const char* message, ET_UNUSED size_t length) {} + virtual void* allocate(ET_UNUSED size_t size) { + return nullptr; + } + + virtual void free(ET_UNUSED void* ptr) {} + virtual ~PlatformIntercept() = default; }; From 2b50c76a33b37bb6b0f57f9624eae74d45548f1b Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 10 Sep 2024 13:11:45 -0700 Subject: [PATCH 297/531] Use dynamic bound by default. Differential Revision: D62459696 Pull Request resolved: https://github.com/pytorch/executorch/pull/5234 --- extension/llm/runner/text_token_generator.h | 7 ++----- extension/tensor/tensor_impl_ptr.h | 8 ++++---- extension/tensor/tensor_ptr.h | 8 ++++---- extension/tensor/tensor_ptr_maker.h | 10 +++++----- extension/tensor/test/tensor_impl_ptr_test.cpp | 4 ++-- extension/tensor/test/tensor_ptr_test.cpp | 6 +++--- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h index 01887e75600..1726750ece5 100644 --- a/extension/llm/runner/text_token_generator.h +++ b/extension/llm/runner/text_token_generator.h @@ -70,11 +70,8 @@ class TextTokenGenerator { } // initialize tensor wrappers - auto tokens_managed = from_blob( - token_data.data(), - token_shape, - exec_aten::ScalarType::Long, - exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + auto tokens_managed = + from_blob(token_data.data(), token_shape, exec_aten::ScalarType::Long); auto start_pos_managed = from_blob(&pos, {1}, exec_aten::ScalarType::Long); diff --git a/extension/tensor/tensor_impl_ptr.h b/extension/tensor/tensor_impl_ptr.h index f336faf07b0..5f34f929b96 100644 --- a/extension/tensor/tensor_impl_ptr.h +++ b/extension/tensor/tensor_impl_ptr.h @@ -66,7 +66,7 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, std::function deleter = nullptr); /** @@ -93,7 +93,7 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { constexpr exec_aten::ScalarType scalar_type = runtime::CppTypeToScalarType::value; const auto raw_data_ptr = data.data(); @@ -126,7 +126,7 @@ template TensorImplPtr make_tensor_impl_ptr( std::vector data, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { constexpr exec_aten::ScalarType scalar_type = runtime::CppTypeToScalarType::value; std::vector sizes{exec_aten::SizesType(data.size())}; @@ -165,7 +165,7 @@ TensorImplPtr make_tensor_impl_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC); + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index ef29d598b84..c760de4f038 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -125,7 +125,7 @@ inline TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, const exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, std::function deleter = nullptr) { return make_tensor_ptr(make_tensor_impl_ptr( type, @@ -160,7 +160,7 @@ TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return make_tensor_ptr(make_tensor_impl_ptr( std::move(sizes), std::move(data), @@ -186,7 +186,7 @@ template TensorPtr make_tensor_ptr( std::vector data, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism)); } @@ -212,7 +212,7 @@ inline TensorPtr make_tensor_ptr( std::vector dim_order = {}, std::vector strides = {}, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return make_tensor_ptr(make_tensor_impl_ptr( scalar_type, std::move(sizes), diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h index a08f04c2101..fd97e53dbca 100644 --- a/extension/tensor/tensor_ptr_maker.h +++ b/extension/tensor/tensor_ptr_maker.h @@ -138,7 +138,7 @@ class TensorPtrMaker final { void* data_ = nullptr; exec_aten::ScalarType type_ = exec_aten::ScalarType::Float; exec_aten::TensorShapeDynamism dynamism_ = - exec_aten::TensorShapeDynamism::STATIC; + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND; }; /** @@ -182,7 +182,7 @@ inline TensorPtr from_blob( std::vector sizes, exec_aten::ScalarType type = exec_aten::ScalarType::Float, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .dynamism(dynamism) .make_tensor_ptr(); @@ -210,7 +210,7 @@ inline TensorPtr from_blob( std::vector strides, exec_aten::ScalarType type = exec_aten::ScalarType::Float, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .strides(std::move(strides)) .dynamism(dynamism) @@ -239,7 +239,7 @@ inline TensorPtr from_blob( exec_aten::ScalarType type, std::function&& deleter, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .deleter(std::move(deleter)) .dynamism(dynamism) @@ -270,7 +270,7 @@ inline TensorPtr from_blob( exec_aten::ScalarType type, std::function&& deleter, exec_aten::TensorShapeDynamism dynamism = - exec_aten::TensorShapeDynamism::STATIC) { + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { return for_blob(data, std::move(sizes), type) .strides(std::move(strides)) .deleter(std::move(deleter)) diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index 09d55de3e8e..1330dfa60f5 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -145,7 +145,7 @@ TEST_F(TensorImplPtrTest, TensorImplDataDeleterReleasesCapturedSharedPtr) { data_ptr.get(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); EXPECT_EQ(data_ptr.use_count(), 2); @@ -280,7 +280,7 @@ TEST_F(TensorImplPtrTest, CustomDeleterWithSharedData) { data->data(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data, &deleter_called](void*) mutable { deleter_called = true; data.reset(); diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 24aa20a8355..3f5e7ff58e2 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -98,7 +98,7 @@ TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) { data, {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [&deleter_called](void* ptr) { deleter_called = true; delete[] static_cast(ptr); @@ -118,7 +118,7 @@ TEST_F(TensorPtrTest, TensorManagesMovedVector) { data_ptr, {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [moved_data = std::move(data), &deleter_called](void*) mutable { deleter_called = true; }); @@ -140,7 +140,7 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) { data_ptr.get(), {}, {}, - exec_aten::TensorShapeDynamism::STATIC, + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND, [data_ptr, &deleter_called](void*) mutable { deleter_called = true; }); EXPECT_EQ(data_ptr.use_count(), 2); From ced40f4fa08b91fc6712a9936727d432d866a638 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:31:20 -0700 Subject: [PATCH 298/531] Fix models in benchinfra (#5226) Co-authored-by: Guang Yang --- .ci/scripts/test_model.sh | 8 +++++++- .github/workflows/apple-perf.yml | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index e589337666d..0b8574573fb 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -209,7 +209,13 @@ elif [[ "${BACKEND}" == "coreml" ]]; then fi elif [[ "${BACKEND}" == "xnnpack" ]]; then echo "Testing ${MODEL_NAME} with xnnpack..." - test_model_with_xnnpack true true + WITH_QUANTIZATION=true + WITH_DELEGATION=true + if [[ "$MODEL_NAME" == "mobilebert" ]]; then + # TODO(T197452682) + WITH_QUANTIZATION=false + fi + test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}" if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 416d1ca805e..bb7fd7b9761 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -165,6 +165,8 @@ jobs: # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "coreml" ]]; then + DELEGATE_CONFIG="coreml" fi PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" From e245590d88d542a06c55ab9c17e22fada03a791c Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:00:43 -0700 Subject: [PATCH 299/531] App side change Differential Revision: D62458651 Pull Request resolved: https://github.com/pytorch/executorch/pull/5205 --- .../android/benchmark/app/build.gradle.kts | 1 + .../app/src/main/AndroidManifest.xml | 8 ++ .../minibench/LlmBenchmarkActivity.java | 114 ++++++++++++++++++ .../org/pytorch/minibench/ModelRunner.java | 97 +++++++++++++++ .../minibench/ModelRunnerCallback.java | 24 ++++ 5 files changed, 244 insertions(+) create mode 100644 extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java create mode 100644 extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java create mode 100644 extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java diff --git a/extension/android/benchmark/app/build.gradle.kts b/extension/android/benchmark/app/build.gradle.kts index b716f2e8bd0..dcf99ca9cd0 100644 --- a/extension/android/benchmark/app/build.gradle.kts +++ b/extension/android/benchmark/app/build.gradle.kts @@ -38,6 +38,7 @@ dependencies { implementation(files("libs/executorch.aar")) implementation("com.facebook.soloader:soloader:0.10.5") implementation("com.facebook.fbjni:fbjni:0.5.1") + implementation("com.google.code.gson:gson:2.8.6") testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.2.1") androidTestImplementation("androidx.test.espresso:espresso-core:3.6.1") diff --git a/extension/android/benchmark/app/src/main/AndroidManifest.xml b/extension/android/benchmark/app/src/main/AndroidManifest.xml index 49711b6830e..098905c052c 100644 --- a/extension/android/benchmark/app/src/main/AndroidManifest.xml +++ b/extension/android/benchmark/app/src/main/AndroidManifest.xml @@ -16,6 +16,14 @@ + + + + + + diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java new file mode 100644 index 00000000000..496cbde53d6 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +import android.app.Activity; +import android.content.Intent; +import android.os.Bundle; +import android.util.Log; +import com.google.gson.Gson; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Arrays; + +public class LlmBenchmarkActivity extends Activity implements ModelRunnerCallback { + ModelRunner mModelRunner; + + String mPrompt; + StatsInfo mStatsInfo; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + + Intent intent = getIntent(); + + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); + String tokenizerPath = intent.getStringExtra("tokenizer_path"); + + float temperature = intent.getFloatExtra("temperature", 0.8f); + mPrompt = intent.getStringExtra("prompt"); + if (mPrompt == null) { + mPrompt = "The ultimate answer"; + } + + mStatsInfo = new StatsInfo(); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); + mStatsInfo.loadStart = System.currentTimeMillis(); + } + + @Override + public void onModelLoaded(int status) { + mStatsInfo.loadEnd = System.currentTimeMillis(); + if (status != 0) { + Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); + onGenerationStopped(); + return; + } + mStatsInfo.generateStart = System.currentTimeMillis(); + mModelRunner.generate(mPrompt); + } + + @Override + public void onTokenGenerated(String token) {} + + @Override + public void onStats(String stats) { + mStatsInfo.tokens = stats; + } + + @Override + public void onGenerationStopped() { + mStatsInfo.generateEnd = System.currentTimeMillis(); + + // TODO (huydhn): Remove txt files here once the JSON format is ready + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { + writer.write(mStatsInfo.toString()); + } catch (IOException e) { + e.printStackTrace(); + } + + // TODO (huydhn): Figure out on what the final JSON results looks like, we need something + // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042 + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(mStatsInfo)); + } catch (IOException e) { + e.printStackTrace(); + } + } +} + +class StatsInfo { + long loadStart; + long loadEnd; + long generateStart; + long generateEnd; + String tokens; + + @Override + public String toString() { + return "loadStart: " + + loadStart + + "\nloadEnd: " + + loadEnd + + "\ngenerateStart: " + + generateStart + + "\ngenerateEnd: " + + generateEnd + + "\n" + + tokens; + } +} diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java new file mode 100644 index 00000000000..9e9b9e003d8 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +import android.os.Handler; +import android.os.HandlerThread; +import android.os.Looper; +import android.os.Message; +import org.pytorch.executorch.LlamaCallback; +import org.pytorch.executorch.LlamaModule; + +/** A helper class to handle all model running logic within this class. */ +public class ModelRunner implements LlamaCallback { + LlamaModule mModule = null; + + String mModelFilePath = ""; + String mTokenizerFilePath = ""; + + ModelRunnerCallback mCallback = null; + + HandlerThread mHandlerThread = null; + Handler mHandler = null; + + /** + * ] Helper class to separate between UI logic and model runner logic. Automatically handle + * generate() request on worker thread. + * + * @param modelFilePath + * @param tokenizerFilePath + * @param callback + */ + ModelRunner( + String modelFilePath, + String tokenizerFilePath, + float temperature, + ModelRunnerCallback callback) { + mModelFilePath = modelFilePath; + mTokenizerFilePath = tokenizerFilePath; + mCallback = callback; + + mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f); + mHandlerThread = new HandlerThread("ModelRunner"); + mHandlerThread.start(); + mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this); + + mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL); + } + + int generate(String prompt) { + Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt); + msg.sendToTarget(); + return 0; + } + + void stop() { + mModule.stop(); + } + + @Override + public void onResult(String result) { + mCallback.onTokenGenerated(result); + } + + @Override + public void onStats(float tps) { + mCallback.onStats("tokens/second: " + tps); + } +} + +class ModelRunnerHandler extends Handler { + public static int MESSAGE_LOAD_MODEL = 1; + public static int MESSAGE_GENERATE = 2; + + private final ModelRunner mModelRunner; + + public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) { + super(looper); + mModelRunner = modelRunner; + } + + @Override + public void handleMessage(android.os.Message msg) { + if (msg.what == MESSAGE_LOAD_MODEL) { + int status = mModelRunner.mModule.load(); + mModelRunner.mCallback.onModelLoaded(status); + } else if (msg.what == MESSAGE_GENERATE) { + mModelRunner.mModule.generate((String) msg.obj, mModelRunner); + mModelRunner.mCallback.onGenerationStopped(); + } + } +} diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java new file mode 100644 index 00000000000..63701a7bbc6 --- /dev/null +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunnerCallback.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench; + +/** + * A helper interface within the app for MainActivity and Benchmarking to handle callback from + * ModelRunner. + */ +public interface ModelRunnerCallback { + + void onModelLoaded(int status); + + void onTokenGenerated(String token); + + void onStats(String token); + + void onGenerationStopped(); +} From 4cce62007a8d76b073ba8333e19ce8121ff9a71a Mon Sep 17 00:00:00 2001 From: Fredrik Knutsson Date: Tue, 10 Sep 2024 23:11:19 +0200 Subject: [PATCH 300/531] Minor fix: Create root dir when it doesn't exist. (#5075) realpath works differently on MacOS Change-Id: I17e114cd289692aa6de8a5b4e6f29fc1734aca08 --- examples/arm/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 272ddcfc0c5..9cef98e6227 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -91,6 +91,7 @@ fi ### Optional user args ######## root_dir=${1:-"${script_dir}/ethos-u-scratch"} +mkdir -p ${root_dir} root_dir=$(realpath ${root_dir}) ######## @@ -246,7 +247,6 @@ fi cd "${script_dir}" # Setup the root dir -mkdir -p "${root_dir}" cd "${root_dir}" echo "[main] Using root dir ${root_dir}" From ab6d91c5c2d9ffa2dd01695e48fdc2981f16085e Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:50:15 -0700 Subject: [PATCH 301/531] Fix internal executorch_llama_jni Differential Revision: D62458604 Pull Request resolved: https://github.com/pytorch/executorch/pull/5231 --- extension/android/jni/BUCK | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index f7e7932a21b..3c8f00b2bdc 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -70,11 +70,16 @@ fb_android_cxx_library( fb_android_cxx_library( name = "executorch_llama_jni", - srcs = ["jni_layer_llama.cpp"], + srcs = [ + "jni_layer.cpp", + "jni_layer_llama.cpp", + ], + headers = ["jni_layer_constants.h"], allow_jni_merging = False, compiler_flags = [ "-frtti", "-fexceptions", + "-DEXECUTORCH_BUILD_LLAMA_JNI", "-Wno-format", ], soname = "libexecutorch.$(ext)", @@ -83,8 +88,12 @@ fb_android_cxx_library( "//fbandroid/libraries/fbjni:fbjni", "//fbandroid/native/fb:fb", "//third-party/glog:glog", + "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", "//xplat/executorch/examples/models/llama2/runner:runner_static", "//xplat/executorch/examples/models/llava/runner:runner_static", + "//xplat/executorch/extension/module:module_static", + "//xplat/executorch/extension/runner_util:inputs_static", + "//xplat/executorch/extension/tensor:tensor_static", "//xplat/executorch/extension/threadpool:cpuinfo_utils_static", "//xplat/executorch/extension/threadpool:threadpool_static", ], From f07e4d5cfa5233dbfe53074c4039bdd532a3884b Mon Sep 17 00:00:00 2001 From: Chester Hu Date: Tue, 10 Sep 2024 14:52:56 -0700 Subject: [PATCH 302/531] Update setup-with-qnn.sh with runner util flag (#5210) --- examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 4deafb83487..68d191685d3 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ @@ -37,6 +38,7 @@ cmake examples/models/llama2 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -50,6 +52,7 @@ cmake extension/android \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android From cac2c05d8c344637c6dc8452749226c42d107a92 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Tue, 10 Sep 2024 18:32:10 -0400 Subject: [PATCH 303/531] [ET-VK] Integrate axis mapping into optimized matrix multiplication shaders + massive code cleanup Differential Revision: D62444923 Pull Request resolved: https://github.com/pytorch/executorch/pull/5223 --- .../vulkan/runtime/api/containers/Tensor.cpp | 8 + .../vulkan/runtime/api/containers/Tensor.h | 15 + backends/vulkan/runtime/graph/ComputeGraph.h | 4 + .../graph/ops/glsl/addmm_optimized.glsl | 267 +++++++++++++----- .../graph/ops/glsl/addmm_optimized.yaml | 25 +- .../graph/ops/glsl/matmul_optimized.glsl | 87 ------ .../graph/ops/glsl/matmul_optimized.yaml | 30 -- .../vulkan/runtime/graph/ops/impl/Linear.cpp | 23 +- .../vulkan/runtime/graph/ops/impl/MatMul.cpp | 22 +- 9 files changed, 279 insertions(+), 202 deletions(-) delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 6fe6746ec0d..dc507f91626 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -356,6 +356,14 @@ vkapi::VulkanBuffer& vTensor::buffer( return storage_.buffer_; } +utils::uvec3 vTensor::mapped_extents() const { + utils::uvec3 m_extents; + m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)]; + m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)]; + m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)]; + return m_extents; +} + const vkapi::BufferBindInfo vTensor::sizes_ubo() { if (!sizes_uniform_.buffer()) { sizes_uniform_ = diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 70f363796fd..31052b351de 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -347,10 +347,25 @@ class vTensor final { return storage_.storage_type_ == utils::kBuffer; } + /* + * Returns the raw image extents of the underlying image texture used to store + * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents + * may not correspond to the width, height, or channels dimension of the + * tensor. + */ inline const utils::uvec3& image_extents() const { return storage_.image_extents_; } + /* + * Returns the image extents of the underlying image texture, but re-ordered + * such that the first element is the extent of the axis used to represent the + * tensor's width dimension, the second element is the extent of the axis used + * to represent the tensor's height dimension, and the third element is the + * extent of the axis used to represent the tensor's channels dimension. + */ + utils::uvec3 mapped_extents() const; + /* * Extract an `vkapi::ScalarType` from the TensorOptions member */ diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index afdc8290cdd..46787955336 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -288,6 +288,10 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().image_extents(); } + inline utils::uvec3 mapped_extents_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().mapped_extents(); + } + inline int32_t numel_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().numel(); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl index 1698efb0b15..6e964c745e3 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl @@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED: $if BATCH_MODE: #define BATCH_MODE -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 +$if HAS_BIAS: + #define HAS_BIAS #include "indexing_utils.h" -#include "matmul.h" -// addmm will have additional arguments compared to regular mm -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_mapping")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_mapping")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_mapping")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_mapping")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; +layout(constant_id = 3) const int out_packed_dim = C_DIM; -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec4 self_sizes; -}; +// To convince the SPIR-V compiler to unroll the loops optimally, need this +// macro +#define FOUR 4 -layout(set = 0, binding = 7) uniform PRECISION restrict InLimits { - ivec3 in_limits; +#define TILE_ROWS ${TILE_ROWS} + +// we avoid mat4 and vec4 usage here as they compile to much less efficient +// SPIR-V +struct FloatMatrix_2d { + float data[TILE_ROWS][FOUR]; }; -layout(set = 0, binding = 8) uniform PRECISION restrict Params { - float alpha; - float beta; +struct FloatMatrix_3d { + float data[TILE_ROWS][FOUR][FOUR]; }; -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +#ifdef BATCH_MODE + #define FloatMatrix FloatMatrix_3d +#else + #define FloatMatrix FloatMatrix_2d +#endif // BATCH_MODE + +#ifdef HAS_BIAS +// get texel from self tensor (channel_packed) in addmm +vec4 get_texel_C_packed(const ivec2 idx) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.x > 1) { + bias_pos[bias_axis_mapping.x] = idx.x; + } + if (bias_sizes.y > 1) { + bias_pos[bias_axis_mapping.y] = idx.y; + } -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +FloatMatrix matmul_partial(const ivec4 out_idx_tl) { + FloatMatrix results; + for (int i = 0; i < TILE_ROWS; i++) { + for (int j = 0; j < FOUR; j++) { +#ifdef BATCH_MODE + for (int k = 0; k < FOUR; k++) { + results.data[i][j][k] = 0.0f; + } +#else + results.data[i][j] = 0.0f; +#endif // BATCH_MODE + } + } + vec4 mat1_tensor_partial_load[TILE_ROWS]; + vec4 mat2_tensor_partial_load[FOUR]; + +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_mapping.x; + const int mat2_row_axis = mat2_axis_mapping.y; +#else + const int mat2_k_axis = mat2_axis_mapping.y; + const int mat2_row_axis = mat2_axis_mapping.x; +#endif // MAT2_IS_TRANSPOSED + +#ifdef BATCH_MODE + for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { + if (out_idx_tl.z + batch_idx >= out_sizes.z) { + break; + } +#endif // BATCH_MODE + for (int k = 0; k < mat1_sizes.x; k+=4) { + const int k_div4 = k >> 2; + // read and cache (4 x TILE_ROWS) tile of mat1 + for (int r = 0; r < TILE_ROWS; r++) { + ivec3 mat1_pos = ivec3(0); + mat1_pos[mat1_axis_mapping.x] = k_div4; + mat1_pos[mat1_axis_mapping.y] = out_idx_tl.y + r; +#ifdef BATCH_MODE + mat1_pos[mat1_axis_mapping.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0); + } - if (any(greaterThanEqual(pos, out_limits))) { - return; + // read and cache (4 x 4) tile of mat2 + for (int r = 0; r < FOUR; ++r) { + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = k_div4; + mat2_pos[mat2_row_axis] = out_idx_tl.x + r; +#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED) + mat2_pos[mat2_axis_mapping.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0); + } + + // perform partial dot products and add partial result to results + for (int out_row = 0; out_row < TILE_ROWS; out_row++) { + for (int out_col = 0; out_col < FOUR; out_col++) { +#ifdef BATCH_MODE + results.data[out_row][out_col][batch_idx] += +#else + results.data[out_row][out_col] += +#endif // BATCH_MODE + dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]); + } + } } +#ifdef BATCH_MODE + } +#endif // BATCH_MODE + + return results; +} - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - vec4 self_texel = get_texel_C_packed( - im_self, - out_pos, - self_sizes.x == 1, - self_sizes.y == 1); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r][0], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][1], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][2], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); +// +// Write result matrix to output (3D matmul) +// + +void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) { + ivec3 out_pos = to_texture_pos( + out_idx_tl, out_sizes, out_axis_mapping, out_packed_dim); + + for (int tile_c = 0; + tile_c < TILE_ROWS; + tile_c++, out_pos[out_axis_mapping.y]++) { + out_pos[out_axis_mapping.x] = out_idx_tl.x; + + for (int tile_r = 0; + tile_r < FOUR; + tile_r++, out_pos[out_axis_mapping.x]++) { + +#ifdef HAS_BIAS + ivec2 bias_idx; + bias_idx[bias_axis_mapping.x] = out_pos[out_axis_mapping.x]; + bias_idx[bias_axis_mapping.y] = out_pos[out_axis_mapping.y]; + float bias_val = get_texel_C_packed(bias_idx).x; +#ifdef BATCH_MODE + vec4 bias_texel = vec4(bias_val); +#else + vec4 bias_texel = vec4(bias_val, 0, 0, 0); +#endif // BATCH_MODE +#endif // HAS_BIAS + +#ifdef BATCH_MODE + vec4 out_texel = vec4( + results.data[tile_c][tile_r][0], + results.data[tile_c][tile_r][1], + results.data[tile_c][tile_r][2], + results.data[tile_c][tile_r][3]); +#else + vec4 out_texel = vec4( + results.data[tile_c][tile_r], + 0.0, + 0.0, + 0.0); +#endif // BATCH_MODE + +#ifdef HAS_BIAS + imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel); +#else + imageStore(out_tensor, out_pos, out_texel); +#endif // HAS_BIAS } } } + +void main() { + // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of + // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4) + // tile of output elements will be computed. Note the sizes are written in + // (W x H x C) format. + const ivec3 tile_idx = ivec3(gl_GlobalInvocationID); + + // Calculate the tensor index of the top left element in the output tile + const ivec4 out_idx_topleft = ivec4( + tile_idx.x * 4, + tile_idx.y * TILE_ROWS, +#ifdef BATCH_MODE + tile_idx.z * 4, +#else + tile_idx.z, +#endif // BATCH_MODE + 0); + + // If the top left element is already out of range, then skip + if (any(greaterThanEqual(out_idx_topleft, out_sizes))) { + return; + } + + FloatMatrix results = matmul_partial(out_idx_topleft); + + write_results_C_packed(out_idx_topleft, results); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml index b958d3b9543..c82c2003d20 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml @@ -7,24 +7,37 @@ addmm_optimized: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - PACKING: C_packed MAT2_IS_TRANSPOSED: false BATCH_MODE: false - TILE_ROW: tile_row_4 + TILE_ROWS: 4 + HAS_BIAS: true generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 + TILE_ROWS: + - VALUE: 4 + SUFFIX: tile_row_4 + - VALUE: 2 + SUFFIX: tile_row_2 DTYPE: - VALUE: float - VALUE: half shader_variants: - NAME: addmm_optimized + - NAME: matmul_optimized + HAS_BIAS: false - NAME: linear_optimized MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false - NAME: batch_addmm_optimized BATCH_MODE: true + - NAME: batch_matmul_optimized + BATCH_MODE: true + HAS_BIAS: false - NAME: batch_linear_optimized MAT2_IS_TRANSPOSED: true BATCH_MODE: true + - NAME: batch_matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + BATCH_MODE: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl deleted file mode 100644 index 8634371a7b4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if BATCH_MODE: - #define BATCH_MODE - -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InLimits { - ivec3 in_limits; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - results.data[idx_c][idx_r][0], - results.data[idx_c][idx_r][1], - results.data[idx_c][idx_r][2], - results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml deleted file mode 100644 index 9268d5a25aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_optimized: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - PACKING: C_packed - MAT2_IS_TRANSPOSED: false - BATCH_MODE: false - TILE_ROW: tile_row_4 - generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_optimized - - NAME: matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - - NAME: batch_matmul_optimized - BATCH_MODE: true - - NAME: batch_matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp index 63b60bf52f7..14c814b084a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp @@ -174,10 +174,19 @@ void add_addmm_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); utils::uvec3 global_size; + + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `mapped_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1}); } utils::uvec3 local_size = adaptive_work_group_size(global_size); @@ -191,14 +200,18 @@ void add_addmm_optimized_node( {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), + graph.axis_mapping_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_mapping_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_mapping_ubo(mat2_packed), graph.sizes_ubo(self), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_mapping_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp index a25a602e38f..07618239a65 100644 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp @@ -181,12 +181,21 @@ void add_matmul_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] utils::uvec3 global_size; if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `mapped_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(graph.mapped_extents_of(out), {4, 4, 1}); } + utils::uvec3 local_size = adaptive_work_group_size(global_size); graph.execute_nodes().emplace_back(new ExecuteNode( @@ -199,12 +208,15 @@ void add_matmul_optimized_node( {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_mapping_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_mapping_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_mapping_ubo(mat2_packed), }, // Specialization Constants - {}, + {graph.packed_dim_whcn_idx_of(out)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); From cba5bee4bae89eebc60958d030cc09e0d9257e36 Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Tue, 10 Sep 2024 17:24:57 -0700 Subject: [PATCH 304/531] fbshipit-source-id: f63634ba171da01328849d84552b125b829403e8 From ca889fb59b06ec8bf07d4c0c56fed2b59d0d0a37 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Tue, 10 Sep 2024 17:15:57 -0700 Subject: [PATCH 305/531] Minibench use model_dir instead (#5250) Summary: We specify a model dir, not model path. It's easier to update test spec Pull Request resolved: https://github.com/pytorch/executorch/pull/5250 Reviewed By: huydhn Differential Revision: D62473641 Pulled By: kirklandsign fbshipit-source-id: 40864831de9960fe29b101683ef7182e2f56fe7b --- .../org/pytorch/minibench/BenchmarkActivity.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java index e9599dd3518..a79f668f80b 100644 --- a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java +++ b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java @@ -11,8 +11,10 @@ import android.app.Activity; import android.content.Intent; import android.os.Bundle; +import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.Arrays; import org.pytorch.executorch.Module; public class BenchmarkActivity extends Activity { @@ -20,13 +22,19 @@ public class BenchmarkActivity extends Activity { protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); + int numIter = intent.getIntExtra("num_iter", 10); // TODO: Format the string with a parsable format StringBuilder resultText = new StringBuilder(); - Module module = Module.load(modelPath); + Module module = Module.load(model.getPath()); for (int i = 0; i < numIter; i++) { long start = System.currentTimeMillis(); module.forward(); From e4d72ce60c45e382e20e48fad3980dad82abed53 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Tue, 10 Sep 2024 17:34:08 -0700 Subject: [PATCH 306/531] Update setup.sh for LlamaDemo (#5235) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5235 Reviewed By: cmodi-meta, shoumikhin Differential Revision: D62468267 Pulled By: kirklandsign fbshipit-source-id: d64f28cb7c6c97853bbb557af63c1f6937b3626d --- examples/demo-apps/android/LlamaDemo/setup.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 78816680bc7..5e65929426b 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -37,6 +38,7 @@ cmake examples/models/llama2 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/examples/models/llama2 @@ -48,6 +50,7 @@ cmake extension/android \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android From d423131b81393ed8dae6a180ebce81dfe142d812 Mon Sep 17 00:00:00 2001 From: Riandy Riandy Date: Tue, 10 Sep 2024 17:41:35 -0700 Subject: [PATCH 307/531] Android app UI/flow improvements (#5241) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5241 - Add default system prompt - Set temperature to 0 - Load model directly upon click Reviewed By: cmodi-meta, kirklandsign Differential Revision: D62472502 fbshipit-source-id: 8ecc88ee4474afa50658e93955c49ff0f3eef745 --- .../java/com/example/executorchllamademo/PromptFormat.java | 1 + .../com/example/executorchllamademo/SettingsActivity.java | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 640d3782128..36e738c3d0e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -13,6 +13,7 @@ public class PromptFormat { public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; + public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences"; public static String getSystemPromptTemplate(ModelType modelType) { switch (modelType) { diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 5f1fc96e1ac..0736c8cda94 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -43,7 +43,7 @@ public class SettingsActivity extends AppCompatActivity { public SettingsFields mSettingsFields; private DemoSharedPreferences mDemoSharedPreferences; - public static double TEMPERATURE_MIN_VALUE = 0.1; + public static double TEMPERATURE_MIN_VALUE = 0.0; @Override protected void onCreate(Bundle savedInstanceState) { @@ -120,6 +120,7 @@ private void setupLoadModelButton() { public void onClick(DialogInterface dialog, int whichButton) { mSettingsFields.saveLoadModelAction(true); mLoadModelButton.setEnabled(false); + onBackPressed(); } }) .setNegativeButton(android.R.string.no, null) @@ -208,8 +209,7 @@ public void afterTextChanged(Editable s) { new DialogInterface.OnClickListener() { public void onClick(DialogInterface dialog, int whichButton) { // Clear the messageAdapter and sharedPreference - mSystemPromptEditText.setText( - PromptFormat.getSystemPromptTemplate(mModelType)); + mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT); } }) .setNegativeButton(android.R.string.no, null) From 7942d2cf3e8fd3422896df6f5b53abff5772365c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 10 Sep 2024 18:14:52 -0700 Subject: [PATCH 308/531] Allow core aten op exception list (#5237) Summary: Currently when a non-core ATen operator shows up in the exported graph, `to_edge()` will fail and the only option is to disable IR validity check by setting `_check_ir_validity=False`. However this is unsafe to do, instead we should still run the rest of the checks. This PR adds support to allow users to bypass core ATen ops check, by passing a list of non-core ATen ops into `to_edge()`. Note that: * This is different than `ops_set_to_not_decompose` in `to_edge_transform_and_lower`, as the ops in `_core_aten_ops_exception_list` are not intended to be kept but more likely showing up because of missing decompositions or missing core ATen tag in `native_functions.yaml`. For this reason, we are combining two lists (`ops_set_to_not_decompose` and `_core_aten_ops_exception_list`) and pass to verifier. * I updated the error log to encourage people to use `_core_aten_ops_exception_list` instead of using `_check_ir_validity=False`. Pull Request resolved: https://github.com/pytorch/executorch/pull/5237 Test Plan: Added unit test Reviewed By: tarun292 Differential Revision: D62469015 Pulled By: larryliu0820 fbshipit-source-id: 1abb1b4fbbfdf3eb5e64e82e2035c7f93cf5b153 --- exir/capture/_config.py | 7 ++- exir/program/_program.py | 31 ++++++--- exir/program/test/test_program.py | 37 ++++++++++- exir/verification/verifier.py | 100 ++++++++++++++++++++---------- 4 files changed, 130 insertions(+), 45 deletions(-) diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 2d0a6c4ca80..11a0d6d069d 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -5,10 +5,11 @@ # LICENSE file in the root directory of this source tree. # pyre-unsafe - from dataclasses import dataclass, field from typing import Dict, List, Optional, Union +import torch + from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode from executorch.exir.pass_manager import PassType from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass @@ -38,6 +39,10 @@ class EdgeCompileConfig: _check_ir_validity: bool = True # TODO(larryliu): remove this _use_edge_ops: bool = True + # Allow core ATen ops check to be skipped for certain ops, but continue with the rest of the checks. + _core_aten_ops_exception_list: List[torch._ops.OpOverload] = field( + default_factory=list + ) _skip_type_promotion: bool = False # TODO(gasoonjia): remove this # TODO(T192537614): reenanle dim order as default diff --git a/exir/program/_program.py b/exir/program/_program.py index 1339760f215..6b72d190f9d 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -573,6 +573,9 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram": EXIRATenDialectVerifier()(ep.exported_program.graph_module) except ExportError: logging.info( + "If a particular operator failed core ATen IR check, please consider adding it to the exception list. " + "Add the operator to _core_aten_ops_exception_list in EdgeCompileConfig. This is the recommended way " + "to resolve this type of failure, so that the rest of the IR validation check can still be performed.\n" "If you'd like to disable IR validation checking, please set _check_ir_validity in EdgeCompileConfig, " "like *.to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))." ) @@ -590,7 +593,11 @@ def _to_edge(ep, config: EdgeCompileConfig) -> "ExirExportedProgram": module_call_graph=ep.exported_program.module_call_graph, example_inputs=ep.exported_program.example_inputs, constants=ep.exported_program.constants, - verifiers=[get_aten_verifier(enable=config._check_ir_validity)], + verifiers=[ + get_aten_verifier( + config=config, + ) + ], ), False, ) @@ -698,10 +705,13 @@ def _generate_edge_program( program: ExportedProgram, ops_set_to_not_decompose: Optional[List[torch._ops.OpOverload]] = None, ) -> ExportedProgram: - if config._check_ir_validity: try: - EXIRATenDialectVerifier(ops_set_to_not_decompose)(program.graph_module) + EXIRATenDialectVerifier( + edge_compile_config=config, + class_only=False, + exception_list=ops_set_to_not_decompose, + )(program.graph_module) except ExportError as e: logging.info(f"Input program {name} is not in ATen dialect.") raise e @@ -1020,13 +1030,8 @@ def to_edge_transform_and_lower( edge_manager = edge_manager.to_backend({name: curr_partitioner}) for name, program in edge_manager._edge_programs.items(): - if config._check_ir_validity: - EXIREdgeDialectVerifier( - edge_compile_config=config, - class_only=True, - )()(program.graph_module) - ops_set_to_not_decompose = set() + ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set() partitioners = partitioner.get(name, []) for curr_partitioner in partitioners: curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose( @@ -1042,6 +1047,13 @@ def to_edge_transform_and_lower( generate_error=True, ) + if config._check_ir_validity: + EXIREdgeDialectVerifier( + edge_compile_config=config, + class_only=True, + exception_list=list(ops_set_to_not_decompose), + )()(program.graph_module) + return edge_manager @@ -1107,6 +1119,7 @@ def __init__( self.compile_config = compile_config or EdgeCompileConfig() if not isinstance(edge_programs, dict): edge_programs = {"forward": edge_programs} + for name, program in edge_programs.items(): try: EXIREdgeDialectVerifier( diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py index 4d2f5dfd699..73f023e778b 100644 --- a/exir/program/test/test_program.py +++ b/exir/program/test/test_program.py @@ -531,11 +531,14 @@ def test_edge_manager_dialect(self): ) self.assertTrue(edge_manager.exported_program().dialect == "EDGE") - def _test_edge_dialect_verifier(self, callable, validate_ir=True): + def _test_edge_dialect_verifier( + self, callable, validate_ir=True, exception_list=None + ): from executorch.exir import EdgeCompileConfig edge_compile_config = EdgeCompileConfig( _check_ir_validity=validate_ir, + _core_aten_ops_exception_list=exception_list, ) # pre-autograd export. eventually this will become torch.export one = torch.ones(1, dtype=torch.float) @@ -681,3 +684,35 @@ def count_nodes(graph_module, target): ), 1, ) + + def test_edge_dialect_non_core_aten_ops(self): + class LinalgNorm(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.linalg.norm(x) + + from torch._export.verifier import SpecViolationError + + input = torch.arange(9, dtype=torch.float) - 4 + ep = torch.export.export(LinalgNorm(), (input,)) + + # aten::linalg_norm is not a core op, so it should error out + with self.assertRaises(SpecViolationError): + _ = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=True)) + + # with exception list, it should not error out + try: + # This should not raise error + _ = to_edge( + ep, + compile_config=EdgeCompileConfig( + _check_ir_validity=True, + _core_aten_ops_exception_list=[ + torch.ops.aten.linalg_vector_norm.default + ], + ), + ) + except SpecViolationError: + self.fail("Should not error out on linalg_vector_norm op") diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py index 8b6ec91dd3b..b519e20393a 100644 --- a/exir/verification/verifier.py +++ b/exir/verification/verifier.py @@ -52,12 +52,6 @@ def _check_valid_dim_order_ops(op, use_dim_order) -> None: class EXIRATenDialectVerifierBase(Verifier): dialect = "OLD_EXIR_ATEN_DISABLED" - def __init__( - self, exception_list: Optional[List[torch._ops.OpOverload]] = None - ) -> None: - super().__init__() - self._exception_list = exception_list if exception_list else [] - def allowed_getattr_types(self) -> Tuple[Type[Any], ...]: return ( torch.fx.GraphModule, @@ -78,38 +72,68 @@ def __call__(self, *args, **kwargs): raise RuntimeError("") -class EXIRATenDialectVerifier(EXIRATenDialectVerifierBase): - dialect = "OLD_EXIR_ATEN" +def EXIRATenDialectVerifier( # noqa: C901 + edge_compile_config: Optional[EdgeCompileConfig] = None, + class_only: bool = False, + exception_list: Optional[List[torch._ops.OpOverload]] = None, +): + """ + Returns a verifier class that runs ATen dialect specific checks on the graph module. + """ + # merge the exception list from edge_compile_config and exception_list + if edge_compile_config and edge_compile_config._core_aten_ops_exception_list: + exception_list = edge_compile_config._core_aten_ops_exception_list + ( + exception_list or [] + ) - def _get_exception_list(self) -> List[torch._ops.OpOverload]: - exception_list = [ - torch.ops.aten.mkldnn_rnn_layer.default, - torch.ops.aten._upsample_bilinear2d_aa.default, - torch.ops.aten.quantize_per_tensor.default, - torch.ops.aten.dequantize.self, - torch.ops.aten.max.default, # TODO(T188268054) - torch.ops.aten.min.default, # TODO(T188268054) - torch.ops.aten.full_like.default, # TODO(T183507359) - ] - exception_list += self._exception_list + class _EXIRATenDialectVerifier(EXIRATenDialectVerifierBase): + dialect = "OLD_EXIR_ATEN" - return exception_list + def __init__(self) -> None: + super().__init__() + # Note: here we are using the exception list passed from EXIRATenDialectVerifier function! + self._exception_list = exception_list if exception_list else [] - def check_valid_op(self, op): - if isinstance(op, OpOverload): - # TODO These special ops should be removable easily. - if op.namespace != "aten" or op in self._get_exception_list(): - return - if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags: - # NOTE(qihan): whether view_copy operators are marked as canonical is still under - # discussion. - raise SpecViolationError( - f"Operator {op.__module__}.{op.__name__} is not Aten Canonical." - ) + def _get_exception_list(self) -> List[torch._ops.OpOverload]: + exception_list = [ + torch.ops.aten.mkldnn_rnn_layer.default, + torch.ops.aten._upsample_bilinear2d_aa.default, + torch.ops.aten.quantize_per_tensor.default, + torch.ops.aten.dequantize.self, + torch.ops.aten.max.default, # TODO(T188268054) + torch.ops.aten.min.default, # TODO(T188268054) + torch.ops.aten.full_like.default, # TODO(T183507359) + ] + exception_list += self._exception_list + return exception_list -def get_aten_verifier(enable: bool = True): - return EXIRATenDialectVerifier if enable else EXIRATenDialectVerifierBase + def check_valid_op(self, op): + if isinstance(op, OpOverload): + # TODO These special ops should be removable easily. + if op.namespace != "aten" or op in self._get_exception_list(): + return + if torch.Tag.core not in op.tags and torch.Tag.view_copy not in op.tags: + # NOTE(qihan): whether view_copy operators are marked as canonical is still under + # discussion. + raise SpecViolationError( + f"Operator {op.__module__}.{op.__name__} is not Aten Canonical." + ) + + ret = _EXIRATenDialectVerifier + if not class_only: + ret = ret() + return ret + + +def get_aten_verifier(config: EdgeCompileConfig): + return ( + EXIRATenDialectVerifier( + class_only=True, exception_list=config._core_aten_ops_exception_list + ) + if config._check_ir_validity + else EXIRATenDialectVerifierBase + ) def _get_inputs(graph_module: GraphModule) -> List[Optional[FakeTensor]]: @@ -160,6 +184,12 @@ def EXIREdgeDialectVerifier( # noqa: C901 class_only: bool = False, exception_list: Optional[List[torch._ops.OpOverload]] = None, ): + # merge the exception list from edge_compile_config and exception_list + if edge_compile_config and edge_compile_config._core_aten_ops_exception_list: + exception_list = edge_compile_config._core_aten_ops_exception_list + ( + exception_list or [] + ) + class _EXIREdgeDialectVerifier(Verifier): dialect = "EDGE" @@ -170,7 +200,9 @@ def __init__(self) -> None: self.check_edge_ops = _edge_compile_config._use_edge_ops self.use_dim_order = not _edge_compile_config._skip_dim_order - self.aten_op_verifier = EXIRATenDialectVerifier(exception_list) + self.aten_op_verifier = EXIRATenDialectVerifier( + exception_list=exception_list + ) self.check_valid_aten_op = self.aten_op_verifier.check_valid_op if self.check_edge_ops: From 69aed24f09205ebb7d1ec8c12fb0d8b5a1423772 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Tue, 10 Sep 2024 18:16:03 -0700 Subject: [PATCH 309/531] link whole quantized_ops_lib (#5253) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5253 Reviewed By: shoumikhin Differential Revision: D62474497 Pulled By: kirklandsign fbshipit-source-id: 408cd0340dce706b758097bfd6f9606bfe506460 --- extension/android/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index c9396a55879..ab1f3650102 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -98,6 +98,7 @@ endif() if(TARGET quantized_kernels) list(APPEND link_libraries quantized_kernels quantized_ops_lib) + target_link_options_shared_lib(quantized_ops_lib) endif() target_include_directories( From 41bc1ce4c0e0f8c341fa7e7738b6210519d880d9 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Tue, 10 Sep 2024 18:26:36 -0700 Subject: [PATCH 310/531] spinquant in eager mode (#5125) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5125 This PR adds the option to export the model with spin quant on gpu. Reviewed By: mergennachin Differential Revision: D62042861 fbshipit-source-id: 74274fcb3408e5f6b23e0c924272385090da03d2 --- examples/models/llama2/TARGETS | 2 + examples/models/llama2/export_llama_lib.py | 109 +++++++++++------- .../source_transformation/spin_quant.py | 55 +++++++++ 3 files changed, 124 insertions(+), 42 deletions(-) create mode 100644 examples/models/llama2/source_transformation/spin_quant.py diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index ae3e1e00f98..f1c56a5bda3 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -75,6 +75,7 @@ runtime.python_library( "source_transformation/rms_norm.py", "source_transformation/rope.py", "source_transformation/sdpa.py", + "source_transformation/spin_quant.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama2", @@ -85,6 +86,7 @@ runtime.python_library( "@EXECUTORCH_CLIENTS", ], deps = [ + "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform", "//caffe2:torch", "//executorch/examples/models:model_base", "//executorch/examples/models:models", diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 611bf16428d..dd5822c23f6 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -16,7 +16,7 @@ from enum import Enum from json import JSONDecodeError from pathlib import Path -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import pkg_resources @@ -340,6 +340,15 @@ def build_args_parser() -> argparse.ArgumentParser: required=False, default="SM8650", ) + + parser.add_argument( + "-sq", + "--use_spin_quant", + type=str, + default=None, + choices=["cuda", "native"], + help="Use SpinQuant for better quantization performance. Only support cuda and native.", + ) return parser @@ -411,46 +420,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: else: dtype_override = None - # source transforms - transforms = [] - if args.quantization_mode: - modelname = f"{modelname}_q" - transforms.append( - get_quant_weight_transform(args, dtype_override, verbose_export()) - ) - - if args.embedding_quantize: - modelname = f"{modelname}_e" - transforms.append(get_quant_embedding_transform(args)) - - if args.expand_rope_table: - transforms.append(materialze_broadcast_of_rope_freq_cis) - - if args.use_sdpa_with_kv_cache: - transforms.append(replace_sdpa_with_custom_op) - - if args.use_kv_cache: - if args.qnn: - # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` - from executorch.backends.qualcomm.utils.utils import ( - convert_linear_to_conv2d, - ) - - transforms.append(replace_kv_cache_with_simple_kv_cache) - transforms.append(replace_sdpa_with_flex_sdpa) - transforms.append(replace_causal_mask) - transforms.append(replace_rms_norm_with_native_rms_norm) - if args.optimized_rotation_path: - transforms.append(fuse_layer_norms) - transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) - transforms.append(convert_linear_to_conv2d) - - elif args.coreml or args.mps: - # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition - # to get free perf gain. - transforms.append(replace_sdpa_with_simple_sdpa) - transforms.append(replace_causal_mask) - return ( _load_llama_model( modelname=modelname, @@ -474,7 +443,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: ) .set_output_dir(output_dir_path) .to_dtype(dtype_override) - .source_transform(transforms) + .source_transform(_get_source_transforms(modelname, dtype_override, args)) ) @@ -763,3 +732,59 @@ def _load_llama_model( ), args=args, ) + + +def _get_source_transforms( + modelname: str, dtype_override: Optional[DType], args +) -> List[Callable[[torch.nn.Module], torch.nn.Module]]: + transforms = [] + if args.quantization_mode: + modelname = f"{modelname}_q" + transforms.append( + get_quant_weight_transform(args, dtype_override, verbose_export()) + ) + + if args.embedding_quantize: + modelname = f"{modelname}_e" + transforms.append(get_quant_embedding_transform(args)) + + if args.expand_rope_table: + transforms.append(materialze_broadcast_of_rope_freq_cis) + + if args.use_sdpa_with_kv_cache: + transforms.append(replace_sdpa_with_custom_op) + + if args.use_kv_cache: + if args.qnn: + # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils` + from executorch.backends.qualcomm.utils.utils import ( + convert_linear_to_conv2d, + ) + + transforms.append(replace_kv_cache_with_simple_kv_cache) + transforms.append(replace_sdpa_with_flex_sdpa) + transforms.append(replace_causal_mask) + transforms.append(replace_rms_norm_with_native_rms_norm) + if args.optimized_rotation_path: + transforms.append(fuse_layer_norms) + transforms.append(get_model_with_r1_r2(args.optimized_rotation_path)) + transforms.append(convert_linear_to_conv2d) + + elif args.coreml or args.mps: + # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition + # to get free perf gain. + transforms.append(replace_sdpa_with_simple_sdpa) + transforms.append(replace_causal_mask) + + if args.use_spin_quant: + if args.use_spin_quant == "cuda": + from .source_transformation.spin_quant import ( + inject_fast_hadamard_transform_cuda_for_spin_quant, + ) + + transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant) + + elif args.use_spin_quant == "native": + raise NotImplementedError("native SpinQuant is not implemented yet.") + + return transforms diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py new file mode 100644 index 00000000000..7b38312c182 --- /dev/null +++ b/examples/models/llama2/source_transformation/spin_quant.py @@ -0,0 +1,55 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +# Helper functions for tranforming the model to be able to run SpinQuant. +# See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant. + +import torch + +import torch.nn.functional as F + +from executorch.examples.models.llama2.llama_transformer import FeedForward +from torch import nn + + +def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module): + """ + SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer. + R3 needs to be injected as well when KV cache quantization is enabled. + """ + try: + from fast_hadamard_transform import hadamard_transform + except ImportError: + raise ImportError( + "Please install fast-hadamard-transform: pip install fast-hadamard-transform" + ) + + class FeedForwardCustom(nn.Module): + def __init__(self, w1, w2, w3): + super().__init__() + self.w1 = w1 + self.w2 = w2 + self.w3 = w3 + + def forward(self, x): + w = F.silu(self.w1(x)) * self.w3(x) + n = w.shape[-1] + return self.w2(hadamard_transform(w.contiguous()) / torch.tensor(n).sqrt()) + + for name, child in module.named_children(): + if isinstance(child, FeedForward): + setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3)) + else: + _inject_fast_hadamard_transform_cuda_for_spin_quant(child) + + +def inject_fast_hadamard_transform_cuda_for_spin_quant( + module: torch.nn.Module, +) -> torch.nn.Module: + _inject_fast_hadamard_transform_cuda_for_spin_quant(module) + return module From d7a7ec6e1d21d73d7463fbf45586464e298627e8 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Tue, 10 Sep 2024 18:51:57 -0700 Subject: [PATCH 311/531] Updated the workflow to upload models to S3 (#5232) Summary: The upload should not be all or nothing ([example flow](https://github.com/pytorch/executorch/actions/runs/10783442883)). It should upload exported models to S3 if there is at least one artifact. Pull Request resolved: https://github.com/pytorch/executorch/pull/5232 Test Plan: - Android: https://github.com/pytorch/executorch/actions/runs/10800212616 - iOS: https://github.com/pytorch/executorch/actions/runs/10799346884 Reviewed By: huydhn Differential Revision: D62459630 Pulled By: guangy10 fbshipit-source-id: cbf6c1c9e030089096d126b91ec10a936030e15b --- .github/workflows/android-perf.yml | 1 + .github/workflows/apple-perf.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c98fa98bb26..ba58435c69a 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -178,6 +178,7 @@ jobs: upload-models: needs: export-models runs-on: linux.2xlarge + if: always() # Continue this job regardless of previous job outcome steps: - name: Download the models from GitHub uses: actions/download-artifact@v3 diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index bb7fd7b9761..cb1b2b6a1b2 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -179,6 +179,7 @@ jobs: upload-models: needs: export-models runs-on: linux.2xlarge + if: always() # Continue this job regardless of previous job outcome steps: - name: Download the models from GitHub uses: actions/download-artifact@v3 From 7e374d762cf40b84cbc09ea8412dd2ac4d61f3b7 Mon Sep 17 00:00:00 2001 From: neuropilot-captain <76544501+neuropilot-captain@users.noreply.github.com> Date: Tue, 10 Sep 2024 19:25:33 -0700 Subject: [PATCH 312/531] Add model execution scripts and runner (#5217) Summary: Add execution scripts and runner for 8 OSS models Pull Request resolved: https://github.com/pytorch/executorch/pull/5217 Reviewed By: kirklandsign Differential Revision: D62479707 Pulled By: cccclai fbshipit-source-id: 81310dbb6b785ec59329110ebacb8208102e8597 --- backends/mediatek/CMakeLists.txt | 10 +- examples/mediatek/CMakeLists.txt | 38 +++ examples/mediatek/README.md | 36 +++ .../mediatek/aot_utils/oss_utils/utils.py | 73 +++++ .../mediatek/eval_utils/eval_oss_result.py | 198 ++++++++++++ .../mtk_oss_executor_runner.cpp | 302 ++++++++++++++++++ .../model_export_scripts/deeplab_v3.py | 124 +++++++ .../mediatek/model_export_scripts/edsr.py | 170 ++++++++++ .../model_export_scripts/inception_v3.py | 120 +++++++ .../model_export_scripts/inception_v4.py | 120 +++++++ .../model_export_scripts/mobilenet_v2.py | 121 +++++++ .../model_export_scripts/mobilenet_v3.py | 121 +++++++ .../mediatek/model_export_scripts/resnet18.py | 122 +++++++ .../mediatek/model_export_scripts/resnet50.py | 121 +++++++ examples/mediatek/requirements.txt | 2 + examples/mediatek/shell_scripts/export_oss.sh | 29 ++ 16 files changed, 1704 insertions(+), 3 deletions(-) create mode 100755 examples/mediatek/aot_utils/oss_utils/utils.py create mode 100755 examples/mediatek/eval_utils/eval_oss_result.py create mode 100755 examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp create mode 100755 examples/mediatek/model_export_scripts/deeplab_v3.py create mode 100755 examples/mediatek/model_export_scripts/edsr.py create mode 100755 examples/mediatek/model_export_scripts/inception_v3.py create mode 100755 examples/mediatek/model_export_scripts/inception_v4.py create mode 100755 examples/mediatek/model_export_scripts/mobilenet_v2.py create mode 100755 examples/mediatek/model_export_scripts/mobilenet_v3.py create mode 100755 examples/mediatek/model_export_scripts/resnet18.py create mode 100755 examples/mediatek/model_export_scripts/resnet50.py create mode 100755 examples/mediatek/shell_scripts/export_oss.sh diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index 4b233d94f04..744b1193d5a 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -25,9 +25,13 @@ include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) -target_link_libraries( - neuron_backend PRIVATE executorch_no_prim_ops portable_ops_lib android log - ${NEURON_BUFFER_ALLOCATOR_LIB} +target_link_libraries(neuron_backend + PRIVATE + executorch_no_prim_ops + portable_ops_lib + android + log + ${NEURON_BUFFER_ALLOCATOR_LIB} ) target_sources( neuron_backend diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 2abee59759f..1d411f07ca7 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -75,6 +75,44 @@ if(${ANDROID}) ) target_compile_options(mtk_executor_runner PUBLIC ${_common_compile_options}) + set(_mtk_oss_executor_runner__srcs ${_executor_runner__srcs}) + list( + TRANSFORM + _mtk_oss_executor_runner__srcs + PREPEND + "${EXECUTORCH_SOURCE_DIR}/" + ) + list( + FILTER + _mtk_oss_executor_runner__srcs + EXCLUDE REGEX + ".*executor_runner.cpp$" + ) + list( + PREPEND + _mtk_oss_executor_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/executor_runner/mtk_oss_executor_runner.cpp + ) + + add_executable(mtk_oss_executor_runner ${_mtk_oss_executor_runner__srcs}) + + target_include_directories(mtk_oss_executor_runner + PUBLIC + ${_common_include_directories} + ${EXECUTORCH_ROOT}/cmake-android-out/third-party/gflags/include + ) + + target_link_libraries(mtk_oss_executor_runner + ${_executor_runner_libs} + executorch + neuron_backend + gflags + ) + target_compile_options(mtk_oss_executor_runner + PUBLIC + ${_common_compile_options} + ) + set(_mtk_llama_executor_runner__srcs ${_mtk_executor_runner__srcs}) list(FILTER _mtk_llama_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$" diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md index faca42fb50c..9727f2587fd 100644 --- a/examples/mediatek/README.md +++ b/examples/mediatek/README.md @@ -9,6 +9,8 @@ examples/mediatek ├── preformatter_templates # Model specific prompt preformatter templates ├── prompts # Calibration Prompts ├── tokenizers_ # Model tokenizer scripts + ├── oss_utils # Utils for oss models +├── eval_utils # Utils for eval oss models ├── model_export_scripts # Model specifc export scripts ├── models # Model definitions ├── llm_models # LLM model definitions @@ -44,6 +46,7 @@ pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylin ``` ## AoT Flow +### llama ##### Note: Verify that localhost connection is available before running AoT Flow 1. Exporting Models to `.pte` - In the `examples/mediatek directory`, run: @@ -72,6 +75,14 @@ source shell_scripts/export_llama.sh +``` +- Argument Options: + - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50 + # Runtime ## Supported Chips @@ -100,6 +111,13 @@ adb push .pte Make sure to replace `` with the actual name of your model file. And, replace the `` with the desired detination on the device. +##### Note: For oss models, please push additional files to your Android device +```bash +adb push mtk_oss_executor_runner +adb push input_list.txt +for i in input*bin; do adb push "$i" ; done; +``` + ### Executing the Model Execute the model on your Android device by running: @@ -111,3 +129,21 @@ adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/` with the name of your model file and `` with the desired number of iterations to run the model. ##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference. +##### Note: For oss models, please use `mtk_oss_executor_runner`. +```bash +adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_" +adb pull "/data/local/tmp/output_ ./" +``` + +### Check oss result on PC +```bash +python3 eval_utils/eval_oss_result.py --eval_type --target_f --output_f +``` +For example: +``` +python3 eval_utils/eval_oss_result.py --eval_type piq --target_f edsr --output_f output_edsr +``` +- Argument Options: + - `eval_type`: topk/piq/segmentation + - `target_f`: folder contain golden data files. file name is `golden__0.bin` + - `output_f`: folder contain model output data files. file name is `output__0.bin` diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py new file mode 100755 index 00000000000..f447b2ac68f --- /dev/null +++ b/examples/mediatek/aot_utils/oss_utils/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Optional + +import torch +from executorch import exir +from executorch.backends.mediatek import ( + NeuropilotPartitioner, + NeuropilotQuantizer, + Precision, +) +from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + + +def build_executorch_binary( + model, + inputs, + file_name, + dataset, + quant_dtype: Optional[Precision] = None, +): + if quant_dtype is not None: + quantizer = NeuropilotQuantizer() + quantizer.setup_precision(quant_dtype) + if quant_dtype not in Precision: + raise AssertionError(f"No support for Precision {quant_dtype}.") + + captured_model = torch._export.capture_pre_autograd_graph(model, inputs) + annotated_model = prepare_pt2e(captured_model, quantizer) + print("Quantizing the model...") + # calibration + for data in dataset: + annotated_model(*data) + quantized_model = convert_pt2e(annotated_model, fold_quantize=False) + aten_dialect = torch.export.export(quantized_model, inputs) + else: + aten_dialect = torch.export.export(model, inputs) + + from executorch.exir.program._program import to_edge_transform_and_lower + + edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False) + # skipped op names are used for deeplabV3 model + neuro_partitioner = NeuropilotPartitioner( + [], + op_names_to_skip={ + "aten_convolution_default_106", + "aten_convolution_default_107", + }, + ) + edge_prog = to_edge_transform_and_lower( + aten_dialect, + compile_config=edge_compile_config, + partitioner=[neuro_partitioner], + ) + + exec_prog = edge_prog.to_executorch( + config=exir.ExecutorchBackendConfig(extract_constant_segment=False) + ) + with open(f"{file_name}.pte", "wb") as file: + file.write(exec_prog.buffer) + + +def make_output_dir(path: str): + if os.path.exists(path): + for f in os.listdir(path): + os.remove(os.path.join(path, f)) + os.removedirs(path) + os.makedirs(path) diff --git a/examples/mediatek/eval_utils/eval_oss_result.py b/examples/mediatek/eval_utils/eval_oss_result.py new file mode 100755 index 00000000000..3e599330b66 --- /dev/null +++ b/examples/mediatek/eval_utils/eval_oss_result.py @@ -0,0 +1,198 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +import os + +import numpy as np +import piq +import torch + + +def check_data(target_f, predict_f): + target_files = os.listdir(target_f) + predict_files = os.listdir(predict_f) + if len(target_files) != len(predict_files): + raise RuntimeError( + "Data number in target folder and prediction folder must be same" + ) + + predict_set = set(predict_files) + for f in target_files: + # target file naming rule is golden_sampleId_outId.bin + # predict file naming rule is output_sampleId_outId.bin + pred_name = f.replace("golden", "output") + try: + predict_set.remove(pred_name) + except KeyError: + raise RuntimeError(f"Cannot find {pred_name} in {predict_f}") + + if predict_set: + target_name = next(predict_set).replace("output", "golden") + raise RuntimeError(f"Cannot find {target_name} in {target_f}") + + +def eval_topk(target_f, predict_f): + def solve(prob, target, k): + _, indices = torch.topk(prob, k=k, sorted=True) + golden = torch.reshape(target, [-1, 1]) + correct = golden == indices + if torch.any(correct): + return 1 + else: + return 0 + + target_files = os.listdir(target_f) + + cnt10 = 0 + cnt50 = 0 + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + + pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.int64)[0] + cnt10 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 10) + cnt50 += solve(torch.from_numpy(pred_npy), torch.from_numpy(target_npy), 50) + + print("Top10 acc:", cnt10 * 100.0 / len(target_files)) + print("Top50 acc:", cnt50 * 100.0 / len(target_files)) + + +def eval_piq(target_f, predict_f): + target_files = os.listdir(target_f) + + psnr_list = [] + ssim_list = [] + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + hr = np.fromfile(os.path.join(target_f, target_name), dtype=np.float32) + hr = hr.reshape((1, 448, 448, 3)) + hr = np.moveaxis(hr, 3, 1) + hr = torch.from_numpy(hr) + + sr = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + sr = sr.reshape((1, 448, 448, 3)) + sr = np.moveaxis(sr, 3, 1) + sr = torch.from_numpy(sr).clamp(0, 1) + + psnr_list.append(piq.psnr(hr, sr)) + ssim_list.append(piq.ssim(hr, sr)) + + avg_psnr = sum(psnr_list).item() / len(psnr_list) + avg_ssim = sum(ssim_list).item() / len(ssim_list) + + print(f"Avg of PSNR is: {avg_psnr}") + print(f"Avg of SSIM is: {avg_ssim}") + + +def eval_segmentation(target_f, predict_f): + classes = [ + "Backround", + "Aeroplane", + "Bicycle", + "Bird", + "Boat", + "Bottle", + "Bus", + "Car", + "Cat", + "Chair", + "Cow", + "DiningTable", + "Dog", + "Horse", + "MotorBike", + "Person", + "PottedPlant", + "Sheep", + "Sofa", + "Train", + "TvMonitor", + ] + + target_files = os.listdir(target_f) + + def make_confusion(goldens, predictions, num_classes): + def histogram(golden, predict): + mask = golden < num_classes + hist = np.bincount( + num_classes * golden[mask].astype(int) + predict[mask], + minlength=num_classes**2, + ).reshape(num_classes, num_classes) + return hist + + confusion = np.zeros((num_classes, num_classes)) + for g, p in zip(goldens, predictions): + confusion += histogram(g.flatten(), p.flatten()) + + return confusion + + pred_list = [] + target_list = [] + for target_name in target_files: + pred_name = target_name.replace("golden", "output") + target_npy = np.fromfile(os.path.join(target_f, target_name), dtype=np.uint8) + target_npy = target_npy.reshape((224, 224)) + target_list.append(target_npy) + + pred_npy = np.fromfile(os.path.join(predict_f, pred_name), dtype=np.float32) + pred_npy = pred_npy.reshape((224, 224, len(classes))) + pred_npy = pred_npy.argmax(2).astype(np.uint8) + pred_list.append(pred_npy) + + eps = 1e-6 + confusion = make_confusion(target_list, pred_list, len(classes)) + + pa = np.diag(confusion).sum() / (confusion.sum() + eps) + mpa = np.mean(np.diag(confusion) / (confusion.sum(axis=1) + eps)) + iou = np.diag(confusion) / ( + confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion) + eps + ) + miou = np.mean(iou) + cls_iou = dict(zip(classes, iou)) + + print(f"PA : {pa}") + print(f"MPA : {mpa}") + print(f"MIoU : {miou}") + print(f"CIoU : \n{json.dumps(cls_iou, indent=2)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--target_f", + help="folder of target data", + type=str, + required=True, + ) + + parser.add_argument( + "--out_f", + help="folder of model prediction data", + type=str, + required=True, + ) + + parser.add_argument( + "--eval_type", + help="Choose eval type from: topk, piq, segmentation", + type=str, + choices=["topk", "piq", "segmentation"], + required=True, + ) + + args = parser.parse_args() + + check_data(args.target_f, args.out_f) + + if args.eval_type == "topk": + eval_topk(args.target_f, args.out_f) + elif args.eval_type == "piq": + eval_piq(args.target_f, args.out_f) + elif args.eval_type == "segmentation": + eval_segmentation(args.target_f, args.out_f) diff --git a/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp new file mode 100755 index 00000000000..3a1ad1d863b --- /dev/null +++ b/examples/mediatek/executor_runner/mtk_oss_executor_runner.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 MediaTek Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * @file + * + * This tool can run ExecuTorch model files that only use operators that + * are covered by the portable kernels, with possible delegate to the + * test_backend_compiler_lib. + * + * It sets all input tensor data to ones, and assumes that the outputs are + * all fp32 tensors. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +static uint8_t method_allocator_pool[8 * 1024U * 1024U]; // 8 MB + +// Model Path +DEFINE_string( + model_path, + "model.pte", + "Model serialized in flatbuffer format. Default to 'model.pte'"); +DEFINE_string( + input_list, + "input_list.txt", + "Model input list. Default to 'input_list.txt'"); +DEFINE_string( + output_folder, + "outputs", + "Model output folder. Default to 'outputs'"); + +using namespace torch::executor; +using torch::executor::MemoryAllocator; +using torch::executor::util::BufferCleanup; +using torch::executor::util::FileDataLoader; +using namespace std::filesystem; + +int main(int argc, char** argv) { + runtime_init(); + + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (argc != 1) { + std::string msg = "Extra commandline args:"; + for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) { + msg += std::string(" ") + argv[i]; + } + ET_LOG(Error, "%s", msg.c_str()); + return 1; + } + + // Create output folder + create_directories(FLAGS_output_folder); + + // Create a loader to get the data of the program file. There are other + // DataLoaders that use mmap() or point to data that's already in memory, and + // users can create their own DataLoaders to load from arbitrary sources. + const char* model_path = FLAGS_model_path.c_str(); + Result loader = FileDataLoader::from(model_path); + ET_CHECK_MSG( + loader.ok(), + "FileDataLoader::from() failed: 0x%" PRIx32, + (uint32_t)loader.error()); + + // Parse the program file. This is immutable, and can also be reused between + // multiple execution invocations across multiple threads. + Result program = Program::load(&loader.get()); + if (!program.ok()) { + ET_LOG(Error, "Failed to parse model file %s", model_path); + return 1; + } + ET_LOG(Info, "Model file %s is loaded.", model_path); + + // Use the first method in the program. + const char* method_name = nullptr; + { + const auto method_name_result = program->get_method_name(0); + ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); + method_name = *method_name_result; + } + ET_LOG(Info, "Using method %s", method_name); + + // MethodMeta describes the memory requirements of the method. + Result method_meta_result = program->method_meta(method_name); + ET_CHECK_MSG( + method_meta_result.ok(), + "Failed to get method_meta for %s: 0x%" PRIx32, + method_name, + (uint32_t)method_meta_result.error()); + + // + // The runtime does not use malloc/new; it allocates all memory using the + // MemoryManger provided by the client. Clients are responsible for allocating + // the memory ahead of time, or providing MemoryAllocator subclasses that can + // do it dynamically. + // + + // The method allocator is used to allocate all dynamic C++ metadata/objects + // used to represent the loaded method. This allocator is only used during + // loading a method of the program, which will return an error if there was + // not enough memory. + // + // The amount of memory required depends on the loaded method and the runtime + // code itself. The amount of memory here is usually determined by running the + // method and seeing how much memory is actually used, though it's possible to + // subclass MemoryAllocator so that it calls malloc() under the hood (see + // MallocMemoryAllocator). + // + // In this example we use a statically allocated memory pool. + MemoryAllocator method_allocator{ + MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + + // The memory-planned buffers will back the mutable tensors used by the + // method. The sizes of these buffers were determined ahead of time during the + // memory-planning pasees. + // + // Each buffer typically corresponds to a different hardware memory bank. Most + // mobile environments will only have a single buffer. Some embedded + // environments may have more than one for, e.g., slow/large DRAM and + // fast/small SRAM, or for memory associated with particular cores. + std::vector> planned_buffers; // Owns the memory + std::vector> planned_spans; // Passed to the allocator + size_t num_memory_planned_buffers = + method_meta_result->num_memory_planned_buffers(); + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + // .get() will always succeed because id < num_memory_planned_buffers. + size_t buffer_size = static_cast( + method_meta_result->memory_planned_buffer_size(id).get()); + ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size); + planned_buffers.push_back(std::make_unique(buffer_size)); + planned_spans.push_back({planned_buffers.back().get(), buffer_size}); + } + HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + // Assemble all of the allocators into the MemoryManager that the Executor + // will use. + MemoryManager memory_manager(&method_allocator, &planned_memory); + + // + // Load the method from the program, using the provided allocators. Running + // the method can mutate the memory-planned buffers, so the method should only + // be used by a single thread at at time, but it can be reused. + // + Result method = program->load_method(method_name, &memory_manager); + ET_CHECK_MSG( + method.ok(), + "Loading of method %s failed with status 0x%" PRIx32, + method_name, + (uint32_t)method.error()); + ET_LOG(Info, "Method loaded."); + + std::ifstream input_list(FLAGS_input_list); + ET_CHECK_MSG( + input_list.is_open(), + "Error: cannot open input file %s", + FLAGS_input_list.c_str()); + + auto split = [](std::string s, std::string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; + }; + + MethodMeta method_meta = method->method_meta(); + size_t num_inputs = method_meta.num_inputs(); + std::string file_path; + int inference_index = 0; + while (std::getline(input_list, file_path)) { + auto input_files = split(file_path, " "); + if (input_files.size() == 0) { + break; + } + ET_CHECK_MSG( + input_files.size() == num_inputs, + "Model expect %zu inputs but get %zu from input files", + num_inputs, + input_files.size()); + + // Prepare the inputs. + size_t num_allocated = 0; + ET_LOG(Info, "Number of inputs: %zu", num_inputs); + void** inputs = (void**)malloc(num_inputs * sizeof(void*)); + + for (size_t i = 0; i < num_inputs; i++) { + auto tag = method_meta.input_tag(i); + if (tag.get() != Tag::Tensor) { + ET_LOG(Debug, "Skipping malloc non-tensor input %zu", i); + continue; + } + Result tensor_meta = method_meta.input_tensor_meta(i); + const auto nbytes = tensor_meta->nbytes(); + // This input is a tensor. Allocate a buffer for it. + void* data_ptr = malloc(nbytes); + + // Read data from file + std::ifstream fin(input_files[i], std::ios::binary); + fin.seekg(0, fin.end); + size_t file_size = fin.tellg(); + + ET_CHECK_MSG( + file_size == nbytes, + "Input %zu size mismatch. file bytes: %zu, tensor bytes: %zu", + i, + file_size, + nbytes); + + fin.seekg(0, fin.beg); + fin.read(static_cast(data_ptr), file_size); + fin.close(); + inputs[num_allocated++] = data_ptr; + + // Set backend input + auto scalar_type = tensor_meta->scalar_type(); + auto sizes_raw = tensor_meta->sizes(); + auto dim = sizes_raw.size(); + auto dim_order_raw = tensor_meta->dim_order(); + std::vector sizes(sizes_raw.begin(), sizes_raw.end()); + std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end()); + + TensorImpl impl = TensorImpl( + scalar_type, dim, sizes.data(), data_ptr, dim_order.data()); + + Tensor tensor(&impl); + Error ret = method->set_input(tensor, i); + if (ret != Error::Ok) { + ET_LOG(Error, "Failed to set input %zu: 0x%" PRIx32, i, (uint32_t)ret); + // The BufferCleanup will free the inputs when it goes out of scope. + BufferCleanup cleanup({inputs, num_allocated}); + return 1; + } + } + BufferCleanup({inputs, num_allocated}); + ET_LOG(Info, "Inputs prepared."); + + // Run the model. + auto before_exec = std::chrono::high_resolution_clock::now(); + Error status = Error::Ok; + status = method->execute(); + auto after_exec = std::chrono::high_resolution_clock::now(); + double elapsed_time = std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0; + + ET_LOG(Info, "Inference took %f ms", elapsed_time); + ET_CHECK_MSG( + status == Error::Ok, + "Execution of method %s failed with status 0x%" PRIx32, + method_name, + (uint32_t)status); + ET_LOG(Info, "Model executed successfully."); + + // Get output data + size_t output_size = method->outputs_size(); + ET_LOG(Info, "Number of outputs: %zu", output_size); + std::vector outputs(output_size); + status = method->get_outputs(outputs.data(), output_size); + ET_CHECK(status == Error::Ok); + for (size_t i = 0; i < output_size; i++) { + auto output_tensor = outputs[i].toTensor(); + auto output_file_name = FLAGS_output_folder + "/output_" + + std::to_string(inference_index) + "_" + std::to_string(i) + ".bin"; + std::ofstream fout(output_file_name.c_str(), std::ios::binary); + fout.write(output_tensor.const_data_ptr(), output_tensor.nbytes()); + fout.close(); + } + + inference_index++; + } + + return 0; +} diff --git a/examples/mediatek/model_export_scripts/deeplab_v3.py b/examples/mediatek/model_export_scripts/deeplab_v3.py new file mode 100755 index 00000000000..da6766c0f54 --- /dev/null +++ b/examples/mediatek/model_export_scripts/deeplab_v3.py @@ -0,0 +1,124 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import random + +import numpy as np + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.deeplabv3 = DeepLabV3ResNet101Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + nchw_output = self.deeplabv3(nchw_input1) + return nchw_output.permute(0, 2, 3, 1) + + +def get_dataset(data_size, dataset_dir, download): + from torchvision import datasets, transforms + + input_size = (224, 224) + preprocess = transforms.Compose( + [ + transforms.Resize(input_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + ) + dataset = list( + datasets.VOCSegmentation( + root=os.path.join(dataset_dir, "voc_image"), + year="2009", + image_set="val", + transform=preprocess, + download=download, + ) + ) + + # prepare input data + random.shuffle(dataset) + inputs, targets, input_list = [], [], "" + for index, data in enumerate(dataset): + if index >= data_size: + break + image, target = data + inputs.append((image.unsqueeze(0).permute(0, 2, 3, 1),)) + targets.append(np.array(target.resize(input_size))) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./deeplab_v3", + default="./deeplab_v3", + type=str, + ) + + parser.add_argument( + "-d", + "--download", + help="If specified, download VOCSegmentation dataset by torchvision API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + data_size=data_num, dataset_dir=args.artifact, download=args.download + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + if idx == 0: + print("inp shape: ", d.detach().numpy().shape) + print("inp type: ", d.detach().numpy().dtype) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.tofile(file_name) + if idx == 0: + print("golden shape: ", data.shape) + print("golden type: ", data.dtype) + + # build pte + pte_filename = "deeplabV3Resnet101_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/edsr.py b/examples/mediatek/model_export_scripts/edsr.py new file mode 100755 index 00000000000..4192d67e569 --- /dev/null +++ b/examples/mediatek/model_export_scripts/edsr.py @@ -0,0 +1,170 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import numpy as np + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.edsr import EdsrModel + +from PIL import Image +from torch.utils.data import Dataset +from torchsr.datasets import B100 +from torchvision.transforms.functional import to_tensor + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.edsr = EdsrModel().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + nchw_output = self.edsr(nchw_input1) + return nchw_output.permute(0, 2, 3, 1) + + +class SrDataset(Dataset): + def __init__(self, hr_dir: str, lr_dir: str): + self.input_size = np.asanyarray([224, 224]) + self.hr = [] + self.lr = [] + + for file in sorted(os.listdir(hr_dir)): + self.hr.append(self._resize_img(os.path.join(hr_dir, file), 2)) + + for file in sorted(os.listdir(lr_dir)): + self.lr.append(self._resize_img(os.path.join(lr_dir, file), 1)) + + if len(self.hr) != len(self.lr): + raise AssertionError( + "The number of high resolution pics is not equal to low " + "resolution pics" + ) + + def __getitem__(self, idx: int): + return self.hr[idx], self.lr[idx] + + def __len__(self): + return len(self.lr) + + def _resize_img(self, file: str, scale: int): + with Image.open(file) as img: + return ( + to_tensor(img.resize(tuple(self.input_size * scale))) + .unsqueeze(0) + .permute(0, 2, 3, 1) + ) + + def get_input_list(self): + input_list = "" + for i in range(len(self.lr)): + input_list += f"input_{i}_0.bin\n" + return input_list + + +def get_b100( + dataset_dir: str, +): + hr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/HR" + lr_dir = f"{dataset_dir}/sr_bm_dataset/SRBenchmarks/benchmark/B100/LR_bicubic/X2" + + if not os.path.exists(hr_dir) or not os.path.exists(lr_dir): + B100(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True) + + return SrDataset(hr_dir, lr_dir) + + +def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str): + if not (lr_dir and hr_dir) and not default_dataset: + raise RuntimeError( + "Nither custom dataset is provided nor using default dataset." + ) + + if (lr_dir and hr_dir) and default_dataset: + raise RuntimeError("Either use custom dataset, or use default dataset.") + + if default_dataset: + return get_b100(dataset_dir) + + return SrDataset(hr_dir, lr_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./edsr", + default="./edsr", + type=str, + ) + + parser.add_argument( + "-r", + "--hr_ref_dir", + help="Path to the high resolution images", + default="", + type=str, + ) + + parser.add_argument( + "-l", + "--lr_dir", + help="Path to the low resolution image inputs", + default="", + type=str, + ) + + parser.add_argument( + "-d", + "--default_dataset", + help="If specified, download and use B100 dataset by torchSR API", + action="store_true", + default=False, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + dataset = get_dataset( + args.hr_ref_dir, args.lr_dir, args.default_dataset, args.artifact + ) + + inputs, targets, input_list = dataset.lr, dataset.hr, dataset.get_input_list() + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "edsr_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (inputs[0],), + f"{args.artifact}/{pte_filename}", + [(input,) for input in inputs], + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/inception_v3.py b/examples/mediatek/model_export_scripts/inception_v3.py new file mode 100755 index 00000000000..c28bd85b402 --- /dev/null +++ b/examples/mediatek/model_export_scripts/inception_v3.py @@ -0,0 +1,120 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.inception_v3 import InceptionV3Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.inception = InceptionV3Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.inception(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inceptionV3", + default="./inceptionV3", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + pte_filename = "inceptionV3_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/inception_v4.py b/examples/mediatek/model_export_scripts/inception_v4.py new file mode 100755 index 00000000000..ccb2ce16f22 --- /dev/null +++ b/examples/mediatek/model_export_scripts/inception_v4.py @@ -0,0 +1,120 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.inception_v4 import InceptionV4Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.inception = InceptionV4Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.inception(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize((299, 299)), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./inceptionV4", + default="./inceptionV4", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "inceptionV4_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 299, 299, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/mobilenet_v2.py b/examples/mediatek/model_export_scripts/mobilenet_v2.py new file mode 100755 index 00000000000..97f2ed884eb --- /dev/null +++ b/examples/mediatek/model_export_scripts/mobilenet_v2.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.mobilenet_v2 import MV2Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.mobilenet = MV2Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.mobilenet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenetV2", + default="./mobilenetV2", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "mobilenetV2_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/mobilenet_v3.py b/examples/mediatek/model_export_scripts/mobilenet_v3.py new file mode 100755 index 00000000000..fed2497ca26 --- /dev/null +++ b/examples/mediatek/model_export_scripts/mobilenet_v3.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.mobilenet_v3 import MV3Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.mobilenet = MV3Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.mobilenet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./mobilenetV3", + default="./mobilenetV3", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "mobilenetV3_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/resnet18.py b/examples/mediatek/model_export_scripts/resnet18.py new file mode 100755 index 00000000000..2f3af57e7f3 --- /dev/null +++ b/examples/mediatek/model_export_scripts/resnet18.py @@ -0,0 +1,122 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.resnet import ResNet18Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.resnet = ResNet18Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.resnet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./resnet18", + default="./resnet18", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + aaa = data.detach().numpy() + data.detach().numpy().tofile(file_name) + + # build pte + pte_filename = "resnet18_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/model_export_scripts/resnet50.py b/examples/mediatek/model_export_scripts/resnet50.py new file mode 100755 index 00000000000..ce23842447b --- /dev/null +++ b/examples/mediatek/model_export_scripts/resnet50.py @@ -0,0 +1,121 @@ +# Copyright (c) MediaTek Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +from executorch.backends.mediatek import Precision +from executorch.examples.mediatek.aot_utils.oss_utils.utils import ( + build_executorch_binary, +) +from executorch.examples.models.resnet import ResNet50Model + + +class NhwcWrappedModel(torch.nn.Module): + def __init__(self): + super(NhwcWrappedModel, self).__init__() + self.resnet = ResNet50Model().get_eager_model() + + def forward(self, input1): + nchw_input1 = input1.permute(0, 3, 1, 2) + output = self.resnet(nchw_input1) + return output + + +def get_dataset(dataset_path, data_size): + from torchvision import datasets, transforms + + def get_data_loader(): + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess) + return torch.utils.data.DataLoader( + imagenet_data, + shuffle=True, + ) + + # prepare input data + inputs, targets, input_list = [], [], "" + data_loader = get_data_loader() + for index, data in enumerate(data_loader): + if index >= data_size: + break + feature, target = data + feature = feature.permute(0, 2, 3, 1) # NHWC + inputs.append((feature,)) + targets.append(target) + input_list += f"input_{index}_0.bin\n" + + return inputs, targets, input_list + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./resnet50", + default="./resnet50", + type=str, + ) + + args = parser.parse_args() + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + data_num = 100 + inputs, targets, input_list = get_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + ) + + # save data to inference on device + input_list_file = f"{args.artifact}/input_list.txt" + with open(input_list_file, "w") as f: + f.write(input_list) + f.flush() + for idx, data in enumerate(inputs): + for i, d in enumerate(data): + file_name = f"{args.artifact}/input_{idx}_{i}.bin" + d.detach().numpy().tofile(file_name) + for idx, data in enumerate(targets): + file_name = f"{args.artifact}/golden_{idx}_0.bin" + data.detach().numpy().tofile(file_name) + + # compile to pte + pte_filename = "resnet50_mtk" + instance = NhwcWrappedModel() + build_executorch_binary( + instance.eval(), + (torch.randn(1, 224, 224, 3),), + f"{args.artifact}/{pte_filename}", + inputs, + quant_dtype=Precision.A8W8, + ) diff --git a/examples/mediatek/requirements.txt b/examples/mediatek/requirements.txt index 038700059ba..7c3de886e27 100644 --- a/examples/mediatek/requirements.txt +++ b/examples/mediatek/requirements.txt @@ -4,3 +4,5 @@ safetensors sentencepiece tokenizers transformers +piq +pillow diff --git a/examples/mediatek/shell_scripts/export_oss.sh b/examples/mediatek/shell_scripts/export_oss.sh new file mode 100755 index 00000000000..3da5dc41f94 --- /dev/null +++ b/examples/mediatek/shell_scripts/export_oss.sh @@ -0,0 +1,29 @@ +model=$1 + +echo "Export model: $model" + +if [ $model = "deeplabv3" ] +then + python3 model_export_scripts/deeplab_v3.py -d +elif [ $model = "edsr" ] +then + python3 model_export_scripts/edsr.py -d +elif [ $model = "inceptionv3" ] +then + python3 model_export_scripts/inception_v3.py -d PATH_TO_DATASET +elif [ $model = "inceptionv4" ] +then + python3 model_export_scripts/inception_v4.py -d PATH_TO_DATASET +elif [ $model = "mobilenetv2" ] +then + python3 model_export_scripts/mobilenet_v2.py -d PATH_TO_DATASET +elif [ $model = "mobilenetv3" ] +then + python3 model_export_scripts/mobilenet_v3.py -d PATH_TO_DATASET +elif [ $model = "resnet18" ] +then + python3 model_export_scripts/resnet18.py -d PATH_TO_DATASET +elif [ $model = "resnet50" ] +then + python3 model_export_scripts/resnet50.py -d PATH_TO_DATASET +fi From af8080497c3d1b1066403b5f645d8e775b5c4378 Mon Sep 17 00:00:00 2001 From: Olivia Liu Date: Tue, 10 Sep 2024 20:13:25 -0700 Subject: [PATCH 313/531] Debug event populates event name (#5142) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5142 Intermediate debugging in delegate doesn't work without also doing intermediate latency profiling in delegates. This diff is to fix this issue. It's currently blocking modai and htp side of work. Reviewed By: Jack-Khuu Differential Revision: D60947913 fbshipit-source-id: 78cb252dc4f0088c2af3a27f467f8cb6182cc785 --- devtools/etdump/etdump_schema_flatcc.fbs | 4 ++ devtools/etdump/schema_flatcc.py | 1 + devtools/etdump/tests/serialize_test.py | 1 + devtools/inspector/_inspector.py | 59 +++++++++++++----- devtools/inspector/tests/event_blocks_test.py | 62 +++++++++++++++++++ devtools/inspector/tests/inspector_test.py | 4 ++ .../inspector/tests/inspector_utils_test.py | 1 + 7 files changed, 115 insertions(+), 17 deletions(-) diff --git a/devtools/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs index d90d278f5fc..1244ebd4aeb 100644 --- a/devtools/etdump/etdump_schema_flatcc.fbs +++ b/devtools/etdump/etdump_schema_flatcc.fbs @@ -76,6 +76,10 @@ table DebugEvent { // String based delegate debug identifier. delegate_debug_id_str:string; + + // Name assigned to this debug event by the runtime. If it is an operator + // call this will just be the name of the operator that was executed. + name:string; } // All the details pertaining to an allocation done in the runtime. The main diff --git a/devtools/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py index f19f328d3fa..404fa1c9758 100644 --- a/devtools/etdump/schema_flatcc.py +++ b/devtools/etdump/schema_flatcc.py @@ -93,6 +93,7 @@ class Value: @dataclass class DebugEvent: + name: Optional[str] chain_index: int instruction_id: int delegate_debug_id_int: Optional[int] diff --git a/devtools/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py index 1a7f3bd93f5..5cab3e5b2ba 100644 --- a/devtools/etdump/tests/serialize_test.py +++ b/devtools/etdump/tests/serialize_test.py @@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC: profile_event=None, allocation_event=None, debug_event=flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py index 82b1ffe1f73..0539d4f5e4b 100644 --- a/devtools/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -152,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature": # Signature of a DebugEvent @dataclass(frozen=True, order=True) class DebugEventSignature: + name: str = "" instruction_id: Optional[int] = -1 delegate_id: Optional[int] = None delegate_id_str: Optional[str] = None @@ -165,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature": The Signature will convert these back to the intended None value """ return DebugEventSignature( + event.name or "", event.instruction_id if event.instruction_id != -1 else None, event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None, event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None, @@ -470,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time): return elapsed_time @staticmethod - def _populate_profiling_related_fields( + def _populate_event_signature_fields( ret_event: "Event", - profile_event_signature: Optional[ProfileEventSignature], - events: List[InstructionEvent], - scale_factor: float, + event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]], ) -> None: """ Given a partially constructed Event, populate the fields related to - the profile events + the profile event signature or debug event signature Fields Updated: name delegate_debug_identifier is_delegated_op - perf_data - delegate_debug_metadatas """ - - # Fill out fields from profile event signature - if profile_event_signature is not None: - if profile_event_signature.delegate_id is not None: # 0 is a valid value - delegate_debug_identifier = profile_event_signature.delegate_id + # TODO: T201347372 Push the None check to ealier in the stack. + if event_signature is not None: + if event_signature.delegate_id is not None: # 0 is a valid value + delegate_debug_identifier = event_signature.delegate_id else: - delegate_debug_identifier = ( - profile_event_signature.delegate_id_str or None - ) + delegate_debug_identifier = event_signature.delegate_id_str or None # Use the delegate identifier as the event name if delegated is_delegated_op = delegate_debug_identifier is not None name = ( - profile_event_signature.name + event_signature.name if not is_delegated_op else str(delegate_debug_identifier) ) # Update fields - ret_event.name = name + # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field + if name: + ret_event.name = name ret_event.delegate_debug_identifier = delegate_debug_identifier ret_event.is_delegated_op = is_delegated_op + @staticmethod + def _populate_profiling_related_fields( + ret_event: "Event", + profile_event_signature: Optional[ProfileEventSignature], + events: List[InstructionEvent], + scale_factor: float, + ) -> None: + """ + Given a partially constructed Event, populate the fields related to + the profile events + + Fields Updated: + name + delegate_debug_identifier + is_delegated_op + perf_data + delegate_debug_metadatas + """ + + # Fill out fields from profile event signature + Event._populate_event_signature_fields(ret_event, profile_event_signature) + # Fill out fields from profile event data = [] delegate_debug_metadatas = [] @@ -577,9 +596,15 @@ def _populate_debugging_related_fields( the debug events Fields Updated: + name + delegate_debug_identifier + is_delegated_op debug_data """ + # Fill out fields from debug event signature + Event._populate_event_signature_fields(ret_event, debug_event_signature) + debug_data: List[flatcc.Value] = [] for event in events: if (debug_events := event.debug_events) is None: diff --git a/devtools/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py index 4101035f99b..85b65aa5f34 100644 --- a/devtools/inspector/tests/event_blocks_test.py +++ b/devtools/inspector/tests/event_blocks_test.py @@ -62,6 +62,7 @@ def _gen_sample_profile_event( def _gen_sample_debug_event( instruction_id: int, delegate_debug_id: Optional[Union[int, str]] = None, + name: str = "test_debug_event", ) -> flatcc.DebugEvent: """ Helper for generating test DebugEvents @@ -77,6 +78,7 @@ def _gen_sample_debug_event( ) return flatcc.DebugEvent( + name=name, chain_index=0, instruction_id=instruction_id, delegate_debug_id_int=delegate_debug_id_int, @@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC: return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3]) + @staticmethod + def _get_sample_etdump_flatcc_debug_events_only( + event_name: str, + delegate_debug_id: str, + ) -> flatcc.ETDumpFlatCC: + """ + Helper for getting a sample ETDumpFlatCC object with RunData signature_a + and (debug_event_delegated, debug_event_non_delegated, no profile event) + """ + + debug_event_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name + ) + debug_event_non_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, name=event_name + ) + run_data_1 = flatcc.RunData( + name="signature_a", + bundled_input_index=-1, + allocators=[], + events=[ + flatcc.Event( + allocation_event=None, + debug_event=debug_event_delegated, + profile_event=None, + ), + flatcc.Event( + allocation_event=None, + debug_event=debug_event_non_delegated, + profile_event=None, + ), + ], + ) + + return ETDumpFlatCC(version=0, run_data=[run_data_1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def test_gen_from_etdump(self) -> None: @@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None: with self.assertRaises(AssertionError): EventBlock._gen_from_etdump(etdump) + def test_gen_from_etdump_debug_events_only(self) -> None: + """ + Test generation of EventBlocks given an ETDump with only debugging events + + Specifically it tests: + - Correct number of EventBlocks and Events + - Correct name of each Event + """ + event_name = "test_debug_event_only" + delegate_debug_id = "debug_id" + etdump: ETDumpFlatCC = ( + TestEventBlock._get_sample_etdump_flatcc_debug_events_only( + event_name=event_name, + delegate_debug_id=delegate_debug_id, + ) + ) + event_blocks = EventBlock._gen_from_etdump(etdump) + self.assertEqual(len(event_blocks), 1) + self.assertEqual(len(event_blocks[0].events), 2) + # Delegated event uses delegate_debug_id as event name + self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id) + # Non delegated event uses event_name as event name + self.assertEqual(event_blocks[0].events[1].name, event_name) + def test_inspector_event_generation(self) -> None: """ Test Inspector.Event derivation from various ProfileEvent cases diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py index e801557cabd..34c96eef534 100644 --- a/devtools/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -318,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -341,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): # Note the sizes of this tensor are different from the previous one debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -385,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -408,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): # Same as the event above except for offset debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py index 27e2cb0647f..73511f5fcd7 100644 --- a/devtools/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -78,6 +78,7 @@ def test_find_populated_event(self): end_time=2002, ) debug_event = flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", From 68397af396aca6cb7c8326da5ba29da0ffcf2d23 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 10 Sep 2024 22:16:03 -0700 Subject: [PATCH 314/531] Optimized op_mm using CPUBlas gemm (#5242) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5242 No immediate need for this, but it is extremely simple to implement so why not support it? ghstack-source-id: 241919004 exported-using-ghexport Reviewed By: kimishpatel Differential Revision: D62151659 fbshipit-source-id: 7cb5850981ad0666a304e7917d407847037ffa2d --- kernels/optimized/cpu/op_mm.cpp | 71 +++++++++++++++++++++++++++++++ kernels/optimized/cpu/targets.bzl | 7 +++ kernels/optimized/optimized.yaml | 5 +++ kernels/test/targets.bzl | 2 +- 4 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 kernels/optimized/cpu/op_mm.cpp diff --git a/kernels/optimized/cpu/op_mm.cpp b/kernels/optimized/cpu/op_mm.cpp new file mode 100644 index 00000000000..9131356aeb6 --- /dev/null +++ b/kernels/optimized/cpu/op_mm.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& opt_mm_out( + RuntimeContext& ctx, + const Tensor& in, + const Tensor& mat2, + Tensor& out) { + ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out); + + size_t output_ndim = 0; + std::array output_sizes; + get_mm_out_target_size(in, mat2, output_sizes.data(), &output_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok, + InvalidArgument, + out); + + if (out.numel() == 0) { + return out; + } + ET_SWITCH_REAL_TYPES_AND2( + Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { + size_t n = in.size(0); + size_t k = in.size(1); + size_t m = mat2.size(1); + + // gemm expects column-major inputs and produces column-major + // output. So, we take advantage of the identity (A @ B).t() + // = B.t() @ A.t() here; row-major B is B.t() from gemm's + // column-major perspective, etc. + executorch::cpublas::gemm( + executorch::cpublas::TransposeType::NoTranspose, + executorch::cpublas::TransposeType::NoTranspose, + m, + n, + k, + static_cast(1), + mat2.const_data_ptr(), + m, + in.const_data_ptr(), + k, + static_cast(0), + out.mutable_data_ptr(), + m); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index e7bb2d36bf4..225498aa8d1 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -52,6 +52,13 @@ _OPTIMIZED_ATEN_OPS = ( ], }), ), + op_target( + name = "op_mm", + deps = [ + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/portable/cpu/util:matmul_ops_util", + ], + ), op_target( name = "op_mul", deps = [ diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 0d445deb3e8..7c2c4d35fd7 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -52,6 +52,11 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: mm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_mm_out + - op: mul.out kernels: - arg_meta: null diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 7ae17c5237a..cd3ca556fe6 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -244,7 +244,7 @@ def define_common_targets(): _common_op_test("op_mean_test", ["aten", "portable"]) _common_op_test("op_min_test", ["aten", "portable"]) _common_op_test("op_minimum_test", ["aten", "portable"]) - _common_op_test("op_mm_test", ["aten", "portable"]) + _common_op_test("op_mm_test", ["aten", "portable", "optimized"]) _common_op_test("op_mul_test", ["aten", "portable", "optimized"]) _common_op_test("op_narrow_copy_test", ["aten", "portable"]) _common_op_test("op_native_batch_norm_test", ["aten", "portable"]) From d73a653c000c35428a73b85be636432aebe09f11 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 10 Sep 2024 22:16:03 -0700 Subject: [PATCH 315/531] Add optimized op_linear (#5243) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5243 If we happen to be running without a delegate, directly implementing linear is much more efficient than permute_copy_out (materialize a transpose) followed by matmul. ghstack-source-id: 241918986 exported-using-ghexport Reviewed By: kimishpatel Differential Revision: D62154007 fbshipit-source-id: 7b764cf9de616729541f081a51384ba8e18e72f5 --- kernels/aten/functions.yaml | 2 + kernels/optimized/cpu/op_linear.cpp | 80 +++++ kernels/optimized/cpu/targets.bzl | 7 + kernels/optimized/optimized-oss.yaml | 5 + kernels/optimized/optimized.yaml | 5 + kernels/portable/cpu/util/matmul_ops_util.cpp | 25 ++ kernels/portable/cpu/util/matmul_ops_util.h | 8 + kernels/test/op_linear_test.cpp | 301 ++++++++++++++++++ kernels/test/targets.bzl | 1 + 9 files changed, 434 insertions(+) create mode 100644 kernels/optimized/cpu/op_linear.cpp create mode 100644 kernels/test/op_linear_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 1350fc090b0..e63863fc048 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -215,6 +215,8 @@ - op: linalg_vector_norm.out +- op: linear.out + - op: log.out - op: log10.out diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp new file mode 100644 index 00000000000..56634d326f2 --- /dev/null +++ b/kernels/optimized/cpu/op_linear.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& opt_linear_out( + RuntimeContext& ctx, + const Tensor& in, + const Tensor& mat2, + const optional& bias, + Tensor& out) { + ET_KERNEL_CHECK_MSG( + ctx, + !bias.has_value(), + InvalidArgument, + out, + "bias not supported yet in linear"); + ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out); + + size_t output_ndim = 0; + std::array output_sizes; + get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok, + InvalidArgument, + out); + + // gemm on some platforms doesn't tolerate empty input. + if (out.numel() == 0) { + return out; + } + + int flattened_input_dim = 1; + for (int ii = 0; ii < in.dim() - 1; ++ii) { + flattened_input_dim *= in.sizes()[ii]; + } + ET_SWITCH_REAL_TYPES_AND2( + Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { + size_t n = flattened_input_dim; + size_t k = in.sizes()[in.dim() - 1]; + size_t m = mat2.size(0); + + executorch::cpublas::gemm( + executorch::cpublas::TransposeType::Transpose, + executorch::cpublas::TransposeType::NoTranspose, + m, + n, + k, + static_cast(1), + mat2.const_data_ptr(), + k, + in.const_data_ptr(), + k, + static_cast(0), + out.mutable_data_ptr(), + m); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 225498aa8d1..488d2af7fa1 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -40,6 +40,13 @@ _OPTIMIZED_ATEN_OPS = ( "//executorch/kernels/portable/cpu:scalar_utils", ], ), + op_target( + name = "op_linear", + deps = [ + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/portable/cpu/util:matmul_ops_util", + ], + ), op_target( name = "op_log_softmax", deps = select({ diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml index f79d652b91d..797744f3bd4 100644 --- a/kernels/optimized/optimized-oss.yaml +++ b/kernels/optimized/optimized-oss.yaml @@ -45,6 +45,11 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: linear.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_linear_out + - op: mul.out kernels: - arg_meta: null diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 7c2c4d35fd7..2421673f8a7 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -52,6 +52,11 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: linear.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_linear_out + - op: mm.out kernels: - arg_meta: null diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp index d7e49d64958..3d4f2e5e9ba 100644 --- a/kernels/portable/cpu/util/matmul_ops_util.cpp +++ b/kernels/portable/cpu/util/matmul_ops_util.cpp @@ -71,6 +71,19 @@ bool check_mm_args(const Tensor& in, const Tensor& mat2, Tensor& out) { return true; } +bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(in.dim() == out.dim()); + ET_LOG_AND_RETURN_IF_FALSE(in.dim() >= 2); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(mat2, 2)); + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mat2, out)); + + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(in, in.dim() - 1, mat2, 1)); + + return true; +} + void get_mm_out_target_size( const Tensor& mat1, const Tensor& mat2, @@ -81,5 +94,17 @@ void get_mm_out_target_size( out_sizes[1] = mat2.size(1); } +void get_linear_out_target_size( + const Tensor& mat1, + const Tensor& mat2, + Tensor::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = mat1.dim(); + for (int ii = 0; ii < mat1.dim() - 1; ++ii) { + out_sizes[ii] = mat1.sizes()[ii]; + } + out_sizes[mat1.dim() - 1] = mat2.size(0); +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h index 91e27ff2cc9..d2991868e95 100644 --- a/kernels/portable/cpu/util/matmul_ops_util.h +++ b/kernels/portable/cpu/util/matmul_ops_util.h @@ -37,5 +37,13 @@ void get_mm_out_target_size( Tensor::SizesType* out_sizes, size_t* out_ndim); +bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out); + +void get_linear_out_target_size( + const Tensor& mat1, + const Tensor& mat2, + Tensor::SizesType* out_sizes, + size_t* out_ndim); + } // namespace executor } // namespace torch diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp new file mode 100644 index 00000000000..96875cc6f77 --- /dev/null +++ b/kernels/test/op_linear_test.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpLinearOutTest : public OperatorTest { + protected: + Tensor& op_linear_out(const Tensor& self, const Tensor& mat2, Tensor& out) { + return torch::executor::aten::linear_outf(context_, self, mat2, {}, out); + } + + template + void test_dtype() { + TensorFactory tf; + + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + if (DTYPE == ScalarType::Half) { + GTEST_SKIP() + << "skip Half because torch::executor::aten::mm_out does not support Half"; + return; + } + } + + // matmul gives 4 * 2 * 3 = 24 + Tensor x = tf.full({3, 4}, 2); + Tensor y = tf.full({5, 4}, 3); + + // Output shape should be (3, 5) + Tensor out = tf.zeros({3, 5}); + + op_linear_out(x, y, out); + + Tensor expected = tf.full({3, 5}, 24); + + EXPECT_TENSOR_EQ(out, expected); + } +}; + +TEST_F(OpLinearOutTest, OutputDim) { + TensorFactory tf; + + // 3 tensors with compatible dimensions: (3, 5), (3, 4) and (4, 5). + Tensor x = tf.ones({3, 4}); + Tensor y = tf.ones({5, 4}); + Tensor out = tf.zeros({3, 5}); + + Tensor ret = op_linear_out(x, y, out); + + // Should always return the provided out Tensor. + EXPECT_TENSOR_EQ(ret, out); + + // Expected tensor, filled with 4. + Tensor expected = tf.full({3, 5}, 4); + + EXPECT_TENSOR_EQ(out, expected); +} + +/// A generic smoke test that works for any dtype that supports ones() and +/// zeros(). +TEST_F(OpLinearOutTest, AllDtypesSupported) { +#define TEST_ENTRY(ctype, dtype) test_dtype(); + ET_FORALL_REALHBF16_TYPES(TEST_ENTRY); +#undef TEST_ENTRY + // TODO: Also add tests for half, complex, quantized, and other types. Easiest + // way to do that would be to make TensorFactory support zeros() and ones() + // for those types. +} + +TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) { + TensorFactory tf; + + // Empty input matrices + Tensor x = tf.make({0, 3}, {}); + Tensor y = tf.make({0, 3}, {}); + + // Output matrix is also empty + Tensor out = tf.make({0, 0}, {}); + + Tensor expected = tf.make({0, 0}, {}); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected); +} + +TEST_F(OpLinearOutTest, InfinityTensorPasses) { + TensorFactory tff; + + Tensor x = tff.full({3, 4}, std::numeric_limits::infinity()); + Tensor y = tff.full({5, 4}, 3); + + // Output shape should be (3, 5) + Tensor out = tff.zeros({3, 5}); + + Tensor expected = tff.full({3, 5}, std::numeric_limits::infinity()); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, out), expected); +} + +TEST_F(OpLinearOutTest, MismatchedDimensionsDies) { + TensorFactory tf; + + Tensor x = tf.full({2, 2}, 3); + + Tensor wrong_y = tf.full({1, 3}, 1); + Tensor right_y = tf.full({2, 2}, 1); + + // Make an empty out tensor and demonstrate that it's empty. + Tensor out = tf.full({2, 2}, 0); + + Tensor expected = tf.full({2, 2}, 6); + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, out)); + + EXPECT_TENSOR_EQ(op_linear_out(x, right_y, out), expected); +} + +TEST_F(OpLinearOutTest, MismatchedDimensionSizeDies) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen kernel can handle mismatched dimension size"; + } + TensorFactory tf; + Tensor x = tf.full({2, 2}, 3); + + // wrong_y has incompatible dim + Tensor wrong_y = tf.full({2, 2, 2}, 1); + Tensor right_y = tf.full({2, 2}, 1); + + // wrong_out has incompatible dim + Tensor right_out = tf.ones({2, 2}); + Tensor wrong_out = tf.ones({2, 2, 3}); + + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, right_y, wrong_out)); + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, wrong_y, right_out)); +} + +TEST_F(OpLinearOutTest, WrongOutShapeDies) { + if (torch::executor::testing::SupportedFeatures::get()->is_aten) { + GTEST_SKIP() << "ATen kernel can handle wrong out shape"; + } + TensorFactory tf; + Tensor x = tf.ones({10, 3}); + + Tensor y = tf.ones({4, 3}); + + // wrong_out has incompatible shape + Tensor right_out = tf.ones({10, 4}); + Tensor wrong_out = tf.ones({7, 5}); + + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, y, wrong_out)); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, right_out), tf.full({10, 4}, 3)); +} + +TEST_F(OpLinearOutTest, DynamicShapeUpperBoundSameAsExpected) { + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({3, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) { + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpLinearOutTest, DynamicShapeUnbound) { + GTEST_SKIP() << "Dynamic shape not supported"; + TensorFactory tf; + + Tensor x = tf.make( + {3, 2}, + {0.17412060499191284, + 0.34793388843536377, + 0.8187907934188843, + 0.9979893565177917, + 0.7049332857131958, + 0.4255824089050293}); + Tensor y = tf.make( + {4, 2}, + {0.8071839213371277, + 0.31638312339782715, + 0.13667285442352295, + 0.3691965937614441, + 0.9002121090888977, + 0.09420186281204224, + 0.9070476293563843, + 0.9310881495475769}); + Tensor expected_result = tf.make( + {3, 4}, + {0.2506277561187744, + 0.15225356817245483, + 0.18952149152755737, + 0.48189279437065125, + 0.976661741733551, + 0.480360746383667, + 0.8310978412628174, + 1.6718982458114624, + 0.703657865524292, + 0.2534688115119934, + 0.6746801733970642, + 1.0356627702713013}); + + Tensor out = + tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND); + Tensor ret = op_linear_out(x, y, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +// TODO: support and test bias diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index cd3ca556fe6..f8ea484435a 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -226,6 +226,7 @@ def define_common_targets(): _common_op_test("op_le_test", ["aten", "portable", "optimized"]) _common_op_test("op_leaky_relu_test", ["aten", "portable"]) _common_op_test("op_lift_fresh_copy_test", ["aten", "portable"]) + _common_op_test("op_linear_test", ["aten", "optimized"]) _common_op_test("op_log_softmax_test", ["aten", "portable", "optimized"]) _common_op_test("op_log_test", ["aten", "portable"]) _common_op_test("op_log10_test", ["aten", "portable"]) From 3171ede404a93276db5d2a41622c659f8833c952 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 10 Sep 2024 23:12:49 -0700 Subject: [PATCH 316/531] Add scalar tensor tests. (#5260) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5260 overriding_review_checks_triggers_an_audit_and_retroactive_review Oncall Short Name: executorch Differential Revision: D62484498 fbshipit-source-id: 345fcd365d25beb1e2ae713cca9bea36e1db04d2 --- .../tensor/test/tensor_impl_ptr_test.cpp | 27 +++++++++++++++++-- extension/tensor/test/tensor_ptr_test.cpp | 22 +++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/extension/tensor/test/tensor_impl_ptr_test.cpp b/extension/tensor/test/tensor_impl_ptr_test.cpp index 1330dfa60f5..f7fd062c462 100644 --- a/extension/tensor/test/tensor_impl_ptr_test.cpp +++ b/extension/tensor/test/tensor_impl_ptr_test.cpp @@ -23,6 +23,29 @@ class TensorImplPtrTest : public ::testing::Test { } }; +TEST_F(TensorImplPtrTest, ScalarTensorCreation) { + float scalar_data = 3.14f; + auto tensor_impl = + make_tensor_impl_ptr(exec_aten::ScalarType::Float, {}, &scalar_data); + + EXPECT_EQ(tensor_impl->numel(), 1); + EXPECT_EQ(tensor_impl->dim(), 0); + EXPECT_EQ(tensor_impl->sizes().size(), 0); + EXPECT_EQ(tensor_impl->strides().size(), 0); + EXPECT_EQ((float*)tensor_impl->data(), &scalar_data); + EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f); +} + +TEST_F(TensorImplPtrTest, ScalarTensorOwningData) { + auto tensor_impl = make_tensor_impl_ptr({}, {3.14f}); + + EXPECT_EQ(tensor_impl->numel(), 1); + EXPECT_EQ(tensor_impl->dim(), 0); + EXPECT_EQ(tensor_impl->sizes().size(), 0); + EXPECT_EQ(tensor_impl->strides().size(), 0); + EXPECT_EQ(((float*)tensor_impl->data())[0], 3.14f); +} + TEST_F(TensorImplPtrTest, TensorImplCreation) { float data[20] = {2}; auto tensor_impl = make_tensor_impl_ptr( @@ -34,8 +57,8 @@ TEST_F(TensorImplPtrTest, TensorImplCreation) { EXPECT_EQ(tensor_impl->strides()[0], 5); EXPECT_EQ(tensor_impl->strides()[1], 1); EXPECT_EQ(tensor_impl->data(), data); - EXPECT_EQ(tensor_impl->mutable_data(), data); - EXPECT_EQ(((float*)tensor_impl->mutable_data())[0], 2); + EXPECT_EQ(tensor_impl->data(), data); + EXPECT_EQ(((float*)tensor_impl->data())[0], 2); } TEST_F(TensorImplPtrTest, TensorImplSharedOwnership) { diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index 3f5e7ff58e2..d5582630494 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -22,6 +22,28 @@ class TensorPtrTest : public ::testing::Test { } }; +TEST_F(TensorPtrTest, ScalarTensorCreation) { + float scalar_data = 3.14f; + auto tensor = make_tensor_ptr(exec_aten::ScalarType::Float, {}, &scalar_data); + + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->sizes().size(), 0); + EXPECT_EQ(tensor->strides().size(), 0); + EXPECT_EQ(tensor->const_data_ptr(), &scalar_data); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); +} + +TEST_F(TensorPtrTest, ScalarTensorOwningData) { + auto tensor = make_tensor_ptr({}, {3.14f}); + + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->sizes().size(), 0); + EXPECT_EQ(tensor->strides().size(), 0); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); +} + TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) { float data[20] = {2}; auto tensor = make_tensor_ptr( From 4da3c5d0bf1103950d0ac9d80cfa465ee51f5a28 Mon Sep 17 00:00:00 2001 From: yifan_shen3 Date: Wed, 11 Sep 2024 09:17:47 -0700 Subject: [PATCH 317/531] Add CoreML Quantize (#5228) Summary: ## Motivation Short term: TorchAO int4 quantization yields float zero point, but CoreML does not have good support for it yet. We will need CoreML int4 quantization for now. Intermediate term: Before torch implements all CoreML-supported quantizations (e.g. palettization, sparcification, joint compression...), it will be great to have a way to use/experiment those CoreML quantizations. ## Solution In CoreML preprocess, we add CoreML quantization config as a compile spec Pull Request resolved: https://github.com/pytorch/executorch/pull/5228 Reviewed By: kirklandsign Differential Revision: D62468184 Pulled By: cccclai fbshipit-source-id: 9f4987d19a01eaf5e2814c9ff8089324174644f8 --- .../coreml/compiler/coreml_preprocess.py | 61 ++++++++++++++++++- examples/models/llama2/export_llama_lib.py | 7 +++ extension/llm/export/partitioner_lib.py | 18 +++++- 3 files changed, 81 insertions(+), 5 deletions(-) diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index 375fdf406b2..5084405c468 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -3,6 +3,7 @@ # CoreML backend for delegating a EdgeProgram to CoreML. import json +import logging import shutil import uuid @@ -14,6 +15,7 @@ from typing import Any, Dict, final, List, Optional, Tuple import coremltools as ct +import coremltools.optimize as cto import executorchcoreml from executorch.exir.backend.backend_details import ( @@ -23,12 +25,16 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + class COMPILE_SPEC_KEYS(Enum): COMPUTE_UNITS = "compute_units" MODEL_TYPE = "model_type" MIN_DEPLOYMENT_TARGET = "min_deployment_target" MODEL_COMPUTE_PRECISION = "model_compute_precision" + OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config" class MODEL_PATHS(Enum): @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec( compute_unit.name.lower().encode("utf-8"), ) + @staticmethod + def generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config: Dict, + ) -> CompileSpec: + """ + Returns the compile spec representing the model post conversion quantization, + which is a dict that will construct cto.coreml.OpLinearQuantizerConfig + """ + str_representation = json.dumps(op_linear_quantizer_config) + byte_representation = str_representation.encode("utf-8") + return CompileSpec( + COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value, + byte_representation, + ) + + @staticmethod + def op_linear_quantizer_config_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> cto.coreml.OpLinearQuantizerConfig: + """ + Returns the model's post conversion quantization by parsing the list of compile specs. + """ + for compile_spec in compile_specs: + if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value: + config_dict_str = compile_spec.value.decode("utf-8") + config_dict = json.loads(config_dict_str) + config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict) + return config + + return None + @staticmethod def generate_compile_specs( compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL, minimum_deployment_target: ct.target = ct.target.iOS15, compute_precision: ct.precision = ct.precision.FLOAT16, model_type: MODEL_TYPE = MODEL_TYPE.MODEL, + op_linear_quantizer_config: Optional[Dict] = None, ) -> List[CompileSpec]: """ Returns the list of compile specs that's used by CoreMLBackend to lower the module. @@ -192,6 +230,12 @@ def generate_compile_specs( CoreMLBackend.generate_compute_precision_compile_spec(compute_precision) ) compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type)) + if op_linear_quantizer_config is not None: + compile_specs.append( + CoreMLBackend.generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config + ) + ) return compile_specs @@ -368,18 +412,18 @@ def preprocess( compile_specs, ) ) - model_compute_precision: ct.precision = ( CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs) ) - minimum_deployment_target: ct.target = ( CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs) ) - compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs( compile_specs ) + op_linear_quantizer_config = ( + CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs) + ) mlmodel = ct.convert( model=edge_program, @@ -392,4 +436,15 @@ def preprocess( compute_units=compute_units, ) + if op_linear_quantizer_config is not None: + logger.warning( + "Core ML Backend op_linear_quantizer_config API is experimental" + ) + config = cto.coreml.OptimizationConfig( + global_config=op_linear_quantizer_config, + # skip embedding + op_type_configs={"gather": None}, + ) + mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config) + return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index dd5822c23f6..97228bb5c5d 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -304,6 +304,12 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="This option is only for coreml, and is only supported for MacOS15+/iOS18+", ) + parser.add_argument( + "--coreml-quantize", + default=None, + choices=["b4w"], + help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)", + ) parser.add_argument( "--qnn", action="store_true", @@ -523,6 +529,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 args.use_kv_cache and args.coreml_enable_state, args.embedding_quantize, args.pt2e_quantize, + args.coreml_quantize, ) partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index f5cc04ead48..eca78bc9346 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -59,6 +59,7 @@ def get_coreml_partitioner( enable_state: bool = False, embedding_quantize: Optional[str] = None, pt2e_quantize: Optional[str] = None, + coreml_quantize: Optional[str] = None, ): try: import coremltools as ct @@ -87,16 +88,29 @@ def get_coreml_partitioner( minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17) # In Core ML, 4-bit weight compression is introduced in iOS 18 if ( - embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4 - ) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"): + (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4) + or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w") + or coreml_quantize == "b4w" + ): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) + op_linear_quantizer_config = None + if coreml_quantize == "b4w": + op_linear_quantizer_config = { + "mode": "linear_symmetric", + "dtype": "int4", + "granularity": "per_block", + "block_size": 32, + "weight_threshold": 512, + } + compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16] minimum_deployment_target=minimum_deployment_target, compute_precision=ct.precision(ct.precision.FLOAT16.value), # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU` compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()], model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16] + op_linear_quantizer_config=op_linear_quantizer_config, ) return CoreMLPartitioner( # pyre-fixme[16] compile_specs=compile_specs, From d6b800bb68b430b5f43872d877e3c6ec247900a7 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 11 Sep 2024 09:41:31 -0700 Subject: [PATCH 318/531] Add helper function to create empty, full, ones and zeros tensors. (#5261) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5261 . Reviewed By: kirklandsign Differential Revision: D62486240 fbshipit-source-id: 1c89db9ed2b31d85ffa68320348f00bc297686f8 --- extension/tensor/targets.bzl | 1 + extension/tensor/tensor_ptr.h | 27 ++- extension/tensor/tensor_ptr_maker.cpp | 114 +++++++++ extension/tensor/tensor_ptr_maker.h | 221 +++++++++++++++++- .../tensor/test/tensor_ptr_maker_test.cpp | 139 +++++++++++ extension/tensor/test/tensor_ptr_test.cpp | 16 ++ 6 files changed, 513 insertions(+), 5 deletions(-) create mode 100644 extension/tensor/tensor_ptr_maker.cpp diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index 4998b5cf15b..8493d093fa1 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -15,6 +15,7 @@ def define_common_targets(): srcs = [ "tensor_impl_ptr.cpp", "tensor_ptr.cpp", + "tensor_ptr_maker.cpp", ], exported_headers = [ "tensor.h", diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index c760de4f038..f477199a3e1 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -142,8 +142,7 @@ inline TensorPtr make_tensor_ptr( * * This template overload is specialized for cases where the tensor data is * provided as a vector. The scalar type is automatically deduced from the - * vector's data type. The deleter ensures that the data vector is properly - * managed and its lifetime is tied to the TensorImpl. + * vector's data type. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. @@ -174,8 +173,7 @@ TensorPtr make_tensor_ptr( * * This template overload is specialized for cases where the tensor data is * provided as a vector. The scalar type is automatically deduced from the - * vector's data type. The deleter ensures that the data vector is properly - * managed and its lifetime is tied to the TensorImpl. + * vector's data type. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param data A vector containing the tensor's data. @@ -190,6 +188,27 @@ TensorPtr make_tensor_ptr( return make_tensor_ptr(make_tensor_impl_ptr(std::move(data), dynamism)); } +/** + * Creates a TensorPtr that manages a Tensor with the specified properties. + * + * This template overload allows creating a Tensor from an initializer list + * of data. The scalar type is automatically deduced from the type of the + * initializer list's elements. + * + * @tparam T The C++ type of the tensor elements, deduced from the initializer + * list. + * @param data An initializer list containing the tensor's data. + * @param dynamism Specifies the mutability of the tensor's shape. + * @return A TensorPtr that manages the newly created TensorImpl. + */ +template +TensorPtr make_tensor_ptr( + std::initializer_list data, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return make_tensor_ptr(std::vector(data), dynamism); +} + /** * Creates a TensorPtr that manages a Tensor with the specified properties. * diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp new file mode 100644 index 00000000000..1c7b0efe589 --- /dev/null +++ b/extension/tensor/tensor_ptr_maker.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace extension { +namespace { +template < + typename INT_T, + typename std::enable_if< + std::is_integral::value && !std::is_same::value, + bool>::type = true> +bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) { + if (!scalar.isIntegral(/*includeBool=*/false)) { + return false; + } + int64_t val = scalar.to(); + if (val < std::numeric_limits::lowest() || + val > std::numeric_limits::max()) { + return false; + } + *out_val = static_cast(val); + return true; +} + +template < + typename FLOAT_T, + typename std::enable_if::value, bool>:: + type = true> +bool extract_scalar(exec_aten::Scalar scalar, FLOAT_T* out_val) { + double val; + if (scalar.isFloatingPoint()) { + val = scalar.to(); + if (std::isfinite(val) && + (val < std::numeric_limits::lowest() || + val > std::numeric_limits::max())) { + return false; + } + } else if (scalar.isIntegral(/*includeBool=*/false)) { + val = static_cast(scalar.to()); + } else { + return false; + } + *out_val = static_cast(val); + return true; +} + +template < + typename BOOL_T, + typename std::enable_if::value, bool>::type = + true> +bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) { + if (scalar.isIntegral(false)) { + *out_val = static_cast(scalar.to()); + return true; + } + if (scalar.isBoolean()) { + *out_val = scalar.to(); + return true; + } + return false; +} + +#define ET_EXTRACT_SCALAR(scalar, out_val) \ + ET_CHECK_MSG( \ + extract_scalar(scalar, &out_val), \ + #scalar " could not be extracted: wrong type or out of range"); + +} // namespace + +TensorPtr empty_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + std::vector data( + exec_aten::compute_numel(sizes.data(), sizes.size()) * + exec_aten::elementSize(type)); + return make_tensor_ptr( + type, + std::move(sizes), + std::move(data), + {}, + std::move(strides), + dynamism); +} + +TensorPtr full_strided( + std::vector sizes, + std::vector strides, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + auto tensor = + empty_strided(std::move(sizes), std::move(strides), type, dynamism); + ET_SWITCH_REALB_TYPES(type, nullptr, "full_strided", CTYPE, [&] { + CTYPE value; + ET_EXTRACT_SCALAR(fill_value, value); + std::fill( + tensor->mutable_data_ptr(), + tensor->mutable_data_ptr() + tensor->numel(), + value); + }); + return tensor; +} + +} // namespace extension +} // namespace executorch diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h index fd97e53dbca..132bd1f12c6 100644 --- a/extension/tensor/tensor_ptr_maker.h +++ b/extension/tensor/tensor_ptr_maker.h @@ -15,7 +15,7 @@ namespace extension { /** * A helper class for creating TensorPtr instances from raw data and tensor - * properties. Note the the TensorPtr created by this class will not own the + * properties. Note that the TensorPtr created by this class will not own the * data, so it must outlive the TensorPtr. * * TensorPtrMaker provides a fluent interface for specifying various properties @@ -31,6 +31,7 @@ class TensorPtrMaker final { // But it is movable. TensorPtrMaker(TensorPtrMaker&&) = default; TensorPtrMaker& operator=(TensorPtrMaker&&) = default; + /** * Sets the scalar type of the tensor elements. * @@ -278,5 +279,223 @@ inline TensorPtr from_blob( .make_tensor_ptr(); } +/** + * Creates a TensorPtr with the specified sizes, strides, and properties. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. The tensor is created with the + * specified strides. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr empty_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates an empty TensorPtr with the same size and properties as the given + * tensor. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr empty_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return empty_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates an empty TensorPtr with the specified sizes and properties. + * + * This function allocates memory for the tensor elements but does not + * initialize them with any specific values. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr empty( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return empty_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with the specified value. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr full_strided( + std::vector sizes, + std::vector strides, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with the specified value, with the same size and + * properties as another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr full_like( + const TensorPtr& other, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return full_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + fill_value, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with the specified value. + * + * @param sizes A vector specifying the size of each dimension. + * @param fill_value The value to fill the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr full( + std::vector sizes, + exec_aten::Scalar fill_value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_strided(std::move(sizes), {}, fill_value, type, dynamism); +} + +/** + * Creates a TensorPtr that holds a scalar value. + * + * @param value The scalar value to create the tensor with. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created scalar Tensor. + */ +inline TensorPtr scalar_tensor( + exec_aten::Scalar value, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full({}, value, type, dynamism); +} + +/** + * Creates a TensorPtr filled with ones, with the same size and properties as + * another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the `other` tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr ones_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_like(other, 1, type, dynamism); +} + +/** + * Creates a TensorPtr filled with ones. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr ones( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full(std::move(sizes), 1, type, dynamism); +} + +/** + * Creates a TensorPtr filled with zeros, with the same size and properties as + * another tensor. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the `other` tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr zeros_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full_like(other, 0, type, dynamism); +} + +/** + * Creates a TensorPtr filled with zeros. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr zeros( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return full(std::move(sizes), 0, type, dynamism); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp index d1b4179a260..7530a3709ab 100644 --- a/extension/tensor/test/tensor_ptr_maker_test.cpp +++ b/extension/tensor/test/tensor_ptr_maker_test.cpp @@ -178,3 +178,142 @@ TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) { EXPECT_TRUE(deleter_called); EXPECT_EQ(data_ptr.use_count(), 1); } + +TEST_F(TensorPtrMakerTest, CreateEmpty) { + auto tensor = empty({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + auto tensor2 = empty({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + + auto tensor3 = empty({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + + auto tensor4 = empty({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); +} + +TEST_F(TensorPtrMakerTest, CreateFull) { + auto tensor = full({4, 5}, 7); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 7); + + auto tensor2 = full({4, 5}, 3, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 3); + + auto tensor3 = full({4, 5}, 9, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 9); + + auto tensor4 = full({4, 5}, 11, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 11); +} + +TEST_F(TensorPtrMakerTest, CreateScalar) { + auto tensor = scalar_tensor(3.14f); + + EXPECT_EQ(tensor->dim(), 0); + EXPECT_EQ(tensor->numel(), 1); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 3.14f); + + auto tensor2 = scalar_tensor(5, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor2->dim(), 0); + EXPECT_EQ(tensor2->numel(), 1); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 5); + + auto tensor3 = scalar_tensor(7.0, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor3->dim(), 0); + EXPECT_EQ(tensor3->numel(), 1); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor3->const_data_ptr()[0], 7.0); +} + +TEST_F(TensorPtrMakerTest, CreateOnes) { + auto tensor = ones({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 1); + + auto tensor2 = ones({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 1); + + auto tensor3 = ones({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 1); + + auto tensor4 = ones({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 1); +} + +TEST_F(TensorPtrMakerTest, CreateZeros) { + auto tensor = zeros({4, 5}); + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + EXPECT_EQ(tensor->const_data_ptr()[0], 0); + + auto tensor2 = zeros({4, 5}, exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->dim(), 2); + EXPECT_EQ(tensor2->size(0), 4); + EXPECT_EQ(tensor2->size(1), 5); + EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int); + EXPECT_EQ(tensor2->const_data_ptr()[0], 0); + + auto tensor3 = zeros({4, 5}, exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->dim(), 2); + EXPECT_EQ(tensor3->size(0), 4); + EXPECT_EQ(tensor3->size(1), 5); + EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long); + EXPECT_EQ(tensor3->const_data_ptr()[0], 0); + + auto tensor4 = zeros({4, 5}, exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->dim(), 2); + EXPECT_EQ(tensor4->size(0), 4); + EXPECT_EQ(tensor4->size(1), 5); + EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); + EXPECT_EQ(tensor4->const_data_ptr()[0], 0); +} diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp index d5582630494..653e2ef98d7 100644 --- a/extension/tensor/test/tensor_ptr_test.cpp +++ b/extension/tensor/test/tensor_ptr_test.cpp @@ -197,6 +197,18 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) { EXPECT_EQ(tensor->strides()[0], 5); EXPECT_EQ(tensor->strides()[1], 1); EXPECT_EQ(tensor->data_ptr(), nullptr); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); +} + +TEST_F(TensorPtrTest, TensorImplDataOnly) { + auto tensor = make_tensor_ptr({1.0f, 2.0f, 3.0f, 4.0f}); + + EXPECT_EQ(tensor->dim(), 1); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->strides()[0], 1); + EXPECT_EQ(tensor->const_data_ptr()[0], 1.0); + EXPECT_EQ(tensor->const_data_ptr()[3], 4.0); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); } TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) { @@ -208,6 +220,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyDoubleType) { EXPECT_EQ(tensor->strides()[0], 1); EXPECT_EQ(tensor->const_data_ptr()[0], 1.0); EXPECT_EQ(tensor->const_data_ptr()[3], 4.0); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); } TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) { @@ -219,6 +232,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyInt32Type) { EXPECT_EQ(tensor->strides()[0], 1); EXPECT_EQ(tensor->const_data_ptr()[0], 10); EXPECT_EQ(tensor->const_data_ptr()[3], 40); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); } TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) { @@ -230,6 +244,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyInt64Type) { EXPECT_EQ(tensor->strides()[0], 1); EXPECT_EQ(tensor->const_data_ptr()[0], 100); EXPECT_EQ(tensor->const_data_ptr()[3], 400); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long); } TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) { @@ -241,6 +256,7 @@ TEST_F(TensorPtrTest, TensorImplDataOnlyUint8Type) { EXPECT_EQ(tensor->strides()[0], 1); EXPECT_EQ(tensor->const_data_ptr()[0], 10); EXPECT_EQ(tensor->const_data_ptr()[3], 40); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte); } TEST_F(TensorPtrTest, TensorImplAmbiguityWithMixedVectors) { From 75a56a2058e7118aefe4dd36a1ca0948e412c0d6 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 11 Sep 2024 09:57:31 -0700 Subject: [PATCH 319/531] Add helper function to create random tensors. (#5266) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5266 Reviewed By: kirklandsign Differential Revision: D62501925 fbshipit-source-id: 790ca389887bb3921fe13d92dbc61e804cfe0c19 --- extension/tensor/tensor_ptr_maker.cpp | 63 ++++++ extension/tensor/tensor_ptr_maker.h | 185 ++++++++++++++++++ .../tensor/test/tensor_ptr_maker_test.cpp | 120 ++++++++++++ 3 files changed, 368 insertions(+) diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp index 1c7b0efe589..1a09fea4cac 100644 --- a/extension/tensor/tensor_ptr_maker.cpp +++ b/extension/tensor/tensor_ptr_maker.cpp @@ -8,9 +8,12 @@ #include +#include + namespace executorch { namespace extension { namespace { + template < typename INT_T, typename std::enable_if< @@ -72,6 +75,25 @@ bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) { extract_scalar(scalar, &out_val), \ #scalar " could not be extracted: wrong type or out of range"); +template +TensorPtr random_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism, + Distribution&& distribution) { + auto tensor = + empty_strided(std::move(sizes), std::move(strides), type, dynamism); + std::default_random_engine gen{std::random_device{}()}; + + ET_SWITCH_REALB_TYPES(type, nullptr, "random_strided", CTYPE, [&] { + std::generate_n(tensor->mutable_data_ptr(), tensor->numel(), [&]() { + return static_cast(distribution(gen)); + }); + }); + return tensor; +} + } // namespace TensorPtr empty_strided( @@ -110,5 +132,46 @@ TensorPtr full_strided( return tensor; } +TensorPtr rand_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::uniform_real_distribution(0.0f, 1.0f)); +} + +TensorPtr randn_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::normal_distribution(0.0f, 1.0f)); +} + +TensorPtr randint_strided( + int64_t low, + int64_t high, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type, + exec_aten::TensorShapeDynamism dynamism) { + return random_strided( + std::move(sizes), + std::move(strides), + type, + dynamism, + std::uniform_int_distribution(low, high - 1)); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h index 132bd1f12c6..4e65480b7fd 100644 --- a/extension/tensor/tensor_ptr_maker.h +++ b/extension/tensor/tensor_ptr_maker.h @@ -497,5 +497,190 @@ inline TensorPtr zeros( return full(std::move(sizes), 0, type, dynamism); } +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + **/ +TensorPtr rand_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr rand_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return rand_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random values between 0 and 1. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr rand( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return rand_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr randn_strided( + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randn_like( + const TensorPtr& other, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return randn_strided( + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random values from a normal distribution. + * + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randn( + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Float, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return randn_strided(std::move(sizes), {}, type, dynamism); +} + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param sizes A vector specifying the size of each dimension. + * @param strides A vector specifying the stride for each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +TensorPtr randint_strided( + int64_t low, + int64_t high, + std::vector sizes, + std::vector strides, + exec_aten::ScalarType type = exec_aten::ScalarType::Int, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND); + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param other A reference to another tensor, whose size and properties will be + * used. + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param type The scalar type of the tensor elements. If not specified, the + * scalar type of the other tensor is used. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randint_like( + const TensorPtr& other, + int64_t low, + int64_t high, + exec_aten::ScalarType type = exec_aten::ScalarType::Undefined, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + if (type == exec_aten::ScalarType::Undefined) { + type = other->scalar_type(); + } + return randint_strided( + low, + high, + {other->sizes().begin(), other->sizes().end()}, + {other->strides().begin(), other->strides().end()}, + type, + dynamism); +} + +/** + * Creates a TensorPtr filled with random integer values in the given range. + * + * @param low The lower bound (inclusive) of the random values. + * @param high The upper bound (exclusive) of the random values. + * @param sizes A vector specifying the size of each dimension. + * @param type The scalar type of the tensor elements. + * @param dynamism Specifies whether the tensor's shape is static or dynamic. + * @return A TensorPtr instance managing the newly created Tensor. + */ +inline TensorPtr randint( + int64_t low, + int64_t high, + std::vector sizes, + exec_aten::ScalarType type = exec_aten::ScalarType::Int, + exec_aten::TensorShapeDynamism dynamism = + exec_aten::TensorShapeDynamism::DYNAMIC_BOUND) { + return randint_strided(low, high, std::move(sizes), {}, type, dynamism); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp index 7530a3709ab..41f3fa21439 100644 --- a/extension/tensor/test/tensor_ptr_maker_test.cpp +++ b/extension/tensor/test/tensor_ptr_maker_test.cpp @@ -317,3 +317,123 @@ TEST_F(TensorPtrMakerTest, CreateZeros) { EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double); EXPECT_EQ(tensor4->const_data_ptr()[0], 0); } + +TEST_F(TensorPtrMakerTest, CreateRandTensor) { + auto tensor = rand({4, 5}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 0.0f); + EXPECT_LT(val, 1.0f); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_EQ(val, 0); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 0.0); + EXPECT_LT(val, 1.0); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensor) { + auto tensor = randn({4, 5}); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float); + + auto sum = 0.0f; + for (auto i = 0; i < tensor->numel(); ++i) { + sum += tensor->const_data_ptr()[i]; + } + const auto average = sum / tensor->numel(); + EXPECT_NEAR(average, 0.0f, 0.5f); +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) { + auto tensor = randn({4, 5}, exec_aten::ScalarType::Double); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double); + + auto sum = 0.0; + for (auto i = 0; i < tensor->numel(); ++i) { + sum += tensor->const_data_ptr()[i]; + } + const auto average = sum / tensor->numel(); + EXPECT_NEAR(average, 0.0, 0.5); +} + +TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) { + auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 10); + EXPECT_LT(val, 20); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) { + auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Long); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_GE(val, 10); + EXPECT_LT(val, 20); + } +} + +TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) { + auto tensor = rand({4, 5}, exec_aten::ScalarType::Int); + + EXPECT_EQ(tensor->dim(), 2); + EXPECT_EQ(tensor->size(0), 4); + EXPECT_EQ(tensor->size(1), 5); + EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int); + + for (auto i = 0; i < tensor->numel(); ++i) { + auto val = tensor->const_data_ptr()[i]; + EXPECT_EQ(val, 0); + } +} From e462e5a3f514270b164655eebc793c6b50599ef1 Mon Sep 17 00:00:00 2001 From: Max Ren Date: Wed, 11 Sep 2024 11:00:52 -0700 Subject: [PATCH 320/531] Bug fix partitioner (#5239) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5239 Forgot to copy over this part of the constraint when I refactored the partitioner Reviewed By: balakv504, kirklandsign Differential Revision: D62471496 fbshipit-source-id: d2fb76ffd2dea5671b38f89e050beac3a35aff4e --- backends/xnnpack/partition/config/node_configs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py index 1e4d1f05fe4..2449d9d6440 100644 --- a/backends/xnnpack/partition/config/node_configs.py +++ b/backends/xnnpack/partition/config/node_configs.py @@ -85,6 +85,11 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: supported_dtypes = {torch.float32, torch.float16, torch.int8, torch.qint8} node_val = node.meta.get("val") output_0 = node_val[0] + + input_node = node.all_input_nodes[0] + if len(input_node.meta.get("val").shape) != 4: + why(node, f"Unsupported input rank {input_node.meta.get('val').shape}") + return False # Don't check indicies dtype if output_0.dtype not in supported_dtypes: why(node, f"Unsupported output dtype {output_0.dtype}") From 0af6c126be4560251aac51eb5402332fbd4583bc Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 11 Sep 2024 11:11:52 -0700 Subject: [PATCH 321/531] Use ones() to create tensors. (#5273) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5273 . Reviewed By: guangy10 Differential Revision: D62511672 fbshipit-source-id: 8d697c85ad80d1ba2a2e409676c2d804a12fb2d0 --- extension/apple/Benchmark/Tests/Tests.mm | 6 ++--- .../apple/Benchmark/Tests/Tests.xcconfig | 24 +++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/extension/apple/Benchmark/Tests/Tests.mm b/extension/apple/Benchmark/Tests/Tests.mm index dd85cb69542..2730cd8963f 100644 --- a/extension/apple/Benchmark/Tests/Tests.mm +++ b/extension/apple/Benchmark/Tests/Tests.mm @@ -96,10 +96,8 @@ + (void)initialize { XCTAssertEqual(tensor_meta.error(), Error::Ok); const auto sizes = tensor_meta->sizes(); - tensors.emplace_back(make_tensor_ptr( - tensor_meta->scalar_type(), - {sizes.begin(), sizes.end()}, - std::vector(tensor_meta->nbytes(), 0b01010101))); + tensors.emplace_back(ones({sizes.begin(), sizes.end()}, + tensor_meta->scalar_type())); inputs.emplace_back(tensors.back()); } break; default: diff --git a/extension/apple/Benchmark/Tests/Tests.xcconfig b/extension/apple/Benchmark/Tests/Tests.xcconfig index e8168046c3d..838cc61a43d 100644 --- a/extension/apple/Benchmark/Tests/Tests.xcconfig +++ b/extension/apple/Benchmark/Tests/Tests.xcconfig @@ -1,26 +1,26 @@ OTHER_LDFLAGS[sdk=iphonesimulator*] = $(inherited) \ -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-simulator-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-simulator-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a OTHER_LDFLAGS[sdk=iphoneos*] = $(inherited) \ -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-ios-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-ios-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-ios-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-ios-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-ios-release.a OTHER_LDFLAGS[sdk=macos*] = $(inherited) \ -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-macos-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-macos-release.a \ -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-macos-release.a \ --force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a +-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_portable-macos-release.a \ +-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-macos-release.a From 92d0559bc638989f5b2e64418e5f2dff76881759 Mon Sep 17 00:00:00 2001 From: generatedunixname89002005307016 Date: Wed, 11 Sep 2024 11:12:44 -0700 Subject: [PATCH 322/531] Add missing Pyre mode headers] [batch:21/424] [shard:6/N] Differential Revision: D62490198 fbshipit-source-id: 9a10193bda402a6132ddc681895de50da3fc9a70 --- backends/arm/arm_backend.py | 2 ++ backends/arm/arm_partitioner.py | 2 ++ backends/arm/arm_vela.py | 2 ++ backends/arm/operators/__init__.py | 2 ++ backends/arm/operators/node_visitor.py | 2 ++ backends/arm/operators/op_add.py | 2 ++ backends/arm/operators/op_addmm.py | 2 ++ backends/arm/operators/op_avg_pool2d.py | 2 ++ backends/arm/operators/op_batch_norm.py | 2 ++ backends/arm/operators/op_bmm.py | 2 ++ backends/arm/operators/op_cat.py | 2 ++ backends/arm/operators/op_conv2d.py | 2 ++ backends/arm/operators/op_dequant.py | 2 ++ backends/arm/operators/op_div.py | 2 ++ backends/arm/operators/op_exp.py | 2 ++ backends/arm/operators/op_full.py | 2 ++ backends/arm/operators/op_get_item.py | 2 ++ backends/arm/operators/op_hardtanh.py | 2 ++ backends/arm/operators/op_log.py | 2 ++ backends/arm/operators/op_mean_dim.py | 2 ++ backends/arm/operators/op_mm.py | 2 ++ backends/arm/operators/op_mul.py | 2 ++ backends/arm/operators/op_output.py | 2 ++ backends/arm/operators/op_permute.py | 2 ++ backends/arm/operators/op_placeholder.py | 2 ++ backends/arm/operators/op_quant.py | 2 ++ backends/arm/operators/op_relu.py | 2 ++ backends/arm/operators/op_repeat.py | 2 ++ backends/arm/operators/op_sigmoid.py | 2 ++ backends/arm/operators/op_slice.py | 2 ++ backends/arm/operators/op_softmax.py | 2 ++ backends/arm/operators/op_sub.py | 2 ++ backends/arm/operators/op_unsqueeze.py | 2 ++ backends/arm/operators/op_view.py | 2 ++ backends/arm/passes/annotate_channels_last_dim_order_pass.py | 2 ++ backends/arm/passes/arm_pass_manager.py | 2 ++ backends/arm/passes/convert_expand_copy_to_repeat.py | 2 ++ backends/arm/passes/convert_split_to_slice.py | 2 ++ backends/arm/passes/meandim_to_averagepool_pass.py | 2 ++ backends/arm/passes/remove_clone_pass.py | 2 ++ backends/arm/passes/size_adjust_conv2d_pass.py | 2 ++ backends/arm/passes/tag_io_quant_pass.py | 2 ++ backends/arm/quantizer/arm_quantizer.py | 2 ++ backends/arm/quantizer/arm_quantizer_utils.py | 2 ++ backends/arm/quantizer/quantization_annotation/__init__.py | 2 ++ .../quantization_annotation/adaptive_ang_pool2d_annotator.py | 2 ++ backends/arm/quantizer/quantization_annotation/add_annotator.py | 2 ++ backends/arm/quantizer/quantization_annotation/cat_annotator.py | 2 ++ .../arm/quantizer/quantization_annotation/conv_annotator.py | 2 ++ .../arm/quantizer/quantization_annotation/linear_annotator.py | 2 ++ .../quantizer/quantization_annotation/max_pool2d_annotator.py | 2 ++ backends/arm/quantizer/quantization_annotation/mm_annotator.py | 2 ++ backends/arm/quantizer/quantization_annotation/mul_annotator.py | 2 ++ .../quantizer/quantization_annotation/one_to_one_annotator.py | 2 ++ .../arm/quantizer/quantization_annotation/sigmoid_annotator.py | 2 ++ backends/arm/quantizer/quantization_annotation/sub_annotator.py | 2 ++ backends/arm/quantizer/quantization_config.py | 2 ++ backends/arm/tosa_mapping.py | 2 ++ backends/arm/tosa_quant_utils.py | 2 ++ backends/arm/tosa_utils.py | 2 ++ 60 files changed, 120 insertions(+) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 27fd36ca0e1..7803cf84950 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Main implementation of AoT flow to partition and preprocess for Arm target # backends. Converts via TOSA as an intermediate form supported by AoT and diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index 524316613ff..6b57c3d9658 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import operator import os diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index 53533947c49..d491437ded3 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import os import struct import tempfile diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 5c1109eec1f..7b94bfa837d 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from . import ( # noqa node_visitor, op_add, diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py index 59edc01e745..99fd0388e45 100644 --- a/backends/arm/operators/node_visitor.py +++ b/backends/arm/operators/node_visitor.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Dict, List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py index 33c0c49744b..ec2ade9e8ad 100644 --- a/backends/arm/operators/op_add.py +++ b/backends/arm/operators/op_add.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py index 4a0581376c2..b4f782db4a3 100644 --- a/backends/arm/operators/op_addmm.py +++ b/backends/arm/operators/op_addmm.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py index e6d07610c81..4caaad92028 100644 --- a/backends/arm/operators/op_avg_pool2d.py +++ b/backends/arm/operators/op_avg_pool2d.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_batch_norm.py b/backends/arm/operators/op_batch_norm.py index c41941722b3..d17c3a1b81f 100644 --- a/backends/arm/operators/op_batch_norm.py +++ b/backends/arm/operators/op_batch_norm.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py index 59f28d3bad8..161b5d22396 100644 --- a/backends/arm/operators/op_bmm.py +++ b/backends/arm/operators/op_bmm.py @@ -3,6 +3,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py index f2b41656572..652eb397371 100644 --- a/backends/arm/operators/op_cat.py +++ b/backends/arm/operators/op_cat.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 935c923ba42..64cde0724f5 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import cast, List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py index 269afceccb7..afa1dda9467 100644 --- a/backends/arm/operators/op_dequant.py +++ b/backends/arm/operators/op_dequant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_div.py b/backends/arm/operators/op_div.py index e365cf6cfe2..0857e0ed32a 100644 --- a/backends/arm/operators/op_div.py +++ b/backends/arm/operators/op_div.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py index f9319b5ea8b..f98bb3f88c2 100644 --- a/backends/arm/operators/op_exp.py +++ b/backends/arm/operators/op_exp.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py index f929b02ee67..eec27bb9090 100644 --- a/backends/arm/operators/op_full.py +++ b/backends/arm/operators/op_full.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py index 59004f49686..a696b33aa75 100644 --- a/backends/arm/operators/op_get_item.py +++ b/backends/arm/operators/op_get_item.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py index 3d58f6d628c..62c0a27f05f 100644 --- a/backends/arm/operators/op_hardtanh.py +++ b/backends/arm/operators/op_hardtanh.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py index a76eb57f710..5276173efa3 100644 --- a/backends/arm/operators/op_log.py +++ b/backends/arm/operators/op_log.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py index 339aa62719f..3c9aea30856 100644 --- a/backends/arm/operators/op_mean_dim.py +++ b/backends/arm/operators/op_mean_dim.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py index 98152215035..ebddb3a40e2 100644 --- a/backends/arm/operators/op_mm.py +++ b/backends/arm/operators/op_mm.py @@ -3,6 +3,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py index f7c593e9fe3..c152e8759ef 100644 --- a/backends/arm/operators/op_mul.py +++ b/backends/arm/operators/op_mul.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import cast, List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_output.py b/backends/arm/operators/op_output.py index 89654ed2d48..1b053b18edc 100644 --- a/backends/arm/operators/op_output.py +++ b/backends/arm/operators/op_output.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import cast import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index eafd6af3678..167a0c382f4 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py index 918a270bb00..b5dcf3f9873 100644 --- a/backends/arm/operators/op_placeholder.py +++ b/backends/arm/operators/op_placeholder.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import numpy as np import serializer.tosa_serializer as ts import torch.fx diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py index e6a62b3f206..8f83e79442d 100644 --- a/backends/arm/operators/op_quant.py +++ b/backends/arm/operators/op_quant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py index 5afe1ac7bce..20bba3f6545 100644 --- a/backends/arm/operators/op_relu.py +++ b/backends/arm/operators/op_relu.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import executorch.backends.arm.tosa_quant_utils as tqutils import serializer.tosa_serializer as ts import torch.fx diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py index 261fcca12e7..20de9e0846a 100644 --- a/backends/arm/operators/op_repeat.py +++ b/backends/arm/operators/op_repeat.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import serializer.tosa_serializer as ts import torch from executorch.backends.arm.operators.node_visitor import ( diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py index 884c803482b..0087b1f7a81 100644 --- a/backends/arm/operators/op_sigmoid.py +++ b/backends/arm/operators/op_sigmoid.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py index e562e0724e2..0dfb287cd75 100644 --- a/backends/arm/operators/op_slice.py +++ b/backends/arm/operators/op_slice.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_softmax.py b/backends/arm/operators/op_softmax.py index 6baf4ea16f6..1ac42413189 100644 --- a/backends/arm/operators/op_softmax.py +++ b/backends/arm/operators/op_softmax.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py index 3dc1519f370..2089b6e9e96 100644 --- a/backends/arm/operators/op_sub.py +++ b/backends/arm/operators/op_sub.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_unsqueeze.py b/backends/arm/operators/op_unsqueeze.py index a7ff8ce0b40..c14128fdc8c 100644 --- a/backends/arm/operators/op_unsqueeze.py +++ b/backends/arm/operators/op_unsqueeze.py @@ -5,6 +5,8 @@ # # Follows this specification: https://pytorch.org/docs/stable/generated/torch.unsqueeze.html +# pyre-unsafe + import serializer.tosa_serializer as ts import torch.fx from executorch.backends.arm.operators.node_visitor import ( diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 5baedfc9627..8667df590dc 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py index 8ba02c2f7e3..a5b657af49f 100644 --- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import cast import torch diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index db8511df613..75ef551171e 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import ( AnnotateChannelsLastDimOrder, diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py index 5f409e1ae5f..249c014ae67 100644 --- a/backends/arm/passes/convert_expand_copy_to_repeat.py +++ b/backends/arm/passes/convert_expand_copy_to_repeat.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import cast import torch.fx diff --git a/backends/arm/passes/convert_split_to_slice.py b/backends/arm/passes/convert_split_to_slice.py index ff978d4d9ec..29aae37fe9e 100644 --- a/backends/arm/passes/convert_split_to_slice.py +++ b/backends/arm/passes/convert_split_to_slice.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch.fx from executorch.backends.arm.tosa_mapping import extract_tensor_meta from executorch.exir.dialects._ops import ops as exir_ops diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py index 3f57e8023ca..0974eac740c 100644 --- a/backends/arm/passes/meandim_to_averagepool_pass.py +++ b/backends/arm/passes/meandim_to_averagepool_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Any, cast, Dict, Tuple import torch.fx diff --git a/backends/arm/passes/remove_clone_pass.py b/backends/arm/passes/remove_clone_pass.py index 6108080cb0d..64a1ae8f43e 100644 --- a/backends/arm/passes/remove_clone_pass.py +++ b/backends/arm/passes/remove_clone_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py index ea161b74928..980ab09e597 100644 --- a/backends/arm/passes/size_adjust_conv2d_pass.py +++ b/backends/arm/passes/size_adjust_conv2d_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import cast, Optional import torch.fx diff --git a/backends/arm/passes/tag_io_quant_pass.py b/backends/arm/passes/tag_io_quant_pass.py index d2bf74462ed..2fce6cf3fd4 100644 --- a/backends/arm/passes/tag_io_quant_pass.py +++ b/backends/arm/passes/tag_io_quant_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index e8fb78fea49..853fd47c29c 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Quantizer for Arm backend # diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 1cac297bc92..fe9c5e34e6b 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Utility functions for ArmQuantizer # diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py index f3017c2d7df..f7219201dec 100644 --- a/backends/arm/quantizer/quantization_annotation/__init__.py +++ b/backends/arm/quantizer/quantization_annotation/__init__.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, Dict, List, NamedTuple, Optional diff --git a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py index acbdc45b6b9..723a48f6644 100644 --- a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/add_annotator.py b/backends/arm/quantizer/quantization_annotation/add_annotator.py index 2926e92f243..35801bd5681 100644 --- a/backends/arm/quantizer/quantization_annotation/add_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/add_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py index 992070ac172..6e138cd9def 100644 --- a/backends/arm/quantizer/quantization_annotation/cat_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, cast, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/conv_annotator.py b/backends/arm/quantizer/quantization_annotation/conv_annotator.py index 40a1f1ee9ea..4ff7dd9e800 100644 --- a/backends/arm/quantizer/quantization_annotation/conv_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/conv_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree.f +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/linear_annotator.py b/backends/arm/quantizer/quantization_annotation/linear_annotator.py index 95b881a9548..7c3f91ec707 100644 --- a/backends/arm/quantizer/quantization_annotation/linear_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/linear_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py index 3d9d8b2e6c8..0ef2ee39fe5 100644 --- a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py index 7fb5c51b224..b48c6d59905 100644 --- a/backends/arm/quantizer/quantization_annotation/mm_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mm_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/mul_annotator.py b/backends/arm/quantizer/quantization_annotation/mul_annotator.py index 6ec8f95531b..4717eac320d 100644 --- a/backends/arm/quantizer/quantization_annotation/mul_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mul_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py index 2c3c485b055..8d507c11ef3 100644 --- a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py index bd683d81f0b..3d242694836 100644 --- a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/sub_annotator.py b/backends/arm/quantizer/quantization_annotation/sub_annotator.py index 4686d480edb..92f1808d023 100644 --- a/backends/arm/quantizer/quantization_annotation/sub_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sub_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index f94c3e18da6..1e776d37a6f 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from dataclasses import dataclass import torch diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py index 5749d1e2043..0baf3e2ec1b 100644 --- a/backends/arm/tosa_mapping.py +++ b/backends/arm/tosa_mapping.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # PyTorch to Tosa mapping - simple mapping functions and multi-type extraction # of key information. These are used by the initial compile stage which captures diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index d93f2544070..8a90e432a69 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # Utiliy functions for TOSA quantized lowerings import math diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index 5353dd49fae..aee8aae8df3 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import os from typing import Any, cast, Dict From 41b463e29d53ec860f44490c2f8574007d22e44d Mon Sep 17 00:00:00 2001 From: Lucy Qiu Date: Wed, 11 Sep 2024 11:26:43 -0700 Subject: [PATCH 323/531] Do not load constant_segment if only the placeholder exists (#5229) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5229 1. If constant segment offsets only contains one value, it is the placeholder value for non-const tensors. This means the constant segment is empty, and does not need to be loaded. (T201034228) 2. Clean up before returning error in `prepare_input_tensors`. (T201103889) Reviewed By: dbort Differential Revision: D62455987 fbshipit-source-id: ecd3ff792e4501b00c778242e89bdb97a6211e64 --- extension/runner_util/inputs.cpp | 1 + runtime/executor/program.cpp | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp index f4c77cae194..c33716be679 100644 --- a/extension/runner_util/inputs.cpp +++ b/extension/runner_util/inputs.cpp @@ -31,6 +31,7 @@ Result prepare_input_tensors(Method& method) { for (size_t i = 0; i < num_inputs; i++) { auto tag = method_meta.input_tag(i); if (!tag.ok()) { + BufferCleanup cleanup({inputs, num_allocated}); return tag.error(); } if (tag.get() != Tag::Tensor) { diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp index 6a889625c6a..48d36602d33 100644 --- a/runtime/executor/program.cpp +++ b/runtime/executor/program.cpp @@ -150,9 +150,12 @@ Result get_execution_plan( // Constant data may live inside the flatbuffer data (constant_buffer) or in a // separate segment (constant_segment). It should not be in both. + // Check constant_segment->offsets()->size() > 1, as the offsets list will + // always contain a placeholder value 0 for non-const tensors. If this is the + // only offset, the constant segment is empty and does not need to be loaded. const auto* constant_segment = flatbuffer_program->constant_segment(); if (constant_segment != nullptr && constant_segment->offsets() != nullptr && - constant_segment->offsets()->size() > 0) { + constant_segment->offsets()->size() > 1) { // The constant data is inside a separate segment. const auto* constant_buffer = flatbuffer_program->constant_buffer(); ET_CHECK_OR_RETURN_ERROR( From d80f78fc7aa38c65164a51c198d8bb07711ee54f Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 11 Sep 2024 12:30:02 -0700 Subject: [PATCH 324/531] Read SpinQuant checkpoints (#5259) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5259 Read SpinQuant checkpoints that is in exported with scales/weights. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: iseeyuan, helunwencser Differential Revision: D62403094 fbshipit-source-id: 283ae18a1d2053306677086b9edd5cb5f38120ee --- examples/models/llama2/export_llama_lib.py | 35 ++++--- examples/models/llama2/model.py | 47 +++++++++- .../source_transformation/spin_quant.py | 93 +++++++++++++++++++ examples/models/llama2/tests/TARGETS | 13 +++ .../llama2/tests/test_spinquant_transforms.py | 89 ++++++++++++++++++ pytest.ini | 2 + 6 files changed, 262 insertions(+), 17 deletions(-) create mode 100644 examples/models/llama2/tests/test_spinquant_transforms.py diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 97228bb5c5d..2a03c0cebda 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -695,6 +695,7 @@ def _load_llama_model( fairseq2=weight_type == WeightType.FAIRSEQ2, max_seq_len=max_seq_len, enable_dynamic_shape=enable_dynamic_shape, + args=args, ) state_dict = model.state_dict() dtype = state_dict[next(iter(state_dict))].dtype @@ -747,9 +748,26 @@ def _get_source_transforms( transforms = [] if args.quantization_mode: modelname = f"{modelname}_q" - transforms.append( - get_quant_weight_transform(args, dtype_override, verbose_export()) - ) + if args.use_spin_quant is None: + transforms.append( + get_quant_weight_transform(args, dtype_override, verbose_export()) + ) + # For SpinQuant, the checkpoints are already quantized + # aka the weights have corresponding scales value, + # So that means, we don't need to apply quantization + # transform. However, we will still need to apply + # transformations that change the model structure to + # match the checkpoint format. + # transform_for_spinquant() will apply these transformations + # later in model.py file. + elif args.use_spin_quant == "cuda": + from .source_transformation.spin_quant import ( + inject_fast_hadamard_transform_cuda_for_spin_quant, + ) + + transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant) + elif args.use_spin_quant == "native": + raise NotImplementedError("native SpinQuant is not implemented yet.") if args.embedding_quantize: modelname = f"{modelname}_e" @@ -783,15 +801,4 @@ def _get_source_transforms( transforms.append(replace_sdpa_with_simple_sdpa) transforms.append(replace_causal_mask) - if args.use_spin_quant: - if args.use_spin_quant == "cuda": - from .source_transformation.spin_quant import ( - inject_fast_hadamard_transform_cuda_for_spin_quant, - ) - - transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant) - - elif args.use_spin_quant == "native": - raise NotImplementedError("native SpinQuant is not implemented yet.") - return transforms diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index f58a2a2def9..174f562f93a 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -65,6 +65,7 @@ def __init__(self, **kwargs): self.enable_dynamic_shape = kwargs.get("enable_dynamic_shape", False) self.max_seq_len = kwargs.get("max_seq_len", 128) + self.args = kwargs.get("args", None) # The example is using a dummy small model with random weights for demo purpose only. # Follow the instruction in https://github.com/facebookresearch/llama to download the model device = "cpu" @@ -126,7 +127,8 @@ def __init__(self, **kwargs): # get checkpoint dtype self.dtype = None if len(checkpoint) > 0: - first = checkpoint[next(iter(checkpoint))] + first_key = next(iter(checkpoint)) + first = checkpoint[first_key] self.dtype = first.dtype mismatched_dtypes = [ (key, value.dtype) @@ -135,7 +137,7 @@ def __init__(self, **kwargs): ] if len(mismatched_dtypes) > 0: print( - f"Mixed dtype model. Dtype of {first.key}: {first.dtype}. Mismatches in the checkpoint: {mismatched_dtypes}" + f"Mixed dtype model. Dtype of {first_key}: {first.dtype}. Mismatches in the checkpoint: {mismatched_dtypes}" ) with open(params_path, "r") as f: params = json.loads(f.read()) @@ -179,15 +181,54 @@ def __init__(self, **kwargs): self.model_ = Int8DynActInt4WeightQuantizer()._convert_for_runtime( self.model_ ) + elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant: + print("Using SPIN quantization.") + assert hasattr(self.args, "group_size"), "group_size must be specified" + assert hasattr( + self.args, "quantization_mode" + ), "quantization_mode must be specified" + assert hasattr( + self.args, "dtype_override" + ), "dtype_override must be specified" + from .source_transformation.spin_quant import ( + sanitize_checkpoint_from_spinquant, + transform_for_spinquant, + ) + + mapping = { + "fp32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, + } + + self.model_ = transform_for_spinquant( + self.model_, + checkpoint, + self.args.group_size, + self.args.quantization_mode, + mapping[self.args.dtype_override], + ) + + sanitize_checkpoint_from_spinquant( + checkpoint, + self.args.group_size, + ) # assign=True: load params/buffers by assignment instead of performing an in-place copy. # Because we are using device="meta", tensors do not have memory associated with them # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario. - self.model_.load_state_dict( + missing, unexpected = self.model_.load_state_dict( checkpoint, strict=False, assign=True, ) # self.model_ = Transformer(gptconf) + if kwargs.get("verbose", False): + print("============= missing keys ================") + print(missing) + print("============= /missing ================") + print("============= unexpected keys ================") + print(unexpected) + print("============= /unexpected ================") def get_eager_model(self): if self.dtype: diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py index 7b38312c182..a45db190f48 100644 --- a/examples/models/llama2/source_transformation/spin_quant.py +++ b/examples/models/llama2/source_transformation/spin_quant.py @@ -9,12 +9,16 @@ # Helper functions for tranforming the model to be able to run SpinQuant. # See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant. +from typing import Any + import torch import torch.nn.functional as F from executorch.examples.models.llama2.llama_transformer import FeedForward from torch import nn +from torchao.quantization.GPTQ import _check_linear_int4_k, Int8DynActInt4WeightLinear +from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module): @@ -53,3 +57,92 @@ def inject_fast_hadamard_transform_cuda_for_spin_quant( ) -> torch.nn.Module: _inject_fast_hadamard_transform_cuda_for_spin_quant(module) return module + + +def _replace_linear_with_linear_8da4w_for_spin_quant( + module: torch.nn.Module, + checkpoint: Any, + group_size: int, + precision: torch.dtype, + scales_precision: torch.dtype, +): + def filter_fn(child: torch.nn.Module, cur_fqn: str) -> bool: + # Only replace linear layers where the checkpoint contains explicit scales + scales_key = f"{cur_fqn}.scale" + if isinstance(child, nn.Linear) and scales_key in checkpoint: + assert _check_linear_int4_k(child.in_features, group_size) + assert checkpoint[f"{cur_fqn}.weight"].dtype == torch.int8 + assert checkpoint[scales_key].dtype == scales_precision + return True + return False + + def replacement_fn(child: torch.nn.Module) -> torch.nn.Module: + new_linear = Int8DynActInt4WeightLinear( + child.in_features, + child.out_features, + bias=False, + device=child.weight.device, + groupsize=group_size, + precision=precision, + scales_precision=scales_precision, + ) + return new_linear + + _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn) + + +def transform_for_spinquant( + module: torch.nn.Module, + checkpoint: Any, + group_size: int, + quantization_mode: str, + dtype: torch.dtype, +) -> torch.nn.Module: + """ + Transform the model to be able to load SpinQuant checkpoints that + are quantized with the given group size and quantization mode. + """ + + if group_size not in [32, 64, 128, 256]: + raise ValueError(f"Group size {group_size} is not supported for SpinQuant.") + if quantization_mode not in ["8da4w"]: + raise ValueError( + f"Quantization mode {quantization_mode} is not compatible with SpinQuant." + ) + _replace_linear_with_linear_8da4w_for_spin_quant( + module, + checkpoint, + group_size, + dtype, + dtype, + ) + return module + + +def sanitize_checkpoint_from_spinquant( + checkpoint: Any, + group_size: int, +): + """ + Sanitize the SpinQuant checkpoint. + - Renames 'scale' to 'scales' + - Groups scales + - Removes 'o_weight' + - Converts all tensors to contiguous format + """ + keys_to_rename = [] + keys_to_remove = [] + for k, _ in checkpoint.items(): + if k.endswith(".scale"): + new_key = k + "s" + keys_to_rename.append((k, new_key)) + if k.endswith(".o_weight"): + keys_to_remove.append(k) + + for old_key, new_key in keys_to_rename: + old_val = checkpoint.pop(old_key) + checkpoint[new_key] = old_val if group_size == -1 else old_val[:, ::group_size] + for k in keys_to_remove: + checkpoint.pop(k) + for k, v in checkpoint.items(): + checkpoint[k] = v.contiguous() diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS index 3d2aef6209f..76981d8f317 100644 --- a/examples/models/llama2/tests/TARGETS +++ b/examples/models/llama2/tests/TARGETS @@ -13,3 +13,16 @@ python_unittest( "//executorch/examples/models/llama2:llama_transformer", ], ) + +python_unittest( + name = "test_spinquant_transforms", + srcs = [ + "test_spinquant_transforms.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/examples/models/llama2:export_library", + "//executorch/examples/models/llama2:llama_transformer", + "//pytorch/ao:torchao", + ], +) diff --git a/examples/models/llama2/tests/test_spinquant_transforms.py b/examples/models/llama2/tests/test_spinquant_transforms.py new file mode 100644 index 00000000000..bd56632c5f5 --- /dev/null +++ b/examples/models/llama2/tests/test_spinquant_transforms.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama2.source_transformation.spin_quant import ( + sanitize_checkpoint_from_spinquant, + transform_for_spinquant, +) +from torchao.quantization.utils import group_quantize_tensor_symmetric + + +class SpinQuantTests(unittest.TestCase): + def test_transforms_for_spinquant(self): + + # Step 1: Create llama class with dummy weights + params = { + "dim": 768, + "multiple_of": 32, + "n_heads": 12, + "n_layers": 12, + "norm_eps": 1e-05, + "vocab_size": 32000, + } + + model_args = ModelArgs( + max_seq_len=2048, + max_batch_size=1, + use_kv_cache=False, + use_sdpa_with_kv_cache_op=False, + generate_full_logits=False, + enable_dynamic_shape=True, + **params, + ) + + model = Transformer(model_args) + checkpoint = model.state_dict() + + # Step 2: + # Do group-wise quantization and amend the checkpoints with + # int8 weight and fp32 scales + group_size = 32 + n_bit = 4 + scales_precision = torch.float32 + for fqn, mod in model.named_modules(): + # Quantize everything except the last layer + if isinstance(mod, torch.nn.Linear) and ("output" not in fqn): + weight = mod.weight.data + ( + weight_int8, + scales, + zeros, + ) = group_quantize_tensor_symmetric( + weight.to(torch.float32), n_bit, group_size, scales_precision + ) + checkpoint[f"{fqn}.weight"] = weight_int8.to("cpu") + checkpoint[f"{fqn}.scale"] = scales.to("cpu") + + # Step 3: + # Transform the model so that it is compatible with the new checkpoint + transform_for_spinquant( + model, + checkpoint, + 32, + "8da4w", + torch.float32, + ) + sanitize_checkpoint_from_spinquant( + checkpoint, + -1, + ) + + model.load_state_dict( + checkpoint, + strict=False, + assign=True, + ) + + new_checkpoint = model.state_dict() + + for k, v in checkpoint.items(): + # The new_checkpoint contains zeros so + # have to iterate over the keys. + self.assertTrue(torch.allclose(new_checkpoint[k], v)) diff --git a/pytest.ini b/pytest.ini index 7298773255a..701c0187ecf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -38,6 +38,8 @@ addopts = test/end2end/test_end2end.py --ignore=backends/xnnpack/test/ops/linear.py --ignore=backends/xnnpack/test/models/llama2_et_example.py + # T200992559: Add torchao to ET as core dependency + --ignore=examples/models/llama2/tests/test_spinquant_transforms.py --ignore=exir/backend/test/demos --ignore=exir/backend/test/test_backends.py --ignore=exir/backend/test/test_backends_lifted.py From 6ac13656a2d83cbf5514d8e29ff8c303792bfc8d Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 11 Sep 2024 13:25:21 -0700 Subject: [PATCH 325/531] Fix Android LlamaDemo setup.sh (#5274) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5274 Reviewed By: shoumikhin Differential Revision: D62514059 Pulled By: kirklandsign fbshipit-source-id: 9e19c88d2926ab6a13dee8daffdbb58eaa4a85aa --- examples/demo-apps/android/LlamaDemo/setup.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 5e65929426b..b89c1829944 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -31,26 +31,13 @@ else fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release -cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - -cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android From d6897222e75becaea17d6a0355ac1bf36c9c352d Mon Sep 17 00:00:00 2001 From: Olivia Liu Date: Wed, 11 Sep 2024 13:48:42 -0700 Subject: [PATCH 326/531] Make `compare_results()` import path public (#5225) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5225 `compare_results()` is a util function that is used outside of the inspector tool itself. We will also probably mention it in PTC as a public util function. This diff makes it "public" aka removes the underscore prefix in the import paths. Reviewed By: dbort Differential Revision: D62450959 fbshipit-source-id: 5688a79001b76c2e99ac76a9238828c6031f7291 --- devtools/inspector/TARGETS | 2 +- devtools/inspector/__init__.py | 13 +++++++++++-- devtools/inspector/inspector_cli.py | 4 +++- docs/source/sdk-debugging.md | 2 +- examples/apple/coreml/scripts/inspector_cli.py | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/devtools/inspector/TARGETS b/devtools/inspector/TARGETS index 2b1cbecff32..bba5f7f8951 100644 --- a/devtools/inspector/TARGETS +++ b/devtools/inspector/TARGETS @@ -26,8 +26,8 @@ python_binary( main_function = ".inspector_cli.main", main_src = "inspector_cli.py", deps = [ - ":inspector_utils", "//executorch/devtools:lib", + "//executorch/devtools/inspector:lib", ], ) diff --git a/devtools/inspector/__init__.py b/devtools/inspector/__init__.py index ff9bb814791..375123a0a5b 100644 --- a/devtools/inspector/__init__.py +++ b/devtools/inspector/__init__.py @@ -4,12 +4,21 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from executorch.devtools.inspector._inspector import ( Event, EventBlock, Inspector, PerfData, ) -from executorch.devtools.inspector._inspector_utils import TimeScale +from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale -__all__ = ["Event", "EventBlock", "Inspector", "PerfData", "TimeScale"] +__all__ = [ + "Event", + "EventBlock", + "Inspector", + "PerfData", + "compare_results", + "TimeScale", +] diff --git a/devtools/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py index bd76607a944..db3536a84bf 100644 --- a/devtools/inspector/inspector_cli.py +++ b/devtools/inspector/inspector_cli.py @@ -4,10 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import argparse from executorch.devtools import Inspector -from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale +from executorch.devtools.inspector import compare_results, TimeScale def main() -> None: diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md index 14d4af0f153..b890653558e 100644 --- a/docs/source/sdk-debugging.md +++ b/docs/source/sdk-debugging.md @@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana ```python -from executorch.devtools.inspector._inspector_utils import compare_results +from executorch.devtools.inspector import compare_results # Run a simple quality analysis between the model outputs sourced from the # runtime and a set of reference outputs. diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py index e0b81d4affb..c63d4791fcf 100644 --- a/examples/apple/coreml/scripts/inspector_cli.py +++ b/examples/apple/coreml/scripts/inspector_cli.py @@ -9,7 +9,7 @@ from pathlib import Path from executorch.devtools import Inspector -from executorch.devtools.inspector._inspector_utils import compare_results +from executorch.devtools.inspector import compare_results def get_root_dir_path() -> Path: From 338ef26098849504db3a43b32c8eb447304804cb Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Wed, 11 Sep 2024 14:13:18 -0700 Subject: [PATCH 327/531] Remove explicit dereferencing for TesnorPtr converted implicitly to EValue. (#5278) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5278 Reviewed By: kirklandsign Differential Revision: D62512518 fbshipit-source-id: c1f32dd398cb58833ca3fa95b0cd1ab5c9984de9 --- examples/qualcomm/oss_scripts/llama2/runner/runner.cpp | 2 +- extension/llm/runner/text_decoder_runner.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp index 0ccaefa79e0..d8da43c74ce 100644 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp @@ -145,7 +145,7 @@ Result Runner::run_model_step( token->mutable_data_ptr()[0] = input_token; // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache] - auto outputs_res = module_->forward({*token, *start_pos, *atten_mask}); + auto outputs_res = module_->forward({token, start_pos, atten_mask}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); // TODO: need to handle batch size != 1 diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 928a21244a2..faf4d1344e1 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -42,7 +42,7 @@ ::executorch::runtime::Result TextDecoderRunner::step( TensorPtr& start_pos) { // ET_LOG(Info, "Input token %" PRIu64, input_token); if (use_kv_cache_) { - auto outputs_res = module_->forward({*tokens, *start_pos}); + auto outputs_res = module_->forward({tokens, start_pos}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( outputs_res.get().size() == 1, From 6328d41eb87ee2fe8a8c0c5532e0b07ec4e90c73 Mon Sep 17 00:00:00 2001 From: Olivia Liu Date: Wed, 11 Sep 2024 14:32:32 -0700 Subject: [PATCH 328/531] Rename "SDK" -> "Developer Tools" in documentations (OSS files) (#5238) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5238 Part of the bigger "sdk" rename to "devtools" task, context in [post](https://fb.workplace.com/groups/222849770514616/permalink/467450562721201/). This diff replaces "SDK" with "Developer Tools" in the documentations. Reviewed By: dbort Differential Revision: D62462792 fbshipit-source-id: 18a46779a9e6d7fb782f563fe73f7c612afc4838 --- CMakeLists.txt | 2 +- README.md | 6 +++--- devtools/debug_format/base_schema.py | 4 +++- devtools/debug_format/et_schema.py | 4 +++- devtools/etrecord/_etrecord.py | 6 ++++-- devtools/etrecord/tests/etrecord_test.py | 4 +++- .../compiler-delegate-and-partitioner.md | 8 ++++---- docs/source/concepts.md | 4 ++-- docs/source/getting-started-architecture.md | 6 +++--- docs/source/index.rst | 8 ++++---- docs/source/intro-overview.md | 6 +++--- docs/source/llm/getting-started.md | 4 ++-- ...e-delegates-executorch-xnnpack-delegate.md | 2 +- docs/source/sdk-debugging.md | 2 +- docs/source/sdk-delegate-integration.md | 2 +- docs/source/sdk-etdump.md | 2 +- docs/source/sdk-etrecord.rst | 2 +- docs/source/sdk-overview.md | 20 +++++++++---------- docs/source/sdk-profiling.md | 4 ++-- docs/source/sdk-tutorial.md | 4 ++-- .../sdk-integration-tutorial.py | 14 ++++++------- runtime/core/event_tracer.h | 2 +- 22 files changed, 62 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a19f405e80c..add38ec56e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF) -option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK") +option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch Developer Tools") option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF) diff --git a/README.md b/README.md index 914eab472e7..6368c873f62 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. diff --git a/devtools/debug_format/base_schema.py b/devtools/debug_format/base_schema.py index b987c288744..9b6247051ec 100644 --- a/devtools/debug_format/base_schema.py +++ b/devtools/debug_format/base_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Base Intermediate Representation for Productivity SDK consumers +Base Intermediate Representation for Developer Tools consumers (e.g. TensorBoard, Terminal Debugger) """ diff --git a/devtools/debug_format/et_schema.py b/devtools/debug_format/et_schema.py index abe155233ae..bb15d70abc4 100644 --- a/devtools/debug_format/et_schema.py +++ b/devtools/debug_format/et_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Intermediate Representation of ExecuTorch Concepts in Productivity SDK +Intermediate Representation of ExecuTorch Concepts in Developer Tools """ from __future__ import annotations diff --git a/devtools/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py index cd213254980..de7cf93990a 100644 --- a/devtools/etrecord/_etrecord.py +++ b/devtools/etrecord/_etrecord.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import json import os import pickle @@ -182,7 +184,7 @@ def generate_etrecord( is the closest graph module representation of what is eventually run on the device. In addition to all the graph modules, we also serialize the program buffer, which the users can provide to the ExecuTorch runtime to run the model, and the debug handle map - for SDK tooling usage. + for Developer Tools usage. Args: et_record: Path to where the `ETRecord` file will be saved to. @@ -201,7 +203,7 @@ def generate_etrecord( etrecord_zip = ZipFile(et_record, "w") # Write the magic file identifier that will be used to verify that this file - # is an etrecord when it's used later in the SDK tooling. + # is an etrecord when it's used later in the Developer Tools. etrecord_zip.writestr(ETRecordReservedFileNames.ETRECORD_IDENTIFIER, "") if export_modules is not None: diff --git a/devtools/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py index b8e08dfe8c1..daef7c3e1e2 100644 --- a/devtools/etrecord/tests/etrecord_test.py +++ b/devtools/etrecord/tests/etrecord_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import copy import json import tempfile @@ -75,7 +77,7 @@ def get_test_model_with_manager(self): return (aten_dialect, edge_program_copy, edge_program.to_executorch()) # Serialized and deserialized graph modules are not completely the same, so we check - # that they are close enough and match especially on the parameters we care about in the SDK. + # that they are close enough and match especially on the parameters we care about in the Developer Tools. def check_graph_closeness(self, graph_a, graph_b): self.assertEqual(len(graph_a.graph.nodes), len(graph_b.graph.nodes)) for node_a, node_b in zip(graph_a.graph.nodes, graph_b.graph.nodes): diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md index fa41ec93c9d..c82af7d98fe 100644 --- a/docs/source/compiler-delegate-and-partitioner.md +++ b/docs/source/compiler-delegate-and-partitioner.md @@ -127,13 +127,13 @@ static auto success_with_compiler = register_backend(backend); ``` -## SDK Integration: Debuggability +## Developer Tools Integration: Debuggability -Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native SDK (Software Development Kit) for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). +Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). -Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native SDK does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. +Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. -In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, SDK provides an interface to correlate delegated (sub)graph to original (sub)graph. The SDK does so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [SDK delegate integration](./sdk-delegate-integration). +In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Developer Tools Delegate Integration](./sdk-delegate-integration). By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob diff --git a/docs/source/concepts.md b/docs/source/concepts.md index 33d944c376a..0c1512b5519 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -283,9 +283,9 @@ Techniques for performing computations and memory accesses on tensors with lower The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources). -## [SDK](./sdk-overview.md) +## [Developer Tools](./sdk-overview.md) -Software Development Kit. The tooling users need to profile, debug and visualize programs that are running with ExecuTorch. +A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch. ## [Selective build](./kernel-library-selective-build.md) diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md index 2c3f85aff17..bccb74b2104 100644 --- a/docs/source/getting-started-architecture.md +++ b/docs/source/getting-started-architecture.md @@ -87,8 +87,8 @@ The ExecuTorch runtime is written in C++ with minimal dependencies for portabili _Executor_ is the entry point to load the program and execute it. The execution triggers corresponding operator kernels or backend execution from this very minimal runtime. -## SDK +## Developer Tools -It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch SDK](./sdk-overview.md) to improve productivity. The SDK is not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. +It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./sdk-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. -During the program preparation and execution, users can use the ExecuTorch SDK to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. +During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. diff --git a/docs/source/index.rst b/docs/source/index.rst index d8955c513e4..3b0e0959cd7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -193,7 +193,7 @@ Topics in this section will help you get started with ExecuTorch. .. toctree:: :glob: :maxdepth: 1 - :caption: SDK + :caption: Developer Tools :hidden: sdk-overview @@ -244,11 +244,11 @@ ExecuTorch tutorials. :tags: .. customcarditem:: - :header: Using the ExecuTorch SDK to Profile a Model - :card_description: A tutorial for using the ExecuTorch SDK to profile and analyze a model with linkage back to source code. + :header: Using the ExecuTorch Developer Tools to Profile a Model + :card_description: A tutorial for using the ExecuTorch Developer Tools to profile and analyze a model with linkage back to source code. :image: _static/img/generic-pytorch-logo.png :link: tutorials/sdk-integration-tutorial.html - :tags: SDK + :tags: devtools .. customcarditem:: :header: Integrating and Running ExecuTorch on Apple Platforms diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md index f80caff4679..96c7982b8fe 100644 --- a/docs/source/intro-overview.md +++ b/docs/source/intro-overview.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 9c03399444e..46a5dc604fc 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -746,7 +746,7 @@ In the fragment of the output for nanoGPT below, observe that embedding and add ### Performance Analysis -Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model. +Through the ExecuTorch Developer Tools, users are able to profile model execution, giving timing information for each operator in the model. #### Prerequisites @@ -805,7 +805,7 @@ if (result.buf != nullptr && result.size > 0) { } ``` -Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump: +Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump: ``` option(EXECUTORCH_BUILD_SDK "" ON) diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md index 1d12daef9d8..72e90161d0c 100644 --- a/docs/source/native-delegates-executorch-xnnpack-delegate.md +++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md @@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors. #### **Profiling** -We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. +We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. [comment]: <> (TODO: Refactor quantizer to a more official quantization doc) diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md index b890653558e..88c05b8c03f 100644 --- a/docs/source/sdk-debugging.md +++ b/docs/source/sdk-debugging.md @@ -1,6 +1,6 @@ # Debugging Models in ExecuTorch -With the ExecuTorch SDK, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). +With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). Currently, ExecuTorch supports the following debugging flows: - Extraction of model level outputs via ETDump. diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md index 80033711552..a2f67157c89 100644 --- a/docs/source/sdk-delegate-integration.md +++ b/docs/source/sdk-delegate-integration.md @@ -1,4 +1,4 @@ -# SDK Delegate Integration +# Developer Tools Delegate Integration [Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing. - For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations. diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md index aad623efc8a..c58efb40de7 100644 --- a/docs/source/sdk-etdump.md +++ b/docs/source/sdk-etdump.md @@ -1,6 +1,6 @@ # Prerequisite | ETDump - ExecuTorch Dump -ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch SDK experience. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. +ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. ## Generating an ETDump diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst index b3b7f042cc4..63546f43ca6 100644 --- a/docs/source/sdk-etrecord.rst +++ b/docs/source/sdk-etrecord.rst @@ -9,7 +9,7 @@ users ahead of time (when they export their model to run on ExecuTorch). To draw a rough equivalent to conventional software development, ``ETRecord`` can be considered as the binary built with debug symbols that is used for debugging in GNU Debugger (gdb). It is expected that -the user will supply this to the ExecuTorch SDK tooling in order for +the user will supply this to the ExecuTorch Developer Tools in order for them to debug and visualize their model. ``ETRecord`` contains numerous components such as: diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md index 53f7d88613a..13fd8e00597 100644 --- a/docs/source/sdk-overview.md +++ b/docs/source/sdk-overview.md @@ -1,12 +1,12 @@ -# Introduction to the ExecuTorch SDK +# Introduction to the ExecuTorch Developer Tools -ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch SDK enables this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. +ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. -All the components of the SDK have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. +All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. -## SDK Features +## Developer Tools Features -The ExecuTorch SDK supports the following features: +The ExecuTorch Developer Tools support the following features: - **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output. - **Profiling** models with operator level breakdown of performance stats @@ -17,12 +17,12 @@ The ExecuTorch SDK supports the following features: - **Debugging** - Intermediate outputs and output quality analysis - **Visualization** - Coming soon -## Fundamental components of the SDK +## Fundamental components of the Developer Tools -In order to fully understand and leverage the power of the SDK in this section, the fundamental components that power the SDK will be detailed. +In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed. ### ETRecord -ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the SDK tooling to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. +ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb). @@ -32,13 +32,13 @@ More details are available in the [ETRecord documentation](sdk-etrecord.rst) on ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution. ```{note} -If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the SDK. For the full experience, it is recommended that the users also generate an ETRecord. +If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord. ``` More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime. ### Inspector APIs -The Inspector Python APIs are the main user enrty point into the SDK. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. +The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs. diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md index 83276d8d180..f827d108b1f 100644 --- a/docs/source/sdk-profiling.md +++ b/docs/source/sdk-profiling.md @@ -4,7 +4,7 @@ Profiling in ExecuTorch gives users access to these runtime metrics: - Model Load Time. - Operator Level Execution Time. - Delegate Execution Time. - - If the delegate that the user is calling into has been integrated with the [SDK](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. + - If the delegate that the user is calling into has been integrated with the [Developer Tools](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. - End-to-end Inference Execution Time. One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to. @@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](./sdk - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level. -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. +Please refer to the [Developer Tools tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md index 90c9ed6d343..2fad3ea9366 100644 --- a/docs/source/sdk-tutorial.md +++ b/docs/source/sdk-tutorial.md @@ -1,3 +1,3 @@ -## SDK usage tutorial +## Developer Tools Usage Tutorial -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the SDK. +Please refer to the [Developer Tools tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools. diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py index 35d200204cb..bc1d2ebe788 100644 --- a/docs/source/tutorials_source/sdk-integration-tutorial.py +++ b/docs/source/tutorials_source/sdk-integration-tutorial.py @@ -6,23 +6,23 @@ # LICENSE file in the root directory of this source tree. """ -Using the ExecuTorch SDK to Profile a Model +Using the ExecuTorch Developer Tools to Profile a Model ======================== **Author:** `Jack Khuu `__ """ ###################################################################### -# The `ExecuTorch SDK <../sdk-overview.html>`__ is a set of tools designed to +# The `ExecuTorch Developer Tools <../sdk-overview.html>`__ is a set of tools designed to # provide users with the ability to profile, debug, and visualize ExecuTorch # models. # -# This tutorial will show a full end-to-end flow of how to utilize the SDK. +# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model. # Specifically, it will: # -# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). +# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). # 2. Create an Inspector class consuming these artifacts. -# 3. Utilize the Inspector class to analyze the model. +# 3. Utilize the Inspector class to analyze the model profiling result. ###################################################################### # Prerequisites @@ -288,13 +288,13 @@ def forward(self, x): # ---------- # # In this tutorial, we learned about the steps required to consume an ExecuTorch -# model with the ExecuTorch SDK. It also showed how to use the Inspector APIs +# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs # to analyze the model run results. # # Links Mentioned # ^^^^^^^^^^^^^^^ # -# - `ExecuTorch SDK <../sdk-overview.html>`__ +# - `ExecuTorch Developer Tools Overview <../sdk-overview.html>`__ # - `ETRecord <../sdk-etrecord.html>`__ # - `ETDump <../sdk-etdump.html>`__ # - `Inspector <../sdk-inspector.html>`__ diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h index eb8ee1fefc9..5a26d24ca45 100644 --- a/runtime/core/event_tracer.h +++ b/runtime/core/event_tracer.h @@ -97,7 +97,7 @@ struct EventTracerEntry { * EventTracer is a class that users can inherit and implement to * log/serialize/stream etc. the profiling and debugging events that are * generated at runtime for a model. An example of this is the ETDump - * implementation in the SDK codebase that serializes these events to a + * implementation in the devtools codebase that serializes these events to a * flatbuffer. */ class EventTracer { From 7c76e0302ed8decada5626eed0aadf356b150c2d Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 11 Sep 2024 14:57:30 -0700 Subject: [PATCH 329/531] Switch Optimizer to std::map (#5230) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5230 Switch to map api which is directly compatible with TrainingModule Update the simple end 2 end test to use TrainingModule as well. Reviewed By: davidlin54 Differential Revision: D62453507 fbshipit-source-id: d40929997d42ea827a97f6fb2a1e38250ac298da --- extension/training/module/training_module.cpp | 4 + extension/training/module/training_module.h | 3 +- extension/training/optimizer/sgd.cpp | 167 ++++++++---------- extension/training/optimizer/sgd.h | 61 +++---- .../training/optimizer/test/sgd_test.cpp | 62 ++----- extension/training/test/targets.bzl | 1 + .../training/test/training_loop_test.cpp | 74 +++----- 7 files changed, 147 insertions(+), 225 deletions(-) diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp index 7b38292fd1f..28128552e2f 100644 --- a/extension/training/module/training_module.cpp +++ b/extension/training/module/training_module.cpp @@ -107,6 +107,10 @@ TrainingModule::named_parameters(const std::string& method_name) { uint64_t param_start = param_res.get()[0].toInt(); + auto e = executorch::extension::Module::load_method(method_name); + if (e != runtime::Error::Ok) { + return e; + } auto& method = methods_.at(method_name).method; // create dict diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h index 7571aacecf6..ade3f6f2f4f 100644 --- a/extension/training/module/training_module.h +++ b/extension/training/module/training_module.h @@ -68,8 +68,7 @@ class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module { * parameters for. * * @returns A Result object containing a map of the fully qualified name to - * parameter tensor, or an error if the method is not a joint graph or has not - * been executed yet. + * parameter tensor, or an error if the method is not a joint graph. */ ET_EXPERIMENTAL runtime::Result> diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp index ad6130183e5..fd63722b4f7 100644 --- a/extension/training/optimizer/sgd.cpp +++ b/extension/training/optimizer/sgd.cpp @@ -16,7 +16,6 @@ using exec_aten::Tensor; using exec_aten::TensorImpl; using ::executorch::runtime::Error; using ::executorch::runtime::KernelRuntimeContext; -using ::executorch::runtime::Span; namespace executorch { namespace extension { @@ -39,25 +38,13 @@ void SGDParamGroup::set_options(std::unique_ptr options) { options_ = std::move(options); } -Span SGDParamGroup::param_names() { - return param_names_; -} - -const Span SGDParamGroup::param_names() const { - return param_names_; -} - -Span SGDParamGroup::param_data() { - return param_data_; -} - -const Span SGDParamGroup::param_data() const { - return param_data_; +const std::map& +SGDParamGroup::named_parameters() const { + return named_parameters_; } void SGD::add_param_group(const SGDParamGroup& param_group) { - SGDParamGroup param_group_( - param_group.param_names(), param_group.param_data()); + SGDParamGroup param_group_(param_group.named_parameters()); if (!param_group.has_options()) { param_group_.set_options(defaults_->clone()); } else { @@ -66,13 +53,8 @@ void SGD::add_param_group(const SGDParamGroup& param_group) { param_groups_.emplace_back(std::move(param_group_)); } -Error SGD::step(Span gradient_names, Span gradient_data) { - // check that the number of gradient names matches the number of gradients - ET_CHECK_OR_RETURN_ERROR( - gradient_names.size() == gradient_data.size(), - InvalidState, - "Gradient names and gradients must have the same length."); - +Error SGD::step(const std::map& + named_gradients) { KernelRuntimeContext context; for (auto& group : param_groups_) { auto& options = static_cast(group.options()); @@ -81,85 +63,82 @@ Error SGD::step(Span gradient_names, Span gradient_data) { auto dampening = options.dampening(); auto nesterov = options.nesterov(); - for (int i = 0; i < group.param_names().size(); i++) { - for (int j = 0; j < gradient_names.size(); j++) { - // if param name and gradient name match, run the optimizer step - if (strcmp(group.param_names()[i], gradient_names[j]) == 0) { - auto d_p = gradient_data[j]; - auto p = group.param_data()[i]; - if (weight_decay != 0) { - // uses weight_decay specified and adds it to the gradient - torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); - } + for (auto param_iter = group.named_parameters().begin(); + param_iter != group.named_parameters().end(); + ++param_iter) { + // if param name and gradient name match, run the optimizer step + const auto& named_gradient = named_gradients.find(param_iter->first); + if (named_gradient != named_gradients.end()) { + auto d_p = named_gradient->second; + auto p = param_iter->second; + if (weight_decay != 0) { + // uses weight_decay specified and adds it to the gradient + torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); } - if (momentum != 0) { - Tensor buf(nullptr); - auto param_state = state_.find(p.unsafeGetTensorImpl()); - // look for the momentum buffer for the given parameter. this is the - // momentum as of the previous epoch - if (param_state == state_.end()) { - // create a new momentum buffer if it doesn't exist. this memory - // needs to be freed when the optimizer is destroyed - void* buf_ptr = malloc(d_p.nbytes()); + } + if (momentum != 0) { + Tensor buf(nullptr); + auto param_state = state_.find(p.unsafeGetTensorImpl()); + // look for the momentum buffer for the given parameter. this is the + // momentum as of the previous epoch + if (param_state == state_.end()) { + // create a new momentum buffer if it doesn't exist. this memory + // needs to be freed when the optimizer is destroyed + void* buf_ptr = malloc(d_p.nbytes()); #ifdef USE_ATEN_LIB - std::vector sizes( - d_p.sizes().begin(), d_p.sizes().end()); - buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type()); + std::vector sizes(d_p.sizes().begin(), d_p.sizes().end()); + buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type()); #else - TensorImpl* buf_impl = new TensorImpl( - d_p.scalar_type(), - d_p.sizes().size(), - const_cast(d_p.sizes().data()), - buf_ptr, - const_cast( - d_p.dim_order().data())); - buf = Tensor(buf_impl); + TensorImpl* buf_impl = new TensorImpl( + d_p.scalar_type(), + d_p.sizes().size(), + const_cast(d_p.sizes().data()), + buf_ptr, + const_cast(d_p.dim_order().data())); + buf = Tensor(buf_impl); #endif - torch::executor::aten::clone_outf( - context, d_p, exec_aten::MemoryFormat::Contiguous, buf); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); - } - - // save the state of the momentum buffer to be reused in later - // epochs - auto state = std::make_unique(buf); - state_[p.unsafeGetTensorImpl()] = std::move(state); - } else { - buf = static_cast(*param_state->second) - .momentum_buffer(); - - // update the momentum buffer and apply dampening - torch::executor::aten::mul_outf(context, buf, momentum, buf); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); - } - torch::executor::aten::add_outf( - context, buf, d_p, 1 - dampening, buf); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); - } + torch::executor::aten::clone_outf( + context, d_p, exec_aten::MemoryFormat::Contiguous, buf); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); } - if (nesterov) { - // apply nesterov momentum - torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); - } - } else { - d_p = buf; + + // save the state of the momentum buffer to be reused in later + // epochs + auto state = std::make_unique(buf); + state_[p.unsafeGetTensorImpl()] = std::move(state); + } else { + buf = static_cast(*param_state->second) + .momentum_buffer(); + + // update the momentum buffer and apply dampening + torch::executor::aten::mul_outf(context, buf, momentum, buf); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); + } + torch::executor::aten::add_outf( + context, buf, d_p, 1 - dampening, buf); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); } } - // update the parameter using the gradient and learning rate - torch::executor::aten::add_outf( - context, p, d_p, -1 * options.lr(), p); - if (context.failure_state() != Error::Ok) { - return context.failure_state(); + if (nesterov) { + // apply nesterov momentum + torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); + } + } else { + d_p = buf; } - break; + } + // update the parameter using the gradient and learning rate + torch::executor::aten::add_outf(context, p, d_p, -1 * options.lr(), p); + if (context.failure_state() != Error::Ok) { + return context.failure_state(); } } } diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h index fb797e4d5d6..3a85f85c77b 100644 --- a/extension/training/optimizer/sgd.h +++ b/extension/training/optimizer/sgd.h @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include @@ -133,52 +133,42 @@ class SGDParamGroup { // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has // to be copy-constructible. SGDParamGroup(const SGDParamGroup& param_group) - : param_data_(param_group.param_data()), - param_names_(param_group.param_names()), + : named_parameters_(param_group.named_parameters()), options_( param_group.has_options() ? param_group.options().clone() : nullptr) {} SGDParamGroup& operator=(const SGDParamGroup& param_group) { - this->param_data_ = param_group.param_data(); - this->param_names_ = param_group.param_names(); + this->named_parameters_ = param_group.named_parameters_; this->options_ = param_group.has_options() ? param_group.options().clone() : nullptr; return *this; } /** - * This constructs a SGD param group. We expect that the two spans are of the - * same size, and that for a given param data, its index in param_data is the - * same as its param name in param_name. + * Constructs a SGD param group. * - * @param[in] param_names The names of the params for this group. - * @param[in] param_data The tensors representing the param data. + * @param[in] named_parameters The parameters to be optimized and their fully + * qualified names. */ /* implicit */ SGDParamGroup( - ::executorch::runtime::Span param_names, - ::executorch::runtime::Span param_data) - : param_data_(std::move(param_data)), - param_names_(std::move(param_names)) {} + const std::map& + named_parameters) + : named_parameters_(named_parameters) {} SGDParamGroup( - ::executorch::runtime::Span param_names, - ::executorch::runtime::Span param_data, + const std::map& + named_parameters, std::unique_ptr options) - : param_data_(std::move(param_data)), - param_names_(std::move(param_names)), - options_(std::move(options)) {} + : named_parameters_(named_parameters), options_(std::move(options)) {} bool has_options() const; SGDOptions& options(); const SGDOptions& options() const; void set_options(std::unique_ptr options); - ::executorch::runtime::Span param_names(); - const ::executorch::runtime::Span param_names() const; - ::executorch::runtime::Span param_data(); - const ::executorch::runtime::Span param_data() const; + const std::map& named_parameters() + const; private: - ::executorch::runtime::Span param_data_; - ::executorch::runtime::Span param_names_; + std::map named_parameters_; std::unique_ptr options_; }; @@ -198,11 +188,10 @@ class SGD { } explicit SGD( - ::executorch::runtime::Span param_names, - ::executorch::runtime::Span param_data, + const std::map& + named_parameters, SGDOptions defaults) - : SGD({SGDParamGroup(std::move(param_names), std::move(param_data))}, - defaults) {} + : SGD({SGDParamGroup(named_parameters)}, defaults) {} // Adds the given param_group to the optimizer's param_group list. void add_param_group(const SGDParamGroup& param_group); @@ -212,18 +201,12 @@ class SGD { /** * Performs the optimization step. * - * The two spans must be of the same size. It is expected that the gradient in - * 'gradient_data' at index 'i' represents the gradient calculated in the loss - * function for the parameter with the name in 'gradient_names' at index 'i'. - * - * @param[in] gradient_names The names of the params that matches the gradient - * in 'gradient_data' at the same index. - * @param[in] gradient_data The gradient tensors to be used for optimization - * step. + * @param[in] named_gradients The gradients of the tensors specified by the + * fully qualified name. */ ::executorch::runtime::Error step( - ::executorch::runtime::Span gradient_names, - ::executorch::runtime::Span gradient_data); + const std::map& + named_gradients); private: std::vector param_groups_; diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp index 33a70b4fe95..92b329d8edb 100644 --- a/extension/training/optimizer/test/sgd_test.cpp +++ b/extension/training/optimizer/test/sgd_test.cpp @@ -23,7 +23,6 @@ using ::executorch::extension::training::optimizer::SGD; using ::executorch::extension::training::optimizer::SGDOptions; using ::executorch::extension::training::optimizer::SGDParamState; using ::executorch::runtime::Error; -using ::executorch::runtime::Span; using ::executorch::runtime::testing::TensorFactory; class SGDOptimizerTest : public ::testing::Test { @@ -69,70 +68,47 @@ TEST_F(SGDOptimizerTest, SGDOptionsDefaultValuesTest) { TEST_F(SGDOptimizerTest, SGDOptimizerSimple) { TensorFactory tf; - const char* param_name[1] = {"param1"}; - Span param_names(param_name, 1); + std::map named_parameters; + std::map named_gradients; - Tensor param_data[1] = {tf.make({1, 1}, {1})}; - Span param_data_span(param_data, 1); + named_parameters.insert({"param1", tf.make({1, 1}, {1})}); // dummy gradient of -1 for all epochs - Tensor grad_data[1] = {tf.make({1, 1}, {-1})}; - Span grad_data_span(grad_data, 1); + named_gradients.insert({"param1", tf.make({1, 1}, {-1})}); - SGD optimizer(param_names, param_data_span, SGDOptions{0.1}); + SGD optimizer(named_parameters, SGDOptions{0.1}); for (int i = 0; i < 10; ++i) { - optimizer.step(param_names, grad_data_span); + optimizer.step(named_gradients); } auto p1 = static_cast( - param_data_span[0].unsafeGetTensorImpl()->data()); + named_parameters.at("param1").unsafeGetTensorImpl()->data()); EXPECT_NEAR(p1[0], 2.0, 0.1); } -TEST_F(SGDOptimizerTest, SGDOptimizerMismatchedGradientSpans) { - TensorFactory tf; - - const char* param_name[1] = {"param1"}; - Span param_names(param_name, 1); - - Tensor param_data[1] = {tf.make({1, 1}, {1})}; - Span param_data_span(param_data, 1); - - // dummy gradient of -1 for all epochs - Tensor grad_data[2] = {tf.make({1, 1}, {-1}), tf.make({1, 1}, {-1})}; - Span grad_data_span(grad_data, 2); - - SGD optimizer(param_names, param_data_span, SGDOptions{0.1}); - - Error error = optimizer.step(param_names, grad_data_span); - - EXPECT_EQ(error, Error::InvalidState); -} - TEST_F(SGDOptimizerTest, SGDOptimizerComplex) { TensorFactory tf; - const char* param_name[2] = {"param1", "param2"}; - Span param_names(param_name, 2); + std::map named_parameters; - Tensor param_data[2] = {tf.make({1, 1}, {1.0}), tf.make({1, 1}, {2.0})}; - Span param_data_span(param_data, 2); + named_parameters.insert({"param1", tf.make({1, 1}, {1.0})}); + named_parameters.insert({"param2", tf.make({1, 1}, {2.0})}); - SGD optimizer(param_names, param_data_span, SGDOptions{0.1, 0.1, 0, 2, true}); + SGD optimizer(named_parameters, SGDOptions{0.1, 0.1, 0, 2, true}); for (int i = 0; i < 10; ++i) { + std::map named_gradients; // dummy gradient of -1 for all epochs - Tensor grad_data[2] = {tf.make({1, 1}, {-1}), tf.make({1, 1}, {-1})}; - Span grad_data_span(grad_data, 2); - - optimizer.step(param_names, grad_data_span); + named_gradients.insert({"param1", tf.make({1, 1}, {-1})}); + named_gradients.insert({"param2", tf.make({1, 1}, {-1})}); + optimizer.step(named_gradients); } - auto p1 = static_cast( - param_data_span[0].unsafeGetTensorImpl()->data()); - auto p2 = static_cast( - param_data_span[1].unsafeGetTensorImpl()->data()); + auto p1 = + static_cast(named_parameters.at("param1").const_data_ptr()); + auto p2 = + static_cast(named_parameters.at("param2").const_data_ptr()); EXPECT_NEAR(p1[0], 0.540303, 0.1); EXPECT_NEAR(p2[0], 0.620909, 0.1); } diff --git a/extension/training/test/targets.bzl b/extension/training/test/targets.bzl index 22107409c2a..9710f512060 100644 --- a/extension/training/test/targets.bzl +++ b/extension/training/test/targets.bzl @@ -30,6 +30,7 @@ def define_common_targets(is_fbcode = False): "//executorch/extension/evalue_util:print_evalue", "//executorch/runtime/executor/test:managed_memory_manager", "//executorch/extension/training/optimizer:sgd", + "//executorch/extension/training/module:training_module", "//executorch/kernels/portable:generated_lib", ], env = modules_env, diff --git a/extension/training/test/training_loop_test.cpp b/extension/training/test/training_loop_test.cpp index 8e62663c9f7..bc162ab26bb 100644 --- a/extension/training/test/training_loop_test.cpp +++ b/extension/training/test/training_loop_test.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -30,73 +31,52 @@ using exec_aten::Tensor; using namespace torch::executor; using torch::executor::util::FileDataLoader; -constexpr size_t kDefaultNonConstMemBytes = 32 * 1024; -constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024; - class TrainingLoopTest : public ::testing::Test { protected: - void SetUp() override { - // Create a loader for the serialized ModuleAdd program. - const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH"); - Result loader = FileDataLoader::from(path); - ASSERT_EQ(loader.error(), Error::Ok); - loader_ = std::make_unique(std::move(loader.get())); - - // Use it to load the program. - Result program = Program::load( - loader_.get(), Program::Verification::InternalConsistency); - ASSERT_EQ(program.error(), Error::Ok); - program_ = std::make_unique(std::move(program.get())); - } - - // Must outlive program_, but tests shouldn't need to touch it. - std::unique_ptr loader_; - - std::unique_ptr program_; + void SetUp() override {} }; TEST_F(TrainingLoopTest, OptimizerSteps) { - // Execute model with constants stored in segment. - ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes); - Result method = program_->load_method("forward", &mmm.get()); - ASSERT_EQ(method.error(), Error::Ok); + const char* path = std::getenv("ET_MODULE_SIMPLE_TRAIN_PATH"); + executorch::runtime::Result + loader_res = torch::executor::util::FileDataLoader::from(path); + ASSERT_EQ(loader_res.error(), Error::Ok); + auto loader = std::make_unique( + std::move(loader_res.get())); + + auto mod = executorch::extension::training::TrainingModule(std::move(loader)); // Create inputs. TensorFactory tf; Tensor input = tf.make({3}, {1.0, 1.0, 1.0}); Tensor label = tf.make({3}, {1.0, 0.0, 0.0}); - Error e = method->set_input(input, 0); - e = method->set_input(label, 1); + auto res = mod.execute_forward_backward("forward", {input, label}); + ASSERT_TRUE(res.ok()); // Set up optimizer. - const char* param_name[2] = {"mod.linear1.weight", "mod.linear2.bias"}; - Span param_names(param_name, 2); - - Tensor param_data[2] = { - method.get().get_output(3).toTensor(), // mod.linear1.weight - method.get().get_output(4).toTensor()}; // mod.linear1.bias - Span param_data_span(param_data, 2); - - auto orig_data = param_data[0].data_ptr()[0]; + // Get the params and names + auto param_res = mod.named_parameters("forward"); + ASSERT_EQ(param_res.error(), Error::Ok); - Tensor grad_data[2] = { - method.get().get_output(1).toTensor(), // mod.linear1.weight.grad - method.get().get_output(2).toTensor()}; // mod.linear1.bias.grad - ; - Span grad_data_span(grad_data, 2); + float orig_data = param_res.get().at("linear.weight").data_ptr()[0]; SGDOptions options{0.1}; - SGD optimizer(param_names, param_data_span, options); + SGD optimizer(param_res.get(), options); - // Execute the method. (Forward and Backward) - Error err = method->execute(); - ASSERT_EQ(err, Error::Ok); + // Get the gradients + auto grad_res = mod.named_gradients("forward"); + ASSERT_EQ(grad_res.error(), Error::Ok); + auto& grad = grad_res.get(); + ASSERT_EQ(grad.size(), 2); + ASSERT_NE(grad.find("linear.weight"), grad.end()); + ASSERT_NE(grad.find("linear.bias"), grad.end()); // Step - auto opt_err = optimizer.step(param_names, grad_data_span); + auto opt_err = optimizer.step(grad_res.get()); ASSERT_EQ(opt_err, Error::Ok); // Check that the data has changed. - ASSERT_NE(param_data[0].data_ptr()[0], orig_data); + ASSERT_NE( + param_res.get().at("linear.weight").data_ptr()[0], orig_data); } From de3057283783ea7bd47105db5e95d3b22fc8f295 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 11 Sep 2024 15:44:42 -0700 Subject: [PATCH 330/531] Just print Android instrument log when the test passes (#5280) Summary: This should make it easier to reuse the test spec on Minibench where we don't need to rely on instrument tests to run the benchmark anymore while just keeping it to satisfy AWS as one of its required input. Pull Request resolved: https://github.com/pytorch/executorch/pull/5280 Reviewed By: kirklandsign Differential Revision: D62521107 Pulled By: huydhn fbshipit-source-id: 08c1ec2ceb4ba2044931267d379a07595304c116 --- .../LlamaDemo/android-llm-device-farm-test-spec.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml index 896e7b73fbf..dc6401806d4 100644 --- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml +++ b/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml @@ -63,14 +63,7 @@ phases: # Check for this last to make sure that there is no failure elif [ $TESTS_PASSED -ne 0 ]; then - OBSERVED_TPS=$(grep "INSTRUMENTATION_STATUS: TPS=" $INSTRUMENT_LOG | tail -n 1) - - if [ -n "${OBSERVED_TPS}" ]; - then - echo "[PyTorch] ${OBSERVED_TPS}"; - else - echo "[PyTorch] Test passes but couldn't find the observed TPS from instrument log"; - fi + cat "${INSTRUMENT_LOG}" fi; # Run the new generic benchmark activity https://developer.android.com/tools/adb#am From 7e3ec96c76f5ef03774f33b38d08661ad69b89e5 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Wed, 11 Sep 2024 16:31:00 -0700 Subject: [PATCH 331/531] Remove references to exec_aten::RuntimeContext (#5257) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5257 RuntimeContext should never have been in that namespace since it's not an ATen type. There are still other internal users outside of //executorch, but the important thing right now is that we set good examples for OSS users. Reviewed By: manuelcandales Differential Revision: D62478758 fbshipit-source-id: 2c0c753a5ef7a766fdde4dd796c12eedce312868 --- extension/llm/custom_ops/op_sdpa_aot.cpp | 2 +- extension/llm/custom_ops/op_sdpa_test.cpp | 2 +- extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp | 2 +- kernels/aten/cpu/op__to_dim_order_copy.cpp | 2 +- kernels/optimized/cpu/op_gelu.cpp | 2 +- kernels/portable/cpu/op__to_dim_order_copy.cpp | 2 +- kernels/portable/test/op_gelu_test.cpp | 2 +- kernels/quantized/test/op_add_test.cpp | 4 ++-- kernels/quantized/test/op_embedding4b_test.cpp | 4 ++-- kernels/quantized/test/op_embedding_test.cpp | 4 ++-- kernels/quantized/test/op_mixed_linear_test.cpp | 6 +++--- kernels/quantized/test/op_mixed_mm_test.cpp | 4 ++-- kernels/test/TestUtil.h | 2 +- kernels/test/custom_kernel_example/op_relu.cpp | 5 +++-- kernels/test/op_atan2_test.cpp | 2 +- kernels/test/op_cdist_forward_test.cpp | 2 +- kernels/test/op_clamp_test.cpp | 2 +- kernels/test/op_diagonal_copy_test.cpp | 2 +- kernels/test/op_expm1_test.cpp | 2 +- kernels/test/op_flip_test.cpp | 2 +- kernels/test/op_ge_test.cpp | 2 +- kernels/test/op_gt_test.cpp | 2 +- kernels/test/op_le_test.cpp | 2 +- kernels/test/op_log10_test.cpp | 2 +- kernels/test/op_log1p_test.cpp | 2 +- kernels/test/op_log2_test.cpp | 2 +- kernels/test/op_lt_test.cpp | 2 +- kernels/test/op_maximum_test.cpp | 2 +- kernels/test/op_native_batch_norm_test.cpp | 2 +- kernels/test/op_native_group_norm_test.cpp | 2 +- kernels/test/op_ne_test.cpp | 4 ++-- kernels/test/op_pdist_forward_test.cpp | 2 +- kernels/test/op_prod_test.cpp | 4 ++-- kernels/test/op_reflection_pad1d_test.cpp | 2 +- kernels/test/op_reflection_pad2d_test.cpp | 2 +- kernels/test/op_reflection_pad3d_test.cpp | 2 +- kernels/test/op_replication_pad1d_test.cpp | 2 +- kernels/test/op_replication_pad2d_test.cpp | 2 +- kernels/test/op_replication_pad3d_test.cpp | 2 +- kernels/test/op_roll_test.cpp | 2 +- kernels/test/op_topk_test.cpp | 2 +- kernels/test/op_trunc_test.cpp | 2 +- 42 files changed, 52 insertions(+), 51 deletions(-) diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp index 3fc790af792..6db8a0ed7cb 100644 --- a/extension/llm/custom_ops/op_sdpa_aot.cpp +++ b/extension/llm/custom_ops/op_sdpa_aot.cpp @@ -33,7 +33,7 @@ Tensor& sdpa_with_kv_cache_out_no_context( // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy const optional scale, Tensor& output) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::native::sdpa_with_kv_cache_out( context, q_projected, diff --git a/extension/llm/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp index 43f20229174..7d7a35b4f96 100644 --- a/extension/llm/custom_ops/op_sdpa_test.cpp +++ b/extension/llm/custom_ops/op_sdpa_test.cpp @@ -28,7 +28,7 @@ exec_aten::Tensor op_scaled_dot_product_attention( bool is_causal, exec_aten::optional scale, exec_aten::Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::native::flash_attention_kernel_out( context, query, key, value, attn_mask, dropout_p, is_causal, scale, out); } diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp index 2a8124bc1e5..e53ddb97663 100644 --- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp +++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp @@ -31,7 +31,7 @@ exec_aten::Tensor op_sdpa_with_kv_cache( bool is_causal, exec_aten::optional scale, exec_aten::Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::native::sdpa_with_kv_cache_out( context, query, diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp index 63a301531d9..209276f319c 100644 --- a/kernels/aten/cpu/op__to_dim_order_copy.cpp +++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp @@ -115,7 +115,7 @@ Tensor& _to_dim_order_copy_out( bool non_blocking, OptionalArrayRef dim_order, Tensor& out) { - exec_aten::RuntimeContext ctx{}; + executorch::runtime::KernelRuntimeContext ctx{}; return _to_dim_order_copy_out(ctx, self, non_blocking, dim_order, out); } diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp index e65f3008484..92f98ec95fd 100644 --- a/kernels/optimized/cpu/op_gelu.cpp +++ b/kernels/optimized/cpu/op_gelu.cpp @@ -38,7 +38,7 @@ namespace { */ template void gelu( - exec_aten::RuntimeContext& context, + executorch::runtime::KernelRuntimeContext& context, const Tensor& input, string_view approximate, Tensor& output) { diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp index c7941f6098a..3a16cdc8998 100644 --- a/kernels/portable/cpu/op__to_dim_order_copy.cpp +++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp @@ -118,7 +118,7 @@ Tensor& _to_dim_order_copy_out( bool non_blocking, OptionalArrayRef dim_order, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return _to_dim_order_copy_out(context, self, non_blocking, dim_order, out); } diff --git a/kernels/portable/test/op_gelu_test.cpp b/kernels/portable/test/op_gelu_test.cpp index ed6b8db4299..7bd3964aedf 100644 --- a/kernels/portable/test/op_gelu_test.cpp +++ b/kernels/portable/test/op_gelu_test.cpp @@ -25,7 +25,7 @@ using torch::executor::testing::TensorFactory; // executorch/kernels/test/op_gelu_test.cpp instead. Tensor& op_gelu_out(const Tensor& self, string_view approximate, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::native::gelu_out(context, self, approximate, out); } diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp index a48ba10c66e..573d9b1dca5 100644 --- a/kernels/quantized/test/op_add_test.cpp +++ b/kernels/quantized/test/op_add_test.cpp @@ -21,10 +21,10 @@ using namespace ::testing; using exec_aten::ArrayRef; using exec_aten::optional; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::native::add_out; using torch::executor::native::dequantize_per_tensor_out; using torch::executor::native::quantize_per_tensor_out; @@ -193,7 +193,7 @@ TEST(OpQuantizeAddTest, ConsitencyWithReferencePattern) { optional out_dtype = optional(); - RuntimeContext context{}; + KernelRuntimeContext context{}; // q -> qadd -> dq // 3.5 / 0.5 + 1 = 8 quantize_per_tensor_out( diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp index 1eb7aa11b2a..4125e557d4b 100644 --- a/kernels/quantized/test/op_embedding4b_test.cpp +++ b/kernels/quantized/test/op_embedding4b_test.cpp @@ -19,9 +19,9 @@ using namespace ::testing; using exec_aten::ArrayRef; using exec_aten::optional; -using exec_aten::RuntimeContext; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::native::quantized_embedding_4bit_out; using torch::executor::testing::TensorFactory; @@ -62,7 +62,7 @@ TEST(OpQuantizedEmbedding4bTest, TestGroupWiseQuantizedEmbedding) { EXPECT_TENSOR_EQ(out, expected); out = tf.zeros({3, 4}); - auto context = RuntimeContext(); + auto context = KernelRuntimeContext(); torch::executor::native::quantized_embedding_4bit_out( context, qweight, diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp index 76114561e53..cc0abe3cb94 100644 --- a/kernels/quantized/test/op_embedding_test.cpp +++ b/kernels/quantized/test/op_embedding_test.cpp @@ -21,10 +21,10 @@ using namespace ::testing; using exec_aten::ArrayRef; using exec_aten::optional; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::native::dequantize_per_tensor_out; using torch::executor::native::embedding_out; using torch::executor::native::quantize_per_tensor_out; @@ -120,7 +120,7 @@ TEST(OpQuantizedEmbeddingTest, ConsitencyWithReferencePattern) { TensorFactory tfo; Tensor qweight = tfo.zeros({3, 1}); - RuntimeContext context{}; + KernelRuntimeContext context{}; // 3.5 / 0.5 + 1 = 8 // 5.5 / 0.5 + 1 = 12 // 1 / 0.5 + 1 = 3 diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp index df141cb1cb3..5728134d983 100644 --- a/kernels/quantized/test/op_mixed_linear_test.cpp +++ b/kernels/quantized/test/op_mixed_linear_test.cpp @@ -18,9 +18,9 @@ using namespace ::testing; using exec_aten::optional; -using exec_aten::RuntimeContext; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::native::quantized_mixed_linear_out; using torch::executor::testing::TensorFactory; @@ -57,7 +57,7 @@ void test_dtype() { /*sizes=*/{1, 2}, /*data=*/{2.3, 3.6}); - RuntimeContext ctx{}; + KernelRuntimeContext ctx{}; quantized_mixed_linear_out( ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out); @@ -112,7 +112,7 @@ void test_dtype_partials() { {(1.0 * 5 + 1.5 * 3) * 0.2 + 2.0 * 1 * 1, (1.0 * 4 + 1.5 * 2) * 0.4 + 2.0 * 1 * 0.5}); - RuntimeContext ctx{}; + KernelRuntimeContext ctx{}; quantized_mixed_linear_out( ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out); diff --git a/kernels/quantized/test/op_mixed_mm_test.cpp b/kernels/quantized/test/op_mixed_mm_test.cpp index 3e181ab5231..0dc71abc7ac 100644 --- a/kernels/quantized/test/op_mixed_mm_test.cpp +++ b/kernels/quantized/test/op_mixed_mm_test.cpp @@ -18,9 +18,9 @@ using namespace ::testing; using exec_aten::optional; -using exec_aten::RuntimeContext; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::native::quantized_mixed_mm_out; using torch::executor::testing::TensorFactory; @@ -55,7 +55,7 @@ void test_dtype() { /*sizes=*/{1, 2}, /*data=*/{3.8, 3.0}); - RuntimeContext ctx{}; + KernelRuntimeContext ctx{}; quantized_mixed_mm_out(ctx, input, weight, weight_scales, opt_weight_zp, out); diff --git a/kernels/test/TestUtil.h b/kernels/test/TestUtil.h index 8d782d3c2a9..aa220f5bfd5 100644 --- a/kernels/test/TestUtil.h +++ b/kernels/test/TestUtil.h @@ -116,6 +116,6 @@ class OperatorTest : public ::testing::Test { } protected: - exec_aten::RuntimeContext context_; + executorch::runtime::KernelRuntimeContext context_; bool expect_failure_; }; diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp index e86d0ca4e62..e59fbf4bd72 100644 --- a/kernels/test/custom_kernel_example/op_relu.cpp +++ b/kernels/test/custom_kernel_example/op_relu.cpp @@ -18,8 +18,8 @@ namespace native { using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; -using exec_aten::RuntimeContext; using executor::Error; +using executorch::runtime::KernelRuntimeContext; namespace { @@ -61,7 +61,8 @@ void relu(const Tensor& input, Tensor& output) { * * relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) */ -Tensor& my_relu_out(RuntimeContext& context, const Tensor& input, Tensor& out) { +Tensor& +my_relu_out(KernelRuntimeContext& context, const Tensor& input, Tensor& out) { (void)context; resize(out, input.sizes()); ET_KERNEL_CHECK( diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp index 0c5baffd414..2acdeeb9134 100644 --- a/kernels/test/op_atan2_test.cpp +++ b/kernels/test/op_atan2_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_atan2_out(const Tensor& self, const Tensor& other, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::atan2_outf(context, self, other, out); } diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp index 04ccb6d34af..c8c18c36add 100644 --- a/kernels/test/op_cdist_forward_test.cpp +++ b/kernels/test/op_cdist_forward_test.cpp @@ -28,7 +28,7 @@ Tensor& op_cdist_forward_out( double p, optional compute_mode, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::_cdist_forward_outf( context, x1, x2, p, compute_mode, out); } diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp index 0244fd55700..d9d45509084 100644 --- a/kernels/test/op_clamp_test.cpp +++ b/kernels/test/op_clamp_test.cpp @@ -260,7 +260,7 @@ class OpClampTensorOutTest : public OperatorTest { const optional& min, const optional& max, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::clamp_outf(context, self, min, max, out); } }; diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp index 71f2839db77..5ad69066532 100644 --- a/kernels/test/op_diagonal_copy_test.cpp +++ b/kernels/test/op_diagonal_copy_test.cpp @@ -27,7 +27,7 @@ Tensor& op_diagonal_copy_out( int64_t dim1, int64_t dim2, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::diagonal_copy_outf( context, input, offset, dim1, dim2, out); } diff --git a/kernels/test/op_expm1_test.cpp b/kernels/test/op_expm1_test.cpp index 0538e7b6b2e..c0d3a226309 100644 --- a/kernels/test/op_expm1_test.cpp +++ b/kernels/test/op_expm1_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_expm1_out(const Tensor& a, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::expm1_outf(context, a, out); } diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp index 36d85d8a1fd..01c79a283e9 100644 --- a/kernels/test/op_flip_test.cpp +++ b/kernels/test/op_flip_test.cpp @@ -22,7 +22,7 @@ using exec_aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::flip_outf(context, input, dims, out); } diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp index 5e7414a735e..21f21dfbfd2 100644 --- a/kernels/test/op_ge_test.cpp +++ b/kernels/test/op_ge_test.cpp @@ -15,10 +15,10 @@ #include using namespace ::testing; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpGeTensorOutTest : public OperatorTest { diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp index ae94e2109f0..140c08ae274 100644 --- a/kernels/test/op_gt_test.cpp +++ b/kernels/test/op_gt_test.cpp @@ -15,10 +15,10 @@ #include using namespace ::testing; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpGtScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp index afbe42ab7ec..ab437327ba2 100644 --- a/kernels/test/op_le_test.cpp +++ b/kernels/test/op_le_test.cpp @@ -15,10 +15,10 @@ #include using namespace ::testing; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpLeScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_log10_test.cpp b/kernels/test/op_log10_test.cpp index 3a030d9e20b..d4e14100497 100644 --- a/kernels/test/op_log10_test.cpp +++ b/kernels/test/op_log10_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_log10_out(const Tensor& a, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::log10_outf(context, a, out); } diff --git a/kernels/test/op_log1p_test.cpp b/kernels/test/op_log1p_test.cpp index dd6dc9981d6..3d4b0f1c567 100644 --- a/kernels/test/op_log1p_test.cpp +++ b/kernels/test/op_log1p_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_log1p_out(const Tensor& a, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::log1p_outf(context, a, out); } diff --git a/kernels/test/op_log2_test.cpp b/kernels/test/op_log2_test.cpp index 17bc0a63880..cbbd8f6a985 100644 --- a/kernels/test/op_log2_test.cpp +++ b/kernels/test/op_log2_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_log2_out(const Tensor& a, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::log2_outf(context, a, out); } diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp index 34ff9c93b68..45767bcd0ba 100644 --- a/kernels/test/op_lt_test.cpp +++ b/kernels/test/op_lt_test.cpp @@ -15,10 +15,10 @@ #include using namespace ::testing; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpLtScalarOutTest : public OperatorTest { diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp index 92e6121c3fa..254725d634f 100644 --- a/kernels/test/op_maximum_test.cpp +++ b/kernels/test/op_maximum_test.cpp @@ -21,7 +21,7 @@ using exec_aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_maximum_out(const Tensor& self, const Tensor& other, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::maximum_outf(context, self, other, out); } diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp index 459a5e5a557..c6810f737fd 100644 --- a/kernels/test/op_native_batch_norm_test.cpp +++ b/kernels/test/op_native_batch_norm_test.cpp @@ -61,7 +61,7 @@ class OpNativeBatchNormLegitOutTest : public OperatorTest { exec_aten::Tensor& out0, exec_aten::Tensor& out1, exec_aten::Tensor& out2) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::_native_batch_norm_legit_outf( context, input, diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp index 6bc4785ce4d..aab4d9d76a4 100644 --- a/kernels/test/op_native_group_norm_test.cpp +++ b/kernels/test/op_native_group_norm_test.cpp @@ -32,7 +32,7 @@ ::std::tuple op_native_group_norm_out( Tensor& out0, Tensor& out1, Tensor& out2) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::native_group_norm_outf( context, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2); } diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp index 9603dee03c2..81ec9d01fce 100644 --- a/kernels/test/op_ne_test.cpp +++ b/kernels/test/op_ne_test.cpp @@ -15,10 +15,10 @@ #include using namespace ::testing; -using exec_aten::RuntimeContext; using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; using torch::executor::testing::TensorFactory; class OpNeTest : public OperatorTest { @@ -34,7 +34,7 @@ class OpNeTest : public OperatorTest { Tensor a = tf_input.make(/*sizes=*/{2, 2}, /*data=*/{2, 3, 2, 4}); Tensor b = tf_input.make({2, 2}, {2, 2, 2, 2}); Tensor out = tf_bool.zeros({2, 2}); - RuntimeContext context{}; + KernelRuntimeContext context{}; torch::executor::aten::ne_outf(context, a, b, out); EXPECT_TENSOR_EQ(out, tf_bool.make({2, 2}, {false, true, false, true})); diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp index a21c1eb8256..f022c9af94f 100644 --- a/kernels/test/op_pdist_forward_test.cpp +++ b/kernels/test/op_pdist_forward_test.cpp @@ -22,7 +22,7 @@ using exec_aten::Tensor; using torch::executor::testing::TensorFactory; Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::_pdist_forward_outf(context, input, p, out); } diff --git a/kernels/test/op_prod_test.cpp b/kernels/test/op_prod_test.cpp index 3e9f7e6af14..f96eea9564c 100644 --- a/kernels/test/op_prod_test.cpp +++ b/kernels/test/op_prod_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::TensorFactory; Tensor& op_prod_out(const Tensor& self, optional dtype, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::prod_outf(context, self, dtype, out); } @@ -33,7 +33,7 @@ Tensor& op_prod_int_out( bool keepdim, optional dtype, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::prod_outf( context, self, dim, keepdim, dtype, out); } diff --git a/kernels/test/op_reflection_pad1d_test.cpp b/kernels/test/op_reflection_pad1d_test.cpp index 91f9da57b42..2c357ffe384 100644 --- a/kernels/test/op_reflection_pad1d_test.cpp +++ b/kernels/test/op_reflection_pad1d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad1d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad1d_outf( context, input, padding, out); } diff --git a/kernels/test/op_reflection_pad2d_test.cpp b/kernels/test/op_reflection_pad2d_test.cpp index 295c8772ceb..6e0c7780f51 100644 --- a/kernels/test/op_reflection_pad2d_test.cpp +++ b/kernels/test/op_reflection_pad2d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad2d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad2d_outf( context, input, padding, out); } diff --git a/kernels/test/op_reflection_pad3d_test.cpp b/kernels/test/op_reflection_pad3d_test.cpp index e49ec715496..8ef8b6154df 100644 --- a/kernels/test/op_reflection_pad3d_test.cpp +++ b/kernels/test/op_reflection_pad3d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_reflection_pad3d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::reflection_pad3d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad1d_test.cpp b/kernels/test/op_replication_pad1d_test.cpp index 77ecc9d5a07..942a38e27ba 100644 --- a/kernels/test/op_replication_pad1d_test.cpp +++ b/kernels/test/op_replication_pad1d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad1d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad1d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad2d_test.cpp b/kernels/test/op_replication_pad2d_test.cpp index af5c7cd7264..2b9147b575f 100644 --- a/kernels/test/op_replication_pad2d_test.cpp +++ b/kernels/test/op_replication_pad2d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad2d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad2d_outf( context, input, padding, out); } diff --git a/kernels/test/op_replication_pad3d_test.cpp b/kernels/test/op_replication_pad3d_test.cpp index 8f9c35219d9..61a23ffbaba 100644 --- a/kernels/test/op_replication_pad3d_test.cpp +++ b/kernels/test/op_replication_pad3d_test.cpp @@ -25,7 +25,7 @@ Tensor& op_replication_pad3d_out( const Tensor& input, ArrayRef padding, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::replication_pad3d_outf( context, input, padding, out); } diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp index dc7b23ca50e..16e09ec83f5 100644 --- a/kernels/test/op_roll_test.cpp +++ b/kernels/test/op_roll_test.cpp @@ -26,7 +26,7 @@ Tensor& op_roll_out( ArrayRef shifts, ArrayRef dims, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::roll_outf(context, input, shifts, dims, out); } diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp index 9f57225ba4f..44a709687f0 100644 --- a/kernels/test/op_topk_test.cpp +++ b/kernels/test/op_topk_test.cpp @@ -104,7 +104,7 @@ std::tuple op_topk_values( Tensor& values, Tensor& indices) { TempMemoryAllocator allocator = TempMemoryAllocator(); - exec_aten::RuntimeContext context(nullptr, &allocator); + executorch::runtime::KernelRuntimeContext context(nullptr, &allocator); return torch::executor::aten::topk_outf( context, input, k, dim, largest, sorted, values, indices); } diff --git a/kernels/test/op_trunc_test.cpp b/kernels/test/op_trunc_test.cpp index f78fb9d37fa..d380886b29e 100644 --- a/kernels/test/op_trunc_test.cpp +++ b/kernels/test/op_trunc_test.cpp @@ -23,7 +23,7 @@ using torch::executor::testing::SupportedFeatures; using torch::executor::testing::TensorFactory; Tensor& op_trunc_out(const Tensor& a, Tensor& out) { - exec_aten::RuntimeContext context{}; + executorch::runtime::KernelRuntimeContext context{}; return torch::executor::aten::trunc_outf(context, a, out); } From f9da6758a6674911efb140e3ccc79f70e535a6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Palma=20S=C3=A1nchez?= Date: Wed, 11 Sep 2024 16:43:11 -0700 Subject: [PATCH 332/531] Tuning LLM from PTE (#5233) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5233 * Add example of finetuning using executorch Reviewed By: JacobSzwejbka, dvorjackz Differential Revision: D61689035 fbshipit-source-id: cf73b02bb337335ba3737934b8a294cee4b010ca --- examples/llm_pte_finetuning/TARGETS | 70 ++++++++ examples/llm_pte_finetuning/model_exporter.py | 87 +++++++++ .../llm_pte_finetuning/model_loading_lib.py | 88 +++++++++ .../phi3_alpaca_code_config.yaml | 49 +++++ examples/llm_pte_finetuning/phi3_config.yaml | 40 +++++ .../llm_pte_finetuning/qwen_05b_config.yaml | 39 ++++ examples/llm_pte_finetuning/runner.py | 118 ++++++++++++ examples/llm_pte_finetuning/training_lib.py | 170 ++++++++++++++++++ extension/pybindings/pybindings.cpp | 66 +++++-- kernels/aten/functions.yaml | 6 + 10 files changed, 718 insertions(+), 15 deletions(-) create mode 100644 examples/llm_pte_finetuning/TARGETS create mode 100644 examples/llm_pte_finetuning/model_exporter.py create mode 100644 examples/llm_pte_finetuning/model_loading_lib.py create mode 100644 examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml create mode 100644 examples/llm_pte_finetuning/phi3_config.yaml create mode 100644 examples/llm_pte_finetuning/qwen_05b_config.yaml create mode 100644 examples/llm_pte_finetuning/runner.py create mode 100644 examples/llm_pte_finetuning/training_lib.py diff --git a/examples/llm_pte_finetuning/TARGETS b/examples/llm_pte_finetuning/TARGETS new file mode 100644 index 00000000000..fee67914909 --- /dev/null +++ b/examples/llm_pte_finetuning/TARGETS @@ -0,0 +1,70 @@ +load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +oncall("papaya_oncall") + +python_library( + name = "model_loading_lib", + srcs = [ + "model_loading_lib.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/examples/llm_pte_finetuning:training_lib", + "fbcode//executorch/exir:lib", + "fbcode//executorch/extension/pybindings:aten_lib", # @manual For PTE loader + "fbcode//pytorch/torchtune:lib", + "fbsource//third-party/pypi/blobfile:blobfile", # @manual For tokenizer + "fbsource//third-party/pypi/omegaconf:omegaconf", + "fbsource//third-party/pypi/tiktoken:tiktoken", # @manual For tokenizer + ], +) + +python_library( + name = "training_lib", + srcs = [ + "training_lib.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/extension/pybindings:aten_lib", # @manual For PTE loader + "fbcode//pytorch/torchtune:lib", + "fbsource//third-party/pypi/blobfile:blobfile", # @manual For tokenizer + "fbsource//third-party/pypi/tiktoken:tiktoken", # @manual For tokenizer + "fbsource//third-party/pypi/tqdm:tqdm", + ], +) + +python_binary( + name = "runner", + srcs = [ + "runner.py", + ], + main_function = "executorch.examples.llm_pte_finetuning.runner.main", + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/examples/llm_pte_finetuning:training_lib", + "fbcode//pytorch/torchtune:lib", + "fbsource//third-party/pypi/blobfile:blobfile", # @manual For tokenizer + "fbsource//third-party/pypi/omegaconf:omegaconf", + "fbsource//third-party/pypi/tiktoken:tiktoken", # @manual For tokenizer + "fbsource//third-party/pypi/tqdm:tqdm", + ], +) + +python_binary( + name = "model_exporter", + srcs = [ + "model_exporter.py", + ], + main_function = "executorch.examples.llm_pte_finetuning.model_exporter.main", + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/examples/llm_pte_finetuning:model_loading_lib", # @manual for model loading + "fbcode//executorch/examples/llm_pte_finetuning:training_lib", # @manual for model exporting + "fbcode//pytorch/torchtune:lib", + "fbsource//third-party/pypi/blobfile:blobfile", # @manual For tokenizer + "fbsource//third-party/pypi/omegaconf:omegaconf", + "fbsource//third-party/pypi/tiktoken:tiktoken", # @manual For tokenizer + ], +) diff --git a/examples/llm_pte_finetuning/model_exporter.py b/examples/llm_pte_finetuning/model_exporter.py new file mode 100644 index 00000000000..e7f074c8769 --- /dev/null +++ b/examples/llm_pte_finetuning/model_exporter.py @@ -0,0 +1,87 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import argparse + +import torch +from executorch.examples.llm_pte_finetuning.model_loading_lib import ( + export_model_lora_training, + load_checkpoint, + setup_model, +) + +from executorch.examples.llm_pte_finetuning.training_lib import ( + get_dataloader, + TrainingModule, +) + +from omegaconf import OmegaConf +from torch.nn import functional as F +from torchtune import config + +from torchtune.training import MODEL_KEY + +parser = argparse.ArgumentParser( + prog="ModelExporter", + description="Export a LoRA model to ExecuTorch.", + epilog="Model exported to be used for fine-tuning.", +) + +parser.add_argument("--cfg", type=str, help="Path to the config file.") +parser.add_argument("--output_file", type=str, help="Path to the output ET model.") + + +def main() -> None: + args = parser.parse_args() + config_file = args.cfg + output_file = args.output_file + cfg = OmegaConf.load(config_file) + tokenizer = config.instantiate( + cfg.tokenizer, + ) + + loss_fn = config.instantiate(cfg.loss) + + ds = config.instantiate(cfg.dataset, tokenizer) + train_set, val_set = torch.utils.data.random_split(ds, [0.8, 0.2]) + train_dataloader = get_dataloader(cfg, train_set, tokenizer, loss_fn) + + max_seq_len = cfg.tokenizer.max_seq_len + + # Example inputs, needed for ET export. + batch = next(iter(train_dataloader)) + tokens, labels = batch["tokens"], batch["labels"] + token_size = tokens.shape[1] + labels_size = labels.shape[1] + + if token_size > max_seq_len: + tokens = tokens[:, :max_seq_len] + else: + tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0) + + if labels_size > max_seq_len: + labels = labels[:, :max_seq_len] + else: + labels = F.pad(labels, (0, max_seq_len - labels_size), value=0) + + # Load pre-trained checkpoint. + checkpoint_dict = load_checkpoint(cfg=cfg) + model = setup_model( + # pyre-ignore + cfg=cfg, + base_model_state_dict=checkpoint_dict[MODEL_KEY], + ) + + training_module = TrainingModule(model, loss_fn) + + # Export the model to ExecuTorch for training. + export_model_lora_training(training_module, (tokens, labels), output_file) + + +if __name__ == "__main__": + main() diff --git a/examples/llm_pte_finetuning/model_loading_lib.py b/examples/llm_pte_finetuning/model_loading_lib.py new file mode 100644 index 00000000000..3372a97e269 --- /dev/null +++ b/examples/llm_pte_finetuning/model_loading_lib.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from typing import Any, Dict, Tuple + +import torch +from executorch.examples.llm_pte_finetuning.training_lib import TrainingModule +from executorch.exir import to_edge + +from omegaconf import DictConfig +from torch.export import export, ExportedProgram +from torch.export.experimental import _export_forward_backward +from torch.nn.attention import sdpa_kernel, SDPBackend +from torchtune import config +from torchtune.modules.peft import get_adapter_params, set_trainable_params +from torchtune.training.precision import get_dtype, set_default_dtype +from torchtune.utils._device import get_device + + +def load_checkpoint(cfg: Any) -> Dict[str, Any]: # pyre-ignore[2] + """ + Extract the checkpoint state from file and validate. This includes the + base model weights. If resume_from_checkpoint is True, this also includes + the adapter weights and recipe state + """ + checkpointer = config.instantiate( + cfg.checkpointer, + resume_from_checkpoint=cfg.resume_from_checkpoint, + ) + checkpoint_dict = checkpointer.load_checkpoint() + return checkpoint_dict + + +def setup_model( + cfg: DictConfig, + base_model_state_dict: Dict[str, Any], +) -> torch.nn.Module: + device = get_device(device=cfg.device) + dtype = get_dtype(cfg.dtype, device=device) + with set_default_dtype(dtype), device: + model = config.instantiate(cfg.model) + + adapter_params = get_adapter_params(model) + set_trainable_params(model, adapter_params) + model.load_state_dict(base_model_state_dict, strict=False) + return model + + +def export_model_lora_training( + model: TrainingModule, + example_args: Tuple[Any, ...], # pyre-ignore[2] + output_file: str, +) -> None: + """ + Export model with LoRA model to executorch for training, only. + """ + + # 0. Mark the LoRA layers as trainable (requires_grad = True) in order + # to just export the backwards pass for these layers later in the + # export process. + set_trainable_params(model, get_adapter_params(model)) + + print("Exporting model with LoRA for training") + # 1. torch.export: Defines the program with the ATen operator set. + + with sdpa_kernel([SDPBackend.MATH]): + exported_graph: ExportedProgram = export(model, example_args, strict=False) + print("Creating a joint forward-backwards graph for training") + joint_graph = _export_forward_backward(exported_graph) + + # 2. to_edge: Make optimizations for Edge devices. + print("Lowering to edge dialect") + edge_program = to_edge(joint_graph) + + print(edge_program._edge_programs["forward"].graph_module) + + # 3. to_executorch: Convert the graph to an ExecuTorch program. + print("Exporting to executorch") + executorch_program = edge_program.to_executorch() + print(executorch_program.exported_program().graph_signature) + print(f"Saving to {output_file}") + with open(output_file, "wb") as file: + file.write(executorch_program.buffer) diff --git a/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml b/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml new file mode 100644 index 00000000000..88e5bfac700 --- /dev/null +++ b/examples/llm_pte_finetuning/phi3_alpaca_code_config.yaml @@ -0,0 +1,49 @@ +tokenizer: + _component_: torchtune.models.phi3.phi3_mini_tokenizer + path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model + max_seq_len: 1024 + +dataset: + _component_: torchtune.datasets.instruct_dataset + template: papaya.toolkit.experimental.llm_pte_finetuning.utils.DatabricksDolly + source: iamtarun/python_code_instructions_18k_alpaca + split: train + column_map: + instruction: instruction + prompt: prompt + input: input + output: output +seed: null +shuffle: True +batch_size: 1 + +loss: + _component_: torch.nn.CrossEntropyLoss + +model: + _component_: torchtune.models.phi3.lora_phi3_mini + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Phi-3-mini-4k-instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Phi-3-mini-4k-instruct/ + model_type: PHI3_MINI + +resume_from_checkpoint: False +save_adapter_weights_only: False + +device: cpu +dtype: fp32 + +enable_activation_checkpointing: True +compile: False diff --git a/examples/llm_pte_finetuning/phi3_config.yaml b/examples/llm_pte_finetuning/phi3_config.yaml new file mode 100644 index 00000000000..7417ece79bd --- /dev/null +++ b/examples/llm_pte_finetuning/phi3_config.yaml @@ -0,0 +1,40 @@ +tokenizer: + _component_: torchtune.models.phi3.phi3_mini_tokenizer + path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model + max_seq_len: 512 + +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 1 + +loss: + _component_: torch.nn.CrossEntropyLoss + +model: + _component_: torchtune.models.phi3.lora_phi3_mini + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Phi-3-mini-4k-instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Phi-3-mini-4k-instruct/ + model_type: PHI3_MINI +resume_from_checkpoint: False +save_adapter_weights_only: False + +device: cpu +dtype: fp32 + +enable_activation_checkpointing: True +compile: False diff --git a/examples/llm_pte_finetuning/qwen_05b_config.yaml b/examples/llm_pte_finetuning/qwen_05b_config.yaml new file mode 100644 index 00000000000..b93517b8fda --- /dev/null +++ b/examples/llm_pte_finetuning/qwen_05b_config.yaml @@ -0,0 +1,39 @@ +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + max_seq_len: 512 + +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 1 + +loss: + _component_: torch.nn.CrossEntropyLoss + +model: + _component_: torchtune.models.qwen2.lora_qwen2_0_5b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + lora_rank: 32 + lora_alpha: 64 + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct + model_type: QWEN2 +resume_from_checkpoint: False +save_adapter_weights_only: False + +device: cpu +dtype: fp32 + +enable_activation_checkpointing: True +compile: False diff --git a/examples/llm_pte_finetuning/runner.py b/examples/llm_pte_finetuning/runner.py new file mode 100644 index 00000000000..2e01fdafe8d --- /dev/null +++ b/examples/llm_pte_finetuning/runner.py @@ -0,0 +1,118 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import argparse + +import torch +from executorch.examples.llm_pte_finetuning.training_lib import ( + eval_model, + get_dataloader, + update_function, +) + +from executorch.extension.pybindings.aten_lib import ( # @manual + _load_for_executorch_from_buffer, +) +from omegaconf import OmegaConf +from torch.nn import functional as F +from torchtune import config +from tqdm import tqdm + +parser = argparse.ArgumentParser( + prog="Runner", + description="Fine tunes LoRA model using ExecuTorch.", + epilog="Model exported to be used for fine-tuning.", +) +parser.add_argument("--cfg", type=str, help="Path to the config file.") +parser.add_argument("--model_file", type=str, help="Path to the ET model file.") + + +def main() -> None: + args = parser.parse_args() + config_file = args.cfg + file = args.model_file + cfg = OmegaConf.load(config_file) + tokenizer = config.instantiate( + cfg.tokenizer, + ) + + loss_fn = config.instantiate(cfg.loss) + + ds = config.instantiate(cfg.dataset, tokenizer) + train_set, val_set = torch.utils.data.random_split(ds, [0.8, 0.2]) + train_dataloader = get_dataloader(cfg, train_set, tokenizer, loss_fn) + val_dataloader = get_dataloader(cfg, val_set, tokenizer, loss_fn) + + max_seq_len = cfg.tokenizer.max_seq_len + # Num of steps to run training. Assume 1 epoch + num_steps = 100 + with open(file, "rb") as f: + model_bytes = f.read() + et_mod = _load_for_executorch_from_buffer(model_bytes) + + # Evaluate the model before training. + print("Evaluating the model before training") + eval_loss = eval_model( + model=et_mod, + dataloader=val_dataloader, + loss_fn=loss_fn, + max_seq_len=max_seq_len, + num_eval_steps=10, + ) + print("Eval loss: ", eval_loss) + + # Based on executorch/extension/training/module/training_module.cpp + # grads run from [grad_start, param_start] + # params run from [param_start, outputs_end] + grad_start = et_mod.run_method("__et_training_gradients_index_forward", [])[0] + param_start = et_mod.run_method("__et_training_parameters_index_forward", [])[0] + learning_rate = 5e-3 + f.seek(0) + losses = [] + for i, batch in tqdm(enumerate(train_dataloader), total=num_steps): + # Run for a limited number of steps. + if i >= num_steps: + break + tokens, labels = batch["tokens"], batch["labels"] + token_size = tokens.shape[1] + labels_size = labels.shape[1] + + # Fixed length for now. We need to resize as the input shapes + # should be the same passed as examples to the export function. + if token_size > max_seq_len: + tokens = tokens[:, :max_seq_len] + else: + tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0) + + if labels_size > max_seq_len: + labels = labels[:, :max_seq_len] + else: + labels = F.pad(labels, (0, max_seq_len - labels_size), value=0) + + out = et_mod.forward((tokens, labels)) + + loss = out[0] + losses.append(loss.item()) + with torch.no_grad(): + for grad, param in zip(out[grad_start:param_start], out[param_start:]): + update_function(param, grad, learning_rate) + + print("Losses: ", losses) + # Evaluate the model after training. + eval_loss = eval_model( + model=et_mod, + dataloader=val_dataloader, + loss_fn=loss_fn, + max_seq_len=max_seq_len, + num_eval_steps=10, + ) + print("Eval loss: ", eval_loss) + + +if __name__ == "__main__": + main() diff --git a/examples/llm_pte_finetuning/training_lib.py b/examples/llm_pte_finetuning/training_lib.py new file mode 100644 index 00000000000..6324d93814e --- /dev/null +++ b/examples/llm_pte_finetuning/training_lib.py @@ -0,0 +1,170 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from functools import partial +from typing import Any, Dict, Mapping, Optional + +import torch +from executorch.extension.pybindings.aten_lib import ExecuTorchModule # @manual + +from torch.nn import functional as F +from torch.utils.data import DataLoader, Dataset, DistributedSampler +from torchtune.data import InstructTemplate +from torchtune.data._collate import padded_collate_sft +from tqdm import tqdm + + +class TrainingModule(torch.nn.Module): + """ + The model being trained should return the loss from forward(). This + class wraps the actual model and computes the loss for an LLM + fine-tuning task. The loss is computed as the cross entropy between + the tokens and a shifted version of the labels so we learn to predict + the next token. + """ + + def __init__( + self, model: torch.nn.Module, loss: torch.nn.modules.loss._Loss + ) -> None: + super().__init__() + self.model = model + self.loss = loss + + def forward(self, input: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + # Output is of the shape (seq_len, vocab_size). + logits = self.model(input) + logits = logits[..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + logits = logits.transpose(1, 2) + return self.loss(logits, labels) + + +class DatabricksDolly(InstructTemplate): + """ + Used for the Dolly dataset from Databricks. + + https://huggingface.co/datasets/databricks/databricks-dolly-15k + """ + + template = "Instruction:\n{instruction}\n\nContext:\n{input}\n\nResponse: " + + @classmethod + def format( + cls, + sample: Mapping[str, Any], + column_map: Optional[Dict[str, str]], + ) -> str: + assert column_map is not None + instruction = sample[column_map["instruction"]] + input = sample[column_map["input"]] + return cls.template.format(instruction=instruction, input=input) + + +class PythonCodeInstructions(InstructTemplate): + """ + https://huggingface.co/datasets/iamtarun/python_code_instructions_18k_alpaca + """ + + template = ( + "{prompt}\n\n" + "Instruction:\n{instruction}" + "\n\nContext:\n{input}\n\nResponse: " + ) + + @classmethod + def format( + cls, + sample: Mapping[str, Any], + column_map: Optional[Dict[str, str]], + ) -> str: + assert column_map is not None + instruction = sample[column_map["instruction"]] + input = sample[column_map["input"]] + prompt = sample[column_map["prompt"]] + return cls.template.format(instruction=instruction, input=input, prompt=prompt) + + +def update_function( + param: torch.Tensor, + grad: torch.Tensor, + learning_rate: float, + weight_decay: float = 1.0, +) -> None: + """SGD update function.""" + grad = grad + weight_decay * param + param.sub_(learning_rate * grad) + + +def eval_model( + model: ExecuTorchModule, + dataloader: DataLoader, + loss_fn: torch.nn.modules.loss._Loss, + max_seq_len: int, + num_eval_steps: int, +) -> float: + total_loss = 0 + for i, batch in tqdm(enumerate(dataloader), total=num_eval_steps): + if i >= num_eval_steps: + break + tokens, labels = batch["tokens"], batch["labels"] + token_size = tokens.shape[1] + labels_size = labels.shape[1] + + tokens, labels = batch["tokens"], batch["labels"] + token_size = tokens.shape[1] + labels_size = labels.shape[1] + + # Fixed length for now. We need to resize as the input shapes + # should be the same passed as examples to the export function. + if token_size > max_seq_len: + tokens = tokens[:, :max_seq_len] + else: + tokens = F.pad(tokens, (0, max_seq_len - token_size), value=0) + + if labels_size > max_seq_len: + labels = labels[:, :max_seq_len] + else: + labels = F.pad(labels, (0, max_seq_len - labels_size), value=0) + + out = model.forward((tokens, labels)) + loss = out[0] + total_loss += loss + return total_loss / num_eval_steps + + +def get_dataloader( + cfg: Any, # pyre-ignore[2] + ds: Dataset[Any], # pyre-ignore[2] + tokenizer: Any, # pyre-ignore[2] + loss_fn: torch.nn.modules.loss._Loss, +) -> DataLoader: + """Given a dataset, tokenizer, and loss function, return a dataloader.""" + packed = cfg.dataset.get("packed", False) + + sampler = DistributedSampler( + ds, + num_replicas=1, + rank=0, + shuffle=cfg.shuffle, + seed=0, + ) + dataloader = DataLoader( + dataset=ds, + sampler=sampler, + batch_size=cfg.batch_size, + collate_fn=( + partial( + padded_collate_sft, + padding_idx=tokenizer.pad_id, + ignore_idx=loss_fn.ignore_index, + ) + if not packed + else None + ), + ) + return dataloader diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 57bc44d1394..d674f2fe58c 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -509,7 +509,8 @@ struct PyModule final { py::list run_method( const std::string& method_name, - const py::sequence& inputs) { + const py::sequence& inputs, + bool clone_outputs = true) { const auto inputs_size = py::len(inputs); std::vector cpp_inputs; cpp_inputs.reserve(inputs_size); @@ -603,17 +604,19 @@ struct PyModule final { module_->run_method(method_name, cpp_inputs, output_storage_spans); // Retrieve outputs - return get_outputs_as_py_list(outputs); + return get_outputs_as_py_list(outputs, clone_outputs); } - py::list forward(const py::sequence& inputs) { - return run_method("forward", inputs); + py::list forward(const py::sequence& inputs, bool clone_outputs = true) { + return run_method("forward", inputs, clone_outputs); } - py::list forward_single_input(const torch::Tensor& inputTensor) { + py::list forward_single_input( + const torch::Tensor& inputTensor, + bool clone_outputs = true) { py::list py_list; py_list.append(py::cast(inputTensor)); - return run_method("forward", py_list); + return run_method("forward", py_list, clone_outputs); } bool has_etdump() { @@ -686,7 +689,9 @@ struct PyModule final { return outputs; } - py::list plan_execute(const std::string method_name) { + py::list plan_execute( + const std::string method_name, + bool clone_outputs = true) { auto& method = module_->get_method(method_name); // Need to pre-allocate space for outputs just like in run_method. const auto num_outputs = method.outputs_size(); @@ -703,10 +708,12 @@ struct PyModule final { "executing execution plan for method 'forward' failed with error: 0x%" PRIx32, static_cast(status)); const auto outputs = module_->get_outputs(method_name); - return get_outputs_as_py_list(outputs); + return get_outputs_as_py_list(outputs, clone_outputs); } - py::list get_outputs_as_py_list(const std::vector& outputs) { + py::list get_outputs_as_py_list( + const std::vector& outputs, + bool clone_outputs = true) { const auto outputs_size = outputs.size(); py::list list(outputs_size); for (size_t i = 0; i < outputs_size; ++i) { @@ -725,9 +732,17 @@ struct PyModule final { #ifdef USE_ATEN_LIB // Clone so the outputs in python do not share a lifetime with the // module object - list[i] = py::cast(v.toTensor().clone()); + if (clone_outputs) { + list[i] = py::cast(v.toTensor().clone()); + } else { + list[i] = py::cast(v.toTensor()); + } #else - list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone()); + if (clone_outputs) { + list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone()); + } else { + list[i] = py::cast(alias_attensor_to_etensor(v.toTensor())); + } #endif } else { ET_ASSERT_UNREACHABLE_MSG("Invalid model output type"); @@ -845,14 +860,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) { py::arg("rtol") = 1e-5, py::arg("atol") = 1e-8, call_guard) - .def("plan_execute", &PyModule::plan_execute, call_guard) + .def( + "plan_execute", + &PyModule::plan_execute, + py::arg("method_name"), + py::arg("clone_outputs") = true, + call_guard) .def( "run_method", &PyModule::run_method, py::arg("method_name"), py::arg("inputs") = py::list(), + py::arg("clone_outputs") = true, + call_guard) + .def( + "forward", + &PyModule::forward, + py::arg("inputs") = py::list(), + py::arg("clone_outputs") = true, call_guard) - .def("forward", &PyModule::forward, call_guard) .def("has_etdump", &PyModule::has_etdump, call_guard) .def( "write_etdump_result_to_file", @@ -860,8 +886,18 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) { py::arg("path"), py::arg("debug_buffer_path") = py::none(), call_guard) - .def("__call__", &PyModule::forward, call_guard) - .def("__call__", &PyModule::forward_single_input, call_guard); + .def( + "__call__", + &PyModule::forward, + py::arg("inputs") = py::list(), + py::arg("clone_outputs") = true, + call_guard) + .def( + "__call__", + &PyModule::forward_single_input, + py::arg("inputs") = py::list(), + py::arg("clone_outputs") = true, + call_guard); py::class_(m, "BundledModule"); } diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index e63863fc048..cba03b8a743 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -410,3 +410,9 @@ - op: zeros_like.out - op: zeros.out + +- op: gather.out + +- op: scatter.value_out + +- op: aten::native_dropout.out From c5c121ba9b9b10d3db6822a7cb722bd76f07f05c Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 11 Sep 2024 17:25:54 -0700 Subject: [PATCH 333/531] Move test spec file (#5218) Summary: Use the new minibench for both LLM and generic model benchmarking. Pull Request resolved: https://github.com/pytorch/executorch/pull/5218 Reviewed By: guangy10 Differential Revision: D62478090 Pulled By: kirklandsign fbshipit-source-id: 62a1b7a5cc0212031453e4962122db51c34e0559 --- .github/workflows/android-perf.yml | 4 ++-- .github/workflows/upload-android-test-specs.yml | 8 ++++---- build/build_android_llm_demo.sh | 3 ++- .../benchmark}/android-llm-device-farm-test-spec.yml | 6 +++--- 4 files changed, 11 insertions(+), 10 deletions(-) rename {examples/demo-apps/android/LlamaDemo => extension/android/benchmark}/android-llm-device-farm-test-spec.yml (93%) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index ba58435c69a..4045d6f99ef 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -292,8 +292,8 @@ jobs: # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer. # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only # one app+flavor that could load and run the model. - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo/app-debug-androidTest.apk + android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/minibench/app-debug.apk + android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/minibench/app-debug-androidTest.apk # NB: Need to set the default spec here so that it works for periodic too test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }} # Uploaded to S3 from the previous job diff --git a/.github/workflows/upload-android-test-specs.yml b/.github/workflows/upload-android-test-specs.yml index 04f7cf40d73..dd6bcca4309 100644 --- a/.github/workflows/upload-android-test-specs.yml +++ b/.github/workflows/upload-android-test-specs.yml @@ -4,13 +4,13 @@ on: pull_request: paths: - .github/workflows/upload-android-test-specs.yml - - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml push: branches: - main paths: - .github/workflows/upload-android-test-specs.yml - - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} @@ -30,7 +30,7 @@ jobs: ${{ github.repository }}/${{ github.run_id }}/artifact retention-days: 1 if-no-files-found: error - path: examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + path: extension/android/benchmark/android-llm-device-farm-test-spec.yml validate-android-test-spec: needs: upload-android-test-spec-for-validation @@ -75,7 +75,7 @@ jobs: - name: Upload the spec to S3 ossci-android bucket shell: bash - working-directory: examples/demo-apps/android/LlamaDemo/ + working-directory: extension/android/benchmark/ env: SPEC_FILE: android-llm-device-farm-test-spec.yml run: | diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index 917512d71b6..c5d0118afda 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -111,7 +111,7 @@ build_android_demo_apps() { mkdir -p extension/android/benchmark/app/libs cp ${BUILD_AAR_DIR}/executorch.aar extension/android/benchmark/app/libs pushd extension/android/benchmark - ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build + ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest popd } @@ -136,6 +136,7 @@ collect_artifacts_to_be_uploaded() { MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench" mkdir -p "${MINIBENCH_APP_DIR}" cp extension/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}" } BUILD_AAR_DIR="$(mktemp -d)" diff --git a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml b/extension/android/benchmark/android-llm-device-farm-test-spec.yml similarity index 93% rename from examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml rename to extension/android/benchmark/android-llm-device-farm-test-spec.yml index dc6401806d4..4e3274ce66f 100644 --- a/examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml +++ b/extension/android/benchmark/android-llm-device-farm-test-spec.yml @@ -69,7 +69,7 @@ phases: # Run the new generic benchmark activity https://developer.android.com/tools/adb#am - echo "Run LLM benchmark" - | - adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n com.example.executorchllamademo/.LlmBenchmarkRunner \ + adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.LlmBenchmarkActivity \ --es "model_dir" "/data/local/tmp/llama" \ --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin" @@ -82,12 +82,12 @@ phases: MAX_ATTEMPT=10 while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do echo "Waiting for benchmark results..." - BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo cat files/benchmark_results.json) + BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench cat files/benchmark_results.json) sleep 30 ((ATTEMPT++)) done - adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo ls -la files/ + adb -s $DEVICEFARM_DEVICE_UDID shell run-as org.pytorch.minibench ls -la files/ # Trying to pull the file using adb ends up with permission error, but this works too, so why not echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json From a4be79fe791f5f1c9f651b936dd8b56e20bf5956 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 11 Sep 2024 18:12:14 -0700 Subject: [PATCH 334/531] Switch Apple benchmark workflow to use the generic ET benchmark iOS app (#5212) Summary: This requires shoumikhin change in https://github.com/pytorch/executorch/pull/5208, so I will rebase and test it out again after https://github.com/pytorch/executorch/pull/5208 lands ### Testing https://github.com/pytorch/executorch/actions/runs/10787058020 Pull Request resolved: https://github.com/pytorch/executorch/pull/5212 Reviewed By: shoumikhin, guangy10 Differential Revision: D62415898 Pulled By: huydhn fbshipit-source-id: 16aeb94e01519c48d75666454e4db933151dc650 --- .github/workflows/apple-perf.yml | 50 +++++++++++++----- .github/workflows/apple.yml | 4 +- build/build_apple_llm_demo.sh | 51 +++++++------------ ...fault-ios-device-farm-appium-test-spec.yml | 4 +- .../Benchmark.xcodeproj/project.pbxproj | 12 +++-- 5 files changed, 67 insertions(+), 54 deletions(-) diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index cb1b2b6a1b2..e214e33ac1c 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -204,22 +204,19 @@ jobs: if-no-files-found: ignore path: ${{ runner.temp }}/artifacts/ - build-llm-demo: - name: build-llm-demo + build-benchmark-app: + name: build-benchmark-app uses: pytorch/test-infra/.github/workflows/macos_job.yml@main needs: - set-parameters secrets: inherit - strategy: - matrix: - tokenizer: [bpe] with: runner: macos-latest-xlarge python-version: '3.11' submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} upload-artifact: ios-apps - secrets-env: BUILD_CERTIFICATE_BASE64 BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD timeout: 90 script: | set -eux @@ -234,7 +231,7 @@ jobs: export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded # Setup Apple certificate for iOS development - BUILD_PROVISION_PROFILE_BASE64="${SECRET_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \ BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ .ci/scripts/setup-ios.sh @@ -248,11 +245,38 @@ jobs: backends/apple/mps/install_requirements.sh echo "::endgroup::" + echo "::group::Build ExecuTorch iOS frameworks" + FRAMEWORKS=( + "executorch" + "backend_coreml" + "backend_mps" + "backend_xnnpack" + "kernels_custom" + "kernels_optimized" + "kernels_portable" + "kernels_quantized" + ) + + # Build Release iOS Frameworks + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack + + mkdir -p extension/apple/Benchmark/Frameworks + for FRAMEWORK in "${FRAMEWORKS[@]}"; do ( + cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/ + ) done + echo "::endgroup::" + + # NB: Although exported models can be copied to this directory and bundled together with the + # app, we don't use this in CI and rely on AWS extra data parameter to make the model and the + # tokenizer available to the benchmark. This decouples the app and the model. We just need to + # create the directory here to pass the build + mkdir -p extension/apple/Benchmark/Models ${CONDA_RUN} --no-capture-output \ - build/build_apple_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} + build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME} - upload-ios-apps: - needs: build-llm-demo + upload-benchmark-app: + needs: build-benchmark-app runs-on: linux.2xlarge steps: - name: Download the apps from GitHub @@ -281,7 +305,7 @@ jobs: benchmark-on-device: needs: - set-parameters - - upload-ios-apps + - upload-benchmark-app - upload-models permissions: id-token: write @@ -302,7 +326,7 @@ jobs: project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 device-pool-arn: ${{ matrix.device }} # Uploaded to S3 from the previous job - ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/LLaMAPerfBenchmark.ipa - ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/LLaMAPerfBenchmark.xctestrun.zip + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/Benchmark.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/Benchmark.xctestrun.zip test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }} extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index a74fbcdaf5f..2224c2d5159 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -15,7 +15,7 @@ on: - build/build_apple_frameworks.sh - build/create_frameworks.sh - build/test_ios_ci.sh - - examples/demo-apps/apple/** + - examples/demo-apps/apple_ios/** - extension/apple/** - extension/module/** workflow_dispatch: @@ -35,7 +35,7 @@ concurrency: jobs: build-demo-ios: - name: test-demo-ios + name: build-demo-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main secrets: inherit with: diff --git a/build/build_apple_llm_demo.sh b/build/build_apple_llm_demo.sh index 08652f04718..9fe1c1bcd77 100755 --- a/build/build_apple_llm_demo.sh +++ b/build/build_apple_llm_demo.sh @@ -7,50 +7,33 @@ set -euo pipefail -TOKENIZER="${1:-bpe}" -ARTIFACTS_DIR_NAME="$2" - -APP_PATH="examples/demo-apps/apple_ios/LLaMA/LLaMA" - -if [[ "${TOKENIZER}" = "bpe" ]]; then - xcodebuild build-for-testing \ - -project "${APP_PATH}.xcodeproj" \ - -scheme LLaMAPerfBenchmark \ - -destination platform="iOS" \ - -allowProvisioningUpdates \ - DEVELOPMENT_TEAM=78E7V7QP35 \ - CODE_SIGN_STYLE=Manual \ - PROVISIONING_PROFILE_SPECIFIER=iLLaMA \ - CODE_SIGN_IDENTITY="iPhone Distribution" \ - CODE_SIGNING_REQUIRED=No \ - CODE_SIGNING_ALLOWED=No \ - GCC_PREPROCESSOR_DEFINITIONS="DEBUG=1 ET_USE_TIKTOKEN=0" -else - xcodebuild build-for-testing \ - -project "${APP_PATH}.xcodeproj" \ - -scheme LLaMAPerfBenchmark \ - -destination platform="iOS" \ - -allowProvisioningUpdates \ - DEVELOPMENT_TEAM=78E7V7QP35 \ - CODE_SIGN_STYLE=Manual \ - PROVISIONING_PROFILE_SPECIFIER=iLLaMA \ - CODE_SIGN_IDENTITY="iPhone Distribution" \ - CODE_SIGNING_REQUIRED=No \ - CODE_SIGNING_ALLOWED=No -fi +ARTIFACTS_DIR_NAME="$1" +APP_PATH="extension/apple/Benchmark/Benchmark" + +xcodebuild build-for-testing \ + -project "${APP_PATH}.xcodeproj" \ + -scheme Benchmark \ + -destination "platform=iOS" \ + -sdk iphoneos \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER="ExecuTorch Benchmark" \ + CODE_SIGN_IDENTITY="iPhone Distribution" \ + CODE_SIGNING_REQUIRED=No \ + CODE_SIGNING_ALLOWED=No # The hack to figure out where the xctest package locates BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") # Prepare the demo app, debug mode here is the default from xcodebuild and match # with what we have in the test spec -# TODO (huydhn): See if we can switch to release mode here -MODE="Debug" +MODE="Release" PLATFORM="iphoneos" pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" rm -rf Payload && mkdir Payload -APP_NAME=LLaMAPerfBenchmark +APP_NAME=Benchmark ls -lah cp -r "${APP_NAME}.app" Payload && zip -vr "${APP_NAME}.ipa" Payload diff --git a/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml index 5b66e165c4e..fcb2e7a978c 100644 --- a/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml +++ b/examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml @@ -11,8 +11,10 @@ phases: pre_test: commands: - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos + - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos - unzip $DEVICEFARM_APP_PATH -d /tmp - - mv /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos/ + - cp -r /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos/ + - cp -r /tmp/Payload/*.app $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos/ # The test phase includes commands that run your test suite execution. test: diff --git a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index 1bc3188fe17..41b2bd16a53 100644 --- a/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -411,7 +411,8 @@ PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; PRODUCT_NAME = Benchmark; SDKROOT = auto; - SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; + SUPPORTS_MACCATALYST = NO; SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; @@ -444,7 +445,8 @@ PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.Benchmark; PRODUCT_NAME = Benchmark; SDKROOT = auto; - SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; + SUPPORTS_MACCATALYST = NO; SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; @@ -467,7 +469,8 @@ PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = auto; - SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; + SUPPORTS_MACCATALYST = NO; SWIFT_EMIT_LOC_STRINGS = NO; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; @@ -491,7 +494,8 @@ PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = auto; - SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; + SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; + SUPPORTS_MACCATALYST = NO; SWIFT_EMIT_LOC_STRINGS = NO; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; From 665fa03555379e2d4f2b8ed2bceba36edce07a86 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 11 Sep 2024 18:57:16 -0700 Subject: [PATCH 335/531] Fix android setup qnn sh (#5275) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5275 Reviewed By: shoumikhin Differential Revision: D62527528 Pulled By: kirklandsign fbshipit-source-id: ff60a76ef930528a3bb985ecb428bb6e57a547e6 --- .../android/LlamaDemo/setup-with-qnn.sh | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 68d191685d3..c3b778d9b11 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -8,7 +8,6 @@ set -eu CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}" -EXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN:-OFF}" # Note: Set up ANDROID_NDK and ANDROID_ABI cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ @@ -32,26 +31,13 @@ else fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release -cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - -cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android From 81b543850390409465941fdfb014f4da84f763c5 Mon Sep 17 00:00:00 2001 From: Chester Hu Date: Wed, 11 Sep 2024 19:18:51 -0700 Subject: [PATCH 336/531] Reskin the demo app with new UI assets and colors (#5282) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5282 Reskin the background, icons, buttons and fonts. No change to the UX flow Reviewed By: cmodi-meta, shoumikhin, kirklandsign Differential Revision: D62525651 fbshipit-source-id: 38bdf40212471fa766c277ce007c2335626bf70d --- .../executorchllamademo/LogsActivity.java | 6 + .../executorchllamademo/MainActivity.java | 6 + .../executorchllamademo/SettingsActivity.java | 6 + .../src/main/res/drawable/banner_shape.xml | 4 +- .../src/main/res/drawable/baseline_add_24.xml | 2 +- .../baseline_add_photo_alternate_24.xml | 2 +- .../main/res/drawable/baseline_article_24.xml | 3 +- .../main/res/drawable/baseline_close_24.xml | 3 +- .../drawable/baseline_delete_forever_24.xml | 2 +- .../res/drawable/baseline_restart_alt_24.xml | 2 +- .../main/res/drawable/baseline_send_24.xml | 3 +- .../res/drawable/baseline_settings_24.xml | 3 +- .../main/res/drawable/baseline_stop_24.xml | 3 +- .../src/main/res/drawable/chat_background.xml | 21 +++ .../main/res/drawable/expand_circle_down.xml | 9 ++ .../main/res/drawable/input_text_shape.xml | 7 +- .../main/res/drawable/outline_add_box_48.xml | 3 +- .../outline_arrow_drop_down_circle_24.xml | 5 - .../src/main/res/drawable/prompt_shape.xml | 2 +- .../main/res/drawable/received_message.xml | 2 +- .../app/src/main/res/layout/activity_main.xml | 40 +++--- .../src/main/res/layout/activity_settings.xml | 126 +++++++++++------- .../src/main/res/layout/received_message.xml | 34 +++-- .../app/src/main/res/layout/sent_message.xml | 17 +-- .../app/src/main/res/values/colors.xml | 4 +- 25 files changed, 197 insertions(+), 118 deletions(-) create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml delete mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java index 8700528d44a..7777b275e6e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java @@ -10,10 +10,12 @@ import android.app.AlertDialog; import android.content.DialogInterface; +import android.os.Build; import android.os.Bundle; import android.widget.ImageButton; import android.widget.ListView; import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; import androidx.core.graphics.Insets; import androidx.core.view.ViewCompat; import androidx.core.view.WindowInsetsCompat; @@ -26,6 +28,10 @@ public class LogsActivity extends AppCompatActivity { protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_logs); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } ViewCompat.setOnApplyWindowInsetsListener( requireViewById(R.id.main), (v, insets) -> { diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index ac14270ed51..f5e50845eca 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -16,6 +16,7 @@ import android.content.Intent; import android.content.pm.PackageManager; import android.net.Uri; +import android.os.Build; import android.os.Bundle; import android.os.Handler; import android.os.Looper; @@ -218,6 +219,11 @@ protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } + try { Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); } catch (ErrnoException e) { diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 0736c8cda94..773fef19dd7 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -10,6 +10,7 @@ import android.app.AlertDialog; import android.content.DialogInterface; +import android.os.Build; import android.os.Bundle; import android.text.Editable; import android.text.TextWatcher; @@ -18,6 +19,7 @@ import android.widget.ImageButton; import android.widget.TextView; import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; import androidx.core.graphics.Insets; import androidx.core.view.ViewCompat; import androidx.core.view.WindowInsetsCompat; @@ -49,6 +51,10 @@ public class SettingsActivity extends AppCompatActivity { protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_settings); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } ViewCompat.setOnApplyWindowInsetsListener( requireViewById(R.id.main), (v, insets) -> { diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml index 70f251ee649..0868ffffa6f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml @@ -1,7 +1,5 @@ - - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml index 9f83b8fbe79..2ae27b8409e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml index d710d27110a..7077fedd483 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml index 30d5d26b985..a6837b9c69f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml index f8ca0c64b98..fb902d4331b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml index 2c71fc6e568..4680bc6629e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml index 9285db079aa..860470ab109 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml @@ -1,4 +1,4 @@ - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml index 3abc6cb33be..2de1f642089 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml @@ -1,5 +1,6 @@ diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml index 42593b298e9..c51d84b9f4f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml @@ -3,7 +3,8 @@ android:height="24dp" android:viewportWidth="960" android:viewportHeight="960" - android:tint="#000000"> + android:tint="#FFFFFF +"> diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml index 817d57b76a8..832e2585954 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml new file mode 100644 index 00000000000..eb8b9d1f1a9 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml @@ -0,0 +1,21 @@ + + + + + + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml new file mode 100644 index 00000000000..0a7a71f0700 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml @@ -0,0 +1,9 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml index 15c404c60df..35c778a437d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml @@ -1,10 +1,7 @@ - - + - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml index c8b2c96d585..bb45d63d85b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml deleted file mode 100644 index a8c859d8b36..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml index 1627ed98c0d..5f81396e382 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml @@ -1,6 +1,6 @@ - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml index ea2d1bbfa14..c2288b5bfce 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml @@ -1,6 +1,6 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml index ec215e63ba1..7b8b8d1760d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml @@ -20,38 +20,32 @@ - - + android:textSize="14sp" /> + android:paddingTop="20dp" + android:src="@drawable/baseline_article_24" /> @@ -83,7 +77,7 @@ android:id="@+id/mediaPreviewConstraintLayout" android:layout_width="match_parent" android:layout_height="wrap_content" - android:background="#edf0ee" + android:background="#16293D" android:visibility="gone"> @@ -169,7 +163,7 @@ + android:text="" + android:textColor="#ffffff" + android:textColorHint="#ffffff" + android:translationY="5dp" /> + + android:textAlignment="viewStart" + android:textColor="#FFFFFF" + android:textSize="22sp" + android:translationX="5dp" + android:translationY="5dp" /> + android:translationX="5dp" /> + android:text="no model selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleType="center" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> @@ -65,10 +74,12 @@ + android:translationX="5dp" /> + android:text="no tokenizer selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> @@ -97,10 +111,12 @@ + android:translationX="5dp" /> + android:text="no model type selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> +