diff --git a/exir/backend/test/test_backends.py b/exir/backend/test/test_backends.py index 887d9756e3b..544b97bb53c 100644 --- a/exir/backend/test/test_backends.py +++ b/exir/backend/test/test_backends.py @@ -319,16 +319,16 @@ def forward(self, x): ) buff = exec_prog.buffer - executorch_module = _load_for_executorch_from_buffer(buff) + # This line should raise an exception like # RuntimeError: failed with error 0x12 - executorch_module.run_method("forward") + _load_for_executorch_from_buffer(buff) @vary_segments def test_backend_with_compiler_out_of_range(self, extract_delegate_segments: bool): with self.assertRaisesRegex( RuntimeError, - "Failed to get method forward, error: 0x12", + "loading method forward failed with error 0x12", ): self.run_model_in_unsupported_backend( extract_delegate_segments=extract_delegate_segments diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py index 5c91d6b47ed..b6aea7f8bb3 100644 --- a/exir/backend/test/test_backends_lifted.py +++ b/exir/backend/test/test_backends_lifted.py @@ -347,16 +347,15 @@ def forward(self, x): buff = exec_prog.buffer - executorch_module = _load_for_executorch_from_buffer(buff) # This line should raise an exception like # RuntimeError: failed with error 0x12 - executorch_module.run_method("forward") + _load_for_executorch_from_buffer(buff) @vary_segments def test_backend_with_compiler_out_of_range(self, extract_delegate_segments: bool): with self.assertRaisesRegex( RuntimeError, - "Failed to get method forward, error: 0x12", + "loading method forward failed with error 0x12", ): self.run_model_in_unsupported_backend( extract_delegate_segments=extract_delegate_segments diff --git a/exir/backend/test/test_compatibility.py b/exir/backend/test/test_compatibility.py index e5dde6a7080..bcda1d36516 100644 --- a/exir/backend/test/test_compatibility.py +++ b/exir/backend/test/test_compatibility.py @@ -62,13 +62,12 @@ def forward(self, x): ) ) - executorch_module = _load_for_executorch_from_buffer(buff) # Throw runtime error with error code 0x30, meaning delegate is incompatible. with self.assertRaisesRegex( RuntimeError, - "Failed to get method forward, error: 0x30", + "loading method forward failed with error 0x30", ): - executorch_module.run_method("forward") + executorch_module = _load_for_executorch_from_buffer(buff) def test_compatibility_in_runtime_edge_program_manager(self): class SinModule(torch.nn.Module): @@ -109,10 +108,9 @@ def forward(self, x): ) ) - executorch_module = _load_for_executorch_from_buffer(buff) # Throw runtime error with error code 0x30, meaning delegate is incompatible. with self.assertRaisesRegex( RuntimeError, - "Failed to get method forward, error: 0x30", + "loading method forward failed with error 0x30", ): - executorch_module.run_method("forward") + executorch_module = _load_for_executorch_from_buffer(buff) diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp index 2b6c82583f9..56f92356870 100644 --- a/extension/pybindings/pybindings.cpp +++ b/extension/pybindings/pybindings.cpp @@ -156,6 +156,239 @@ void setup_output_storage( } } +class Module final { + public: + explicit Module( + std::unique_ptr loader, + std::unique_ptr tracer = nullptr, + size_t debug_buffer_size = 0, + Program::Verification program_verification = + Program::Verification::InternalConsistency) + : loader_(std::move(loader)), + event_tracer_(std::move(tracer)), + debug_buffer_size_(debug_buffer_size) { + ::executorch::runtime::runtime_init(); + Result program = + Program::load(loader_.get(), program_verification); + THROW_IF_ERROR( + program.error(), + "loading program failed with error: 0x%" PRIx32, + static_cast(program.error())); + program_ = std::make_unique(std::move(program.get())); + + // Figure out the size of each non_const layer we need to support every + // method in the program. Map will be easier to use than a list because we + // dont know how many non_const arenas there will be + std::map non_const_buffer_sizes; + for (size_t i = 0; i < program_->num_methods(); ++i) { + auto name = program_->get_method_name(i).get(); + auto method_meta = program_->method_meta(name).get(); + for (size_t j = 0; j < method_meta.num_non_const_buffers(); j++) { + int64_t buffer_size = method_meta.non_const_buffer_size(j).get(); + if (non_const_buffer_sizes.find(j) == non_const_buffer_sizes.end()) { + non_const_buffer_sizes.insert({j, buffer_size}); + } else { + non_const_buffer_sizes[j] = + std::max(non_const_buffer_sizes[j], buffer_size); + } + } + } + + // Allocate the arenas. Using vector because we need to remember the size as + // well, so vector is easier then unique_ptr. + std::vector> non_const_buffers_; + for (std::map::iterator i = non_const_buffer_sizes.begin(); + i != non_const_buffer_sizes.end(); + i++) { + non_const_buffers_.push_back(std::vector(i->second)); + } + + memory_ = std::make_unique(std::move(non_const_buffers_)); + if (event_tracer_ && debug_buffer_size > 0) { + // If a debug buffer was requested for the ETDump, allocate it and make + // sure its lifetime is as long as the event_tracer. + debug_buffer_ = std::make_unique(debug_buffer_size); + event_tracer_->set_debug_buffer(get_etdump_debug_buffer()); + event_tracer_->set_event_tracer_debug_level( + EventTracerDebugLogLevel::kIntermediateOutputs); + } + + // Load methods + for (size_t i = 0; i < program_->num_methods(); ++i) { + auto name = program_->get_method_name(i).get(); + // It's safe to use the same memory manager for all modules because + // we can guarantee that only one will be executing at a time. + // Everything in this module runs on a single thread. + Result method = program_->load_method( + name, memory_->mem_manager(), event_tracer_.get()); + THROW_IF_ERROR( + method.error(), + "loading method %s failed with error 0x%" PRIx32, + name, + static_cast(method.error())); + methods_.insert( + {std::string(name), + std::make_unique(std::move(method.get()))}); + } + } + + Module(const Module&) = delete; + Module& operator=(const Module&) = delete; + Module(Module&&) = default; + Module& operator=(Module&&) = default; + + /// Executes the specified method on the provided inputs and returns its + /// outputs. + std::vector run_method( + const std::string& method_name, + const std::vector& args, + const std::optional>>& output_storages = + std::nullopt) { + auto& method = get_method(method_name); + executorch::aten::ArrayRef input_evalue_list( + args.data(), args.size()); + + Error set_inputs_status = method.set_inputs(input_evalue_list); + THROW_IF_ERROR( + set_inputs_status, + "method->set_inputs() for method '%s' failed with error 0x%" PRIx32, + method_name.c_str(), + static_cast(set_inputs_status)); + +#ifdef USE_ATEN_LIB + // [TLS handling] This is to workaround an assertion failure + // (https://fburl.com/code/302jyn8d) running `gelu` in ATen mode in fbcode + // (such as bento). The problem is ExecuTorch ATen mode doesn't have + // Thread Local State, but `torch-cpp` is assuming tls init is done. There + // are two more checks: MKLDNN disabled and C10_MOBILE, if any of them is + // true we won't be hitting this assertion error. However in `torch-cpp` + // lib both checks are false. Production impact: this should not make any + // impact in production environment, given that in xplat we are depending + // on a library that enables C10_MOBILE (`torch_mobile_core`). + c10::impl::ExcludeDispatchKeyGuard no_autograd( + c10::autograd_dispatch_keyset); +#endif + if (output_storages) { + setup_output_storage(method, *output_storages); + } + Error execute_status = method.execute(); + THROW_IF_ERROR( + execute_status, + "method->execute() failed with error 0x%" PRIx32, + static_cast(execute_status)); + // process outputs + return get_outputs(method_name); + } + + std::vector get_outputs(const std::string& method_name) { + auto& method = methods_[method_name]; + std::vector result(method->outputs_size()); + + Error get_outputs_status = + method->get_outputs(result.data(), method->outputs_size()); + THROW_IF_ERROR( + get_outputs_status, + "method->get_outputs() for method '%s' failed with error 0x%" PRIx32, + method_name.c_str(), + static_cast(get_outputs_status)); + + return result; + } + + Method& get_method(const std::string& method_name) { + if (methods_.count(method_name) == 0) { + THROW_IF_ERROR( + Error::InvalidArgument, + "no such method in program: %s", + method_name.c_str()); + } + return *methods_[method_name].get(); + } + + /// Returns the names of all methods in the program. + std::vector method_names() const { + std::vector names; + for (const auto& method : methods_) { + names.push_back(method.first); + } + return names; + } + + bool has_etdump() { + return static_cast(event_tracer_); + } + + ETDumpGen& etdump() { + return *event_tracer_; + } + + bool has_etdump_debug_buffer() const { + return static_cast(debug_buffer_); + } + + Span get_etdump_debug_buffer() { + return Span(debug_buffer_.get(), debug_buffer_size_); + } + + private: + /// A wrapper/util class for executorch memory allocations/manager. + class Memory { + public: + explicit Memory(std::vector>&& non_const_buffers) + : runtime_allocator_(), + non_const_buffers_(std::move(non_const_buffers)), + non_const_spans_(create_non_const_spans()), + non_const_allocator_( + {non_const_spans_.data(), non_const_spans_.size()}), + mem_manager_( + &const_allocator_, + &non_const_allocator_, + &runtime_allocator_, + &temp_allocator_) {} + + /// Returns a pointer to the internal memory manager, the Memory instance + /// must outlive this pointer. + MemoryManager* mem_manager() { + return &mem_manager_; + } + + Memory(const Memory&) = delete; + Memory& operator=(const Memory&) = delete; + + private: + MemoryAllocator const_allocator_{MemoryAllocator(0, nullptr)}; + + MallocMemoryAllocator runtime_allocator_; + + MallocMemoryAllocator temp_allocator_{}; + + std::vector> non_const_buffers_; + + std::vector> non_const_spans_; + + HierarchicalAllocator non_const_allocator_; + + MemoryManager mem_manager_; + + std::vector> create_non_const_spans() { + std::vector> result; + for (size_t i = 0; i < non_const_buffers_.size(); i++) { + result.push_back( + {non_const_buffers_[i].data(), non_const_buffers_[i].size()}); + } + return result; + } + }; + + std::unique_ptr memory_; + std::unique_ptr loader_; // program_ points to this. + std::unique_ptr program_; // methods_ entries points to this. + std::unordered_map> methods_; + std::unique_ptr event_tracer_; + std::unique_ptr debug_buffer_; + size_t debug_buffer_size_; +}; + inline std::unique_ptr load_module_from_buffer( const void* ptr, size_t ptr_len, @@ -166,10 +399,9 @@ inline std::unique_ptr load_module_from_buffer( auto loader = std::make_unique(ptr, ptr_len); return std::make_unique( std::move(loader), - nullptr, // memory_allocator - nullptr, // temp_allocator enable_etdump ? std::make_unique() : nullptr, - nullptr); // data_map_loader + debug_buffer_size, + program_verification); } inline std::unique_ptr load_module_from_file( @@ -190,10 +422,9 @@ inline std::unique_ptr load_module_from_file( auto loader = std::make_unique(std::move(res.get())); return std::make_unique( std::move(loader), - nullptr, // memory_allocator - nullptr, // temp_allocator enable_etdump ? std::make_unique() : nullptr, - nullptr); // data_map_loader + debug_buffer_size, + program_verification); } inline py::list get_outputs_as_py_list( @@ -647,17 +878,19 @@ struct PyModule final { } } - // Set up output storage before execution. - allocate_output_storages(method_name); - auto outputs = module_->execute(method_name, cpp_inputs); - THROW_IF_ERROR( - outputs.error(), - "Failed to execute method %s, error: 0x%" PRIx32, - method_name.c_str(), - static_cast(outputs.error())); + const auto& method = module_->get_method(method_name); + const auto num_outputs = method.outputs_size(); + output_storages_ = make_output_storages(method); + std::vector> output_storage_spans(num_outputs); + for (int i = 0; i < output_storages_.size(); ++i) { + output_storage_spans[i] = + Span(output_storages_[i].data(), output_storages_[i].size()); + } + auto outputs = + module_->run_method(method_name, cpp_inputs, output_storage_spans); // Retrieve outputs - return get_outputs_as_py_list(outputs.get(), clone_outputs); + return get_outputs_as_py_list(outputs, clone_outputs); } py::list forward(const py::sequence& inputs, bool clone_outputs = true) { @@ -673,8 +906,7 @@ struct PyModule final { } bool has_etdump() { - ETDumpGen* etdump = dynamic_cast(module_->event_tracer()); - return etdump != nullptr; + return module_->has_etdump(); } void write_etdump_result_to_file( @@ -683,20 +915,19 @@ struct PyModule final { if (!has_etdump()) { throw std::runtime_error("No etdump found"); } - ETDumpGen* etdump = dynamic_cast(module_->event_tracer()); - etdump_result result = etdump->get_etdump_data(); + auto& etdump = module_->etdump(); + etdump_result result = etdump.get_etdump_data(); if (result.buf != nullptr && result.size > 0) { write_data_to_file(path, result.buf, result.size); free(result.buf); - if (py::isinstance(debug_buffer_path)) { + if (module_->has_etdump_debug_buffer() && + py::isinstance(debug_buffer_path)) { // Also write out the debug buffer to a separate file if requested. std::string debug_buffer_path_str = py::cast(debug_buffer_path); - const auto debug_buffer = module_->debug_buffer(); - if (debug_buffer.size() > 0) { - write_data_to_file( - debug_buffer_path_str, debug_buffer.data(), debug_buffer.size()); - } + const auto debug_buffer = module_->get_etdump_debug_buffer(); + write_data_to_file( + debug_buffer_path_str, debug_buffer.data(), debug_buffer.size()); } } else { ET_LOG( @@ -710,33 +941,32 @@ struct PyModule final { py::list plan_execute( const std::string method_name, bool clone_outputs = true) { - auto status = module_->load_method(method_name); - - THROW_IF_ERROR( - status, - "executing execution plan for method 'load' failed with error: 0x%" PRIx32, - static_cast(status)); - auto output = module_->execute(method_name.c_str()); + auto& method = module_->get_method(method_name); + // Need to pre-allocate space for outputs just like in run_method. + const auto num_outputs = method.outputs_size(); + output_storages_ = make_output_storages(method); + std::vector> output_storage_spans(num_outputs); + for (int i = 0; i < output_storages_.size(); ++i) { + output_storage_spans[i] = + Span(output_storages_[i].data(), output_storages_[i].size()); + } + setup_output_storage(method, output_storage_spans); + auto status = method.execute(); THROW_IF_ERROR( status, "executing execution plan for method 'forward' failed with error: 0x%" PRIx32, static_cast(status)); - return get_outputs_as_py_list(output.get(), clone_outputs); + const auto outputs = module_->get_outputs(method_name); + return get_outputs_as_py_list(outputs, clone_outputs); } std::unique_ptr method_meta(const std::string method_name) { - auto method_data = module_->method_meta(method_name); - return std::make_unique(module_, method_data.get()); + auto& method = module_->get_method(method_name); + return std::make_unique(module_, method.method_meta()); } std::vector method_names() { - auto result = module_->method_names(); - THROW_IF_ERROR( - result.error(), - "Failed to get method names, error: 0x%" PRIx32, - static_cast(result.error())); - const auto& method_set = result.get(); - return std::vector(method_set.begin(), method_set.end()); + return module_->method_names(); } private: @@ -745,52 +975,38 @@ struct PyModule final { // bundled programs. std::vector> output_storages_; - void allocate_output_storages(const std::string& method_name) { - auto method_result = module_->method(method_name); - THROW_IF_ERROR( - method_result.error(), - "Failed to get method %s, error: 0x%" PRIx32, - method_name.c_str(), - static_cast(method_result.error())); - - auto* method = method_result.get(); - const auto num_outputs = method->outputs_size(); + std::vector> make_output_storages(const Method& method) { + const auto num_outputs = method.outputs_size(); // Create a buffer for each output tensor. Memory planned outputs and non // tensor outputs get an empty buffer in this list which is ignored later. - output_storages_.clear(); + std::vector> output_storages; output_storages_.reserve(num_outputs); - auto meta = method->method_meta(); + auto meta = method.method_meta(); for (size_t i = 0; i < num_outputs; ++i) { auto output_type = meta.output_tag(i); THROW_IF_ERROR( output_type.error(), "Failed to get output type for output %zu", i); if (output_type.get() != Tag::Tensor) { // Skip allocating storage for non-tensor outputs. - output_storages_.emplace_back(); + output_storages.emplace_back(); continue; } const auto& output_tensor_meta = - method->method_meta().output_tensor_meta(i); + method.method_meta().output_tensor_meta(i); THROW_IF_ERROR( output_tensor_meta.error(), "Failed to get output tensor meta for output %zu", i); if (output_tensor_meta.get().is_memory_planned()) { // Skip allocating storage for planned memory outputs. - output_storages_.emplace_back(); + output_storages.emplace_back(); continue; } // Allocate storage for the output tensor. const size_t output_size = output_tensor_meta.get().nbytes(); - output_storages_.emplace_back(output_size); - } - // Set up output storage for non-empty buffers - std::vector> output_storage_spans(num_outputs); - for (size_t i = 0; i < output_storages_.size(); ++i) { - output_storage_spans[i] = - Span(output_storages_[i].data(), output_storages_[i].size()); + output_storages.emplace_back(output_size); } - setup_output_storage(*method, output_storage_spans); + return output_storages; } }; diff --git a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl index 7e14ca8713a..55a268d5d34 100644 --- a/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl +++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl @@ -17,8 +17,6 @@ PORTABLE_MODULE_DEPS = [ "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/extension/module:bundled_module", - "//executorch/extension/module:module", - "//executorch/extension/tensor:tensor", "//executorch/runtime/executor/test:test_backend_compiler_lib", "//executorch/devtools/etdump:etdump_flatcc", ] + get_all_cpu_backend_targets() @@ -32,8 +30,6 @@ ATEN_MODULE_DEPS = [ "//executorch/extension/data_loader:mmap_data_loader", "//executorch/extension/memory_allocator:malloc_memory_allocator", "//executorch/extension/module:bundled_module_aten", - "//executorch/extension/module:module_aten", - "//executorch/extension/tensor:tensor_aten", "//executorch/devtools/bundled_program:runtime_aten", "//executorch/runtime/executor/test:test_backend_compiler_lib_aten", "//executorch/devtools/etdump:etdump_flatcc",