diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh index a05287ac4bf..104e3d02a25 100755 --- a/backends/arm/scripts/build_executor_runner.sh +++ b/backends/arm/scripts/build_executor_runner.sh @@ -44,7 +44,7 @@ help() { echo " --memory_mode= Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms." echo " Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)." echo " Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85" - echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --etdump Adds Devtools etdump support to track timing and output, etdump area will be base64 encoded in the log" echo " --extra_build_flags= Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none " echo " --output= Output folder Default: /_.pte" echo " --et_build_root= Build output root folder to use, defaults to ${et_build_root}" @@ -161,7 +161,7 @@ if [ "$bundleio" = true ] ; then fi if [ "$build_with_etdump" = true ] ; then - build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DET_DUMP_INTERMEDIATE_OUTPUTS=ON " fi echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}" diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh index 769b2e30282..0f76d0496de 100755 --- a/backends/arm/scripts/run_fvp.sh +++ b/backends/arm/scripts/run_fvp.sh @@ -21,6 +21,7 @@ elf_file="" data_file="" target="ethos-u55-128" timeout="600" +etrecord_file="" help() { echo "Usage: $(basename $0) [options]" @@ -29,6 +30,7 @@ help() { echo " --data=@
Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code." echo " --target= Target to build and run for Default: ${target}" echo " --timeout= Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}" + echo " --etrecord= If ETDump is used you can supply a ETRecord file matching the PTE" exit 0 } @@ -39,6 +41,7 @@ for arg in "$@"; do --data=*) data_file="--data ${arg#*=}";; --target=*) target="${arg#*=}";; --timeout=*) timeout="${arg#*=}";; + --etrecord=*) etrecord_file="${arg#*=}";; *) ;; esac @@ -115,15 +118,23 @@ echo "Checking for a etdump in log" ! grep "#\[RUN THIS\]" ${log_file} >/dev/null if [ $? != 0 ]; then echo "Found ETDump in log!" + devtools_extra_args="" echo "#!/bin/sh" > etdump_script.sh sed -n '/^#\[RUN THIS\]$/,/^#\[END\]$/p' ${log_file} >> etdump_script.sh # You can run etdump_script.sh if you do # $ chmod a+x etdump_script.sh # $ ./etdump_script.sh # But lets not trust the script as a bad patch would run bad code on your machine - grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 >etdump.base64 - base64 -d etdump.base64 >etdump.bin - python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles + grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >etdump.bin + ! grep ">debug_buffer.bin" etdump_script.sh >/dev/null + if [ $? != 0 ]; then + grep ">debug_buffer.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >debug_buffer.bin + devtools_extra_args="${devtools_extra_args} --debug_buffer_path debug_buffer.bin" + fi + if [[ ${etrecord_file} != "" ]]; then + devtools_extra_args="${devtools_extra_args} --etrecord_path ${etrecord_file}" + fi + python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin ${devtools_extra_args} --source_time_scale cycles --target_time_scale cycles fi echo "Checking for problems in log:" diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py index 8833b7050e7..c336d67ad51 100755 --- a/backends/arm/test/test_model.py +++ b/backends/arm/test/test_model.py @@ -184,7 +184,7 @@ def build_ethosu_runtime( "--build_type=Release", f"--system_config={system_config}", f"--memory_mode={memory_mode}", - f"--extra_build_flags=-DET_DUMP_OUTPUT=OFF {extra_flags}", + f"--extra_build_flags=-DET_LOG_DUMP_OUTPUT=OFF {extra_flags}", f"--output={elf_build_path}", ] ) diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index ff6f73398c3..4e4a8eeb409 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -6,30 +6,59 @@ cmake_minimum_required(VERSION 3.20) project(arm_executor_runner) -option(SEMIHOSTING "Enable semihosting" OFF) -option( - ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" - OFF -) option( ET_MODEL_PTE_ADDR "Place in memory that the PTE file is located/flashed, if set to OFF the PTE is built into the code as a big data area." OFF ) -option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF) -option(ET_ATOL "Set atol to use for BundleIO testing" OFF) -option(ET_RTOL "Set rtol to use for BundleIO testing" OFF) -option(ET_DUMP_INPUT "Dump input in log" OFF) -option(ET_DUMP_OUTPUT "Dump output in log" ON) -option(FETCH_ETHOS_U_CONTENT - "Fetch ethos_u dependencies instead of relying on pre-downloads" ON -) + set(ET_NUM_INFERENCES "1" CACHE STRING "Number of inferences to run" ) +option(ET_LOG_DUMP_INPUT "Dump input in log" OFF) +option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON) + +option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF) +set(ET_ATOL + "0.01" + CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)" +) +set(ET_RTOL + "0.01" + CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)" +) + +option( + ET_DUMP_OUTPUTS + "Collect and print outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)" + OFF +) +option( + ET_DUMP_INTERMEDIATE_OUTPUTS + "Collect and print intermediate outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)" + OFF +) +set(ET_DEBUG_BUFFER_SIZE + "2097152" + CACHE + STRING + "Size of buffer to collect intermediate outputs/outputs buffers (Requires EXECUTORCH_ENABLE_EVENT_TRACER and ET_DUMP_OUTPUTS or ET_DUMP_INTERMEDIATE_OUTPUTS)" +) + +option(SEMIHOSTING "Enable semihosting" OFF) + +option( + ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE + "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" + OFF +) + +option(FETCH_ETHOS_U_CONTENT + "Fetch ethos_u dependencies instead of relying on pre-downloads" ON +) + if(NOT DEFINED ET_MODEL_PTE_ADDR AND NOT DEFINED ET_PTE_FILE_PATH AND NOT DEFINED SEMIHOSTING @@ -322,37 +351,29 @@ if(NOT ${ET_MODEL_PTE_ADDR} AND NOT SEMIHOSTING) add_dependencies(arm_executor_runner gen_model_header) endif() -if(SEMIHOSTING) - target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) -endif() - -if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) +if(ET_MODEL_PTE_ADDR) target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE} + arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR} ) endif() -target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} -) -if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) +if(ET_NUM_INFERENCES) target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} + arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} ) endif() -if(ET_MODEL_PTE_ADDR) - target_compile_definitions( - arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR} - ) +if(ET_LOG_DUMP_INPUT) + target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_INPUT) +endif() + +if(ET_LOG_DUMP_OUTPUT) + target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_OUTPUT) endif() +# Devtool BundleIO: Use Bundle PTE with input and reference output included to +# check if it matches. + if(ET_BUNDLE_IO) target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO) endif() @@ -365,17 +386,50 @@ if(ET_RTOL) target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL}) endif() -if(ET_DUMP_INPUT) - target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_INPUT) +# Devtools ETDump: Speed and dumping output + +if(ET_DUMP_OUTPUTS) + target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUTS) endif() -if(ET_DUMP_OUTPUT) - target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT) +if(ET_DUMP_INTERMEDIATE_OUTPUTS) + target_compile_definitions( + arm_executor_runner PUBLIC -DET_DUMP_INTERMEDIATE_OUTPUTS + ) endif() -if(ET_NUM_INFERENCES) +if(ET_DEBUG_BUFFER_SIZE) target_compile_definitions( - arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} + arm_executor_runner PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE} + ) +endif() + +# Semihosting FVP (FVP Simulator can access host filesystem) + +if(SEMIHOSTING) + target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) +endif() + +# Memory buffer sizes for Executorch flow + +if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE} + ) +endif() + +target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} +) +if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} ) endif() diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index d56710e27ad..696817450b5 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -6,10 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -/* This is an example executorch runner running on Arm Cortex-m and Ethos-U +/* This is an example ExecuTorch runner running on Arm Cortex-M and Ethos-U * based hardware. This example tries to illustrate a few ways to use ExecuTorch * and you can use it as is or remove the unneeded parts. Please use this code - * as inpiration. + * as inspiration. * * Some defines used to configure the code: * @@ -20,24 +20,43 @@ * that is controlled by your memory mode via the * ETHOSU_MODEL cmake parameter. * If SEMIHOSTING is define this is not used - * ET_DUMP_INPUT - Control if you want input to be dumped to the log. - * ET_DUMP_OUTPUT - Control if you want output to be dumped to the log. - * ET_BUNDLE_IO - Build in devtools BundelIO, this makes it possible to + * ET_NUM_INFERENCES - Numbers of times to run the inference + * ET_LOG_DUMP_INPUT - Control if you want input to be dumped to the log. + * ET_LOG_DUMP_OUTPUT - Control if you want output to be dumped to the log. + * + * Devtool BundleIO: Use Bundle PTE with input and reference output included to + * check if it matches. + * + * ET_BUNDLE_IO - Build in Devtools BundleIO, this makes it possible to * use bpte with bundled input and output refdata to * compare output. * See also ET_ATOL and ET_RTOL - * ET_ATOL - The atol used to compare the output and ref data when - * using ET_BUNDLE_IO - * ET_RTOL - The rtol used to compare the output and ref data when - * using ET_BUNDLE_IO - * ET_EVENT_TRACER_ENABLED - Build in devtools event trace code to generate - * ETDump and print it base64 coded of it in the logs - * so you can get it out of your embedded target. - * This can be used to benchmark where time is spent. - * If you run on Ethos-U the delegate/commandstream - * is run in one go, this means that per op - * measurements is not possible. - * Warning: CPU time meassurements is NOT possible in the FVP simulator and a + * ET_ATOL - The atol used to compare the output and ref data + * when using ET_BUNDLE_IO ET_RTOL - The rtol used to compare the + * output and ref data when using ET_BUNDLE_IO + * + * Devtools ETDump: Speed and dumping output + * + * ET_EVENT_TRACER_ENABLED - Build in Devtools ETDump event trace code + * to generate cycle data and print it base64 + * coded in the log so you can get it out of + * your embedded target. This can be used to + * benchmark where time is spent. If you run + * on Ethos-U the delegate/commandstream is + * run in one go, this means that per op + * measurements is not possible. + * ET_DUMP_OUTPUTS - Collect and print outputs as a base64 buffer + * in the log, see ExecuTorch Devtools for more + * info. (Requires ET_EVENT_TRACER_ENABLED) + * ET_DUMP_INTERMEDIATE_OUTPUTS - Collect and print intermediate outputs as a + * base64 buffer in the log, see ExecuTorch + * Devtools for more info. + * (Requires ET_EVENT_TRACER_ENABLED) + * ET_DEBUG_BUFFER_SIZE - Override the size of memory area used by + * ET_DUMP_OUTPUTS or + * ET_DUMP_INTERMEDIATE_OUTPUTS + * + * Warning: CPU time measurements is NOT possible in the FVP simulator and a * real target or FPGA must be used. NPU number are roughly OK, and can be used * as guidance if timeing adaptor values are set correctly. * @@ -54,11 +73,12 @@ * left over memory after code is linked. This needs to be big enough to fit * and run your model. In our example using the FVP simulator we have much * memory and set this quite high to be able to test larger models. - * Regarding heap/mallocs type of allocation from executorch, + * Regarding heap/mallocs type of allocation from ExecuTorch, * et_pal_allocate() is not implemented or needed. * - * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area - * used when setting up the model + * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area + * used when setting up + * the model * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area * used when running * inferences @@ -86,10 +106,21 @@ #if defined(ET_EVENT_TRACER_ENABLED) #include + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) +#include + +#if !defined(ET_DEBUG_BUFFER_SIZE) +#define ET_DEBUG_BUFFER_SIZE (2 * 1024 * 1024) +#endif + +#endif + #if !defined(SEMIHOSTING) #include #endif -#endif + +#endif // defined(ET_EVENT_TRACER_ENABLED) #if defined(SEMIHOSTING) @@ -158,8 +189,10 @@ using executorch::bundled_program::ErrorStats; using executorch::bundled_program::verify_method_outputs; #endif #if defined(ET_EVENT_TRACER_ENABLED) +using executorch::etdump::BufferDataSink; using executorch::etdump::ETDumpGen; using executorch::etdump::ETDumpResult; +using executorch::runtime::EventTracerDebugLogLevel; using torch::executor::etdump_result; #endif /** @@ -505,6 +538,9 @@ struct RunnerContext { Box> method; #if defined(ET_EVENT_TRACER_ENABLED) Box etdump_gen; +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + void* debug_buffer; +#endif #endif #if defined(SEMIHOSTING) Box input_file_allocator; @@ -622,7 +658,60 @@ void runner_init( ET_LOG(Info, "Setting up ETDump"); ctx.etdump_gen.reset(); event_tracer_ptr = &ctx.etdump_gen.value(); -#endif + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + // Alloc debug buffer and create if and only if we need to log intermediate + // tensor outputs + ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16); + if (ctx.debug_buffer != nullptr) { + Span debug_buffer_span( + (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE); + + Result result = + ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span); + + if (result.ok()) { + // Everything worked, we got the buffer setup, lets enable output logging + // depending on the compile flag ET_DUMP_INTERMEDIATE_OUTPUTS e.g. + // kIntermediateOutputs or kProgramOutputs +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + ET_LOG( + Info, + "ETDump: Allocated intermediate output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kIntermediateOutputs); +#else // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + ET_LOG( + Info, + "ETDump: Allocated output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kProgramOutputs); +#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + + } else { + // set_debug_buffer() failed + // Here we would free ctx.debug_buffer if it was possible, but we can't as + // the allocator don't support it. + ctx.debug_buffer = nullptr; + ET_LOG( + Error, + "ETDump: Could not set_debug_buffer() for output buffer size %zu error:0x%" PRIx32, + ET_DEBUG_BUFFER_SIZE, + result.error()); + } + } else { + // debug buffer allocation failed + ET_LOG( + Error, + "ETDump: Could not allocate memory for output buffer size %zu", + ET_DEBUG_BUFFER_SIZE); + } +#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) +#endif // defined(ET_EVENT_TRACER_ENABLED) ctx.method.reset( program->load_method(ctx.method_name, &memory_manager, event_tracer_ptr)); @@ -660,7 +749,7 @@ void runner_init( ET_CHECK_MSG( status == Error::Ok, "Failed to prepare inputs 0x%" PRIx32, status); } -#if defined(ET_DUMP_INPUT) +#if defined(ET_LOG_DUMP_INPUT) { std::vector inputs((*ctx.method.value())->inputs_size()); ET_LOG(Info, "%zu inputs: ", inputs.size()); @@ -712,7 +801,7 @@ void runner_init( ET_LOG(Info, "Input prepared."); } -void log_mem_status(const RunnerContext& ctx) { +void log_mem_status(RunnerContext& ctx) { size_t executor_memsize = ctx.method_allocator->used_size() - ctx.executor_membase; @@ -765,6 +854,20 @@ void log_mem_status(const RunnerContext& ctx) { if (ctx.temp_allocator->size() > 0) { ET_LOG(Info, "temp_allocator: %zu", ctx.temp_allocator->size()); } +#if defined(ET_EVENT_TRACER_ENABLED) +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (ctx.debug_buffer != nullptr) { + size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + ET_LOG( + Info, + "ETDump_outputs_buffer: %zu / %zu free: %zu ( used: %zu %% ) ", + outputdump_len, + ET_DEBUG_BUFFER_SIZE, + ET_DEBUG_BUFFER_SIZE - outputdump_len, + 100 * outputdump_len / ET_DEBUG_BUFFER_SIZE); + } +#endif +#endif } void print_outputs(RunnerContext& ctx) { @@ -779,7 +882,7 @@ void print_outputs(RunnerContext& ctx) { if (outputs[i].isTensor()) { Tensor tensor = outputs[i].toTensor(); #if !defined(SEMIHOSTING) -#if defined(ET_DUMP_OUTPUT) +#if defined(ET_LOG_DUMP_OUTPUT) // The output might be collected and parsed so printf() is used instead // of ET_LOG() here for (int j = 0; j < tensor.numel(); ++j) { @@ -811,7 +914,7 @@ void print_outputs(RunnerContext& ctx) { } } #endif -#else +#else //! defined(SEMIHOSTING) char out_filename[255]; snprintf(out_filename, 255, "%s-%d.bin", ctx.output_basename, i); ET_LOG(Info, "Writing output to file: %s", out_filename); @@ -819,7 +922,7 @@ void print_outputs(RunnerContext& ctx) { auto written_size = fwrite(tensor.const_data_ptr(), 1, tensor.nbytes(), out_file); fclose(out_file); -#endif +#endif //! defined(SEMIHOSTING) } else { printf("Output[%d]: Not Tensor\n", i); } @@ -835,29 +938,96 @@ void write_etdump(RunnerContext& ctx) { if (result.buf != nullptr && result.size > 0) { // On a device with no file system we can't just write it out // to the file-system so we base64 encode it and dump it on the log. + bool dump_outputs = false; int mode = base64_enc_modifier_padding | base64_dec_modifier_skipspace; - size_t len = result.size; - size_t encoded_len = base64_encoded_size(result.size, mode); + size_t etdump_len = result.size; + size_t encoded_etdump_len = base64_encoded_size(etdump_len, mode); + size_t base64buffer_len = encoded_etdump_len; +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + // Make base64 buffer fit both so it can be reused istead of allocating two + // buffers. + size_t outputdump_len = 0; + size_t encoded_outputdump_len = 0; + if (ctx.debug_buffer != nullptr) { + outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + if (outputdump_len > 0) { + encoded_outputdump_len = base64_encoded_size(outputdump_len, mode); + if (encoded_outputdump_len > 0) { + base64buffer_len = + std::max(encoded_etdump_len, encoded_outputdump_len); + dump_outputs = true; + } else { + ET_LOG( + Error, + "Problem getting the size of the base64 ETDump output buffers"); + } + } else { + ET_LOG(Error, "No ETDump output buffers saved in the data area"); + } + } +#endif + ET_LOG(Info, "[base64] buffer size: %d", base64buffer_len); + uint8_t* encoded_buf = reinterpret_cast( - ctx.method_allocator->allocate(encoded_len + 1)); + ctx.method_allocator->allocate(base64buffer_len + 1)); if (encoded_buf != nullptr) { - int ret = base64_encode( - encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode); - encoded_buf[encoded_len] = 0x00; // Ensure null termination - ET_LOG(Info, "Writing etdump.bin [base64]"); + int ret; + const char* debug_buffer_flag = ""; + printf("#[RUN THIS]\n"); +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (dump_outputs) { + ret = base64_encode( + encoded_buf, + (uint8_t*)ctx.debug_buffer, + &encoded_outputdump_len, + &outputdump_len, + mode); + encoded_buf[encoded_outputdump_len] = 0x00; // Ensure null termination + printf("# Writing debug_buffer.bin [base64]\n"); + printf("echo \"%s\" | base64 -d >debug_buffer.bin\n", encoded_buf); + debug_buffer_flag = "--debug_buffer_path debug_buffer.bin"; + } +#endif + ret = base64_encode( + encoded_buf, + (uint8_t*)result.buf, + &encoded_etdump_len, + &etdump_len, + mode); + encoded_buf[encoded_etdump_len] = 0x00; // Ensure null termination + printf("# Writing etdump.bin [base64]\n"); + printf("echo \"%s\" | base64 -d >etdump.bin\n", encoded_buf); + + printf("# Generate cpu cycle table with:\n"); printf( - "#[RUN THIS]\necho \"%s\" | base64 -d >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles\n#[END]\n", - encoded_buf); + "python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin %s --source_time_scale cycles --target_time_scale cycles\n", + debug_buffer_flag); + printf("#[END]\n"); + } else { ET_LOG( Error, "Could not allocate memory etdump base64 encoding size %zu", - encoded_len + 1); + encoded_etdump_len + 1); } } -#else - // Dump the etdump data containing profiling/debugging data to the specified - // file. +#else // !defined(SEMIHOSTING) +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (ctx.debug_buffer != nullptr) { + // Dump the etdump outputs data to a file. + size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + const char* etdump_output_filename = "debug_buffer.bin"; + ET_LOG( + Info, + "Writing etdump debug_buffer to file: %s", + etdump_output_filename); + FILE* f = fopen(etdump_output_filename, "w+"); + fwrite((uint8_t*)ctx.debug_buffer, 1, outputdump_len, f); + fclose(f); + } +#endif + + // Dump the etdump data containing profiling/debugging data to a file. etdump_result result = ctx.etdump_gen->get_etdump_data(); if (result.buf != nullptr && result.size > 0) { // On a device with a file system we can just write it out @@ -869,11 +1039,12 @@ void write_etdump(RunnerContext& ctx) { fclose(f); free(result.buf); } -#endif -#endif +#endif // !defined(SEMIHOSTING) +#endif // defined(ET_EVENT_TRACER_ENABLED) } -void verify_result(RunnerContext& ctx, const void* model_pte) { +bool verify_result(RunnerContext& ctx, const void* model_pte) { + bool model_ok = false; #if defined(ET_BUNDLE_IO) if (ctx.bundle_io) { // Check result @@ -899,6 +1070,7 @@ void verify_result(RunnerContext& ctx, const void* model_pte) { if (status == Error::Ok) { ET_LOG(Info, "Model output match expected BundleIO bpte ref data."); ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx); + model_ok = true; } else { ET_LOG( Error, @@ -906,19 +1078,24 @@ void verify_result(RunnerContext& ctx, const void* model_pte) { et_rtol, et_atol); ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx); + ET_LOG( + Error, "Bundle verification failed with status 0x%" PRIx32, status); + model_ok = false; } - ET_CHECK_MSG( - status == Error::Ok, - "Bundle verification failed with status 0x%" PRIx32, - status); + } else { + // No checking done, assume true + model_ok = true; } -#else +#else // defined(ET_BUNDLE_IO) (void)ctx; (void)model_pte; -#endif + // No checking done, assume true + model_ok = true; +#endif // defined(ET_BUNDLE_IO) + return model_ok; } -void run_model(RunnerContext& ctx, const void* model_pte) { +bool run_model(RunnerContext& ctx, const void* model_pte) { Error status; ET_LOG(Info, "Starting running %d inferences...", num_inferences); int n = 0; @@ -946,7 +1123,10 @@ void run_model(RunnerContext& ctx, const void* model_pte) { ET_LOG(Info, "%d inferences finished", num_inferences); print_outputs(ctx); - verify_result(ctx, model_pte); + bool model_ok = verify_result(ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + + return model_ok; } } // namespace @@ -1047,10 +1227,14 @@ int main(int argc, const char* argv[]) { model_pte[7]); runner_init(ctx, input_buffers, pte_size); - run_model(ctx, model_pte); + bool model_ok = run_model(ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + log_mem_status(ctx); write_etdump(ctx); + ET_CHECK_MSG(model_ok == true, "Problem running model"); + ET_LOG(Info, "Program complete, exiting."); #if defined(SEMIHOSTING) _exit(0); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 77dddfe6451..182a7308964 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -52,7 +52,7 @@ function help() { echo " --no_delegate Do not delegate the model (can't override builtin models)" echo " --no_quantize Do not quantize the model (can't override builtin models)" echo " --portable_kernels= TO BE DEPRECATED: Alias to select_ops_list." - echo " --select_ops_list= Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}" + echo " --select_ops_list= Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}" echo " NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio." echo " See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information." echo " --target= Target to build and run for Default: ${target}" @@ -289,6 +289,12 @@ for i in "${!test_model[@]}"; do pte_file=$(realpath ${pte_file}) + if [ "${etrecord_flag}" != "" ] ; then + etrecord_filename="${output_folder}/${model_filename}_etrecord.bin" + etrecord_filename=$(realpath ${etrecord_filename}) + etrecord_flag="--etrecord=${etrecord_filename}" + fi + [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "pte_data_size: $(wc -c ${pte_file})" echo "pte_file: ${pte_file}" @@ -322,7 +328,8 @@ for i in "${!test_model[@]}"; do backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file_or_mem}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}" --select_ops_list="${select_ops_list}" if [ "$build_only" = false ] ; then # Execute the executor_runner on FVP Simulator - backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target + + backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag} fi set +x fi