diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh
index a05287ac4bf..104e3d02a25 100755
--- a/backends/arm/scripts/build_executor_runner.sh
+++ b/backends/arm/scripts/build_executor_runner.sh
@@ -44,7 +44,7 @@ help() {
     echo "  --memory_mode=<CONFIG>               Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
     echo "                                       Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
     echo "                                       Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
-    echo "  --etdump                             Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --etdump                             Adds Devtools etdump support to track timing and output, etdump area will be base64 encoded in the log"
     echo "  --extra_build_flags=<FLAGS>          Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --output=<FOLDER>                    Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
     echo "  --et_build_root=<FOLDER>             Build output root folder to use, defaults to ${et_build_root}"
@@ -161,7 +161,7 @@ if [ "$bundleio" = true ] ; then
 fi
 
 if [ "$build_with_etdump" = true ] ; then
-    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DET_DUMP_INTERMEDIATE_OUTPUTS=ON "
 fi
 
 echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}"
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
index 769b2e30282..0f76d0496de 100755
--- a/backends/arm/scripts/run_fvp.sh
+++ b/backends/arm/scripts/run_fvp.sh
@@ -21,6 +21,7 @@ elf_file=""
 data_file=""
 target="ethos-u55-128"
 timeout="600"
+etrecord_file=""
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -29,6 +30,7 @@ help() {
     echo "  --data=<FILE>@<ADDRESS>  Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code."
     echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
     echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
+    echo "  --etrecord=<FILE>        If ETDump is used you can supply a ETRecord file matching the PTE"
     exit 0
 }
 
@@ -39,6 +41,7 @@ for arg in "$@"; do
       --data=*) data_file="--data ${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --timeout=*) timeout="${arg#*=}";;
+      --etrecord=*) etrecord_file="${arg#*=}";;
       *)
       ;;
     esac
@@ -115,15 +118,23 @@ echo "Checking for a etdump in log"
 ! grep "#\[RUN THIS\]" ${log_file} >/dev/null
 if [ $? != 0 ]; then
     echo "Found ETDump in log!"
+    devtools_extra_args=""
     echo "#!/bin/sh" > etdump_script.sh
     sed -n '/^#\[RUN THIS\]$/,/^#\[END\]$/p' ${log_file} >> etdump_script.sh
     # You can run etdump_script.sh if you do
     # $ chmod a+x etdump_script.sh
     # $ ./etdump_script.sh
     # But lets not trust the script as a bad patch would run bad code on your machine
-    grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 >etdump.base64
-    base64 -d etdump.base64 >etdump.bin
-    python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles
+    grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >etdump.bin
+    ! grep ">debug_buffer.bin" etdump_script.sh >/dev/null
+    if [ $? != 0 ]; then
+        grep ">debug_buffer.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >debug_buffer.bin
+        devtools_extra_args="${devtools_extra_args} --debug_buffer_path debug_buffer.bin"
+    fi
+    if [[ ${etrecord_file} != "" ]]; then
+        devtools_extra_args="${devtools_extra_args} --etrecord_path ${etrecord_file}"
+    fi
+    python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin ${devtools_extra_args} --source_time_scale cycles --target_time_scale cycles
 fi
 
 echo "Checking for problems in log:"
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index 8833b7050e7..c336d67ad51 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -184,7 +184,7 @@ def build_ethosu_runtime(
             "--build_type=Release",
             f"--system_config={system_config}",
             f"--memory_mode={memory_mode}",
-            f"--extra_build_flags=-DET_DUMP_OUTPUT=OFF {extra_flags}",
+            f"--extra_build_flags=-DET_LOG_DUMP_OUTPUT=OFF {extra_flags}",
             f"--output={elf_build_path}",
         ]
     )
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index ff6f73398c3..4e4a8eeb409 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -6,30 +6,59 @@
 cmake_minimum_required(VERSION 3.20)
 project(arm_executor_runner)
 
-option(SEMIHOSTING "Enable semihosting" OFF)
-option(
-  ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
-  "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size"
-  OFF
-)
 option(
   ET_MODEL_PTE_ADDR
   "Place in memory that the PTE file is located/flashed, if set to OFF the PTE is built into the code as a big data area."
   OFF
 )
-option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
-option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
-option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
-option(ET_DUMP_INPUT "Dump input in log" OFF)
-option(ET_DUMP_OUTPUT "Dump output in log" ON)
-option(FETCH_ETHOS_U_CONTENT
-       "Fetch ethos_u dependencies instead of relying on pre-downloads" ON
-)
+
 set(ET_NUM_INFERENCES
     "1"
     CACHE STRING "Number of inferences to run"
 )
 
+option(ET_LOG_DUMP_INPUT "Dump input in log" OFF)
+option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON)
+
+option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
+set(ET_ATOL
+    "0.01"
+    CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)"
+)
+set(ET_RTOL
+    "0.01"
+    CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)"
+)
+
+option(
+  ET_DUMP_OUTPUTS
+  "Collect and print outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)"
+  OFF
+)
+option(
+  ET_DUMP_INTERMEDIATE_OUTPUTS
+  "Collect and print intermediate outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)"
+  OFF
+)
+set(ET_DEBUG_BUFFER_SIZE
+    "2097152"
+    CACHE
+      STRING
+      "Size of buffer to collect intermediate outputs/outputs buffers (Requires EXECUTORCH_ENABLE_EVENT_TRACER and ET_DUMP_OUTPUTS or ET_DUMP_INTERMEDIATE_OUTPUTS)"
+)
+
+option(SEMIHOSTING "Enable semihosting" OFF)
+
+option(
+  ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
+  "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size"
+  OFF
+)
+
+option(FETCH_ETHOS_U_CONTENT
+       "Fetch ethos_u dependencies instead of relying on pre-downloads" ON
+)
+
 if(NOT DEFINED ET_MODEL_PTE_ADDR
    AND NOT DEFINED ET_PTE_FILE_PATH
    AND NOT DEFINED SEMIHOSTING
@@ -322,37 +351,29 @@ if(NOT ${ET_MODEL_PTE_ADDR} AND NOT SEMIHOSTING)
   add_dependencies(arm_executor_runner gen_model_header)
 endif()
 
-if(SEMIHOSTING)
-  target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
-endif()
-
-if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+if(ET_MODEL_PTE_ADDR)
   target_compile_definitions(
-    arm_executor_runner
-    PUBLIC
-      ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE}
+    arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR}
   )
 endif()
 
-target_compile_definitions(
-  arm_executor_runner
-  PUBLIC
-    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
-)
-if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+if(ET_NUM_INFERENCES)
   target_compile_definitions(
-    arm_executor_runner
-    PUBLIC
-      ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
+    arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
   )
 endif()
 
-if(ET_MODEL_PTE_ADDR)
-  target_compile_definitions(
-    arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR}
-  )
+if(ET_LOG_DUMP_INPUT)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_INPUT)
+endif()
+
+if(ET_LOG_DUMP_OUTPUT)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_OUTPUT)
 endif()
 
+# Devtool BundleIO: Use Bundle PTE with input and reference output included to
+# check if it matches.
+
 if(ET_BUNDLE_IO)
   target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO)
 endif()
@@ -365,17 +386,50 @@ if(ET_RTOL)
   target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL})
 endif()
 
-if(ET_DUMP_INPUT)
-  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_INPUT)
+# Devtools ETDump: Speed and dumping output
+
+if(ET_DUMP_OUTPUTS)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUTS)
 endif()
 
-if(ET_DUMP_OUTPUT)
-  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
+if(ET_DUMP_INTERMEDIATE_OUTPUTS)
+  target_compile_definitions(
+    arm_executor_runner PUBLIC -DET_DUMP_INTERMEDIATE_OUTPUTS
+  )
 endif()
 
-if(ET_NUM_INFERENCES)
+if(ET_DEBUG_BUFFER_SIZE)
   target_compile_definitions(
-    arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
+    arm_executor_runner PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE}
+  )
+endif()
+
+# Semihosting FVP (FVP Simulator can access host filesystem)
+
+if(SEMIHOSTING)
+  target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
+endif()
+
+# Memory buffer sizes for Executorch flow
+
+if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+  target_compile_definitions(
+    arm_executor_runner
+    PUBLIC
+      ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE}
+  )
+endif()
+
+target_compile_definitions(
+  arm_executor_runner
+  PUBLIC
+    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
+)
+if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+  target_compile_definitions(
+    arm_executor_runner
+    PUBLIC
+      ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
   )
 endif()
 
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index d56710e27ad..696817450b5 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -6,10 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-/* This is an example executorch runner running on Arm Cortex-m and Ethos-U
+/* This is an example ExecuTorch runner running on Arm Cortex-M and Ethos-U
  * based hardware. This example tries to illustrate a few ways to use ExecuTorch
  * and you can use it as is or remove the unneeded parts. Please use this code
- * as inpiration.
+ * as inspiration.
  *
  * Some defines used to configure the code:
  *
@@ -20,24 +20,43 @@
  *                      that is controlled by your memory mode via the
  *                      ETHOSU_MODEL cmake parameter.
  *                      If SEMIHOSTING is define this is not used
- * ET_DUMP_INPUT      - Control if you want input to be dumped to the log.
- * ET_DUMP_OUTPUT     - Control if you want output to be dumped to the log.
- * ET_BUNDLE_IO       - Build in devtools BundelIO, this makes it possible to
+ * ET_NUM_INFERENCES  - Numbers of times to run the inference
+ * ET_LOG_DUMP_INPUT  - Control if you want input to be dumped to the log.
+ * ET_LOG_DUMP_OUTPUT     - Control if you want output to be dumped to the log.
+ *
+ * Devtool BundleIO: Use Bundle PTE with input and reference output included to
+ * check if it matches.
+ *
+ * ET_BUNDLE_IO       - Build in Devtools BundleIO, this makes it possible to
  *                      use bpte with bundled input and output refdata to
  *                      compare output.
  *                      See also ET_ATOL and ET_RTOL
- * ET_ATOL            - The atol used to compare the output and ref data when
- *                      using ET_BUNDLE_IO
- * ET_RTOL            - The rtol used to compare the output and ref data when
- *                      using ET_BUNDLE_IO
- * ET_EVENT_TRACER_ENABLED - Build in devtools event trace code to generate
- *                           ETDump and print it base64 coded of it in the logs
- *                           so you can get it out of your embedded target.
- *                           This can be used to benchmark where time is spent.
- *                           If you run on Ethos-U the delegate/commandstream
- *                           is run in one go, this means that per op
- *                           measurements is not possible.
- * Warning: CPU time meassurements is NOT possible in the FVP simulator and a
+ *   ET_ATOL              - The atol used to compare the output and ref data
+ * when using ET_BUNDLE_IO ET_RTOL              - The rtol used to compare the
+ * output and ref data when using ET_BUNDLE_IO
+ *
+ * Devtools ETDump: Speed and dumping output
+ *
+ * ET_EVENT_TRACER_ENABLED       - Build in Devtools ETDump event trace code
+ *                                 to generate cycle data and print it base64
+ *                                 coded in the log so you can get it out of
+ *                                 your embedded target. This can be used to
+ *                                 benchmark where time is spent. If you run
+ *                                 on Ethos-U the delegate/commandstream is
+ *                                 run in one go, this means that per op
+ *                                 measurements is not possible.
+ *  ET_DUMP_OUTPUTS              - Collect and print outputs as a base64 buffer
+ *                                 in the log, see ExecuTorch Devtools for more
+ *                                 info. (Requires ET_EVENT_TRACER_ENABLED)
+ *  ET_DUMP_INTERMEDIATE_OUTPUTS - Collect and print intermediate outputs as a
+ *                                 base64 buffer in the log, see ExecuTorch
+ *                                 Devtools for more info.
+ *                                 (Requires ET_EVENT_TRACER_ENABLED)
+ *  ET_DEBUG_BUFFER_SIZE         - Override the size of memory area used by
+ *                                 ET_DUMP_OUTPUTS or
+ * ET_DUMP_INTERMEDIATE_OUTPUTS
+ *
+ * Warning: CPU time measurements is NOT possible in the FVP simulator and a
  * real target or FPGA must be used. NPU number are roughly OK, and can be used
  * as guidance if timeing adaptor values are set correctly.
  *
@@ -54,11 +73,12 @@
  *    left over memory after code is linked. This needs to be big enough to fit
  *    and run your model. In our example using the FVP simulator we have much
  *    memory and set this quite high to be able to test larger models.
- *    Regarding heap/mallocs type of allocation from executorch,
+ *    Regarding heap/mallocs type of allocation from ExecuTorch,
  *    et_pal_allocate() is not implemented or needed.
  *
- * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area
- *                                               used when setting up the model
+ * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE            - Size of memory area
+ *                                                          used when setting up
+ *                                                          the model
  * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area
  *                                                          used when running
  *                                                          inferences
@@ -86,10 +106,21 @@
 
 #if defined(ET_EVENT_TRACER_ENABLED)
 #include <executorch/devtools/etdump/etdump_flatcc.h>
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+
+#if !defined(ET_DEBUG_BUFFER_SIZE)
+#define ET_DEBUG_BUFFER_SIZE (2 * 1024 * 1024)
+#endif
+
+#endif
+
 #if !defined(SEMIHOSTING)
 #include <executorch/third-party/flatcc/include/flatcc/portable/pbase64.h>
 #endif
-#endif
+
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 
 #if defined(SEMIHOSTING)
 
@@ -158,8 +189,10 @@ using executorch::bundled_program::ErrorStats;
 using executorch::bundled_program::verify_method_outputs;
 #endif
 #if defined(ET_EVENT_TRACER_ENABLED)
+using executorch::etdump::BufferDataSink;
 using executorch::etdump::ETDumpGen;
 using executorch::etdump::ETDumpResult;
+using executorch::runtime::EventTracerDebugLogLevel;
 using torch::executor::etdump_result;
 #endif
 /**
@@ -505,6 +538,9 @@ struct RunnerContext {
   Box<Result<Method>> method;
 #if defined(ET_EVENT_TRACER_ENABLED)
   Box<ETDumpGen> etdump_gen;
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  void* debug_buffer;
+#endif
 #endif
 #if defined(SEMIHOSTING)
   Box<ArmMemoryAllocator> input_file_allocator;
@@ -622,7 +658,60 @@ void runner_init(
   ET_LOG(Info, "Setting up ETDump");
   ctx.etdump_gen.reset();
   event_tracer_ptr = &ctx.etdump_gen.value();
-#endif
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  // Alloc debug buffer and create if and only if we need to log intermediate
+  // tensor outputs
+  ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16);
+  if (ctx.debug_buffer != nullptr) {
+    Span<uint8_t> debug_buffer_span(
+        (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE);
+
+    Result<bool> result =
+        ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span);
+
+    if (result.ok()) {
+      // Everything worked, we got the buffer setup, lets enable output logging
+      // depending on the compile flag ET_DUMP_INTERMEDIATE_OUTPUTS e.g.
+      // kIntermediateOutputs or kProgramOutputs
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+      ET_LOG(
+          Info,
+          "ETDump: Allocated intermediate output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kIntermediateOutputs);
+#else // defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+      ET_LOG(
+          Info,
+          "ETDump: Allocated output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kProgramOutputs);
+#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+
+    } else {
+      // set_debug_buffer() failed
+      // Here we would free ctx.debug_buffer if it was possible, but we can't as
+      // the allocator don't support it.
+      ctx.debug_buffer = nullptr;
+      ET_LOG(
+          Error,
+          "ETDump: Could not set_debug_buffer() for output buffer size %zu error:0x%" PRIx32,
+          ET_DEBUG_BUFFER_SIZE,
+          result.error());
+    }
+  } else {
+    // debug buffer allocation failed
+    ET_LOG(
+        Error,
+        "ETDump: Could not allocate memory for output buffer size %zu",
+        ET_DEBUG_BUFFER_SIZE);
+  }
+#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 
   ctx.method.reset(
       program->load_method(ctx.method_name, &memory_manager, event_tracer_ptr));
@@ -660,7 +749,7 @@ void runner_init(
     ET_CHECK_MSG(
         status == Error::Ok, "Failed to prepare inputs 0x%" PRIx32, status);
   }
-#if defined(ET_DUMP_INPUT)
+#if defined(ET_LOG_DUMP_INPUT)
   {
     std::vector<EValue> inputs((*ctx.method.value())->inputs_size());
     ET_LOG(Info, "%zu inputs: ", inputs.size());
@@ -712,7 +801,7 @@ void runner_init(
   ET_LOG(Info, "Input prepared.");
 }
 
-void log_mem_status(const RunnerContext& ctx) {
+void log_mem_status(RunnerContext& ctx) {
   size_t executor_memsize =
       ctx.method_allocator->used_size() - ctx.executor_membase;
 
@@ -765,6 +854,20 @@ void log_mem_status(const RunnerContext& ctx) {
   if (ctx.temp_allocator->size() > 0) {
     ET_LOG(Info, "temp_allocator:            %zu", ctx.temp_allocator->size());
   }
+#if defined(ET_EVENT_TRACER_ENABLED)
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  if (ctx.debug_buffer != nullptr) {
+    size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+    ET_LOG(
+        Info,
+        "ETDump_outputs_buffer:     %zu / %zu free: %zu ( used: %zu %% ) ",
+        outputdump_len,
+        ET_DEBUG_BUFFER_SIZE,
+        ET_DEBUG_BUFFER_SIZE - outputdump_len,
+        100 * outputdump_len / ET_DEBUG_BUFFER_SIZE);
+  }
+#endif
+#endif
 }
 
 void print_outputs(RunnerContext& ctx) {
@@ -779,7 +882,7 @@ void print_outputs(RunnerContext& ctx) {
     if (outputs[i].isTensor()) {
       Tensor tensor = outputs[i].toTensor();
 #if !defined(SEMIHOSTING)
-#if defined(ET_DUMP_OUTPUT)
+#if defined(ET_LOG_DUMP_OUTPUT)
       // The output might be collected and parsed so printf() is used instead
       // of ET_LOG() here
       for (int j = 0; j < tensor.numel(); ++j) {
@@ -811,7 +914,7 @@ void print_outputs(RunnerContext& ctx) {
         }
       }
 #endif
-#else
+#else //! defined(SEMIHOSTING)
       char out_filename[255];
       snprintf(out_filename, 255, "%s-%d.bin", ctx.output_basename, i);
       ET_LOG(Info, "Writing output to file: %s", out_filename);
@@ -819,7 +922,7 @@ void print_outputs(RunnerContext& ctx) {
       auto written_size =
           fwrite(tensor.const_data_ptr<char>(), 1, tensor.nbytes(), out_file);
       fclose(out_file);
-#endif
+#endif //! defined(SEMIHOSTING)
     } else {
       printf("Output[%d]: Not Tensor\n", i);
     }
@@ -835,29 +938,96 @@ void write_etdump(RunnerContext& ctx) {
   if (result.buf != nullptr && result.size > 0) {
     // On a device with no file system we can't just write it out
     // to the file-system so we base64 encode it and dump it on the log.
+    bool dump_outputs = false;
     int mode = base64_enc_modifier_padding | base64_dec_modifier_skipspace;
-    size_t len = result.size;
-    size_t encoded_len = base64_encoded_size(result.size, mode);
+    size_t etdump_len = result.size;
+    size_t encoded_etdump_len = base64_encoded_size(etdump_len, mode);
+    size_t base64buffer_len = encoded_etdump_len;
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+    // Make base64 buffer fit both so it can be reused istead of allocating two
+    // buffers.
+    size_t outputdump_len = 0;
+    size_t encoded_outputdump_len = 0;
+    if (ctx.debug_buffer != nullptr) {
+      outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+      if (outputdump_len > 0) {
+        encoded_outputdump_len = base64_encoded_size(outputdump_len, mode);
+        if (encoded_outputdump_len > 0) {
+          base64buffer_len =
+              std::max(encoded_etdump_len, encoded_outputdump_len);
+          dump_outputs = true;
+        } else {
+          ET_LOG(
+              Error,
+              "Problem getting the size of the base64 ETDump output buffers");
+        }
+      } else {
+        ET_LOG(Error, "No ETDump output buffers saved in the data area");
+      }
+    }
+#endif
+    ET_LOG(Info, "[base64] buffer size: %d", base64buffer_len);
+
     uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
-        ctx.method_allocator->allocate(encoded_len + 1));
+        ctx.method_allocator->allocate(base64buffer_len + 1));
     if (encoded_buf != nullptr) {
-      int ret = base64_encode(
-          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
-      encoded_buf[encoded_len] = 0x00; // Ensure null termination
-      ET_LOG(Info, "Writing etdump.bin [base64]");
+      int ret;
+      const char* debug_buffer_flag = "";
+      printf("#[RUN THIS]\n");
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+      if (dump_outputs) {
+        ret = base64_encode(
+            encoded_buf,
+            (uint8_t*)ctx.debug_buffer,
+            &encoded_outputdump_len,
+            &outputdump_len,
+            mode);
+        encoded_buf[encoded_outputdump_len] = 0x00; // Ensure null termination
+        printf("# Writing debug_buffer.bin [base64]\n");
+        printf("echo \"%s\" | base64 -d >debug_buffer.bin\n", encoded_buf);
+        debug_buffer_flag = "--debug_buffer_path debug_buffer.bin";
+      }
+#endif
+      ret = base64_encode(
+          encoded_buf,
+          (uint8_t*)result.buf,
+          &encoded_etdump_len,
+          &etdump_len,
+          mode);
+      encoded_buf[encoded_etdump_len] = 0x00; // Ensure null termination
+      printf("# Writing etdump.bin [base64]\n");
+      printf("echo \"%s\" | base64 -d >etdump.bin\n", encoded_buf);
+
+      printf("# Generate cpu cycle table with:\n");
       printf(
-          "#[RUN THIS]\necho \"%s\" | base64 -d >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#[END]\n",
-          encoded_buf);
+          "python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin %s --source_time_scale cycles --target_time_scale cycles\n",
+          debug_buffer_flag);
+      printf("#[END]\n");
+
     } else {
       ET_LOG(
           Error,
           "Could not allocate memory etdump base64 encoding size %zu",
-          encoded_len + 1);
+          encoded_etdump_len + 1);
     }
   }
-#else
-  // Dump the etdump data containing profiling/debugging data to the specified
-  // file.
+#else // !defined(SEMIHOSTING)
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  if (ctx.debug_buffer != nullptr) {
+    // Dump the etdump outputs data to a file.
+    size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+    const char* etdump_output_filename = "debug_buffer.bin";
+    ET_LOG(
+        Info,
+        "Writing etdump debug_buffer to file: %s",
+        etdump_output_filename);
+    FILE* f = fopen(etdump_output_filename, "w+");
+    fwrite((uint8_t*)ctx.debug_buffer, 1, outputdump_len, f);
+    fclose(f);
+  }
+#endif
+
+  // Dump the etdump data containing profiling/debugging data to a file.
   etdump_result result = ctx.etdump_gen->get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     // On a device with a file system we can just write it out
@@ -869,11 +1039,12 @@ void write_etdump(RunnerContext& ctx) {
     fclose(f);
     free(result.buf);
   }
-#endif
-#endif
+#endif // !defined(SEMIHOSTING)
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 }
 
-void verify_result(RunnerContext& ctx, const void* model_pte) {
+bool verify_result(RunnerContext& ctx, const void* model_pte) {
+  bool model_ok = false;
 #if defined(ET_BUNDLE_IO)
   if (ctx.bundle_io) {
     // Check result
@@ -899,6 +1070,7 @@ void verify_result(RunnerContext& ctx, const void* model_pte) {
     if (status == Error::Ok) {
       ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
       ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx);
+      model_ok = true;
     } else {
       ET_LOG(
           Error,
@@ -906,19 +1078,24 @@ void verify_result(RunnerContext& ctx, const void* model_pte) {
           et_rtol,
           et_atol);
       ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx);
+      ET_LOG(
+          Error, "Bundle verification failed with status 0x%" PRIx32, status);
+      model_ok = false;
     }
-    ET_CHECK_MSG(
-        status == Error::Ok,
-        "Bundle verification failed with status 0x%" PRIx32,
-        status);
+  } else {
+    // No checking done, assume true
+    model_ok = true;
   }
-#else
+#else // defined(ET_BUNDLE_IO)
   (void)ctx;
   (void)model_pte;
-#endif
+  // No checking done, assume true
+  model_ok = true;
+#endif // defined(ET_BUNDLE_IO)
+  return model_ok;
 }
 
-void run_model(RunnerContext& ctx, const void* model_pte) {
+bool run_model(RunnerContext& ctx, const void* model_pte) {
   Error status;
   ET_LOG(Info, "Starting running %d inferences...", num_inferences);
   int n = 0;
@@ -946,7 +1123,10 @@ void run_model(RunnerContext& ctx, const void* model_pte) {
 
   ET_LOG(Info, "%d inferences finished", num_inferences);
   print_outputs(ctx);
-  verify_result(ctx, model_pte);
+  bool model_ok = verify_result(ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
+  return model_ok;
 }
 
 } // namespace
@@ -1047,10 +1227,14 @@ int main(int argc, const char* argv[]) {
       model_pte[7]);
 
   runner_init(ctx, input_buffers, pte_size);
-  run_model(ctx, model_pte);
+  bool model_ok = run_model(ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
   log_mem_status(ctx);
   write_etdump(ctx);
 
+  ET_CHECK_MSG(model_ok == true, "Problem running model");
+
   ET_LOG(Info, "Program complete, exiting.");
 #if defined(SEMIHOSTING)
   _exit(0);
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 77dddfe6451..182a7308964 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -52,7 +52,7 @@ function help() {
     echo "  --no_delegate                          Do not delegate the model (can't override builtin models)"
     echo "  --no_quantize                          Do not quantize the model (can't override builtin models)"
     echo "  --portable_kernels=<OPS>               TO BE DEPRECATED: Alias to select_ops_list."
-    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}"
+    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}"
     echo "                                           NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio."
     echo "                                           See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information."
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
@@ -289,6 +289,12 @@ for i in "${!test_model[@]}"; do
 
     pte_file=$(realpath ${pte_file})
 
+    if [ "${etrecord_flag}" != "" ] ; then
+        etrecord_filename="${output_folder}/${model_filename}_etrecord.bin"
+        etrecord_filename=$(realpath ${etrecord_filename})
+        etrecord_flag="--etrecord=${etrecord_filename}"
+    fi
+
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "pte_data_size: $(wc -c ${pte_file})"
     echo "pte_file: ${pte_file}"
@@ -322,7 +328,8 @@ for i in "${!test_model[@]}"; do
         backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file_or_mem}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}" --select_ops_list="${select_ops_list}"
         if [ "$build_only" = false ] ; then
             # Execute the executor_runner on FVP Simulator
-            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target
+
+            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag}
         fi
         set +x
     fi